diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ec3e33ad9fe..36e6ed8afa7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,7 +128,6 @@ set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name
 
 set(LLVM_ALL_TARGETS
   AArch64
-  ARM64
   ARM
   CppBackend
   Hexagon
@@ -144,7 +143,7 @@ set(LLVM_ALL_TARGETS
   )
 
 # List of targets with JIT support:
-set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM64 ARM Mips SystemZ)
+set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ)
 
 set(LLVM_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")
@@ -235,7 +234,7 @@ option(LLVM_USE_OPROFILE
 # If enabled, verify we are on a platform that supports oprofile.
 if( LLVM_USE_OPROFILE )
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
-    message(FATAL_ERROR "OProfile support is available on Linux only.") 
+    message(FATAL_ERROR "OProfile support is available on Linux only.")
   endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
 endif( LLVM_USE_OPROFILE )
 
@@ -245,6 +244,9 @@ set(LLVM_USE_SANITIZER "" CACHE STRING
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm." OFF)
 
+option(WITH_POLLY "Build LLVM with Polly" ON)
+option(LINK_POLLY_INTO_TOOLS "Static link Polly into tools" OFF)
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
@@ -516,9 +518,6 @@ endif()
 
 add_subdirectory(projects)
 
-option(WITH_POLLY "Build LLVM with Polly" ON)
-option(LINK_POLLY_INTO_TOOLS "Static link Polly into tools" OFF)
-
 if(WITH_POLLY)
   if(NOT EXISTS ${LLVM_MAIN_SRC_DIR}/tools/polly/CMakeLists.txt)
     set(WITH_POLLY OFF)
@@ -584,6 +583,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     PATTERN "*.inc"
     # Exclude include/llvm/CMakeFiles/intrinsics_gen.dir, matched by "*.def"
     PATTERN "CMakeFiles" EXCLUDE
+    PATTERN "config.h" EXCLUDE
     PATTERN ".svn" EXCLUDE
     )
 endif()
diff --git a/CREDITS.TXT b/CREDITS.TXT
index 311a661a7546..0447c40e381b 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -107,6 +107,10 @@ N: Rafael Avila de Espindola
 E: rafael.espindola@gmail.com
 D: The ARM backend
 
+N: Dave Estes
+E: cestes@codeaurora.org
+D: AArch64 machine description for Cortex-A53
+
 N: Alkis Evlogimenos
 E: alkis@evlogimenos.com
 D: Linear scan register allocator, many codegen improvements, Java frontend
@@ -162,10 +166,12 @@ D: Improvements for space efficiency
 
 N: James Grosbach
 E: grosbach@apple.com
+I: grosbach
 D: SjLj exception handling support
 D: General fixes and improvements for the ARM back-end
 D: MCJIT
 D: ARM integrated assembler and assembly parser
+D: Led effort for the backend formerly known as ARM64
 
 N: Lang Hames
 E: lhames@gmail.com
@@ -339,6 +345,10 @@ D: LTO tool, PassManager rewrite, Loop Pass Manager, Loop Rotate
 D: GCC PCH Integration (llvm-gcc), llvm-gcc improvements
 D: Optimizer improvements, Loop Index Split
 
+N: Ana Pazos
+E: apazos@codeaurora.org
+D: Fixes and improvements to the AArch64 backend
+
 N: Wesley Peck
 E: peckw@wesleypeck.com
 W: http://wesleypeck.com/
@@ -368,8 +378,10 @@ D: ARM calling conventions rewrite, hard float support
 
 N: Chad Rosier
 E: mcrosier@codeaurora.org
-D: ARM fast-isel improvements
-D: Performance monitoring
+I: mcrosier
+D: AArch64 fast instruction selection pass
+D: Fixes and improvements to the ARM fast-isel pass
+D: Fixes and improvements to the AArch64 backend
 
 N: Nadav Rotem
 E: nrotem@apple.com
diff --git a/Makefile.rules b/Makefile.rules
index 941797107c78..ff0a3e3f8191 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -377,6 +377,7 @@ ifeq ($(ENABLE_COVERAGE),1)
   BuildMode := $(BuildMode)+Coverage
   CXX.Flags += -ftest-coverage -fprofile-arcs
   C.Flags   += -ftest-coverage -fprofile-arcs
+  LD.Flags   += -ftest-coverage -fprofile-arcs
 endif
 
 # If DISABLE_ASSERTIONS=1 is specified (make command line or configured),
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 6b9c17ae4054..a1c2ac52e202 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -419,7 +419,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
-  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
+  arm64*-*)               llvm_cv_target_arch="AArch64" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
   aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
@@ -455,7 +455,7 @@ case $host in
   amd64-* | x86_64-*)     host_arch="x86_64" ;;
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
-  arm64*-*)               host_arch="ARM64" ;;
+  arm64*-*)               host_arch="AArch64" ;;
   arm*-*)                 host_arch="ARM" ;;
   aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
@@ -786,7 +786,6 @@ else
     PowerPC)     AC_SUBST(TARGET_HAS_JIT,1) ;;
     x86_64)      AC_SUBST(TARGET_HAS_JIT,1) ;;
     ARM)         AC_SUBST(TARGET_HAS_JIT,1) ;;
-    AArch64)     AC_SUBST(TARGET_HAS_JIT,0) ;;
     Mips)        AC_SUBST(TARGET_HAS_JIT,1) ;;
     XCore)       AC_SUBST(TARGET_HAS_JIT,0) ;;
     MSP430)      AC_SUBST(TARGET_HAS_JIT,0) ;;
@@ -797,7 +796,7 @@ else
   esac
 fi
 
-TARGETS_WITH_JIT="AArch64 ARM ARM64 Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86"
 AC_SUBST(TARGETS_WITH_JIT,$TARGETS_WITH_JIT)
 
 dnl Allow enablement of building and installing docs
@@ -950,7 +949,7 @@ if test "$llvm_cv_enable_crash_overrides" = "yes" ; then
 fi
 
 dnl List all possible targets
-ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 AC_SUBST(ALL_TARGETS,$ALL_TARGETS)
 
 dnl Allow specific targets to be specified for building (or not)
@@ -972,7 +971,7 @@ case "$enableval" in
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
         aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
-        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
@@ -1283,16 +1282,6 @@ AC_PATH_PROG(TAR,  [tar],  [gtar])
 AC_PATH_PROG(BINPWD,[pwd],  [pwd])
 
 dnl Looking for misc. graph plotting software
-AC_PATH_PROG(GRAPHVIZ, [Graphviz], [echo Graphviz])
-if test "$GRAPHVIZ" != "echo Graphviz" ; then
-  AC_DEFINE([HAVE_GRAPHVIZ],[1],[Define if the Graphviz program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    GRAPHVIZ=`echo $GRAPHVIZ | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_GRAPHVIZ],"$GRAPHVIZ${EXEEXT}",
-   [Define to path to Graphviz program if found or 'echo Graphviz' otherwise])
-fi
 AC_PATH_PROG(DOT, [dot], [echo dot])
 if test "$DOT" != "echo dot" ; then
   AC_DEFINE([HAVE_DOT],[1],[Define if the dot program is available])
@@ -1303,76 +1292,6 @@ if test "$DOT" != "echo dot" ; then
   AC_DEFINE_UNQUOTED([LLVM_PATH_DOT],"$DOT${EXEEXT}",
    [Define to path to dot program if found or 'echo dot' otherwise])
 fi
-AC_PATH_PROG(FDP, [fdp], [echo fdp])
-if test "$FDP" != "echo fdp" ; then
-  AC_DEFINE([HAVE_FDP],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    FDP=`echo $FDP | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_FDP],"$FDP${EXEEXT}",
-   [Define to path to fdp program if found or 'echo fdp' otherwise])
-fi
-AC_PATH_PROG(NEATO, [neato], [echo neato])
-if test "$NEATO" != "echo neato" ; then
-  AC_DEFINE([HAVE_NEATO],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    NEATO=`echo $NEATO | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_NEATO],"$NEATO${EXEEXT}",
-   [Define to path to neato program if found or 'echo neato' otherwise])
-fi
-AC_PATH_PROG(TWOPI, [twopi], [echo twopi])
-if test "$TWOPI" != "echo twopi" ; then
-  AC_DEFINE([HAVE_TWOPI],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    TWOPI=`echo $TWOPI | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_TWOPI],"$TWOPI${EXEEXT}",
-   [Define to path to twopi program if found or 'echo twopi' otherwise])
-fi
-AC_PATH_PROG(CIRCO, [circo], [echo circo])
-if test "$CIRCO" != "echo circo" ; then
-  AC_DEFINE([HAVE_CIRCO],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    CIRCO=`echo $CIRCO | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_CIRCO],"$CIRCO${EXEEXT}",
-   [Define to path to circo program if found or 'echo circo' otherwise])
-fi
-AC_PATH_PROGS(GV, [gv gsview32], [echo gv])
-if test "$GV" != "echo gv" ; then
-  AC_DEFINE([HAVE_GV],[1],[Define if the gv program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    GV=`echo $GV | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_GV],"$GV${EXEEXT}",
-   [Define to path to gv program if found or 'echo gv' otherwise])
-fi
-AC_PATH_PROG(DOTTY, [dotty], [echo dotty])
-if test "$DOTTY" != "echo dotty" ; then
-  AC_DEFINE([HAVE_DOTTY],[1],[Define if the dotty program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    DOTTY=`echo $DOTTY | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_DOTTY],"$DOTTY${EXEEXT}",
-   [Define to path to dotty program if found or 'echo dotty' otherwise])
-fi
-AC_PATH_PROGS(XDOT, [xdot xdot.py], [echo xdot])
-if test "$XDOT" != "echo xdot" ; then
-  AC_DEFINE([HAVE_XDOT],[1],[Define if the xdot program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    XDOT=`echo $XDOT | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_XDOT],"$XDOT${EXEEXT}",
-   [Define to path to xdot program if found or 'echo xdot' otherwise])
-fi
 
 dnl Find the install program
 AC_PROG_INSTALL
diff --git a/autoconf/m4/link_options.m4 b/autoconf/m4/link_options.m4
index b58d61745f97..abf6596f7c6f 100644
--- a/autoconf/m4/link_options.m4
+++ b/autoconf/m4/link_options.m4
@@ -6,7 +6,7 @@
 AC_DEFUN([AC_LINK_GET_VERSION],
   [AC_CACHE_CHECK([for linker version],[llvm_cv_link_version],
   [
-   version_string="$(ld -v 2>&1 | head -1)"
+   version_string="$(${LD:-ld} -v 2>&1 | head -1)"
 
    # Check for ld64.
    if (echo "$version_string" | grep -q "ld64"); then
diff --git a/autoconf/m4/path_tclsh.m4 b/autoconf/m4/path_tclsh.m4
deleted file mode 100644
index 85433de71cc5..000000000000
--- a/autoconf/m4/path_tclsh.m4
+++ /dev/null
@@ -1,39 +0,0 @@
-dnl This macro checks for tclsh which is required to run dejagnu. On some 
-dnl platforms (notably FreeBSD), tclsh is named tclshX.Y - this handles
-dnl that for us so we can get the latest installed tclsh version.
-dnl 
-AC_DEFUN([DJ_AC_PATH_TCLSH], [
-no_itcl=true
-AC_MSG_CHECKING(for the tclsh program in tclinclude directory)
-AC_ARG_WITH(tclinclude,
-  AS_HELP_STRING([--with-tclinclude],
-                [directory where tcl headers are]), 
-  [with_tclinclude=${withval}],[with_tclinclude=''])
-AC_CACHE_VAL(ac_cv_path_tclsh,[
-dnl first check to see if --with-itclinclude was specified
-if test x"${with_tclinclude}" != x ; then
-  if test -f ${with_tclinclude}/tclsh ; then
-    ac_cv_path_tclsh=`(cd ${with_tclinclude}; pwd)`
-  elif test -f ${with_tclinclude}/src/tclsh ; then
-    ac_cv_path_tclsh=`(cd ${with_tclinclude}/src; pwd)`
-  else
-    AC_MSG_ERROR([${with_tclinclude} directory doesn't contain tclsh])
-  fi
-fi])
-
-dnl see if one is installed
-if test x"${ac_cv_path_tclsh}" = x ; then
-  AC_MSG_RESULT(none)
-  AC_PATH_PROGS([TCLSH],[tclsh8.4 tclsh8.4.8 tclsh8.4.7 tclsh8.4.6 tclsh8.4.5 tclsh8.4.4 tclsh8.4.3 tclsh8.4.2 tclsh8.4.1 tclsh8.4.0 tclsh8.3 tclsh8.3.5 tclsh8.3.4 tclsh8.3.3 tclsh8.3.2 tclsh8.3.1 tclsh8.3.0 tclsh])
-  if test x"${TCLSH}" = x ; then
-    ac_cv_path_tclsh='';
-  else
-    ac_cv_path_tclsh="${TCLSH}";
-  fi
-else
-  AC_MSG_RESULT(${ac_cv_path_tclsh})
-  TCLSH="${ac_cv_path_tclsh}"
-  AC_SUBST(TCLSH)
-fi
-])
-
diff --git a/bindings/ocaml/llvm/llvm.ml b/bindings/ocaml/llvm/llvm.ml
index d36f360bf65f..39875a5ed488 100644
--- a/bindings/ocaml/llvm/llvm.ml
+++ b/bindings/ocaml/llvm/llvm.ml
@@ -16,6 +16,7 @@ type lluse
 type llbasicblock
 type llbuilder
 type llmemorybuffer
+type llmdkind
 
 module TypeKind = struct
   type t =
@@ -299,7 +300,7 @@ type ('a, 'b) llrev_pos =
 external create_context : unit -> llcontext = "llvm_create_context"
 external dispose_context : llcontext -> unit = "llvm_dispose_context"
 external global_context : unit -> llcontext = "llvm_global_context"
-external mdkind_id : llcontext -> string -> int = "llvm_mdkind_id"
+external mdkind_id : llcontext -> string -> llmdkind = "llvm_mdkind_id"
 
 (*===-- Modules -----------------------------------------------------------===*)
 external create_module : llcontext -> string -> llmodule = "llvm_create_module"
@@ -442,9 +443,9 @@ external constexpr_opcode : llvalue -> Opcode.t = "llvm_constexpr_get_opcode"
 
 (*--... Operations on instructions .........................................--*)
 external has_metadata : llvalue -> bool = "llvm_has_metadata"
-external metadata : llvalue -> int -> llvalue option = "llvm_metadata"
-external set_metadata : llvalue -> int -> llvalue -> unit = "llvm_set_metadata"
-external clear_metadata : llvalue -> int -> unit = "llvm_clear_metadata"
+external metadata : llvalue -> llmdkind -> llvalue option = "llvm_metadata"
+external set_metadata : llvalue -> llmdkind -> llvalue -> unit = "llvm_set_metadata"
+external clear_metadata : llvalue -> llmdkind -> unit = "llvm_clear_metadata"
 
 (*--... Operations on metadata .......,.....................................--*)
 external mdstring : llcontext -> string -> llvalue = "llvm_mdstring"
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index e99612179345..59a89db5f489 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -48,6 +48,9 @@ type llbuilder
     See the [llvm::MemoryBuffer] class. *)
 type llmemorybuffer
 
+(** The kind id of metadata attached to an instruction. *)
+type llmdkind
+
 (** The kind of an [lltype], the result of [classify_type ty]. See the
     [llvm::Type::TypeID] enumeration. *)
 module TypeKind : sig
@@ -392,7 +395,7 @@ val global_context : unit -> llcontext
 (** [mdkind_id context name] returns the MDKind ID that corresponds to the
     name [name] in the context [context].  See the function
     [llvm::LLVMContext::getMDKindID]. *)
-val mdkind_id : llcontext -> string -> int
+val mdkind_id : llcontext -> string -> llmdkind
 
 
 (** {6 Modules} *)
@@ -770,15 +773,15 @@ val has_metadata : llvalue -> bool
 (** [metadata i kind] optionally returns the metadata associated with the
     kind [kind] in the instruction [i] See the function
     [llvm::Instruction::getMetadata]. *)
-val metadata : llvalue -> int -> llvalue option
+val metadata : llvalue -> llmdkind -> llvalue option
 
 (** [set_metadata i kind md] sets the metadata [md] of kind [kind] in the
     instruction [i]. See the function [llvm::Instruction::setMetadata]. *)
-val set_metadata : llvalue -> int -> llvalue -> unit
+val set_metadata : llvalue -> llmdkind -> llvalue -> unit
 
 (** [clear_metadata i kind] clears the metadata of kind [kind] in the
     instruction [i]. See the function [llvm::Instruction::setMetadata]. *)
-val clear_metadata : llvalue -> int -> unit
+val clear_metadata : llvalue -> llmdkind -> unit
 
 
 (** {7 Operations on metadata} *)
diff --git a/bindings/ocaml/target/target_ocaml.c b/bindings/ocaml/target/target_ocaml.c
index 9e8778aa61a6..74e818582714 100644
--- a/bindings/ocaml/target/target_ocaml.c
+++ b/bindings/ocaml/target/target_ocaml.c
@@ -352,8 +352,8 @@ CAMLprim value llvm_targetmachine_data_layout(value Machine) {
   CAMLreturn(DataLayout);
 }
 
-/* TargetMachine.t -> bool -> unit */
-CAMLprim value llvm_targetmachine_set_verbose_asm(value Machine, value Verb) {
+/* bool -> TargetMachine.t -> unit */
+CAMLprim value llvm_targetmachine_set_verbose_asm(value Verb, value Machine) {
   LLVMSetTargetMachineAsmVerbosity(TargetMachine_val(Machine), Bool_val(Verb));
   return Val_unit;
 }
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index ca4af73d92c8..b862cebfe09f 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -252,15 +252,9 @@ function(llvm_find_program name)
   endif(LLVM_PATH_${NAME})
 endfunction()
 
-llvm_find_program(gv)
-llvm_find_program(circo)
-llvm_find_program(twopi)
-llvm_find_program(neato)
-llvm_find_program(fdp)
-llvm_find_program(dot)
-llvm_find_program(dotty)
-llvm_find_program(xdot xdot.py)
-llvm_find_program(Graphviz)
+if (LLVM_ENABLE_DOXYGEN)
+  llvm_find_program(dot)
+endif ()
 
 if( LLVM_ENABLE_FFI )
   find_path(FFI_INCLUDE_PATH ffi.h PATHS ${FFI_INCLUDE_DIR})
@@ -304,25 +298,6 @@ else()
   set(ENABLE_PIC 0)
 endif()
 
-find_package(LibXml2)
-if (LIBXML2_FOUND)
-  set(CLANG_HAVE_LIBXML 1)
-  # When cross-compiling, liblzma is not detected as a dependency for libxml2,
-  # which makes linking c-index-test fail. But for native builds, all libraries
-  # are installed and checked by CMake before Makefiles are generated and everything
-  # works according to the plan. However, if a -llzma is added to native builds,
-  # an additional requirement on the static liblzma.a is required, but will not
-  # be checked by CMake, breaking native compilation.
-  # Since this is only pertinent to cross-compilations, and there's no way CMake
-  # can check for every foreign library on every OS, we add the dep and warn the dev.
-  if ( CMAKE_CROSSCOMPILING )
-    if (NOT PC_LIBXML_VERSION VERSION_LESS "2.8.0")
-      message(STATUS "Adding LZMA as a dep to XML2 for cross-compilation, make sure liblzma.a is available.")
-      set(LIBXML2_LIBRARIES ${LIBXML2_LIBRARIES} "-llzma")
-    endif ()
-  endif ()
-endif ()
-
 check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG)
 
 set(USE_NO_MAYBE_UNINITIALIZED 0)
@@ -372,7 +347,7 @@ elseif (LLVM_NATIVE_ARCH MATCHES "powerpc")
 elseif (LLVM_NATIVE_ARCH MATCHES "aarch64")
   set(LLVM_NATIVE_ARCH AArch64)
 elseif (LLVM_NATIVE_ARCH MATCHES "arm64")
-  set(LLVM_NATIVE_ARCH ARM64)
+  set(LLVM_NATIVE_ARCH AArch64)
 elseif (LLVM_NATIVE_ARCH MATCHES "arm")
   set(LLVM_NATIVE_ARCH ARM)
 elseif (LLVM_NATIVE_ARCH MATCHES "mips")
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index df4aef623256..69ffa5b6606a 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -632,11 +632,12 @@ function(add_lit_target target comment)
   if (NOT CMAKE_CFG_INTDIR STREQUAL ".")
     list(APPEND LIT_ARGS --param build_mode=${CMAKE_CFG_INTDIR})
   endif ()
-  set(LIT_COMMAND
-    ${PYTHON_EXECUTABLE}
-    ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py
-    ${LIT_ARGS}
-    )
+  if (LLVM_MAIN_SRC_DIR)
+    set (LIT_COMMAND ${PYTHON_EXECUTABLE} ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
+  else()
+    find_program(LIT_COMMAND llvm-lit)
+  endif ()
+  list(APPEND LIT_COMMAND ${LIT_ARGS})
   foreach(param ${ARG_PARAMS})
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
diff --git a/configure b/configure
index 778aa189d575..e9aba06ad48f 100755
--- a/configure
+++ b/configure
@@ -14,7 +14,7 @@
 ## M4sh Initialization.  ##
 ## --------------------- ##
 
-# Be Bourne compatible
+# Be Bourne compatible.
 if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
   emulate sh
   NULLCMD=:
@@ -4151,7 +4151,7 @@ else
   amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
-  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
+  arm64*-*)               llvm_cv_target_arch="AArch64" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
   aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
@@ -4188,7 +4188,7 @@ case $host in
   amd64-* | x86_64-*)     host_arch="x86_64" ;;
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
-  arm64*-*)               host_arch="ARM64" ;;
+  arm64*-*)               host_arch="AArch64" ;;
   arm*-*)                 host_arch="ARM" ;;
   aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
@@ -5102,8 +5102,6 @@ else
     x86_64)      TARGET_HAS_JIT=1
  ;;
     ARM)         TARGET_HAS_JIT=1
- ;;
-    AArch64)     TARGET_HAS_JIT=0
  ;;
     Mips)        TARGET_HAS_JIT=1
  ;;
@@ -5122,7 +5120,7 @@ else
   esac
 fi
 
-TARGETS_WITH_JIT="AArch64 ARM ARM64 Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86"
 TARGETS_WITH_JIT=$TARGETS_WITH_JIT
 
 
@@ -5359,7 +5357,7 @@ _ACEOF
 
 fi
 
-ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 ALL_TARGETS=$ALL_TARGETS
 
 
@@ -5383,7 +5381,7 @@ case "$enableval" in
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
         aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
-        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
@@ -7614,7 +7612,7 @@ if test "${llvm_cv_link_version+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
 
-   version_string="$(ld -v 2>&1 | head -1)"
+   version_string="$(${LD:-ld} -v 2>&1 | head -1)"
 
    # Check for ld64.
    if (echo "$version_string" | grep -q "ld64"); then
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index 3bfb0953aff2..1cbaee703f30 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -246,6 +246,20 @@ analysis run method (``run`` for a ``Pass``, ``runOnFunction`` for a
     return false;
   }
 
+Required methods to override
+----------------------------
+
+You must override the ``getAdjustedAnalysisPointer`` method on all subclasses
+of ``AliasAnalysis``. An example implementation of this method would look like:
+
+.. code-block:: c++
+
+  void *getAdjustedAnalysisPointer(const void* ID) override {
+    if (ID == &AliasAnalysis::ID)
+      return (AliasAnalysis*)this;
+    return this;
+  }
+
 Interfaces which may be specified
 ---------------------------------
 
diff --git a/docs/Atomics.rst b/docs/Atomics.rst
index 1243f345483f..5f17c6154f3c 100644
--- a/docs/Atomics.rst
+++ b/docs/Atomics.rst
@@ -110,8 +110,7 @@ where threads and signals are involved.
 
 ``cmpxchg`` and ``atomicrmw`` are essentially like an atomic load followed by an
 atomic store (where the store is conditional for ``cmpxchg``), but no other
-memory operation can happen on any thread between the load and store.  Note that
-LLVM's cmpxchg does not provide quite as many options as the C++0x version.
+memory operation can happen on any thread between the load and store.
 
 A ``fence`` provides Acquire and/or Release ordering which is not part of
 another operation; it is normally used along with Monotonic memory operations.
@@ -430,10 +429,9 @@ other ``atomicrmw`` operations generate a loop with ``LOCK CMPXCHG``.  Depending
 on the users of the result, some ``atomicrmw`` operations can be translated into
 operations like ``LOCK AND``, but that does not work in general.
 
-On ARM, MIPS, and many other RISC architectures, Acquire, Release, and
-SequentiallyConsistent semantics require barrier instructions for every such
+On ARM (before v8), MIPS, and many other RISC architectures, Acquire, Release,
+and SequentiallyConsistent semantics require barrier instructions for every such
 operation. Loads and stores generate normal instructions.  ``cmpxchg`` and
 ``atomicrmw`` can be represented using a loop with LL/SC-style instructions
 which take some sort of exclusive lock on a cache line (``LDREX`` and ``STREX``
-on ARM, etc.). At the moment, the IR does not provide any way to represent a
-weak ``cmpxchg`` which would not require a loop.
+on ARM, etc.).
diff --git a/docs/CMake.rst b/docs/CMake.rst
index fed283d3995a..988e12b73502 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -132,7 +132,7 @@ write the variable and the type on the CMake command line:
 Frequently-used CMake variables
 -------------------------------
 
-Here are listed some of the CMake variables that are used often, along with a
+Here are some of the CMake variables that are used often, along with a
 brief explanation and LLVM-specific notes. For full documentation, check the
 CMake docs or execute ``cmake --help-variable VARIABLE_NAME``.
 
@@ -157,8 +157,8 @@ CMake docs or execute ``cmake --help-variable VARIABLE_NAME``.
   Extra flags to use when compiling C++ source files.
 
 **BUILD_SHARED_LIBS**:BOOL
-  Flag indicating is shared libraries will be built. Its default value is
-  OFF. Shared libraries are not supported on Windows and not recommended in the
+  Flag indicating if shared libraries will be built. Its default value is
+  OFF. Shared libraries are not supported on Windows and not recommended on the
   other OSes.
 
 .. _LLVM-specific variables:
diff --git a/docs/CommandGuide/bugpoint.rst b/docs/CommandGuide/bugpoint.rst
index e4663e5d4477..f11585d359c6 100644
--- a/docs/CommandGuide/bugpoint.rst
+++ b/docs/CommandGuide/bugpoint.rst
@@ -124,10 +124,6 @@ OPTIONS
  do not use this option, **bugpoint** will attempt to generate a reference output
  by compiling the program with the "safe" backend and running it.
 
-**--profile-info-file** *filename*
-
- Profile file loaded by **--profile-loader**.
-
 **--run-{int,jit,llc,custom}**
 
  Whenever the test program is compiled, **bugpoint** should generate code for it
diff --git a/docs/CommandGuide/opt.rst b/docs/CommandGuide/opt.rst
index 3fed6845550a..ad5b62cf6e53 100644
--- a/docs/CommandGuide/opt.rst
+++ b/docs/CommandGuide/opt.rst
@@ -99,10 +99,6 @@ OPTIONS
  :option:`-std-compile-opts` and :option:`-verify-each` can quickly track down
  this kind of problem.
 
-.. option:: -profile-info-file <filename>
-
- Specify the name of the file loaded by the ``-profile-loader`` option.
-
 .. option:: -stats
 
  Print statistics.
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index a49485cc5e4e..271c08598b3f 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -76,7 +76,7 @@ the target.  It corresponds to the COFF relocation types
 
 Syntax:
 
-   ``.linkonce [ comdat type [ section identifier ] ]``
+   ``.linkonce [ comdat type ]``
 
 Supported COMDAT types:
 
@@ -95,16 +95,6 @@ Supported COMDAT types:
    Duplicates are discarded, but the linker issues an error if any duplicates
    do not have exactly the same content.
 
-``associative``
-   Links the section if a certain other COMDAT section is linked. This other
-   section is indicated by its section identifier following the comdat type.
-   The following restrictions apply to the associated section:
-
-   1. It must be the name of a section already defined.
-   2. It must differ from the current section.
-   3. It must be a COMDAT section.
-   4. It cannot be another associative COMDAT section.
-
 ``largest``
    Links the largest section from among the duplicates.
 
@@ -118,10 +108,6 @@ Supported COMDAT types:
   .linkonce
     ...
 
-  .section .xdata$foo
-  .linkonce associative .text$foo
-    ...
-
 ``.section`` Directive
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -160,6 +146,25 @@ different COMDATs:
   Symbol2:
   .long 1
 
+In addition to the types allowed with ``.linkonce``, ``.section`` also accepts
+``associative``. The meaning is that the section is linked  if a certain other
+COMDAT section is linked. This other section is indicated by the comdat symbol
+in this directive. It can be any symbol defined in the associated section, but
+is usually the associated section's comdat.
+
+   The following restrictions apply to the associated section:
+
+   1. It must be a COMDAT section.
+   2. It cannot be another associative COMDAT section.
+
+In the following example the symobl ``sym`` is the comdat symbol of ``.foo``
+and ``.bar`` is associated to ``.foo``.
+
+.. code-block:: gas
+
+	.section	.foo,"bw",discard, "sym"
+	.section	.bar,"rd",associative, "sym"
+
 Target Specific Behaviour
 =========================
 
@@ -190,3 +195,17 @@ range via a slight deviation.  It will generate an indirect jump as follows:
   blx r12
   sub.w sp, sp, r4
 
+Variable Length Arrays
+^^^^^^^^^^^^^^^^^^^^^^
+
+The reference implementation (Microsoft Visual Studio 2012) does not permit the
+emission of Variable Length Arrays (VLAs).
+
+The Windows ARM Itanium ABI extends the base ABI by adding support for emitting
+a dynamic stack allocation.  When emitting a variable stack allocation, a call
+to ``__chkstk`` is emitted unconditionally to ensure that guard pages are setup
+properly.  The emission of this stack probe emission is handled similar to the
+standard stack probe emission.
+
+The MSVC environment does not emit code for VLAs currently.
+
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 3d2ec1e99dc5..6de9b9004e0f 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -87,9 +87,10 @@ Here's the short story for getting up and running quickly with LLVM:
    * ``make check-all`` --- This run the regression tests to ensure everything
      is in working order.
 
-   * It is also possible to use CMake instead of the makefiles. With CMake it is
-     possible to generate project files for several IDEs: Xcode, Eclipse CDT4,
-     CodeBlocks, Qt-Creator (use the CodeBlocks generator), KDevelop3.
+   * It is also possible to use `CMake <CMake.html>`_ instead of the makefiles.
+     With CMake it is possible to generate project files for several IDEs:
+     Xcode, Eclipse CDT4, CodeBlocks, Qt-Creator (use the CodeBlocks
+     generator), KDevelop3.
 
    * If you get an "internal compiler error (ICE)" or test failures, see
      `below`.
@@ -680,7 +681,7 @@ The following options can be used to set or enable LLVM specific options:
 
   Enables optimized compilation (debugging symbols are removed and GCC
   optimization flags are enabled). Note that this is the default setting if you
-  are using the LLVM distribution. The default behavior of an Subversion
+  are using the LLVM distribution. The default behavior of a Subversion
   checkout is to use an unoptimized build (also known as a debug build).
 
 ``--enable-debug-runtime``
@@ -698,14 +699,12 @@ The following options can be used to set or enable LLVM specific options:
 
   Controls which targets will be built and linked into llc. The default value
   for ``target_options`` is "all" which builds and links all available targets.
-  The value "host-only" can be specified to build only a native compiler (no
-  cross-compiler targets available). The "native" target is selected as the
-  target of the build host. You can also specify a comma separated list of
-  target names that you want available in llc. The target names use all lower
-  case. The current set of targets is:
+  The "host" target is selected as the target of the build host. You can also
+  specify a comma separated list of target names that you want available in llc.
+  The target names use all lower case. The current set of targets is:
 
-    ``arm, cpp, hexagon, mips, mipsel, msp430, powerpc, ptx, sparc, spu,
-    systemz, x86, x86_64, xcore``.
+    ``aarch64, arm, arm64, cpp, hexagon, mips, mipsel, mips64, mips64el, msp430,
+    powerpc, nvptx, r600, sparc, systemz, x86, x86_64, xcore``.
 
 ``--enable-doxygen``
 
@@ -743,7 +742,7 @@ builds:
 
 Debug Builds
 
-  These builds are the default when one is using an Subversion checkout and
+  These builds are the default when one is using a Subversion checkout and
   types ``gmake`` (unless the ``--enable-optimized`` option was used during
   configuration).  The build system will compile the tools and libraries with
   debugging information.  To get a Debug Build using the LLVM distribution the
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index aa980d20d246..d914cc176dfd 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -99,6 +99,9 @@ Here's the short story for getting up and running quickly with LLVM:
      build.
    * See the :doc:`LLVM CMake guide <CMake>` for detailed information about
      how to configure the LLVM build.
+   * CMake generates project files for all build types. To select a specific
+     build type, use the Configuration manager from the VS IDE or the 
+     ``/property:Configuration`` command line option when using MSBuild.
 
 6. Start Visual Studio
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index ceec1bd5476d..cb94d3967b13 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -117,8 +117,8 @@ And the hard way:
 
 .. code-block:: llvm
 
-    %0 = add i32 %X, %X           ; yields {i32}:%0
-    %1 = add i32 %0, %0           ; yields {i32}:%1
+    %0 = add i32 %X, %X           ; yields i32:%0
+    %1 = add i32 %0, %0           ; yields i32:%1
     %result = add i32 %1, %1
 
 This last way of multiplying ``%X`` by 8 illustrates several important
@@ -443,7 +443,7 @@ styles:
 A symbol with ``internal`` or ``private`` linkage must have ``default``
 visibility.
 
-.. _namedtypes:
+.. _dllstorageclass:
 
 DLL Storage Classes
 -------------------
@@ -464,6 +464,36 @@ DLL storage class:
     exists for defining a dll interface, the compiler, assembler and linker know
     it is externally referenced and must refrain from deleting the symbol.
 
+.. _tls_model:
+
+Thread Local Storage Models
+---------------------------
+
+A variable may be defined as ``thread_local``, which means that it will
+not be shared by threads (each thread will have a separated copy of the
+variable). Not all targets support thread-local variables. Optionally, a
+TLS model may be specified:
+
+``localdynamic``
+    For variables that are only used within the current shared library.
+``initialexec``
+    For variables in modules that will not be loaded dynamically.
+``localexec``
+    For variables defined in the executable and only used within it.
+
+If no explicit model is given, the "general dynamic" model is used.
+
+The models correspond to the ELF TLS models; see `ELF Handling For
+Thread-Local Storage <http://people.redhat.com/drepper/tls.pdf>`_ for
+more information on under which circumstances the different models may
+be used. The target may choose a different TLS model if the specified
+model is not supported, or if a better choice of model can be made.
+
+A model can also be specified in a alias, but then it only governs how
+the alias is accessed. It will not have any effect in the aliasee.
+
+.. _namedtypes:
+
 Structure Types
 ---------------
 
@@ -489,29 +519,13 @@ Global Variables
 Global variables define regions of memory allocated at compilation time
 instead of run-time.
 
-Global variables definitions must be initialized, may have an explicit section
-to be placed in, and may have an optional explicit alignment specified.
+Global variables definitions must be initialized.
 
 Global variables in other translation units can also be declared, in which
 case they don't have an initializer.
 
-A variable may be defined as ``thread_local``, which means that it will
-not be shared by threads (each thread will have a separated copy of the
-variable). Not all targets support thread-local variables. Optionally, a
-TLS model may be specified:
-
-``localdynamic``
-    For variables that are only used within the current shared library.
-``initialexec``
-    For variables in modules that will not be loaded dynamically.
-``localexec``
-    For variables defined in the executable and only used within it.
-
-The models correspond to the ELF TLS models; see `ELF Handling For
-Thread-Local Storage <http://people.redhat.com/drepper/tls.pdf>`_ for
-more information on under which circumstances the different models may
-be used. The target may choose a different TLS model if the specified
-model is not supported, or if a better choice of model can be made.
+Either global variable definitions or declarations may have an explicit section
+to be placed in and may have an optional explicit alignment specified.
 
 A variable may be defined as a global ``constant``, which indicates that
 the contents of the variable will **never** be modified (enabling better
@@ -570,11 +584,14 @@ iteration.
 
 Globals can also have a :ref:`DLL storage class <dllstorageclass>`.
 
+Variables and aliasaes can have a
+:ref:`Thread Local Storage Model <tls_model>`.
+
 Syntax::
 
     [@<GlobalVarName> =] [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
-                         [AddrSpace] [unnamed_addr] [ExternallyInitialized]
-                         <global | constant> <Type>
+                         [unnamed_addr] [AddrSpace] [ExternallyInitialized]
+                         <global | constant> <Type> [<InitializerConstant>]
                          [, section "name"] [, align <Alignment>]
 
 For example, the following defines a global in a numbered address space
@@ -664,29 +681,40 @@ Syntax::
 Aliases
 -------
 
-Aliases act as "second name" for the aliasee value (which can be either
-function, global variable, another alias or bitcast of global value).
+Aliases, unlike function or variables, don't create any new data. They
+are just a new symbol and metadata for an existing position.
+
+Aliases have a name and an aliasee that is either a global value or a
+constant expression.
+
 Aliases may have an optional :ref:`linkage type <linkage>`, an optional
-:ref:`visibility style <visibility>`, and an optional :ref:`DLL storage class
-<dllstorageclass>`.
+:ref:`visibility style <visibility>`, an optional :ref:`DLL storage class
+<dllstorageclass>` and an optional :ref:`tls model <tls_model>`.
 
 Syntax::
 
-    @<Name> = [Visibility] [DLLStorageClass] alias [Linkage] <AliaseeTy> @<Aliasee>
+    @<Name> = [Visibility] [DLLStorageClass] [ThreadLocal] [unnamed_addr] alias [Linkage] <AliaseeTy> @<Aliasee>
 
 The linkage must be one of ``private``, ``internal``, ``linkonce``, ``weak``,
 ``linkonce_odr``, ``weak_odr``, ``external``. Note that some system linkers
-might not correctly handle dropping a weak symbol that is aliased by a non-weak
-alias.
+might not correctly handle dropping a weak symbol that is aliased.
 
 Alias that are not ``unnamed_addr`` are guaranteed to have the same address as
-the aliasee.
+the aliasee expression. ``unnamed_addr`` ones are only guaranteed to point
+to the same content.
+
+Since aliases are only a second name, some restrictions apply, of which
+some can only be checked when producing an object file:
+
+* The expression defining the aliasee must be computable at assembly
+  time. Since it is just a name, no relocations can be used.
 
-The aliasee must be a definition.
+* No alias in the expression can be weak as the possibility of the
+  intermediate alias being overridden cannot be represented in an
+  object file.
 
-Aliases are not allowed to point to aliases with linkages that can be
-overridden. Since they are only a second name, the possibility of the
-intermediate alias being overridden cannot be represented in an object file.
+* No global value in the expression can be a declaration, since that
+  would require a relocation, which is not possible.
 
 .. _namedmetadatastructure:
 
@@ -995,6 +1023,14 @@ example:
     inlining this function is desirable (such as the "inline" keyword in
     C/C++). It is just a hint; it imposes no requirements on the
     inliner.
+``jumptable``
+    This attribute indicates that the function should be added to a
+    jump-instruction table at code-generation time, and that all address-taken
+    references to this function should be replaced with a reference to the
+    appropriate jump-instruction-table function pointer. Note that this creates
+    a new pointer for the original function, which means that code that depends
+    on function-pointer identity can break. So, any function annotated with
+    ``jumptable`` must also be ``unnamed_addr``.
 ``minsize``
     This attribute suggests that optimization passes and code generator
     passes make choices that keep the code size of this function as small
@@ -2785,15 +2821,29 @@ for optimizations are prefixed with ``llvm.mem``.
 '``llvm.mem.parallel_loop_access``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-For a loop to be parallel, in addition to using
-the ``llvm.loop`` metadata to mark the loop latch branch instruction,
-also all of the memory accessing instructions in the loop body need to be
-marked with the ``llvm.mem.parallel_loop_access`` metadata. If there
-is at least one memory accessing instruction not marked with the metadata,
-the loop must be considered a sequential loop. This causes parallel loops to be
-converted to sequential loops due to optimization passes that are unaware of
-the parallel semantics and that insert new memory instructions to the loop
-body.
+The ``llvm.mem.parallel_loop_access`` metadata refers to a loop identifier, 
+or metadata containing a list of loop identifiers for nested loops. 
+The metadata is attached to memory accessing instructions and denotes that 
+no loop carried memory dependence exist between it and other instructions denoted 
+with the same loop identifier.
+
+Precisely, given two instructions ``m1`` and ``m2`` that both have the 
+``llvm.mem.parallel_loop_access`` metadata, with ``L1`` and ``L2`` being the 
+set of loops associated with that metadata, respectively, then there is no loop 
+carried dependence between ``m1`` and ``m2`` for loops in both ``L1`` and 
+``L2``.
+
+As a special case, if all memory accessing instructions in a loop have 
+``llvm.mem.parallel_loop_access`` metadata that refers to that loop, then the 
+loop has no loop carried memory dependences and is considered to be a parallel 
+loop.  
+
+Note that if not all memory access instructions have such metadata referring to 
+the loop, then the loop is considered not being trivially parallel. Additional 
+memory dependence analysis is required to make that determination.  As a fail 
+safe mechanism, this causes loops that were originally parallel to be considered 
+sequential (if optimization passes that are unaware of the parallel semantics 
+insert new memory instructions into the loop body).
 
 Example of a loop that is considered parallel due to its correct use of
 both ``llvm.loop`` and ``llvm.mem.parallel_loop_access``
@@ -3185,7 +3235,7 @@ The '``llvm.global_dtors``' Global Variable
 The ``@llvm.global_dtors`` array contains a list of destructor
 functions, priorities, and an optional associated global or function.
 The functions referenced by this array will be called in descending
-order of priority (i.e. highest first) when the module is loaded. The
+order of priority (i.e. highest first) when the module is unloaded. The
 order of functions with the same priority is not defined.
 
 If the third field is present, non-null, and points to a global variable
@@ -3527,9 +3577,9 @@ Example:
 .. code-block:: llvm
 
       %retval = invoke i32 @Test(i32 15) to label %Continue
-                  unwind label %TestCleanup              ; {i32}:retval set
+                  unwind label %TestCleanup              ; i32:retval set
       %retval = invoke coldcc i32 %Testfnptr(i32 15) to label %Continue
-                  unwind label %TestCleanup              ; {i32}:retval set
+                  unwind label %TestCleanup              ; i32:retval set
 
 .. _i_resume:
 
@@ -3618,10 +3668,10 @@ Syntax:
 
 ::
 
-      <result> = add <ty> <op1>, <op2>          ; yields {ty}:result
-      <result> = add nuw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = add nsw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = add nuw nsw <ty> <op1>, <op2>  ; yields {ty}:result
+      <result> = add <ty> <op1>, <op2>          ; yields ty:result
+      <result> = add nuw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = add nsw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = add nuw nsw <ty> <op1>, <op2>  ; yields ty:result
 
 Overview:
 """""""""
@@ -3657,7 +3707,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = add i32 4, %var          ; yields {i32}:result = 4 + %var
+      <result> = add i32 4, %var          ; yields i32:result = 4 + %var
 
 .. _i_fadd:
 
@@ -3669,7 +3719,7 @@ Syntax:
 
 ::
 
-      <result> = fadd [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fadd [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3696,7 +3746,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fadd float 4.0, %var          ; yields {float}:result = 4.0 + %var
+      <result> = fadd float 4.0, %var          ; yields float:result = 4.0 + %var
 
 '``sub``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^
@@ -3706,10 +3756,10 @@ Syntax:
 
 ::
 
-      <result> = sub <ty> <op1>, <op2>          ; yields {ty}:result
-      <result> = sub nuw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = sub nsw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = sub nuw nsw <ty> <op1>, <op2>  ; yields {ty}:result
+      <result> = sub <ty> <op1>, <op2>          ; yields ty:result
+      <result> = sub nuw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = sub nsw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = sub nuw nsw <ty> <op1>, <op2>  ; yields ty:result
 
 Overview:
 """""""""
@@ -3748,8 +3798,8 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = sub i32 4, %var          ; yields {i32}:result = 4 - %var
-      <result> = sub i32 0, %val          ; yields {i32}:result = -%var
+      <result> = sub i32 4, %var          ; yields i32:result = 4 - %var
+      <result> = sub i32 0, %val          ; yields i32:result = -%var
 
 .. _i_fsub:
 
@@ -3761,7 +3811,7 @@ Syntax:
 
 ::
 
-      <result> = fsub [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fsub [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3791,8 +3841,8 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fsub float 4.0, %var           ; yields {float}:result = 4.0 - %var
-      <result> = fsub float -0.0, %val          ; yields {float}:result = -%var
+      <result> = fsub float 4.0, %var           ; yields float:result = 4.0 - %var
+      <result> = fsub float -0.0, %val          ; yields float:result = -%var
 
 '``mul``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^
@@ -3802,10 +3852,10 @@ Syntax:
 
 ::
 
-      <result> = mul <ty> <op1>, <op2>          ; yields {ty}:result
-      <result> = mul nuw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = mul nsw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = mul nuw nsw <ty> <op1>, <op2>  ; yields {ty}:result
+      <result> = mul <ty> <op1>, <op2>          ; yields ty:result
+      <result> = mul nuw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = mul nsw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = mul nuw nsw <ty> <op1>, <op2>  ; yields ty:result
 
 Overview:
 """""""""
@@ -3845,7 +3895,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = mul i32 4, %var          ; yields {i32}:result = 4 * %var
+      <result> = mul i32 4, %var          ; yields i32:result = 4 * %var
 
 .. _i_fmul:
 
@@ -3857,7 +3907,7 @@ Syntax:
 
 ::
 
-      <result> = fmul [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fmul [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3884,7 +3934,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fmul float 4.0, %var          ; yields {float}:result = 4.0 * %var
+      <result> = fmul float 4.0, %var          ; yields float:result = 4.0 * %var
 
 '``udiv``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -3894,8 +3944,8 @@ Syntax:
 
 ::
 
-      <result> = udiv <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = udiv exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = udiv <ty> <op1>, <op2>         ; yields ty:result
+      <result> = udiv exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3928,7 +3978,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = udiv i32 4, %var          ; yields {i32}:result = 4 / %var
+      <result> = udiv i32 4, %var          ; yields i32:result = 4 / %var
 
 '``sdiv``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -3938,8 +3988,8 @@ Syntax:
 
 ::
 
-      <result> = sdiv <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = sdiv exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = sdiv <ty> <op1>, <op2>         ; yields ty:result
+      <result> = sdiv exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3974,7 +4024,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = sdiv i32 4, %var          ; yields {i32}:result = 4 / %var
+      <result> = sdiv i32 4, %var          ; yields i32:result = 4 / %var
 
 .. _i_fdiv:
 
@@ -3986,7 +4036,7 @@ Syntax:
 
 ::
 
-      <result> = fdiv [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fdiv [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4013,7 +4063,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fdiv float 4.0, %var          ; yields {float}:result = 4.0 / %var
+      <result> = fdiv float 4.0, %var          ; yields float:result = 4.0 / %var
 
 '``urem``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -4023,7 +4073,7 @@ Syntax:
 
 ::
 
-      <result> = urem <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = urem <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4055,7 +4105,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = urem i32 4, %var          ; yields {i32}:result = 4 % %var
+      <result> = urem i32 4, %var          ; yields i32:result = 4 % %var
 
 '``srem``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -4065,7 +4115,7 @@ Syntax:
 
 ::
 
-      <result> = srem <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = srem <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4110,7 +4160,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = srem i32 4, %var          ; yields {i32}:result = 4 % %var
+      <result> = srem i32 4, %var          ; yields i32:result = 4 % %var
 
 .. _i_frem:
 
@@ -4122,7 +4172,7 @@ Syntax:
 
 ::
 
-      <result> = frem [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = frem [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4150,7 +4200,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = frem float 4.0, %var          ; yields {float}:result = 4.0 % %var
+      <result> = frem float 4.0, %var          ; yields float:result = 4.0 % %var
 
 .. _bitwiseops:
 
@@ -4171,10 +4221,10 @@ Syntax:
 
 ::
 
-      <result> = shl <ty> <op1>, <op2>           ; yields {ty}:result
-      <result> = shl nuw <ty> <op1>, <op2>       ; yields {ty}:result
-      <result> = shl nsw <ty> <op1>, <op2>       ; yields {ty}:result
-      <result> = shl nuw nsw <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = shl <ty> <op1>, <op2>           ; yields ty:result
+      <result> = shl nuw <ty> <op1>, <op2>       ; yields ty:result
+      <result> = shl nsw <ty> <op1>, <op2>       ; yields ty:result
+      <result> = shl nuw nsw <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4212,9 +4262,9 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = shl i32 4, %var   ; yields {i32}: 4 << %var
-      <result> = shl i32 4, 2      ; yields {i32}: 16
-      <result> = shl i32 1, 10     ; yields {i32}: 1024
+      <result> = shl i32 4, %var   ; yields i32: 4 << %var
+      <result> = shl i32 4, 2      ; yields i32: 16
+      <result> = shl i32 1, 10     ; yields i32: 1024
       <result> = shl i32 1, 32     ; undefined
       <result> = shl <2 x i32> < i32 1, i32 1>, < i32 1, i32 2>   ; yields: result=<2 x i32> < i32 2, i32 4>
 
@@ -4226,8 +4276,8 @@ Syntax:
 
 ::
 
-      <result> = lshr <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = lshr exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = lshr <ty> <op1>, <op2>         ; yields ty:result
+      <result> = lshr exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4261,10 +4311,10 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = lshr i32 4, 1   ; yields {i32}:result = 2
-      <result> = lshr i32 4, 2   ; yields {i32}:result = 1
-      <result> = lshr i8  4, 3   ; yields {i8}:result = 0
-      <result> = lshr i8 -2, 1   ; yields {i8}:result = 0x7F
+      <result> = lshr i32 4, 1   ; yields i32:result = 2
+      <result> = lshr i32 4, 2   ; yields i32:result = 1
+      <result> = lshr i8  4, 3   ; yields i8:result = 0
+      <result> = lshr i8 -2, 1   ; yields i8:result = 0x7F
       <result> = lshr i32 1, 32  ; undefined
       <result> = lshr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 2>   ; yields: result=<2 x i32> < i32 0x7FFFFFFF, i32 1>
 
@@ -4276,8 +4326,8 @@ Syntax:
 
 ::
 
-      <result> = ashr <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = ashr exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = ashr <ty> <op1>, <op2>         ; yields ty:result
+      <result> = ashr exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4312,10 +4362,10 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = ashr i32 4, 1   ; yields {i32}:result = 2
-      <result> = ashr i32 4, 2   ; yields {i32}:result = 1
-      <result> = ashr i8  4, 3   ; yields {i8}:result = 0
-      <result> = ashr i8 -2, 1   ; yields {i8}:result = -1
+      <result> = ashr i32 4, 1   ; yields i32:result = 2
+      <result> = ashr i32 4, 2   ; yields i32:result = 1
+      <result> = ashr i8  4, 3   ; yields i8:result = 0
+      <result> = ashr i8 -2, 1   ; yields i8:result = -1
       <result> = ashr i32 1, 32  ; undefined
       <result> = ashr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 3>   ; yields: result=<2 x i32> < i32 -1, i32 0>
 
@@ -4327,7 +4377,7 @@ Syntax:
 
 ::
 
-      <result> = and <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = and <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4364,9 +4414,9 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = and i32 4, %var         ; yields {i32}:result = 4 & %var
-      <result> = and i32 15, 40          ; yields {i32}:result = 8
-      <result> = and i32 4, 8            ; yields {i32}:result = 0
+      <result> = and i32 4, %var         ; yields i32:result = 4 & %var
+      <result> = and i32 15, 40          ; yields i32:result = 8
+      <result> = and i32 4, 8            ; yields i32:result = 0
 
 '``or``' Instruction
 ^^^^^^^^^^^^^^^^^^^^
@@ -4376,7 +4426,7 @@ Syntax:
 
 ::
 
-      <result> = or <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = or <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4413,9 +4463,9 @@ Example:
 
 ::
 
-      <result> = or i32 4, %var         ; yields {i32}:result = 4 | %var
-      <result> = or i32 15, 40          ; yields {i32}:result = 47
-      <result> = or i32 4, 8            ; yields {i32}:result = 12
+      <result> = or i32 4, %var         ; yields i32:result = 4 | %var
+      <result> = or i32 15, 40          ; yields i32:result = 47
+      <result> = or i32 4, 8            ; yields i32:result = 12
 
 '``xor``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^
@@ -4425,7 +4475,7 @@ Syntax:
 
 ::
 
-      <result> = xor <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = xor <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4463,10 +4513,10 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = xor i32 4, %var         ; yields {i32}:result = 4 ^ %var
-      <result> = xor i32 15, 40          ; yields {i32}:result = 39
-      <result> = xor i32 4, 8            ; yields {i32}:result = 12
-      <result> = xor i32 %V, -1          ; yields {i32}:result = ~%V
+      <result> = xor i32 4, %var         ; yields i32:result = 4 ^ %var
+      <result> = xor i32 15, 40          ; yields i32:result = 39
+      <result> = xor i32 4, 8            ; yields i32:result = 12
+      <result> = xor i32 %V, -1          ; yields i32:result = ~%V
 
 Vector Operations
 -----------------
@@ -4732,7 +4782,7 @@ Syntax:
 
 ::
 
-      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>]     ; yields {type*}:result
+      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>]     ; yields type*:result
 
 Overview:
 """""""""
@@ -4774,10 +4824,10 @@ Example:
 
 .. code-block:: llvm
 
-      %ptr = alloca i32                             ; yields {i32*}:ptr
-      %ptr = alloca i32, i32 4                      ; yields {i32*}:ptr
-      %ptr = alloca i32, i32 4, align 1024          ; yields {i32*}:ptr
-      %ptr = alloca i32, align 1024                 ; yields {i32*}:ptr
+      %ptr = alloca i32                             ; yields i32*:ptr
+      %ptr = alloca i32, i32 4                      ; yields i32*:ptr
+      %ptr = alloca i32, i32 4, align 1024          ; yields i32*:ptr
+      %ptr = alloca i32, align 1024                 ; yields i32*:ptr
 
 .. _i_load:
 
@@ -4860,9 +4910,9 @@ Examples:
 
 .. code-block:: llvm
 
-      %ptr = alloca i32                               ; yields {i32*}:ptr
-      store i32 3, i32* %ptr                          ; yields {void}
-      %val = load i32* %ptr                           ; yields {i32}:val = i32 3
+      %ptr = alloca i32                               ; yields i32*:ptr
+      store i32 3, i32* %ptr                          ; yields void
+      %val = load i32* %ptr                           ; yields i32:val = i32 3
 
 .. _i_store:
 
@@ -4874,8 +4924,8 @@ Syntax:
 
 ::
 
-      store [volatile] <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>]        ; yields {void}
-      store atomic [volatile] <ty> <value>, <ty>* <pointer> [singlethread] <ordering>, align <alignment>  ; yields {void}
+      store [volatile] <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>]        ; yields void
+      store atomic [volatile] <ty> <value>, <ty>* <pointer> [singlethread] <ordering>, align <alignment>  ; yields void
 
 Overview:
 """""""""
@@ -4939,9 +4989,9 @@ Example:
 
 .. code-block:: llvm
 
-      %ptr = alloca i32                               ; yields {i32*}:ptr
-      store i32 3, i32* %ptr                          ; yields {void}
-      %val = load i32* %ptr                           ; yields {i32}:val = i32 3
+      %ptr = alloca i32                               ; yields i32*:ptr
+      store i32 3, i32* %ptr                          ; yields void
+      %val = load i32* %ptr                           ; yields i32:val = i32 3
 
 .. _i_fence:
 
@@ -4953,7 +5003,7 @@ Syntax:
 
 ::
 
-      fence [singlethread] <ordering>                   ; yields {void}
+      fence [singlethread] <ordering>                   ; yields void
 
 Overview:
 """""""""
@@ -4996,8 +5046,8 @@ Example:
 
 .. code-block:: llvm
 
-      fence acquire                          ; yields {void}
-      fence singlethread seq_cst             ; yields {void}
+      fence acquire                          ; yields void
+      fence singlethread seq_cst             ; yields void
 
 .. _i_cmpxchg:
 
@@ -5009,14 +5059,14 @@ Syntax:
 
 ::
 
-      cmpxchg [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <success ordering> <failure ordering> ; yields {ty}
+      cmpxchg [weak] [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <success ordering> <failure ordering> ; yields  { ty, i1 }
 
 Overview:
 """""""""
 
 The '``cmpxchg``' instruction is used to atomically modify memory. It
 loads a value in memory and compares it to a given value. If they are
-equal, it stores a new value into the memory.
+equal, it tries to store a new value into the memory.
 
 Arguments:
 """"""""""
@@ -5033,10 +5083,10 @@ to modify the number or order of execution of this ``cmpxchg`` with
 other :ref:`volatile operations <volatile>`.
 
 The success and failure :ref:`ordering <ordering>` arguments specify how this
-``cmpxchg`` synchronizes with other atomic operations. The both ordering
-parameters must be at least ``monotonic``, the ordering constraint on failure
-must be no stronger than that on success, and the failure ordering cannot be
-either ``release`` or ``acq_rel``.
+``cmpxchg`` synchronizes with other atomic operations. Both ordering parameters
+must be at least ``monotonic``, the ordering constraint on failure must be no
+stronger than that on success, and the failure ordering cannot be either
+``release`` or ``acq_rel``.
 
 The optional "``singlethread``" argument declares that the ``cmpxchg``
 is only atomic with respect to code (usually signal handlers) running in
@@ -5049,10 +5099,17 @@ equal to the size in memory of the operand.
 Semantics:
 """"""""""
 
-The contents of memory at the location specified by the '``<pointer>``'
-operand is read and compared to '``<cmp>``'; if the read value is the
-equal, '``<new>``' is written. The original value at the location is
-returned.
+The contents of memory at the location specified by the '``<pointer>``' operand
+is read and compared to '``<cmp>``'; if the read value is the equal, the
+'``<new>``' is written. The original value at the location is returned, together
+with a flag indicating success (true) or failure (false).
+
+If the cmpxchg operation is marked as ``weak`` then a spurious failure is
+permitted: the operation may not write ``<new>`` even if the comparison
+matched.
+
+If the cmpxchg operation is strong (the default), the i1 value is 1 if and only
+if the value loaded equals ``cmp``.
 
 A successful ``cmpxchg`` is a read-modify-write instruction for the purpose of
 identifying release sequences. A failed ``cmpxchg`` is equivalent to an atomic
@@ -5064,14 +5121,15 @@ Example:
 .. code-block:: llvm
 
     entry:
-      %orig = atomic load i32* %ptr unordered                   ; yields {i32}
+      %orig = atomic load i32* %ptr unordered                   ; yields i32
       br label %loop
 
     loop:
       %cmp = phi i32 [ %orig, %entry ], [%old, %loop]
       %squared = mul i32 %cmp, %cmp
-      %old = cmpxchg i32* %ptr, i32 %cmp, i32 %squared acq_rel monotonic ; yields {i32}
-      %success = icmp eq i32 %cmp, %old
+      %val_success = cmpxchg i32* %ptr, i32 %cmp, i32 %squared acq_rel monotonic ; yields  { i32, i1 }
+      %value_loaded = extractvalue { i32, i1 } %val_success, 0
+      %success = extractvalue { i32, i1 } %val_success, 1
       br i1 %success, label %done, label %loop
 
     done:
@@ -5087,7 +5145,7 @@ Syntax:
 
 ::
 
-      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [singlethread] <ordering>                   ; yields {ty}
+      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [singlethread] <ordering>                   ; yields ty
 
 Overview:
 """""""""
@@ -5148,7 +5206,7 @@ Example:
 
 .. code-block:: llvm
 
-      %old = atomicrmw add i32* %ptr, i32 1 acquire                        ; yields {i32}
+      %old = atomicrmw add i32* %ptr, i32 1 acquire                        ; yields i32
 
 .. _i_getelementptr:
 
@@ -5882,7 +5940,7 @@ Syntax:
 
 ::
 
-      <result> = icmp <cond> <ty> <op1>, <op2>   ; yields {i1} or {<N x i1>}:result
+      <result> = icmp <cond> <ty> <op1>, <op2>   ; yields i1 or <N x i1>:result
 
 Overview:
 """""""""
@@ -5973,7 +6031,7 @@ Syntax:
 
 ::
 
-      <result> = fcmp <cond> <ty> <op1>, <op2>     ; yields {i1} or {<N x i1>}:result
+      <result> = fcmp <cond> <ty> <op1>, <op2>     ; yields i1 or <N x i1>:result
 
 Overview:
 """""""""
@@ -6278,7 +6336,7 @@ Example:
       call void %foo(i8 97 signext)
 
       %struct.A = type { i32, i8 }
-      %r = call %struct.A @foo()                        ; yields { 32, i8 }
+      %r = call %struct.A @foo()                        ; yields { i32, i8 }
       %gr = extractvalue %struct.A %r, 0                ; yields i32
       %gr1 = extractvalue %struct.A %r, 1               ; yields i8
       %Z = call void @foo() noreturn                    ; indicates that %foo never returns normally
@@ -6863,7 +6921,7 @@ register in surrounding code, including inline assembly. Because of that,
 allocatable registers are not supported.
 
 Warning: So far it only works with the stack pointer on selected
-architectures (ARM, ARM64, AArch64, PowerPC and x86_64). Significant amount of
+architectures (ARM, AArch64, PowerPC and x86_64). Significant amount of
 work is needed to support other registers and even more so, allocatable
 registers.
 
@@ -8440,7 +8498,7 @@ Examples:
 
 .. code-block:: llvm
 
-      %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields {float}:r2 = (a * b) + c
+      %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
 
 Half Precision Floating Point Intrinsics
 ----------------------------------------
diff --git a/docs/Passes.rst b/docs/Passes.rst
index b51829d3a3bd..9f4009297aca 100644
--- a/docs/Passes.rst
+++ b/docs/Passes.rst
@@ -261,12 +261,6 @@ returns "I don't know" for alias queries.  NoAA is unlike other alias analysis
 implementations, in that it does not chain to a previous analysis.  As such it
 doesn't follow many of the rules that other alias analyses must.
 
-``-no-profile``: No Profile Information
----------------------------------------
-
-The default "no profile" implementation of the abstract ``ProfileInfo``
-interface.
-
 ``-postdomfrontier``: Post-Dominance Frontier Construction
 ----------------------------------------------------------
 
@@ -336,23 +330,6 @@ This pass is used to seek out all of the types in use by the program.  Note
 that this analysis explicitly does not include types only used by the symbol
 table.
 
-``-profile-estimator``: Estimate profiling information
-------------------------------------------------------
-
-Profiling information that estimates the profiling information in a very crude
-and unimaginative way.
-
-``-profile-loader``: Load profile information from ``llvmprof.out``
--------------------------------------------------------------------
-
-A concrete implementation of profiling information that loads the information
-from a profile dump file.
-
-``-profile-verifier``: Verify profiling information
----------------------------------------------------
-
-Pass that checks profiling information for plausibility.
-
 ``-regions``: Detect single entry single exit regions
 -----------------------------------------------------
 
@@ -626,24 +603,6 @@ where it is profitable, the loop could be transformed to count down to zero
 
 Bottom-up inlining of functions into callees.
 
-``-insert-edge-profiling``: Insert instrumentation for edge profiling
----------------------------------------------------------------------
-
-This pass instruments the specified program with counters for edge profiling.
-Edge profiling can give a reasonable approximation of the hot paths through a
-program, and is used for a wide variety of program transformations.
-
-Note that this implementation is very naïve.  It inserts a counter for *every*
-edge in the program, instead of using control flow information to prune the
-number of counters inserted.
-
-``-insert-optimal-edge-profiling``: Insert optimal instrumentation for edge profiling
--------------------------------------------------------------------------------------
-
-This pass instruments the specified program with counters for edge profiling.
-Edge profiling can give a reasonable approximation of the hot paths through a
-program, and is used for a wide variety of program transformations.
-
 .. _passes-instcombine:
 
 ``-instcombine``: Combine redundant instructions
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 7e46ac4e8e64..4973e5c66719 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -387,7 +387,8 @@ Fine grained debug info with ``DEBUG_TYPE`` and the ``-debug-only`` option
 Sometimes you may find yourself in a situation where enabling ``-debug`` just
 turns on **too much** information (such as when working on the code generator).
 If you want to enable debug information with more fine-grained control, you
-define the ``DEBUG_TYPE`` macro and the ``-debug`` only option as follows:
+can define the ``DEBUG_TYPE`` macro and use the ``-debug-only`` option as
+follows:
 
 .. code-block:: c++
 
@@ -545,14 +546,15 @@ methods.  Within GDB, for example, you can usually use something like ``call
 DAG.viewGraph()`` to pop up a window.  Alternatively, you can sprinkle calls to
 these functions in your code in places you want to debug.
 
-Getting this to work requires a small amount of configuration.  On Unix systems
+Getting this to work requires a small amount of setup.  On Unix systems
 with X11, install the `graphviz <http://www.graphviz.org>`_ toolkit, and make
 sure 'dot' and 'gv' are in your path.  If you are running on Mac OS X, download
 and install the Mac OS X `Graphviz program
 <http://www.pixelglow.com/graphviz/>`_ and add
 ``/Applications/Graphviz.app/Contents/MacOS/`` (or wherever you install it) to
-your path.  Once in your system and path are set up, rerun the LLVM configure
-script and rebuild LLVM to enable this functionality.
+your path. The programs need not be present when configuring, building or
+running LLVM and can simply be installed when needed during an active debug
+session.
 
 ``SelectionDAG`` has been extended to make it easier to locate *interesting*
 nodes in large complex graphs.  From gdb, if you ``call DAG.setGraphColor(node,
@@ -1916,7 +1918,7 @@ which is a pointer to an integer on the run time stack.
 
 *Inserting instructions*
 
-There are essentially two ways to insert an ``Instruction`` into an existing
+There are essentially three ways to insert an ``Instruction`` into an existing
 sequence of instructions that form a ``BasicBlock``:
 
 * Insertion into an explicit instruction list
@@ -1986,6 +1988,41 @@ sequence of instructions that form a ``BasicBlock``:
   which is much cleaner, especially if you're creating a lot of instructions and
   adding them to ``BasicBlock``\ s.
 
+* Insertion using an instance of ``IRBuilder``
+
+  Inserting several ``Instruction``\ s can be quite laborious using the previous
+  methods. The ``IRBuilder`` is a convenience class that can be used to add
+  several instructions to the end of a ``BasicBlock`` or before a particular
+  ``Instruction``. It also supports constant folding and renaming named
+  registers (see ``IRBuilder``'s template arguments).
+
+  The example below demonstrates a very simple use of the ``IRBuilder`` where
+  three instructions are inserted before the instruction ``pi``. The first two
+  instructions are Call instructions and third instruction multiplies the return
+  value of the two calls.
+
+  .. code-block:: c++
+
+    Instruction *pi = ...;
+    IRBuilder<> Builder(pi);
+    CallInst* callOne = Builder.CreateCall(...);
+    CallInst* callTwo = Builder.CreateCall(...);
+    Value* result = Builder.CreateMul(callOne, callTwo);
+
+  The example below is similar to the above example except that the created
+  ``IRBuilder`` inserts instructions at the end of the ``BasicBlock`` ``pb``.
+
+  .. code-block:: c++
+
+    BasicBlock *pb = ...;
+    IRBuilder<> Builder(pb);
+    CallInst* callOne = Builder.CreateCall(...);
+    CallInst* callTwo = Builder.CreateCall(...);
+    Value* result = Builder.CreateMul(callOne, callTwo);
+
+  See :doc:`tutorial/LangImpl3` for a practical use of the ``IRBuilder``.
+
+
 .. _schanges_deleting:
 
 Deleting Instructions
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index f957a7dfd4a7..a4fbd72167f0 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -235,8 +235,8 @@ File descriptors
 .. code-block:: llvm
 
   !0 = metadata !{
-    i32,       ;; Tag = 41 (DW_TAG_file_type)
-    metadata,  ;; Source directory (including trailing slash) & file pair
+    i32,      ;; Tag = 41 (DW_TAG_file_type)
+    metadata, ;; Source directory (including trailing slash) & file pair
   }
 
 These descriptors contain information for a file.  Global variables and top
@@ -269,7 +269,7 @@ Global variable descriptors
     metadata, ;; The static member declaration, if any
   }
 
-These descriptors provide debug information about globals variables.  They
+These descriptors provide debug information about global variables.  They
 provide details such as name, type and where the variable is defined.  All
 global variables are collected inside the named metadata ``!llvm.dbg.cu``.
 
@@ -297,7 +297,7 @@ Subprogram descriptors
               ;; derived class
     i32,      ;; Flags - Artificial, Private, Protected, Explicit, Prototyped.
     i1,       ;; isOptimized
-    Function * , ;; Pointer to LLVM function
+    {}*,      ;; Reference to the LLVM function
     metadata, ;; Lists function template parameters
     metadata, ;; Function declaration descriptor
     metadata, ;; List of function variables
@@ -314,13 +314,13 @@ Block descriptors
 .. code-block:: llvm
 
   !3 = metadata !{
-    i32,     ;; Tag = 11 (DW_TAG_lexical_block)
-    metadata,;; Source directory (including trailing slash) & file pair
-    metadata,;; Reference to context descriptor
-    i32,     ;; Line number
-    i32,     ;; Column number
-    i32,     ;; DWARF path discriminator value
-    i32      ;; Unique ID to identify blocks from a template function
+    i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+    metadata, ;; Source directory (including trailing slash) & file pair
+    metadata, ;; Reference to context descriptor
+    i32,      ;; Line number
+    i32,      ;; Column number
+    i32,      ;; DWARF path discriminator value
+    i32       ;; Unique ID to identify blocks from a template function
   }
 
 This descriptor provides debug information about nested blocks within a
@@ -330,9 +330,9 @@ lexical blocks at same depth.
 .. code-block:: llvm
 
   !3 = metadata !{
-    i32,     ;; Tag = 11 (DW_TAG_lexical_block)
-    metadata,;; Source directory (including trailing slash) & file pair
-    metadata ;; Reference to the scope we're annotating with a file change
+    i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+    metadata, ;; Source directory (including trailing slash) & file pair
+    metadata  ;; Reference to the scope we're annotating with a file change
   }
 
 This descriptor provides a wrapper around a lexical scope to handle file
@@ -528,9 +528,9 @@ Subrange descriptors
 .. code-block:: llvm
 
   !42 = metadata !{
-    i32,    ;; Tag = 33 (DW_TAG_subrange_type)
-    i64,    ;; Low value
-    i64     ;; High value
+    i32,      ;; Tag = 33 (DW_TAG_subrange_type)
+    i64,      ;; Low value
+    i64       ;; High value
   }
 
 These descriptors are used to define ranges of array subscripts for an array
diff --git a/docs/TestingGuide.rst b/docs/TestingGuide.rst
index f9222372c2af..481be55b576b 100644
--- a/docs/TestingGuide.rst
+++ b/docs/TestingGuide.rst
@@ -304,8 +304,7 @@ For instance, on ``test/CodeGen/ARM``, the ``lit.local.cfg`` is:
 .. code-block:: python
 
   config.suffixes = ['.ll', '.c', '.cpp', '.test']
-  targets = set(config.root.targets_to_build.split())
-  if not 'ARM' in targets:
+  if not 'ARM' in config.root.targets:
     config.unsupported = True
 
 Other platform-specific tests are those that depend on a specific feature
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index f9cb4fe80fd3..cfbda042cc53 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@@ -259,7 +259,6 @@ To see what happened to the other string you registered, try running
       -hello                    - Hello World Pass
       -indvars                  - Induction Variable Simplification
       -inline                   - Function Integration/Inlining
-      -insert-edge-profiling    - Insert instrumentation for edge profiling
   ...
 
 The pass name gets added as the information string for your pass, giving some
diff --git a/docs/YamlIO.rst b/docs/YamlIO.rst
index dfb348da2f47..76dd021f82f5 100644
--- a/docs/YamlIO.rst
+++ b/docs/YamlIO.rst
@@ -399,6 +399,42 @@ the above schema, a same valid YAML document is:
     name:    Tom
     flags:   [ pointy, flat ]
 
+Sometimes a "flags" field might contains an enumeration part
+defined by a bit-mask.
+
+.. code-block:: c++
+
+    enum {
+      flagsFeatureA = 1,
+      flagsFeatureB = 2,
+      flagsFeatureC = 4,
+
+      flagsCPUMask = 24,
+
+      flagsCPU1 = 8,
+      flagsCPU2 = 16
+    };
+
+To support reading and writing such fields, you need to use the maskedBitSet()
+method and provide the bit values, their names and the enumeration mask.
+
+.. code-block:: c++
+
+    template <>
+    struct ScalarBitSetTraits<MyFlags> {
+      static void bitset(IO &io, MyFlags &value) {
+        io.bitSetCase(value, "featureA",  flagsFeatureA);
+        io.bitSetCase(value, "featureB",  flagsFeatureB);
+        io.bitSetCase(value, "featureC",  flagsFeatureC);
+        io.maskedBitSetCase(value, "CPU1",  flagsCPU1, flagsCPUMask);
+        io.maskedBitSetCase(value, "CPU2",  flagsCPU2, flagsCPUMask);
+      }
+    };
+
+YAML I/O (when writing) will apply the enumeration mask to the flags field,
+and compare the result and values from the bitset. As in case of a regular
+bitset, each that matches will cause the corresponding string to be added
+to the flow sequence.
 
 Custom Scalar
 -------------
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index f37e3f89ca3e..0e78ed71fa9a 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -166,7 +166,8 @@ typedef enum {
     LLVMCold = 1ULL << 34,
     LLVMOptimizeNone = 1ULL << 35,
     LLVMInAllocaAttribute = 1ULL << 36,
-    LLVMNonNullAttribute = 1ULL << 37
+    LLVMNonNullAttribute = 1ULL << 37,
+    LLVMJumpTableAttribute = 1ULL << 38,
     */
 } LLVMAttribute;
 
diff --git a/include/llvm-c/module.modulemap b/include/llvm-c/module.modulemap
new file mode 100644
index 000000000000..a456119595c9
--- /dev/null
+++ b/include/llvm-c/module.modulemap
@@ -0,0 +1,4 @@
+module LLVM_C {
+  umbrella "."
+  module * { export * }
+}
diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h
index 053defff5234..ee34e9b53088 100644
--- a/include/llvm/ADT/APSInt.h
+++ b/include/llvm/ADT/APSInt.h
@@ -56,7 +56,7 @@ class APSInt : public APInt {
     APInt::toString(Str, Radix, isSigned());
   }
   /// toString - Converts an APInt to a std::string.  This is an inefficient
-  /// method, your should prefer passing in a SmallString instead.
+  /// method; you should prefer passing in a SmallString instead.
   std::string toString(unsigned Radix) const {
     return APInt::toString(Radix, isSigned());
   }
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index 9b7ee8520d7f..14c5933d388a 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -794,6 +794,14 @@ template<typename T> struct FoldingSetTrait<T*> {
     ID.AddPointer(X);
   }
 };
+template <typename T1, typename T2>
+struct FoldingSetTrait<std::pair<T1, T2>> {
+  static inline void Profile(const std::pair<T1, T2> &P,
+                             llvm::FoldingSetNodeID &ID) {
+    ID.Add(P.first);
+    ID.Add(P.second);
+  }
+};
 } // End of namespace llvm.
 
 #endif
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index b11e3c1c505b..abf02b8cc9d5 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -152,7 +152,7 @@ inline uint64_t fetch64(const char *p) {
   uint64_t result;
   memcpy(&result, p, sizeof(result));
   if (sys::IsBigEndianHost)
-    return sys::SwapByteOrder(result);
+    sys::swapByteOrder(result);
   return result;
 }
 
@@ -160,7 +160,7 @@ inline uint32_t fetch32(const char *p) {
   uint32_t result;
   memcpy(&result, p, sizeof(result));
   if (sys::IsBigEndianHost)
-    return sys::SwapByteOrder(result);
+    sys::swapByteOrder(result);
   return result;
 }
 
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index cd1946c54d4f..128ada0ec835 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -177,10 +177,7 @@ class ThreadSafeRefCountedBase {
 
     T* getPtr() const { return Obj; }
 
-    typedef T* (IntrusiveRefCntPtr::*unspecified_bool_type) () const;
-    operator unspecified_bool_type() const {
-      return Obj ? &IntrusiveRefCntPtr::getPtr : nullptr;
-    }
+    LLVM_EXPLICIT operator bool() const { return Obj; }
 
     void swap(IntrusiveRefCntPtr& other) {
       T* tmp = other.Obj;
@@ -244,6 +241,26 @@ class ThreadSafeRefCountedBase {
     return A != B.getPtr();
   }
 
+  template <class T>
+  bool operator==(std::nullptr_t A, const IntrusiveRefCntPtr<T> &B) {
+    return !B;
+  }
+
+  template <class T>
+  bool operator==(const IntrusiveRefCntPtr<T> &A, std::nullptr_t B) {
+    return B == A;
+  }
+
+  template <class T>
+  bool operator!=(std::nullptr_t A, const IntrusiveRefCntPtr<T> &B) {
+    return !(A == B);
+  }
+
+  template <class T>
+  bool operator!=(const IntrusiveRefCntPtr<T> &A, std::nullptr_t B) {
+    return !(A == B);
+  }
+
 //===----------------------------------------------------------------------===//
 // LLVM-style downcasting support for IntrusiveRefCntPtr objects
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 5b7b88b90085..1cef3933b5d6 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -67,11 +67,11 @@ template<typename Fn> class function_ref;
 
 template<typename Ret, typename ...Params>
 class function_ref<Ret(Params...)> {
-  Ret (*callback)(void *callable, Params ...params);
-  void *callable;
+  Ret (*callback)(intptr_t callable, Params ...params);
+  intptr_t callable;
 
   template<typename Callable>
-  static Ret callback_fn(void *callable, Params ...params) {
+  static Ret callback_fn(intptr_t callable, Params ...params) {
     return (*reinterpret_cast<Callable*>(callable))(
         std::forward<Params>(params)...);
   }
@@ -80,7 +80,7 @@ class function_ref<Ret(Params...)> {
   template<typename Callable>
   function_ref(Callable &&callable)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<void *>(&callable)) {}
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Params ...params) const {
     return callback(callable, std::forward<Params>(params)...);
   }
@@ -90,11 +90,11 @@ class function_ref<Ret(Params...)> {
 
 template<typename Ret>
 class function_ref<Ret()> {
-  Ret (*callback)(void *callable);
-  void *callable;
+  Ret (*callback)(intptr_t callable);
+  intptr_t callable;
 
   template<typename Callable>
-  static Ret callback_fn(void *callable) {
+  static Ret callback_fn(intptr_t callable) {
     return (*reinterpret_cast<Callable*>(callable))();
   }
 
@@ -102,17 +102,17 @@ class function_ref<Ret()> {
   template<typename Callable>
   function_ref(Callable &&callable)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<void *>(&callable)) {}
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()() const { return callback(callable); }
 };
 
 template<typename Ret, typename Param1>
 class function_ref<Ret(Param1)> {
-  Ret (*callback)(void *callable, Param1 param1);
-  void *callable;
+  Ret (*callback)(intptr_t callable, Param1 param1);
+  intptr_t callable;
 
   template<typename Callable>
-  static Ret callback_fn(void *callable, Param1 param1) {
+  static Ret callback_fn(intptr_t callable, Param1 param1) {
     return (*reinterpret_cast<Callable*>(callable))(
         std::forward<Param1>(param1));
   }
@@ -121,7 +121,7 @@ class function_ref<Ret(Param1)> {
   template<typename Callable>
   function_ref(Callable &&callable)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<void *>(&callable)) {}
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Param1 param1) {
     return callback(callable, std::forward<Param1>(param1));
   }
@@ -129,11 +129,11 @@ class function_ref<Ret(Param1)> {
 
 template<typename Ret, typename Param1, typename Param2>
 class function_ref<Ret(Param1, Param2)> {
-  Ret (*callback)(void *callable, Param1 param1, Param2 param2);
-  void *callable;
+  Ret (*callback)(intptr_t callable, Param1 param1, Param2 param2);
+  intptr_t callable;
 
   template<typename Callable>
-  static Ret callback_fn(void *callable, Param1 param1, Param2 param2) {
+  static Ret callback_fn(intptr_t callable, Param1 param1, Param2 param2) {
     return (*reinterpret_cast<Callable*>(callable))(
         std::forward<Param1>(param1),
         std::forward<Param2>(param2));
@@ -143,7 +143,7 @@ class function_ref<Ret(Param1, Param2)> {
   template<typename Callable>
   function_ref(Callable &&callable)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<void *>(&callable)) {}
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Param1 param1, Param2 param2) {
     return callback(callable,
                     std::forward<Param1>(param1),
@@ -153,11 +153,11 @@ class function_ref<Ret(Param1, Param2)> {
 
 template<typename Ret, typename Param1, typename Param2, typename Param3>
 class function_ref<Ret(Param1, Param2, Param3)> {
-  Ret (*callback)(void *callable, Param1 param1, Param2 param2, Param3 param3);
-  void *callable;
+  Ret (*callback)(intptr_t callable, Param1 param1, Param2 param2, Param3 param3);
+  intptr_t callable;
 
   template<typename Callable>
-  static Ret callback_fn(void *callable, Param1 param1, Param2 param2,
+  static Ret callback_fn(intptr_t callable, Param1 param1, Param2 param2,
                          Param3 param3) {
     return (*reinterpret_cast<Callable*>(callable))(
         std::forward<Param1>(param1),
@@ -169,7 +169,7 @@ class function_ref<Ret(Param1, Param2, Param3)> {
   template<typename Callable>
   function_ref(Callable &&callable)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
-        callable(reinterpret_cast<void *>(&callable)) {}
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Param1 param1, Param2 param2, Param3 param3) {
     return callback(callable,
                     std::forward<Param1>(param1),
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index dcf0354860fa..82538e9bd108 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -380,7 +380,8 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
     } else if (N > this->size()) {
       if (this->capacity() < N)
         this->grow(N);
-      std::uninitialized_fill(this->end(), this->begin()+N, T());
+      for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
+        new (&*I) T();
       this->setEnd(this->begin()+N);
     }
   }
@@ -488,9 +489,9 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
     }
 
     ::new ((void*) this->end()) T(::std::move(this->back()));
-    this->setEnd(this->end()+1);
     // Push everything else over.
     this->move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
 
     // If we just moved the element we're inserting, be sure to update
     // the reference.
@@ -516,10 +517,10 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
       this->grow();
       I = this->begin()+EltNo;
     }
-    ::new ((void*) this->end()) T(this->back());
-    this->setEnd(this->end()+1);
+    ::new ((void*) this->end()) T(std::move(this->back()));
     // Push everything else over.
     this->move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
 
     // If we just moved the element we're inserting, be sure to update
     // the reference.
@@ -555,7 +556,8 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
     // reallocate the vector.
     if (size_t(this->end()-I) >= NumToInsert) {
       T *OldEnd = this->end();
-      append(this->end()-NumToInsert, this->end());
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
 
       // Copy the existing elements that get replaced.
       this->move_backward(I, OldEnd-NumToInsert, OldEnd);
@@ -608,7 +610,8 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
     // reallocate the vector.
     if (size_t(this->end()-I) >= NumToInsert) {
       T *OldEnd = this->end();
-      append(this->end()-NumToInsert, this->end());
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
 
       // Copy the existing elements that get replaced.
       this->move_backward(I, OldEnd-NumToInsert, OldEnd);
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index ecac5dd5f66a..5b1868187986 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -139,10 +139,10 @@ class StringMapEntry : public StringMapEntryBase {
   /// Create - Create a StringMapEntry for the specified key and default
   /// construct the value.
   template<typename AllocatorTy, typename InitType>
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd,
+  static StringMapEntry *Create(StringRef Key,
                                 AllocatorTy &Allocator,
                                 InitType InitVal) {
-    unsigned KeyLength = static_cast<unsigned>(KeyEnd-KeyStart);
+    unsigned KeyLength = Key.size();
 
     // Allocate a new item with space for the string at the end and a null
     // terminator.
@@ -158,27 +158,25 @@ class StringMapEntry : public StringMapEntryBase {
 
     // Copy the string information.
     char *StrBuffer = const_cast<char*>(NewItem->getKeyData());
-    memcpy(StrBuffer, KeyStart, KeyLength);
+    memcpy(StrBuffer, Key.data(), KeyLength);
     StrBuffer[KeyLength] = 0;  // Null terminate for convenience of clients.
     return NewItem;
   }
 
   template<typename AllocatorTy>
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd,
-                                AllocatorTy &Allocator) {
-    return Create(KeyStart, KeyEnd, Allocator, 0);
+  static StringMapEntry *Create(StringRef Key, AllocatorTy &Allocator) {
+    return Create(Key, Allocator, ValueTy());
   }
 
   /// Create - Create a StringMapEntry with normal malloc/free.
   template<typename InitType>
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd,
-                                InitType InitVal) {
+  static StringMapEntry *Create(StringRef Key, InitType InitVal) {
     MallocAllocator A;
-    return Create(KeyStart, KeyEnd, A, std::move(InitVal));
+    return Create(Key, A, std::move(InitVal));
   }
 
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd) {
-    return Create(KeyStart, KeyEnd, ValueTy());
+  static StringMapEntry *Create(StringRef Key) {
+    return Create(Key, ValueTy());
   }
 
   /// GetStringMapEntryFromValue - Given a value that is known to be embedded
@@ -353,8 +351,7 @@ class StringMap : public StringMapImpl {
     if (Bucket && Bucket != getTombstoneVal())
       return *static_cast<MapEntryTy*>(Bucket);
 
-    MapEntryTy *NewItem =
-        MapEntryTy::Create(Key.begin(), Key.end(), Allocator, std::move(Val));
+    MapEntryTy *NewItem = MapEntryTy::Create(Key, Allocator, std::move(Val));
 
     if (Bucket == getTombstoneVal())
       --NumTombstones;
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 95f3380b364e..a10bc734da2c 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -350,6 +350,10 @@ class Triple {
     return getOS() == Triple::Win32 && getEnvironment() == Triple::MSVC;
   }
 
+  bool isWindowsItaniumEnvironment() const {
+    return getOS() == Triple::Win32 && getEnvironment() == Triple::Itanium;
+  }
+
   bool isWindowsCygwinEnvironment() const {
     return getOS() == Triple::Cygwin ||
            (getOS() == Triple::Win32 && getEnvironment() == Triple::Cygnus);
diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index a54fd743ad68..4be3ee6f82db 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h
@@ -182,6 +182,10 @@ namespace llvm {
       assert(isValid() && "Invalid twine!");
     }
 
+    /// Since the intended use of twines is as temporary objects, assignments
+    /// when concatenating might cause undefined behavior or stack corruptions
+    Twine &operator=(const Twine &Other) LLVM_DELETED_FUNCTION;
+
     /// isNull - Check for the null twine.
     bool isNull() const {
       return getLHSKind() == NullKind;
diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
index 78a03ff2366d..279755e47622 100644
--- a/include/llvm/Analysis/DependenceAnalysis.h
+++ b/include/llvm/Analysis/DependenceAnalysis.h
@@ -910,7 +910,8 @@ namespace llvm {
                          const Constraint &CurConstraint) const;
 
     bool tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
-                        SmallVectorImpl<Subscript> &Pair) const;
+                        SmallVectorImpl<Subscript> &Pair,
+                        const SCEV *ElementSize) const;
 
   public:
     static char ID; // Class identification, replacement for typeinfo
diff --git a/include/llvm/Analysis/JumpInstrTableInfo.h b/include/llvm/Analysis/JumpInstrTableInfo.h
new file mode 100644
index 000000000000..54760aa02466
--- /dev/null
+++ b/include/llvm/Analysis/JumpInstrTableInfo.h
@@ -0,0 +1,60 @@
+//===-- JumpInstrTableInfo.h: Info for Jump-Instruction Tables --*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Information about jump-instruction tables that have been created by
+/// JumpInstrTables pass.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H
+#define LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Pass.h"
+
+#include <vector>
+
+namespace llvm {
+class Function;
+class FunctionType;
+
+/// This class stores information about jump-instruction tables created by the
+/// JumpInstrTables pass (in lib/CodeGen/JumpInstrTables.cpp). Each table is a
+/// map from a function type to a vector of pairs. The first element of each
+/// pair is the function that has the jumptable annotation. The second element
+/// is a function that was declared by JumpInstrTables and used to replace all
+/// address-taking sites for the original function.
+///
+/// The information in this pass is used in AsmPrinter
+/// (lib/CodeGen/AsmPrinter/AsmPrinter.cpp) to generate the required assembly
+/// for the jump-instruction tables.
+class JumpInstrTableInfo : public ImmutablePass {
+public:
+  static char ID;
+
+  JumpInstrTableInfo();
+  virtual ~JumpInstrTableInfo();
+  const char *getPassName() const override {
+    return "Jump-Instruction Table Info";
+  }
+
+  typedef std::pair<Function *, Function *> JumpPair;
+  typedef DenseMap<FunctionType *, std::vector<JumpPair> > JumpTables;
+
+  /// Inserts an entry in a table, adding the table if it doesn't exist.
+  void insertEntry(FunctionType *TableFunTy, Function *Target, Function *Jump);
+
+  /// Gets the tables.
+  const JumpTables &getTables() const { return Tables; }
+
+private:
+  JumpTables Tables;
+};
+}
+
+#endif /* LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H */
diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index 9494b7d9ebe4..fd65ae5ca5b2 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h
@@ -142,6 +142,10 @@ namespace llvm {
   // information and prints it with -analyze.
   //
   FunctionPass *createMemDepPrinter();
+
+  // createJumpInstrTableInfoPass - This creates a pass that stores information
+  // about the jump tables created by JumpInstrTables
+  ImmutablePass *createJumpInstrTableInfoPass();
 }
 
 #endif
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 4db8686bae78..057082676824 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -894,10 +894,14 @@ namespace llvm {
     /// indirect operand.
     bool hasOperand(const SCEV *S, const SCEV *Op) const;
 
+    /// Return the size of an element read or written by Inst.
+    const SCEV *getElementSize(Instruction *Inst);
+
     /// Compute the array dimensions Sizes from the set of Terms extracted from
     /// the memory access function of this SCEVAddRecExpr.
     void findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
-                             SmallVectorImpl<const SCEV *> &Sizes) const;
+                             SmallVectorImpl<const SCEV *> &Sizes,
+                             const SCEV *ElementSize) const;
 
     bool runOnFunction(Function &F) override;
     void releaseMemory() override;
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index ba32e928d95e..01b034f8a011 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -362,14 +362,12 @@ namespace llvm {
                                 SmallVectorImpl<const SCEV *> &Terms) const;
 
     /// Return in Subscripts the access functions for each dimension in Sizes.
-    const SCEV *
-    computeAccessFunctions(ScalarEvolution &SE,
-                           SmallVectorImpl<const SCEV *> &Subscripts,
-                           SmallVectorImpl<const SCEV *> &Sizes) const;
+    void computeAccessFunctions(ScalarEvolution &SE,
+                                SmallVectorImpl<const SCEV *> &Subscripts,
+                                SmallVectorImpl<const SCEV *> &Sizes) const;
 
     /// Split this SCEVAddRecExpr into two vectors of SCEVs representing the
-    /// subscripts and sizes of an array access. Returns the remainder of the
-    /// delinearization that is the offset start of the array.
+    /// subscripts and sizes of an array access.
     ///
     /// The delinearization is a 3 step process: the first two steps compute the
     /// sizes of each subscript and the third step computes the access functions
@@ -432,9 +430,10 @@ namespace llvm {
     /// The subscript of the outermost dimension is the Quotient: [j+k].
     ///
     /// Overall, we have: A[][n][m], and the access function: A[j+k][2i][5i].
-    const SCEV *delinearize(ScalarEvolution &SE,
-                            SmallVectorImpl<const SCEV *> &Subscripts,
-                            SmallVectorImpl<const SCEV *> &Sizes) const;
+    void delinearize(ScalarEvolution &SE,
+                     SmallVectorImpl<const SCEV *> &Subscripts,
+                     SmallVectorImpl<const SCEV *> &Sizes,
+                     const SCEV *ElementSize) const;
   };
 
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 04c08ab3f46c..10b0f65cbc0c 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -372,7 +372,8 @@ namespace bitc {
     ATTR_KIND_COLD = 36,
     ATTR_KIND_OPTIMIZE_NONE = 37,
     ATTR_KIND_IN_ALLOCA = 38,
-    ATTR_KIND_NON_NULL = 39
+    ATTR_KIND_NON_NULL = 39,
+    ATTR_KIND_JUMP_TABLE = 40
   };
 
 } // End bitc namespace
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 4c194a638d4a..0d0d6a71ab14 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -30,7 +30,8 @@ namespace llvm {
   /// deserialization of function bodies.  If successful, this takes ownership
   /// of 'buffer. On error, this *does not* take ownership of Buffer.
   ErrorOr<Module *> getLazyBitcodeModule(MemoryBuffer *Buffer,
-                                         LLVMContext &Context);
+                                         LLVMContext &Context,
+                                         bool BufferOwned = true);
 
   /// getStreamedBitcodeModule - Read the header of the specified stream
   /// and prepare for lazy deserialization and streaming of function bodies.
diff --git a/include/llvm/CMakeLists.txt b/include/llvm/CMakeLists.txt
index 0f5c63ded67a..ca4fd1338ed7 100644
--- a/include/llvm/CMakeLists.txt
+++ b/include/llvm/CMakeLists.txt
@@ -12,3 +12,9 @@ if( MSVC_IDE OR XCODE )
   set_target_properties(llvm_headers_do_not_build PROPERTIES FOLDER "Misc"
                         EXCLUDE_FROM_DEFAULT_BUILD ON)
 endif()
+
+# If we're doing an out-of-tree build, copy a module map for generated
+# header files into the build area.
+if (NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
+  configure_file(module.modulemap.build module.modulemap COPYONLY)
+endif (NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index c3aefd4bcb44..c5060fb5441c 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares several CodeGen-specific LLVM IR analysis utilties.
+// This file declares several CodeGen-specific LLVM IR analysis utilities.
 //
 //===----------------------------------------------------------------------===//
 
@@ -86,7 +86,7 @@ ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred);
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool isInTailCallPosition(ImmutableCallSite CS, const TargetLowering &TLI);
+bool isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG);
 
 /// Test if given that the input instruction is in the tail call position if the
 /// return type or any attributes of the function will inhibit tail call
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index b53fb423d82a..e1c9a14c9009 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -52,7 +52,6 @@ class MCSubtargetInfo;
 class MCSymbol;
 class MDNode;
 class DwarfDebug;
-class DwarfException;
 class Mangler;
 class TargetLoweringObjectFile;
 class DataLayout;
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index ac789e4af8a6..449d93418a4c 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -193,6 +193,30 @@ cl::opt<std::string> StartAfter("start-after",
                           cl::value_desc("pass-name"),
                           cl::init(""));
 
+cl::opt<bool> DataSections("data-sections",
+                           cl::desc("Emit data into separate sections"),
+                           cl::init(false));
+
+cl::opt<bool>
+FunctionSections("function-sections",
+                 cl::desc("Emit functions into separate sections"),
+                 cl::init(false));
+
+cl::opt<llvm::JumpTable::JumpTableType>
+JTableType("jump-table-type",
+          cl::desc("Choose the type of Jump-Instruction Table for jumptable."),
+          cl::init(JumpTable::Single),
+          cl::values(
+              clEnumValN(JumpTable::Single, "single",
+                         "Create a single table for all jumptable functions"),
+              clEnumValN(JumpTable::Arity, "arity",
+                         "Create one table per number of parameters."),
+              clEnumValN(JumpTable::Simplified, "simplified",
+                         "Create one table per simplified function type."),
+              clEnumValN(JumpTable::Full, "full",
+                         "Create one table per unique function type."),
+              clEnumValEnd));
+
 // Common utility function tightly tied to the options listed here. Initializes
 // a TargetOptions object with CodeGen flags and returns it.
 static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
@@ -215,8 +239,11 @@ static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
   Options.TrapFuncName = TrapFuncName;
   Options.PositionIndependentExecutable = EnablePIE;
   Options.UseInitArray = UseInitArray;
+  Options.DataSections = DataSections;
+  Options.FunctionSections = FunctionSections;
 
   Options.MCOptions = InitMCTargetOptionsFromFlags();
+  Options.JTType = JTableType;
 
   return Options;
 }
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index bfeede2a34bf..c7ec6a024b84 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -23,6 +23,7 @@ namespace llvm {
 class AllocaInst;
 class Constant;
 class ConstantFP;
+class CallInst;
 class DataLayout;
 class FunctionLoweringInfo;
 class Instruction;
@@ -373,6 +374,12 @@ class FastISel {
   /// - \c Add has a constant operand.
   bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
 
+  /// Test whether the given value has exactly one use.
+  bool hasTrivialKill(const Value *V) const;
+
+  /// \brief Create a machine mem operand from the given instruction.
+  MachineMemOperand *createMachineMemOperandFor(const Instruction *I) const;
+
 private:
   bool SelectBinaryOp(const User *I, unsigned ISDOpcode);
 
@@ -409,8 +416,8 @@ class FastISel {
   /// heavy instructions like calls.
   void flushLocalValueMap();
 
-  /// Test whether the given value has exactly one use.
-  bool hasTrivialKill(const Value *V) const;
+  bool addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
+                           const CallInst *CI, unsigned StartIdx);
 };
 
 }
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 49891b2934c8..80fb8b2d3a5d 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -619,6 +619,12 @@ namespace ISD {
     /// This corresponds to the cmpxchg instruction.
     ATOMIC_CMP_SWAP,
 
+    /// Val, Success, OUTCHAIN
+    ///     = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap)
+    /// N.b. this is still a strong cmpxchg operation, so
+    /// Success == "Val == cmp".
+    ATOMIC_CMP_SWAP_WITH_SUCCESS,
+
     /// Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt)
     /// Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amt)
     /// For double-word atomic operations:
diff --git a/include/llvm/CodeGen/JumpInstrTables.h b/include/llvm/CodeGen/JumpInstrTables.h
new file mode 100644
index 000000000000..6ca3d7d1765f
--- /dev/null
+++ b/include/llvm/CodeGen/JumpInstrTables.h
@@ -0,0 +1,104 @@
+//===-- JumpInstrTables.h: Jump-Instruction Tables --------------*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief An implementation of tables consisting of jump instructions
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_JUMPINSTRTABLES_H
+#define LLVM_CODEGEN_JUMPINSTRTABLES_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+class Constant;
+class Function;
+class FunctionType;
+class JumpInstrTableInfo;
+class Module;
+
+/// A class to manage a set of jump tables indexed on function type. It looks at
+/// each function in the module to find all the functions that have the
+/// jumptable attribute set. For each such function, it creates a new
+/// jump-instruction-table function and stores the mapping in the ImmutablePass
+/// JumpInstrTableInfo.
+///
+/// These special functions get lowered in AsmPrinter to assembly of the form:
+/// \verbatim
+///   .globl f
+///   .type f,@function
+///   .align 8,0x90
+/// f:
+///   jmp f_orig@PLT
+/// \endverbatim
+///
+/// Support for an architecture depends on two functions in TargetInstrInfo:
+/// getUnconditionalBranch, and getTrap. AsmPrinter uses these to generate the
+/// appropriate instructions for the jump statement (an unconditional branch)
+/// and for padding to make the table have a size that is a power of two. This
+/// padding uses a trap instruction to ensure that calls to this area halt the
+/// program. The default implementations of these functions call
+/// llvm_unreachable.
+class JumpInstrTables : public ModulePass {
+public:
+  static char ID;
+
+  JumpInstrTables();
+  JumpInstrTables(JumpTable::JumpTableType JTT);
+  virtual ~JumpInstrTables();
+  bool runOnModule(Module &M) override;
+  const char *getPassName() const override { return "Jump-Instruction Tables"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Creates a jump-instruction table function for the Target and adds it to
+  /// the tables.
+  Function *insertEntry(Module &M, Function *Target);
+
+  /// Checks to see if there is already a table for the given FunctionType.
+  bool hasTable(FunctionType *FunTy);
+
+private:
+  /// The metadata used while a jump table is being built
+  struct TableMeta {
+    /// The number of this table
+    unsigned TableNum;
+
+    /// The current number of jump entries in the table.
+    unsigned Count;
+  };
+
+  typedef DenseMap<FunctionType *, struct TableMeta> JumpMap;
+
+  /// Maps the function into a subset of function types, depending on the
+  /// jump-instruction table style selected from JumpTableTypes in
+  /// JumpInstrTables.cpp. The choice of mapping determines the number of
+  /// jump-instruction tables generated by this pass. E.g., the simplest mapping
+  /// converts every function type into void f(); so, all functions end up in a
+  /// single table.
+  FunctionType *transformType(FunctionType *FunTy);
+
+  /// The current state of functions and jump entries in the table(s).
+  JumpMap Metadata;
+
+  /// The ImmutablePass that stores information about the generated tables.
+  JumpInstrTableInfo *JITI;
+
+  /// The total number of tables.
+  unsigned TableCount;
+
+  /// The type of tables to build.
+  JumpTable::JumpTableType JTType;
+};
+
+/// Creates a JumpInstrTables pass for the given type of jump table.
+ModulePass *createJumpInstrTablesPass(JumpTable::JumpTableType JTT);
+}
+
+#endif /* LLVM_CODEGEN_JUMPINSTRTABLES_H */
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index 31d687267416..036aea30a510 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -197,6 +197,9 @@ class LexicalScopes {
   /// dump - Print data structures to dbgs().
   void dump();
 
+  /// getOrCreateAbstractScope - Find or create an abstract lexical scope.
+  LexicalScope *getOrCreateAbstractScope(const MDNode *N);
+
 private:
   /// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
   /// not available then create new lexical scope.
@@ -208,9 +211,6 @@ class LexicalScopes {
   /// getOrCreateInlinedScope - Find or create an inlined lexical scope.
   LexicalScope *getOrCreateInlinedScope(MDNode *Scope, MDNode *InlinedAt);
 
-  /// getOrCreateAbstractScope - Find or create an abstract lexical scope.
-  LexicalScope *getOrCreateAbstractScope(const MDNode *N);
-
   /// extractLexicalScopes - Extract instruction ranges for each lexical scopes
   /// for the given machine function.
   void extractLexicalScopes(SmallVectorImpl<InsnRange> &MIRanges,
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index ddd623c2a33d..176665bc2566 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -155,6 +155,17 @@ namespace llvm {
     bool shrinkToUses(LiveInterval *li,
                       SmallVectorImpl<MachineInstr*> *dead = nullptr);
 
+    /// \brief Walk the values in the given interval and compute which ones
+    /// are dead.  Dead values are not deleted, however:
+    /// - Dead PHIDef values are marked as unused.
+    /// - New dead machine instructions are added to the dead vector.
+    /// - CanSeparate is set to true if the interval may have been separated
+    ///   into multiple connected components.
+    void computeDeadValues(LiveInterval *li,
+                           LiveRange &LR,
+                           bool *CanSeparate,
+                           SmallVectorImpl<MachineInstr*> *dead);
+
     /// extendToIndices - Extend the live range of LI to reach all points in
     /// Indices. The points in the Indices array must be jointly dominated by
     /// existing defs in LI. PHI-defs are added as needed to maintain SSA form.
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 8709d86fe443..90bdeee46d26 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -219,10 +219,15 @@ class MachineBasicBlock : public ilist_node<MachineBasicBlock> {
   unsigned size() const { return (unsigned)Insts.size(); }
   bool empty() const { return Insts.empty(); }
 
-  MachineInstr& front() { return Insts.front(); }
-  MachineInstr& back()  { return Insts.back(); }
-  const MachineInstr& front() const { return Insts.front(); }
-  const MachineInstr& back()  const { return Insts.back(); }
+  MachineInstr       &instr_front()       { return Insts.front(); }
+  MachineInstr       &instr_back()        { return Insts.back();  }
+  const MachineInstr &instr_front() const { return Insts.front(); }
+  const MachineInstr &instr_back()  const { return Insts.back();  }
+
+  MachineInstr       &front()             { return Insts.front(); }
+  MachineInstr       &back()              { return *--end();      }
+  const MachineInstr &front()       const { return Insts.front(); }
+  const MachineInstr &back()        const { return *--end();      }
 
   instr_iterator                instr_begin()       { return Insts.begin();  }
   const_instr_iterator          instr_begin() const { return Insts.begin();  }
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index acd37e11fdb2..b8a28e877f7d 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -624,9 +624,9 @@ class SchedBoundary {
   SmallVector<unsigned, 16> ReservedCycles;
 
 #ifndef NDEBUG
-  // Remember the greatest operand latency as an upper bound on the number of
+  // Remember the greatest possible stall as an upper bound on the number of
   // times we should retry the pending queue because of a hazard.
-  unsigned MaxObservedLatency;
+  unsigned MaxObservedStall;
 #endif
 
 public:
@@ -739,6 +739,217 @@ class SchedBoundary {
 #endif
 };
 
+/// Base class for GenericScheduler. This class maintains information about
+/// scheduling candidates based on TargetSchedModel making it easy to implement
+/// heuristics for either preRA or postRA scheduling.
+class GenericSchedulerBase : public MachineSchedStrategy {
+public:
+  /// Represent the type of SchedCandidate found within a single queue.
+  /// pickNodeBidirectional depends on these listed by decreasing priority.
+  enum CandReason {
+    NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
+    ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
+    TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
+
+#ifndef NDEBUG
+  static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
+#endif
+
+  /// Policy for scheduling the next instruction in the candidate's zone.
+  struct CandPolicy {
+    bool ReduceLatency;
+    unsigned ReduceResIdx;
+    unsigned DemandResIdx;
+
+    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
+  };
+
+  /// Status of an instruction's critical resource consumption.
+  struct SchedResourceDelta {
+    // Count critical resources in the scheduled region required by SU.
+    unsigned CritResources;
+
+    // Count critical resources from another region consumed by SU.
+    unsigned DemandedResources;
+
+    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
+
+    bool operator==(const SchedResourceDelta &RHS) const {
+      return CritResources == RHS.CritResources
+        && DemandedResources == RHS.DemandedResources;
+    }
+    bool operator!=(const SchedResourceDelta &RHS) const {
+      return !operator==(RHS);
+    }
+  };
+
+  /// Store the state used by GenericScheduler heuristics, required for the
+  /// lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    CandPolicy Policy;
+
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // The reason for this candidate.
+    CandReason Reason;
+
+    // Set of reasons that apply to multiple candidates.
+    uint32_t RepeatReasonSet;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Critical resource consumption of the best candidate.
+    SchedResourceDelta ResDelta;
+
+    SchedCandidate(const CandPolicy &policy)
+      : Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      RPDelta = Best.RPDelta;
+      ResDelta = Best.ResDelta;
+    }
+
+    bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
+    void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
+
+    void initResourceDelta(const ScheduleDAGMI *DAG,
+                           const TargetSchedModel *SchedModel);
+  };
+
+protected:
+  const MachineSchedContext *Context;
+  const TargetSchedModel *SchedModel;
+  const TargetRegisterInfo *TRI;
+
+  SchedRemainder Rem;
+protected:
+  GenericSchedulerBase(const MachineSchedContext *C):
+    Context(C), SchedModel(nullptr), TRI(nullptr) {}
+
+  void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
+                 SchedBoundary *OtherZone);
+
+#ifndef NDEBUG
+  void traceCandidate(const SchedCandidate &Cand);
+#endif
+};
+
+/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
+/// the schedule.
+class GenericScheduler : public GenericSchedulerBase {
+  ScheduleDAGMILive *DAG;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
+
+  MachineSchedPolicy RegionPolicy;
+public:
+  GenericScheduler(const MachineSchedContext *C):
+    GenericSchedulerBase(C), DAG(nullptr), Top(SchedBoundary::TopQID, "TopQ"),
+    Bot(SchedBoundary::BotQID, "BotQ") {}
+
+  void initPolicy(MachineBasicBlock::iterator Begin,
+                  MachineBasicBlock::iterator End,
+                  unsigned NumRegionInstrs) override;
+
+  bool shouldTrackPressure() const override {
+    return RegionPolicy.ShouldTrackPressure;
+  }
+
+  void initialize(ScheduleDAGMI *dag) override;
+
+  SUnit *pickNode(bool &IsTopNode) override;
+
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
+  void releaseTopNode(SUnit *SU) override {
+    Top.releaseTopNode(SU);
+  }
+
+  void releaseBottomNode(SUnit *SU) override {
+    Bot.releaseBottomNode(SU);
+  }
+
+  void registerRoots() override;
+
+protected:
+  void checkAcyclicLatency();
+
+  void tryCandidate(SchedCandidate &Cand,
+                    SchedCandidate &TryCand,
+                    SchedBoundary &Zone,
+                    const RegPressureTracker &RPTracker,
+                    RegPressureTracker &TempTracker);
+
+  SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+  void pickNodeFromQueue(SchedBoundary &Zone,
+                         const RegPressureTracker &RPTracker,
+                         SchedCandidate &Candidate);
+
+  void reschedulePhysRegCopies(SUnit *SU, bool isTop);
+};
+
+/// PostGenericScheduler - Interface to the scheduling algorithm used by
+/// ScheduleDAGMI.
+///
+/// Callbacks from ScheduleDAGMI:
+///   initPolicy -> initialize(DAG) -> registerRoots -> pickNode ...
+class PostGenericScheduler : public GenericSchedulerBase {
+  ScheduleDAGMI *DAG;
+  SchedBoundary Top;
+  SmallVector<SUnit*, 8> BotRoots;
+public:
+  PostGenericScheduler(const MachineSchedContext *C):
+    GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
+
+  virtual ~PostGenericScheduler() {}
+
+  void initPolicy(MachineBasicBlock::iterator Begin,
+                  MachineBasicBlock::iterator End,
+                  unsigned NumRegionInstrs) override {
+    /* no configurable policy */
+  };
+
+  /// PostRA scheduling does not track pressure.
+  bool shouldTrackPressure() const override { return false; }
+
+  void initialize(ScheduleDAGMI *Dag) override;
+
+  void registerRoots() override;
+
+  SUnit *pickNode(bool &IsTopNode) override;
+
+  void scheduleTree(unsigned SubtreeID) override {
+    llvm_unreachable("PostRA scheduler does not support subtree analysis.");
+  }
+
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
+  void releaseTopNode(SUnit *SU) override {
+    Top.releaseTopNode(SU);
+  }
+
+  // Only called for roots.
+  void releaseBottomNode(SUnit *SU) override {
+    BotRoots.push_back(SU);
+  }
+
+protected:
+  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
+
+  void pickNodeFromQueue(SchedCandidate &Cand);
+};
+
 } // namespace llvm
 
 #endif
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 2deb1aca7617..17477fe6b05d 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -133,10 +133,6 @@ class TargetPassConfig : public ImmutablePass {
     return *static_cast<TMC*>(TM);
   }
 
-  const TargetLowering *getTargetLowering() const {
-    return TM->getTargetLowering();
-  }
-
   //
   void setInitialized() { Initialized = true; }
 
@@ -592,6 +588,25 @@ namespace llvm {
   /// the intrinsic for later emission to the StackMap.
   extern char &StackMapLivenessID;
 
+  /// createJumpInstrTables - This pass creates jump-instruction tables.
+  ModulePass *createJumpInstrTablesPass();
 } // End llvm namespace
 
+/// This initializer registers TargetMachine constructor, so the pass being
+/// initialized can use target dependent interfaces. Please do not move this
+/// macro to be together with INITIALIZE_PASS, which is a complete target
+/// independent initializer, and we don't want to make libScalarOpts depend
+/// on libCodeGen.
+#define INITIALIZE_TM_PASS(passName, arg, name, cfg, analysis) \
+  static void* initialize##passName##PassOnce(PassRegistry &Registry) { \
+    PassInfo *PI = new PassInfo(name, arg, & passName ::ID, \
+      PassInfo::NormalCtor_t(callDefaultCtor< passName >), cfg, analysis, \
+      PassInfo::TargetMachineCtor_t(callTargetMachineCtor< passName >)); \
+    Registry.registerPass(*PI, true); \
+    return PI; \
+  } \
+  void llvm::initialize##passName##Pass(PassRegistry &Registry) { \
+    CALL_ONCE_INITIALIZATION(initialize##passName##PassOnce) \
+  }
+
 #endif
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index d9c38c072ddf..db2e841e1720 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -151,8 +151,7 @@ class SDDbgInfo {
 };
 
 class SelectionDAG;
-void checkForCycles(const SDNode *N);
-void checkForCycles(const SelectionDAG *DAG);
+void checkForCycles(const SelectionDAG *DAG, bool force = false);
 
 /// SelectionDAG class - This is used to represent a portion of an LLVM function
 /// in a low-level Data Dependence DAG representation suitable for instruction
@@ -335,7 +334,7 @@ class SelectionDAG {
     assert((!N.getNode() || N.getValueType() == MVT::Other) &&
            "DAG root value is not a chain!");
     if (N.getNode())
-      checkForCycles(N.getNode());
+      checkForCycles(N.getNode(), this);
     Root = N;
     if (N.getNode())
       checkForCycles(this);
@@ -540,6 +539,12 @@ class SelectionDAG {
   /// undefined.
   SDValue getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2,
                            const int *MaskElts);
+  SDValue getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2,
+                           ArrayRef<int> MaskElts) {
+    assert(VT.getVectorNumElements() == MaskElts.size() &&
+           "Must have the same number of vector elements as mask elements!");
+    return getVectorShuffle(VT, dl, N1, N2, MaskElts.data());
+  }
 
   /// getAnyExtOrTrunc - Convert Op, which must be of integer type, to the
   /// integer type VT, by either any-extending or truncating it.
@@ -607,14 +612,14 @@ class SelectionDAG {
   ///
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT);
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  SDValue N1, SDValue N2, SDValue N3);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  SDValue N1, SDValue N2, SDValue N3, SDValue N4);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  SDValue N1, SDValue N2, SDValue N3, SDValue N4,
-                  SDValue N5);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  bool nuw = false, bool nsw = false, bool exact = false);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  SDValue N3);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  SDValue N3, SDValue N4);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  SDValue N3, SDValue N4, SDValue N5);
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef<SDUse> Ops);
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
                   ArrayRef<SDValue> Ops);
@@ -695,20 +700,22 @@ class SelectionDAG {
   SDValue getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
                    SDValue SV, unsigned Align);
 
-  /// getAtomic - Gets a node for an atomic op, produces result and chain and
-  /// takes 3 operands
-  SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain,
-                    SDValue Ptr, SDValue Cmp, SDValue Swp,
-                    MachinePointerInfo PtrInfo, unsigned Alignment,
-                    AtomicOrdering SuccessOrdering,
-                    AtomicOrdering FailureOrdering,
-                    SynchronizationScope SynchScope);
-  SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain,
-                    SDValue Ptr, SDValue Cmp, SDValue Swp,
-                    MachineMemOperand *MMO,
-                    AtomicOrdering SuccessOrdering,
-                    AtomicOrdering FailureOrdering,
-                    SynchronizationScope SynchScope);
+  /// getAtomicCmpSwap - Gets a node for an atomic cmpxchg op. There are two
+  /// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces a the value loaded and a
+  /// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded,
+  /// a success flag (initially i1), and a chain.
+  SDValue getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs,
+                           SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp,
+                           MachinePointerInfo PtrInfo, unsigned Alignment,
+                           AtomicOrdering SuccessOrdering,
+                           AtomicOrdering FailureOrdering,
+                           SynchronizationScope SynchScope);
+  SDValue getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs,
+                           SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp,
+                           MachineMemOperand *MMO,
+                           AtomicOrdering SuccessOrdering,
+                           AtomicOrdering FailureOrdering,
+                           SynchronizationScope SynchScope);
 
   /// getAtomic - Gets a node for an atomic op, produces result (if relevant)
   /// and chain and takes 2 operands.
@@ -922,7 +929,9 @@ class SelectionDAG {
 
   /// getNodeIfExists - Get the specified node if it's already available, or
   /// else return NULL.
-  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef<SDValue> Ops);
+  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef<SDValue> Ops,
+                          bool nuw = false, bool nsw = false,
+                          bool exact = false);
 
   /// getDbgValue - Creates a SDDbgValue node.
   ///
@@ -1179,6 +1188,10 @@ class SelectionDAG {
 
   void allnodes_clear();
 
+  BinarySDNode *GetBinarySDNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
+                                SDValue N1, SDValue N2, bool nuw, bool nsw,
+                                bool exact);
+
   /// VTList - List of non-single value types.
   FoldingSet<SDVTListNode> VTListMap;
 
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 4f0ddb7c9163..a39d35be6175 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -49,7 +49,26 @@ template <typename T> struct DenseMapInfo;
 template <typename T> struct simplify_type;
 template <typename T> struct ilist_traits;
 
-void checkForCycles(const SDNode *N);
+/// isBinOpWithFlags - Returns true if the opcode is a binary operation
+/// with flags.
+static bool isBinOpWithFlags(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::MUL:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL:
+    return true;
+  default:
+    return false;
+  }
+}
+
+void checkForCycles(const SDNode *N, const SelectionDAG *DAG = nullptr,
+                    bool force = false);
 
 /// SDVTList - This represents a list of ValueType's that has been intern'd by
 /// a SelectionDAG.  Instances of this simple value class are returned by
@@ -938,6 +957,36 @@ class BinarySDNode : public SDNode {
   }
 };
 
+/// BinaryWithFlagsSDNode - This class is an extension of BinarySDNode
+/// used from those opcodes that have associated extra flags.
+class BinaryWithFlagsSDNode : public BinarySDNode {
+  enum { NUW = (1 << 0), NSW = (1 << 1), EXACT = (1 << 2) };
+
+public:
+  BinaryWithFlagsSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
+                        SDValue X, SDValue Y)
+      : BinarySDNode(Opc, Order, dl, VTs, X, Y) {}
+  /// getRawSubclassData - Return the SubclassData value, which contains an
+  /// encoding of the flags.
+  /// This function should be used to add subclass data to the NodeID value.
+  unsigned getRawSubclassData() const { return SubclassData; }
+  void setHasNoUnsignedWrap(bool b) {
+    SubclassData = (SubclassData & ~NUW) | (b ? NUW : 0);
+  }
+  void setHasNoSignedWrap(bool b) {
+    SubclassData = (SubclassData & ~NSW) | (b ? NSW : 0);
+  }
+  void setIsExact(bool b) {
+    SubclassData = (SubclassData & ~EXACT) | (b ? EXACT : 0);
+  }
+  bool hasNoUnsignedWrap() const { return SubclassData & NUW; }
+  bool hasNoSignedWrap() const { return SubclassData & NSW; }
+  bool isExact() const { return SubclassData & EXACT; }
+  static bool classof(const SDNode *N) {
+    return isBinOpWithFlags(N->getOpcode());
+  }
+};
+
 /// TernarySDNode - This class is used for three-operand SDNodes. This is solely
 /// to allow co-allocation of node operands with the node itself.
 class TernarySDNode : public SDNode {
@@ -1077,6 +1126,7 @@ class MemSDNode : public SDNode {
            N->getOpcode() == ISD::STORE               ||
            N->getOpcode() == ISD::PREFETCH            ||
            N->getOpcode() == ISD::ATOMIC_CMP_SWAP     ||
+           N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS ||
            N->getOpcode() == ISD::ATOMIC_SWAP         ||
            N->getOpcode() == ISD::ATOMIC_LOAD_ADD     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_SUB     ||
@@ -1185,12 +1235,13 @@ class AtomicSDNode : public MemSDNode {
 
   bool isCompareAndSwap() const {
     unsigned Op = getOpcode();
-    return Op == ISD::ATOMIC_CMP_SWAP;
+    return Op == ISD::ATOMIC_CMP_SWAP || Op == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS;
   }
 
   // Methods to support isa and dyn_cast
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::ATOMIC_CMP_SWAP     ||
+           N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS ||
            N->getOpcode() == ISD::ATOMIC_SWAP         ||
            N->getOpcode() == ISD::ATOMIC_LOAD_ADD     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_SUB     ||
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 9f1cbaa7a5a5..230d1ed51a94 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -68,11 +68,9 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
 
   void InitializeELF(bool UseInitArray_);
   const MCSection *getStaticCtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
   const MCSection *getStaticDtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
 };
 
 
@@ -144,11 +142,9 @@ class TargetLoweringObjectFileCOFF : public TargetLoweringObjectFile {
                        Mangler &Mang, const TargetMachine &TM) const override;
 
   const MCSection *getStaticCtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
   const MCSection *getStaticDtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index ea1e75a76a08..e9f6702f8180 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -3,20 +3,14 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 
-/* Bug report URL. */
-#define BUG_REPORT_URL "${BUG_REPORT_URL}"
-
-/* Define if we have libxml2 */
-#cmakedefine CLANG_HAVE_LIBXML ${CLANG_HAVE_LIBXML}
-
-/* Relative directory for resource files */
-#define CLANG_RESOURCE_DIR "${CLANG_RESOURCE_DIR}"
+/* Exported configuration */
+#include "llvm/Config/llvm-config.h"
 
-/* Directories clang will search for headers */
-#define C_INCLUDE_DIRS "${C_INCLUDE_DIRS}"
+/* Patch version of the LLVM API */
+#cmakedefine LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}
 
-/* Default <path> to all compiler invocations for --sysroot=<path>. */
-#undef DEFAULT_SYSROOT
+/* Bug report URL. */
+#define BUG_REPORT_URL "${BUG_REPORT_URL}"
 
 /* Define if you want backtraces on crash */
 #cmakedefine ENABLE_BACKTRACES
@@ -30,9 +24,6 @@
 /* Define if timestamp information (e.g., __DATE__) is allowed */
 #cmakedefine ENABLE_TIMESTAMPS ${ENABLE_TIMESTAMPS}
 
-/* Directory where gcc is installed. */
-#undef GCC_INSTALL_PREFIX
-
 /* Define to 1 if you have the `arc4random' function. */
 #cmakedefine HAVE_ARC4RANDOM
 
@@ -45,9 +36,6 @@
 /* Define to 1 if you have the `ceilf' function. */
 #cmakedefine HAVE_CEILF ${HAVE_CEILF}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_CIRCO ${HAVE_CIRCO}
-
 /* Define to 1 if you have the `closedir' function. */
 #cmakedefine HAVE_CLOSEDIR ${HAVE_CLOSEDIR}
 
@@ -80,12 +68,6 @@
 /* Define if dlopen() is available on this platform. */
 #cmakedefine HAVE_DLOPEN ${HAVE_DLOPEN}
 
-/* Define if the dot program is available */
-#cmakedefine HAVE_DOT ${HAVE_DOT}
-
-/* Define if the dotty program is available */
-#cmakedefine HAVE_DOTTY ${HAVE_DOTTY}
-
 /* Define if you have the _dyld_func_lookup function. */
 #undef HAVE_DYLD
 
@@ -98,9 +80,6 @@
 /* Define to 1 if you have the <fcntl.h> header file. */
 #cmakedefine HAVE_FCNTL_H ${HAVE_FCNTL_H}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_FDP ${HAVE_FDP}
-
 /* Define to 1 if you have the <fenv.h> header file. */
 #cmakedefine HAVE_FENV_H ${HAVE_FENV_H}
 
@@ -161,12 +140,6 @@
 /* Define to 1 if you have the `gettimeofday' function. */
 #cmakedefine HAVE_GETTIMEOFDAY ${HAVE_GETTIMEOFDAY}
 
-/* Define if the Graphviz program is available */
-#cmakedefine HAVE_GRAPHVIZ ${HAVE_GRAPHVIZ}
-
-/* Define if the gv program is available */
-#cmakedefine HAVE_GV ${HAVE_GV}
-
 /* Define to 1 if the system has the type `int64_t'. */
 #cmakedefine HAVE_INT64_T ${HAVE_INT64_T}
 
@@ -271,9 +244,6 @@
 /* Define to 1 if you have the `nearbyintf' function. */
 #cmakedefine HAVE_NEARBYINTF ${HAVE_NEARBYINTF}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_NEATO ${HAVE_NEATO}
-
 /* Define to 1 if you have the `opendir' function. */
 #cmakedefine HAVE_OPENDIR ${HAVE_OPENDIR}
 
@@ -417,9 +387,6 @@
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_TWOPI ${HAVE_TWOPI}
-
 /* Define to 1 if the system has the type `uint64_t'. */
 #cmakedefine HAVE_UINT64_T ${HAVE_UINT64_T}
 
@@ -438,9 +405,6 @@
 /* Define to 1 if you have the `writev' function. */
 #cmakedefine HAVE_WRITEV ${HAVE_WRITEV}
 
-/* Define if the xdot.py program is available */
-#cmakedefine HAVE_XDOT ${HAVE_XDOT}
-
 /* Define to 1 if you have the <zlib.h> header file. */
 #cmakedefine HAVE_ZLIB_H ${HAVE_ZLIB_H}
 
@@ -501,114 +465,9 @@
 /* Define if we link Polly to the tools */
 #cmakedefine LINK_POLLY_INTO_TOOLS
 
-/* Installation directory for binary executables */
-#cmakedefine LLVM_BINDIR "${LLVM_BINDIR}"
-
-/* Time at which LLVM was configured */
-#cmakedefine LLVM_CONFIGTIME "${LLVM_CONFIGTIME}"
-
-/* Installation directory for data files */
-#cmakedefine LLVM_DATADIR "${LLVM_DATADIR}"
-
-/* Target triple LLVM will generate code for by default */
-#cmakedefine LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}"
-
-/* Installation directory for documentation */
-#cmakedefine LLVM_DOCSDIR "${LLVM_DOCSDIR}"
-
-/* Define if threads enabled */
-#cmakedefine01 LLVM_ENABLE_THREADS
-
 /* Define if zlib compression is available */
 #cmakedefine01 LLVM_ENABLE_ZLIB
 
-/* Installation directory for config files */
-#cmakedefine LLVM_ETCDIR "${LLVM_ETCDIR}"
-
-/* Has gcc/MSVC atomic intrinsics */
-#cmakedefine01 LLVM_HAS_ATOMICS
-
-/* Host triple LLVM will be executed on */
-#cmakedefine LLVM_HOST_TRIPLE "${LLVM_HOST_TRIPLE}"
-
-/* Installation directory for include files */
-#cmakedefine LLVM_INCLUDEDIR "${LLVM_INCLUDEDIR}"
-
-/* Installation directory for .info files */
-#cmakedefine LLVM_INFODIR "${LLVM_INFODIR}"
-
-/* Installation directory for man pages */
-#cmakedefine LLVM_MANDIR "${LLVM_MANDIR}"
-
-/* LLVM architecture name for the native architecture, if available */
-#cmakedefine LLVM_NATIVE_ARCH ${LLVM_NATIVE_ARCH}
-
-/* LLVM name for the native AsmParser init function, if available */
-#cmakedefine LLVM_NATIVE_ASMPARSER LLVMInitialize${LLVM_NATIVE_ARCH}AsmParser
-
-/* LLVM name for the native AsmPrinter init function, if available */
-#cmakedefine LLVM_NATIVE_ASMPRINTER LLVMInitialize${LLVM_NATIVE_ARCH}AsmPrinter
-
-/* LLVM name for the native Disassembler init function, if available */
-#cmakedefine LLVM_NATIVE_DISASSEMBLER LLVMInitialize${LLVM_NATIVE_ARCH}Disassembler
-
-/* LLVM name for the native Target init function, if available */
-#cmakedefine LLVM_NATIVE_TARGET LLVMInitialize${LLVM_NATIVE_ARCH}Target
-
-/* LLVM name for the native TargetInfo init function, if available */
-#cmakedefine LLVM_NATIVE_TARGETINFO LLVMInitialize${LLVM_NATIVE_ARCH}TargetInfo
-
-/* LLVM name for the native target MC init function, if available */
-#cmakedefine LLVM_NATIVE_TARGETMC LLVMInitialize${LLVM_NATIVE_ARCH}TargetMC
-
-/* Define if this is Unixish platform */
-#cmakedefine LLVM_ON_UNIX ${LLVM_ON_UNIX}
-
-/* Define if this is Win32ish platform */
-#cmakedefine LLVM_ON_WIN32 ${LLVM_ON_WIN32}
-
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#cmakedefine LLVM_PATH_CIRCO "${LLVM_PATH_CIRCO}"
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#cmakedefine LLVM_PATH_DOT "${LLVM_PATH_DOT}"
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#cmakedefine LLVM_PATH_DOTTY "${LLVM_PATH_DOTTY}"
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#cmakedefine LLVM_PATH_FDP "${LLVM_PATH_FDP}"
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#cmakedefine LLVM_PATH_GRAPHVIZ "${LLVM_PATH_GRAPHVIZ}"
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#cmakedefine LLVM_PATH_GV "${LLVM_PATH_GV}"
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#cmakedefine LLVM_PATH_NEATO "${LLVM_PATH_NEATO}"
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#cmakedefine LLVM_PATH_TWOPI "${LLVM_PATH_TWOPI}"
-
-/* Define to path to xdot.py program if found or 'echo xdot' otherwise */
-#cmakedefine LLVM_PATH_XDOT "${LLVM_PATH_XDOT}"
-
-/* Installation prefix directory */
-#cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
-
-/* Define if we have the Intel JIT API runtime support library */
-#cmakedefine LLVM_USE_INTEL_JITEVENTS 1
-
-/* Define if we have the oprofile JIT-support library */
-#cmakedefine LLVM_USE_OPROFILE 1
-
-/* Major version of the LLVM API */
-#cmakedefine LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
-
-/* Minor version of the LLVM API */
-#cmakedefine LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
-
 /* Define if the OS needs help to load dependent libraries for dlopen(). */
 #cmakedefine LTDL_DLOPEN_DEPLIBS ${LTDL_DLOPEN_DEPLIBS}
 
@@ -689,7 +548,7 @@
 /* Define to 1 if you have the `_chsize_s' function. */
 #cmakedefine HAVE__CHSIZE_S ${HAVE__CHSIZE_S}
 
-/* Added by Kevin -- Maximum path length */
+/* Maximum path length */
 #cmakedefine MAXPATHLEN ${MAXPATHLEN}
 
 #endif
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 6b8dbb7c772f..b5f72977c22d 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -3,20 +3,14 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 
-/* Bug report URL. */
-#undef BUG_REPORT_URL
-
-/* Define if we have libxml2 */
-#undef CLANG_HAVE_LIBXML
-
-/* Relative directory for resource files */
-#undef CLANG_RESOURCE_DIR
+/* Exported configuration */
+#include "llvm/Config/llvm-config.h"
 
-/* Directories clang will search for headers */
-#undef C_INCLUDE_DIRS
+/* Patch version of the LLVM API */
+#undef LLVM_VERSION_PATCH
 
-/* Default <path> to all compiler invocations for --sysroot=<path>. */
-#undef DEFAULT_SYSROOT
+/* Bug report URL. */
+#undef BUG_REPORT_URL
 
 /* Define if you want backtraces on crash */
 #undef ENABLE_BACKTRACES
@@ -30,18 +24,12 @@
 /* Define if timestamp information (e.g., __DATE__) is allowed */
 #undef ENABLE_TIMESTAMPS
 
-/* Directory where gcc is installed. */
-#undef GCC_INSTALL_PREFIX
-
 /* Define to 1 if you have the `backtrace' function. */
 #undef HAVE_BACKTRACE
 
 /* Define to 1 if you have the `ceilf' function. */
 #undef HAVE_CEILF
 
-/* Define if the neat program is available */
-#undef HAVE_CIRCO
-
 /* Define to 1 if you have the <CrashReporterClient.h> header file. */
 #undef HAVE_CRASHREPORTERCLIENT_H
 
@@ -77,12 +65,6 @@
 /* Define if dlopen() is available on this platform. */
 #undef HAVE_DLOPEN
 
-/* Define if the dot program is available */
-#undef HAVE_DOT
-
-/* Define if the dotty program is available */
-#undef HAVE_DOTTY
-
 /* Define to 1 if you have the <errno.h> header file. */
 #undef HAVE_ERRNO_H
 
@@ -98,9 +80,6 @@
 /* Define to 1 if you have the <fcntl.h> header file. */
 #undef HAVE_FCNTL_H
 
-/* Define if the neat program is available */
-#undef HAVE_FDP
-
 /* Define to 1 if you have the <fenv.h> header file. */
 #undef HAVE_FENV_H
 
@@ -143,12 +122,6 @@
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY
 
-/* Define if the Graphviz program is available */
-#undef HAVE_GRAPHVIZ
-
-/* Define if the gv program is available */
-#undef HAVE_GV
-
 /* Define to 1 if the system has the type `int64_t'. */
 #undef HAVE_INT64_T
 
@@ -259,9 +232,6 @@
 /* Define to 1 if you have the `nearbyintf' function. */
 #undef HAVE_NEARBYINTF
 
-/* Define if the neat program is available */
-#undef HAVE_NEATO
-
 /* Define to 1 if you have the `posix_spawn' function. */
 #undef HAVE_POSIX_SPAWN
 
@@ -402,9 +372,6 @@
 /* Define to 1 if you have the <termios.h> header file. */
 #undef HAVE_TERMIOS_H
 
-/* Define if the neat program is available */
-#undef HAVE_TWOPI
-
 /* Define to 1 if the system has the type `uint64_t'. */
 #undef HAVE_UINT64_T
 
@@ -423,9 +390,6 @@
 /* Define to 1 if you have the `writev' function. */
 #undef HAVE_WRITEV
 
-/* Define if the xdot program is available */
-#undef HAVE_XDOT
-
 /* Define to 1 if you have the <zlib.h> header file. */
 #undef HAVE_ZLIB_H
 
@@ -483,117 +447,9 @@
 /* Linker version detected at compile time. */
 #undef HOST_LINK_VERSION
 
-/* Installation directory for binary executables */
-#undef LLVM_BINDIR
-
-/* Time at which LLVM was configured */
-#undef LLVM_CONFIGTIME
-
-/* Installation directory for data files */
-#undef LLVM_DATADIR
-
-/* Target triple LLVM will generate code for by default */
-#undef LLVM_DEFAULT_TARGET_TRIPLE
-
-/* Installation directory for documentation */
-#undef LLVM_DOCSDIR
-
-/* Define if threads enabled */
-#undef LLVM_ENABLE_THREADS
-
 /* Define if zlib is enabled */
 #undef LLVM_ENABLE_ZLIB
 
-/* Installation directory for config files */
-#undef LLVM_ETCDIR
-
-/* Has gcc/MSVC atomic intrinsics */
-#undef LLVM_HAS_ATOMICS
-
-/* Host triple LLVM will be executed on */
-#undef LLVM_HOST_TRIPLE
-
-/* Installation directory for include files */
-#undef LLVM_INCLUDEDIR
-
-/* Installation directory for .info files */
-#undef LLVM_INFODIR
-
-/* Installation directory for man pages */
-#undef LLVM_MANDIR
-
-/* LLVM architecture name for the native architecture, if available */
-#undef LLVM_NATIVE_ARCH
-
-/* LLVM name for the native AsmParser init function, if available */
-#undef LLVM_NATIVE_ASMPARSER
-
-/* LLVM name for the native AsmPrinter init function, if available */
-#undef LLVM_NATIVE_ASMPRINTER
-
-/* LLVM name for the native Disassembler init function, if available */
-#undef LLVM_NATIVE_DISASSEMBLER
-
-/* LLVM name for the native Target init function, if available */
-#undef LLVM_NATIVE_TARGET
-
-/* LLVM name for the native TargetInfo init function, if available */
-#undef LLVM_NATIVE_TARGETINFO
-
-/* LLVM name for the native target MC init function, if available */
-#undef LLVM_NATIVE_TARGETMC
-
-/* Define if this is Unixish platform */
-#undef LLVM_ON_UNIX
-
-/* Define if this is Win32ish platform */
-#undef LLVM_ON_WIN32
-
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#undef LLVM_PATH_CIRCO
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#undef LLVM_PATH_DOT
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#undef LLVM_PATH_DOTTY
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#undef LLVM_PATH_FDP
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#undef LLVM_PATH_GRAPHVIZ
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#undef LLVM_PATH_GV
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#undef LLVM_PATH_NEATO
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#undef LLVM_PATH_TWOPI
-
-/* Define to path to xdot program if found or 'echo xdot' otherwise */
-#undef LLVM_PATH_XDOT
-
-/* Installation prefix directory */
-#undef LLVM_PREFIX
-
-/* Define if we have the Intel JIT API runtime support library */
-#undef LLVM_USE_INTEL_JITEVENTS
-
-/* Define if we have the oprofile JIT-support library */
-#undef LLVM_USE_OPROFILE
-
-/* Major version of the LLVM API */
-#undef LLVM_VERSION_MAJOR
-
-/* Minor version of the LLVM API */
-#undef LLVM_VERSION_MINOR
-
-/* Patch version of the LLVM API */
-#undef LLVM_VERSION_PATCH
-
 /* The shared library extension */
 #undef LTDL_SHLIB_EXT
 
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 65116cb469cc..58111644ffa9 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -1,4 +1,4 @@
-/*===-- llvm/config/llvm-config.h - llvm configure variable -------*- C -*-===*/
+/*===------- llvm/Config/llvm-config.h - llvm configuration -------*- C -*-===*/
 /*                                                                            */
 /*                     The LLVM Compiler Infrastructure                       */
 /*                                                                            */
@@ -7,14 +7,12 @@
 /*                                                                            */
 /*===----------------------------------------------------------------------===*/
 
-/* This file enumerates all of the llvm variables from configure so that
-   they can be in exported headers and won't override package specific
-   directives.  This is a C file so we can include it in the llvm-c headers.  */
+/* This file enumerates variables from the LLVM configuration so that they
+   can be in exported headers and won't override package specific directives.
+   This is a C header that can be included in the llvm-c headers. */
 
-/* To avoid multiple inclusions of these variables when we include the exported
-   headers and config.h, conditionally include these.  */
-/* TODO: This is a bit of a hack.  */
-#ifndef CONFIG_H
+#ifndef LLVM_CONFIG_H
+#define LLVM_CONFIG_H
 
 /* Installation directory for binary executables */
 #cmakedefine LLVM_BINDIR "${LLVM_BINDIR}"
@@ -79,33 +77,6 @@
 /* Define if this is Win32ish platform */
 #cmakedefine LLVM_ON_WIN32 ${LLVM_ON_WIN32}
 
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#cmakedefine LLVM_PATH_CIRCO "${LLVM_PATH_CIRCO}"
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#cmakedefine LLVM_PATH_DOT "${LLVM_PATH_DOT}"
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#cmakedefine LLVM_PATH_DOTTY "${LLVM_PATH_DOTTY}"
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#cmakedefine LLVM_PATH_FDP "${LLVM_PATH_FDP}"
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#cmakedefine LLVM_PATH_GRAPHVIZ "${LLVM_PATH_GRAPHVIZ}"
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#cmakedefine LLVM_PATH_GV "${LLVM_PATH_GV}"
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#cmakedefine LLVM_PATH_NEATO "${LLVM_PATH_NEATO}"
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#cmakedefine LLVM_PATH_TWOPI "${LLVM_PATH_TWOPI}"
-
-/* Define to path to xdot.py program if found or 'echo xdot.py' otherwise */
-#cmakedefine LLVM_PATH_XDOT_PY "${LLVM_PATH_XDOT_PY}"
-
 /* Installation prefix directory */
 #cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
 
diff --git a/include/llvm/Config/llvm-config.h.in b/include/llvm/Config/llvm-config.h.in
index a4fae5537abe..5656240eb127 100644
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in
@@ -1,4 +1,4 @@
-/*===-- llvm/config/llvm-config.h - llvm configure variable -------*- C -*-===*/
+/*===------- llvm/Config/llvm-config.h - llvm configuration -------*- C -*-===*/
 /*                                                                            */
 /*                     The LLVM Compiler Infrastructure                       */
 /*                                                                            */
@@ -7,14 +7,12 @@
 /*                                                                            */
 /*===----------------------------------------------------------------------===*/
 
-/* This file enumerates all of the llvm variables from configure so that
-   they can be in exported headers and won't override package specific
-   directives.  This is a C file so we can include it in the llvm-c headers.  */
+/* This file enumerates variables from the LLVM configuration so that they
+   can be in exported headers and won't override package specific directives.
+   This is a C header that can be included in the llvm-c headers. */
 
-/* To avoid multiple inclusions of these variables when we include the exported
-   headers and config.h, conditionally include these.  */
-/* TODO: This is a bit of a hack.  */
-#ifndef CONFIG_H
+#ifndef LLVM_CONFIG_H
+#define LLVM_CONFIG_H
 
 /* Installation directory for binary executables */
 #undef LLVM_BINDIR
@@ -79,33 +77,6 @@
 /* Define if this is Win32ish platform */
 #undef LLVM_ON_WIN32
 
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#undef LLVM_PATH_CIRCO
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#undef LLVM_PATH_DOT
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#undef LLVM_PATH_DOTTY
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#undef LLVM_PATH_FDP
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#undef LLVM_PATH_GRAPHVIZ
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#undef LLVM_PATH_GV
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#undef LLVM_PATH_NEATO
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#undef LLVM_PATH_TWOPI
-
-/* Define to path to xdot.py program if found or 'echo xdot.py' otherwise */
-#undef LLVM_PATH_XDOT_PY
-
 /* Installation prefix directory */
 #undef LLVM_PREFIX
 
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index 7518c1e1d87b..e5dab6191ab6 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -54,7 +54,7 @@ namespace object {
 }
 
 /// \brief Helper class for helping synchronize access to the global address map
-/// table.
+/// table.  Access to this class should be serialized under a mutex.
 class ExecutionEngineState {
 public:
   struct AddressMapConfig : public ValueMapConfig<const GlobalValue*> {
@@ -84,19 +84,19 @@ class ExecutionEngineState {
 public:
   ExecutionEngineState(ExecutionEngine &EE);
 
-  GlobalAddressMapTy &getGlobalAddressMap(const MutexGuard &) {
+  GlobalAddressMapTy &getGlobalAddressMap() {
     return GlobalAddressMap;
   }
 
   std::map<void*, AssertingVH<const GlobalValue> > &
-  getGlobalAddressReverseMap(const MutexGuard &) {
+  getGlobalAddressReverseMap() {
     return GlobalAddressReverseMap;
   }
 
   /// \brief Erase an entry from the mapping table.
   ///
   /// \returns The address that \p ToUnmap was happed to.
-  void *RemoveMapping(const MutexGuard &, const GlobalValue *ToUnmap);
+  void *RemoveMapping(const GlobalValue *ToUnmap);
 };
 
 /// \brief Abstract interface for implementation execution of LLVM modules,
@@ -586,26 +586,7 @@ class EngineBuilder {
   bool VerifyModules;
 
   /// InitEngine - Does the common initialization of default options.
-  void InitEngine() {
-    WhichEngine = EngineKind::Either;
-    ErrorStr = nullptr;
-    OptLevel = CodeGenOpt::Default;
-    MCJMM = nullptr;
-    JMM = nullptr;
-    Options = TargetOptions();
-    AllocateGVsWithCode = false;
-    RelocModel = Reloc::Default;
-    CMModel = CodeModel::JITDefault;
-    UseMCJIT = false;
-
-  // IR module verification is enabled by default in debug builds, and disabled
-  // by default in release builds.
-#ifndef NDEBUG
-  VerifyModules = true;
-#else
-  VerifyModules = false;
-#endif
-  }
+  void InitEngine();
 
 public:
   /// EngineBuilder - Constructor for EngineBuilder.  If create() is called and
diff --git a/include/llvm/ExecutionEngine/SectionMemoryManager.h b/include/llvm/ExecutionEngine/SectionMemoryManager.h
index f24bb4d5a52a..136856390b21 100644
--- a/include/llvm/ExecutionEngine/SectionMemoryManager.h
+++ b/include/llvm/ExecutionEngine/SectionMemoryManager.h
@@ -21,7 +21,6 @@
 #include "llvm/Support/Memory.h"
 
 namespace llvm {
-
 /// This is a simple memory manager which implements the methods called by
 /// the RuntimeDyld class to allocate memory for section-based loading of
 /// objects, usually those generated by the MCJIT execution engine.
@@ -93,8 +92,8 @@ class SectionMemoryManager : public RTDyldMemoryManager {
   uint8_t *allocateSection(MemoryGroup &MemGroup, uintptr_t Size,
                            unsigned Alignment);
 
-  error_code applyMemoryGroupPermissions(MemoryGroup &MemGroup,
-                                         unsigned Permissions);
+  std::error_code applyMemoryGroupPermissions(MemoryGroup &MemGroup,
+                                              unsigned Permissions);
 
   MemoryGroup CodeMem;
   MemoryGroup RWDataMem;
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 86f9cc88941c..e34dc83a5ba3 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -75,6 +75,7 @@ class Attribute {
     Cold,                  ///< Marks function as being in a cold path.
     InlineHint,            ///< Source said inlining was desirable
     InReg,                 ///< Force argument to be passed in register
+    JumpTable,             ///< Build jump-instruction tables and replace refs.
     MinSize,               ///< Function must be optimized for size first
     Naked,                 ///< Naked function
     Nest,                  ///< Nested function static chain
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index f03e3dd45537..39c7c37dafd5 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -163,6 +163,14 @@ class Constant : public User {
   /// that want to check to see if a global is unused, but don't want to deal
   /// with potentially dead constants hanging off of the globals.
   void removeDeadConstantUsers() const;
+
+  Constant *stripPointerCasts() {
+    return cast<Constant>(Value::stripPointerCasts());
+  }
+
+  const Constant *stripPointerCasts() const {
+    return const_cast<Constant*>(this)->stripPointerCasts();
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h
index 6d769d4055c8..3d969a8b7532 100644
--- a/include/llvm/IR/DebugLoc.h
+++ b/include/llvm/IR/DebugLoc.h
@@ -95,7 +95,7 @@ namespace llvm {
 
     // getFnDebugLoc - Walk up the scope chain of given debug loc and find line
     // number info for the function.
-    DebugLoc getFnDebugLoc(const LLVMContext &Ctx);
+    DebugLoc getFnDebugLoc(const LLVMContext &Ctx) const;
 
     /// getAsMDNode - This method converts the compressed DebugLoc node into a
     /// DILocation compatible MDNode.
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index 604de1fed93d..de38d07f903c 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -26,6 +26,7 @@ namespace llvm {
 class DiagnosticPrinter;
 class Function;
 class Instruction;
+class LLVMContextImpl;
 class Twine;
 class Value;
 class DebugLoc;
@@ -48,6 +49,8 @@ enum DiagnosticKind {
   DK_DebugMetadataVersion,
   DK_SampleProfile,
   DK_OptimizationRemark,
+  DK_OptimizationRemarkMissed,
+  DK_OptimizationRemarkAnalysis,
   DK_FirstPluginKind
 };
 
@@ -135,7 +138,6 @@ class DiagnosticInfoInlineAsm : public DiagnosticInfo {
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_InlineAsm;
   }
@@ -163,7 +165,6 @@ class DiagnosticInfoStackSize : public DiagnosticInfo {
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_StackSize;
   }
@@ -192,7 +193,6 @@ class DiagnosticInfoDebugMetadataVersion : public DiagnosticInfo {
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_DebugMetadataVersion;
   }
@@ -218,7 +218,6 @@ class DiagnosticInfoSampleProfile : public DiagnosticInfo {
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_SampleProfile;
   }
@@ -239,30 +238,36 @@ class DiagnosticInfoSampleProfile : public DiagnosticInfo {
   const Twine &Msg;
 };
 
-/// Diagnostic information for optimization remarks.
-class DiagnosticInfoOptimizationRemark : public DiagnosticInfo {
+/// Common features for diagnostics dealing with optimization remarks.
+class DiagnosticInfoOptimizationRemarkBase : public DiagnosticInfo {
 public:
-  /// \p PassName is the name of the pass emitting this diagnostic. If
-  /// this name matches the regular expression given in -Rpass=, then the
-  /// diagnostic will be emitted. \p Fn is the function where the diagnostic
-  /// is being emitted. \p DLoc is the location information to use in the
-  /// diagnostic. If line table information is available, the diagnostic
-  /// will include the source code location. \p Msg is the message to show.
-  /// Note that this class does not copy this message, so this reference
-  /// must be valid for the whole life time of the diagnostic.
-  DiagnosticInfoOptimizationRemark(const char *PassName, const Function &Fn,
-                                   const DebugLoc &DLoc, const Twine &Msg)
-      : DiagnosticInfo(DK_OptimizationRemark, DS_Remark), PassName(PassName),
-        Fn(Fn), DLoc(DLoc), Msg(Msg) {}
+  /// \p PassName is the name of the pass emitting this diagnostic.
+  /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
+  /// the location information to use in the diagnostic. If line table
+  /// information is available, the diagnostic will include the source code
+  /// location. \p Msg is the message to show. Note that this class does not
+  /// copy this message, so this reference must be valid for the whole life time
+  /// of the diagnostic.
+  DiagnosticInfoOptimizationRemarkBase(enum DiagnosticKind Kind,
+                                       const char *PassName, const Function &Fn,
+                                       const DebugLoc &DLoc, const Twine &Msg)
+      : DiagnosticInfo(Kind, DS_Remark), PassName(PassName), Fn(Fn), DLoc(DLoc),
+        Msg(Msg) {}
 
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemark;
   }
 
+  /// Return true if this optimization remark is enabled by one of
+  /// of the LLVM command line flags (-pass-remarks, -pass-remarks-missed,
+  /// or -pass-remarks-analysis). Note that this only handles the LLVM
+  /// flags. We cannot access Clang flags from here (they are handled
+  /// in BackendConsumer::OptimizationRemarkHandler).
+  virtual bool isEnabled() const = 0;
+
   /// Return true if location information is available for this diagnostic.
   bool isLocationAvailable() const;
 
@@ -296,9 +301,116 @@ class DiagnosticInfoOptimizationRemark : public DiagnosticInfo {
   const Twine &Msg;
 };
 
+/// Diagnostic information for applied optimization remarks.
+class DiagnosticInfoOptimizationRemark
+    : public DiagnosticInfoOptimizationRemarkBase {
+public:
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass=, then the
+  /// diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p DLoc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic
+  /// will include the source code location. \p Msg is the message to show.
+  /// Note that this class does not copy this message, so this reference
+  /// must be valid for the whole life time of the diagnostic.
+  DiagnosticInfoOptimizationRemark(const char *PassName, const Function &Fn,
+                                   const DebugLoc &DLoc, const Twine &Msg)
+      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemark, PassName,
+                                             Fn, DLoc, Msg) {}
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemark;
+  }
+
+  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
+  virtual bool isEnabled() const override;
+};
+
+/// Diagnostic information for missed-optimization remarks.
+class DiagnosticInfoOptimizationRemarkMissed
+    : public DiagnosticInfoOptimizationRemarkBase {
+public:
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass-missed=, then the
+  /// diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p DLoc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic
+  /// will include the source code location. \p Msg is the message to show.
+  /// Note that this class does not copy this message, so this reference
+  /// must be valid for the whole life time of the diagnostic.
+  DiagnosticInfoOptimizationRemarkMissed(const char *PassName,
+                                         const Function &Fn,
+                                         const DebugLoc &DLoc, const Twine &Msg)
+      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkMissed,
+                                             PassName, Fn, DLoc, Msg) {}
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemarkMissed;
+  }
+
+  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
+  virtual bool isEnabled() const override;
+};
+
+/// Diagnostic information for optimization analysis remarks.
+class DiagnosticInfoOptimizationRemarkAnalysis
+    : public DiagnosticInfoOptimizationRemarkBase {
+public:
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass-analysis=, then
+  /// the diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p DLoc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic will
+  /// include the source code location. \p Msg is the message to show. Note that
+  /// this class does not copy this message, so this reference must be valid for
+  /// the whole life time of the diagnostic.
+  DiagnosticInfoOptimizationRemarkAnalysis(const char *PassName,
+                                           const Function &Fn,
+                                           const DebugLoc &DLoc,
+                                           const Twine &Msg)
+      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkAnalysis,
+                                             PassName, Fn, DLoc, Msg) {}
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemarkAnalysis;
+  }
+
+  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
+  virtual bool isEnabled() const override;
+};
+
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DiagnosticInfo, LLVMDiagnosticInfoRef)
 
+/// Emit an optimization-applied message. \p PassName is the name of the pass
+/// emitting the message. If -Rpass= is given and \p PassName matches the
+/// regular expression in -Rpass, then the remark will be emitted. \p Fn is
+/// the function triggering the remark, \p DLoc is the debug location where
+/// the diagnostic is generated. \p Msg is the message string to use.
+void emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
+                            const Function &Fn, const DebugLoc &DLoc,
+                            const Twine &Msg);
+
+/// Emit an optimization-missed message. \p PassName is the name of the
+/// pass emitting the message. If -Rpass-missed= is given and \p PassName
+/// matches the regular expression in -Rpass, then the remark will be
+/// emitted. \p Fn is the function triggering the remark, \p DLoc is the
+/// debug location where the diagnostic is generated. \p Msg is the
+/// message string to use.
+void emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
+                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Twine &Msg);
+
+/// Emit an optimization analysis remark message. \p PassName is the name of
+/// the pass emitting the message. If -Rpass-analysis= is given and \p
+/// PassName matches the regular expression in -Rpass, then the remark will be
+/// emitted. \p Fn is the function triggering the remark, \p DLoc is the debug
+/// location where the diagnostic is generated. \p Msg is the message string
+/// to use.
+void emitOptimizationRemarkAnalysis(LLVMContext &Ctx, const char *PassName,
+                                    const Function &Fn, const DebugLoc &DLoc,
+                                    const Twine &Msg);
+
 } // End namespace llvm
 
 #endif
diff --git a/include/llvm/IR/GVMaterializer.h b/include/llvm/IR/GVMaterializer.h
index dbe52bc2a329..4afdbb05f854 100644
--- a/include/llvm/IR/GVMaterializer.h
+++ b/include/llvm/IR/GVMaterializer.h
@@ -18,10 +18,9 @@
 #ifndef LLVM_IR_GVMATERIALIZER_H
 #define LLVM_IR_GVMATERIALIZER_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-
 class Function;
 class GlobalValue;
 class Module;
@@ -43,7 +42,7 @@ class GVMaterializer {
 
   /// Make sure the given GlobalValue is fully read.
   ///
-  virtual error_code Materialize(GlobalValue *GV) = 0;
+  virtual std::error_code Materialize(GlobalValue *GV) = 0;
 
   /// If the given GlobalValue is read in, and if the GVMaterializer supports
   /// it, release the memory for the GV, and set it up to be materialized
@@ -54,7 +53,7 @@ class GVMaterializer {
 
   /// Make sure the entire Module has been completely read.
   ///
-  virtual error_code MaterializeModule(Module *M) = 0;
+  virtual std::error_code MaterializeModule(Module *M) = 0;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index d9f0b4a89b83..a77d1630f428 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -34,7 +34,7 @@ class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   void setParent(Module *parent);
 
   GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
-              const Twine &Name, GlobalObject *Aliasee, Module *Parent);
+              const Twine &Name, Constant *Aliasee, Module *Parent);
 
 public:
   // allocate space for exactly one operand
@@ -46,7 +46,7 @@ class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   /// the end of the specified module's alias list.
   static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
                              LinkageTypes Linkage, const Twine &Name,
-                             GlobalObject *Aliasee, Module *Parent);
+                             Constant *Aliasee, Module *Parent);
 
   // Without the Aliasee.
   static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
@@ -56,14 +56,14 @@ class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   // The module is taken from the Aliasee.
   static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
                              LinkageTypes Linkage, const Twine &Name,
-                             GlobalObject *Aliasee);
+                             GlobalValue *Aliasee);
 
   // Type, Parent and AddressSpace taken from the Aliasee.
   static GlobalAlias *create(LinkageTypes Linkage, const Twine &Name,
-                             GlobalObject *Aliasee);
+                             GlobalValue *Aliasee);
 
   // Linkage, Type, Parent and AddressSpace taken from the Aliasee.
-  static GlobalAlias *create(const Twine &Name, GlobalObject *Aliasee);
+  static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
 
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
@@ -78,14 +78,13 @@ class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   ///
   void eraseFromParent() override;
 
-  /// set/getAliasee - These methods retrive and set alias target.
-  void setAliasee(GlobalObject *GO);
-  const GlobalObject *getAliasee() const {
+  /// These methods retrive and set alias target.
+  void setAliasee(Constant *Aliasee);
+  const Constant *getAliasee() const {
     return const_cast<GlobalAlias *>(this)->getAliasee();
   }
-
-  GlobalObject *getAliasee() {
-    return cast_or_null<GlobalObject>(getOperand(0));
+  Constant *getAliasee() {
+    return getOperand(0);
   }
 
   static bool isValidLinkage(LinkageTypes L) {
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index 3bc8b8531dd6..74cc18eeb1e7 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -40,8 +40,8 @@ class GlobalObject : public GlobalValue {
   }
   void setAlignment(unsigned Align);
 
-  bool hasSection() const { return !getSection().empty(); }
-  const std::string &getSection() const { return Section; }
+  bool hasSection() const { return !StringRef(getSection()).empty(); }
+  const char *getSection() const { return Section.c_str(); }
   void setSection(StringRef S);
 
   void copyAttributesFrom(const GlobalValue *Src) override;
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 0ed302cdb4b8..5e99886a5fb2 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -63,7 +63,8 @@ class GlobalValue : public Constant {
               LinkageTypes Linkage, const Twine &Name)
       : Constant(Ty, VTy, Ops, NumOps), Linkage(Linkage),
         Visibility(DefaultVisibility), UnnamedAddr(0),
-        DllStorageClass(DefaultStorageClass), Parent(nullptr) {
+        DllStorageClass(DefaultStorageClass),
+        ThreadLocal(NotThreadLocal), Parent(nullptr) {
     setName(Name);
   }
 
@@ -74,21 +75,32 @@ class GlobalValue : public Constant {
   unsigned UnnamedAddr : 1;   // This value's address is not significant
   unsigned DllStorageClass : 2; // DLL storage class
 
+  unsigned ThreadLocal : 3; // Is this symbol "Thread Local", if so, what is
+                            // the desired model?
+
 private:
   // Give subclasses access to what otherwise would be wasted padding.
-  // (22 + 2 + 1 + 2 + 5) == 32.
-  unsigned SubClassData : 22;
+  // (19 + 3 + 2 + 1 + 2 + 5) == 32.
+  unsigned SubClassData : 19;
 protected:
   unsigned getGlobalValueSubClassData() const {
     return SubClassData;
   }
   void setGlobalValueSubClassData(unsigned V) {
-    assert(V < (1 << 22) && "It will not fit");
+    assert(V < (1 << 19) && "It will not fit");
     SubClassData = V;
   }
 
   Module *Parent;             // The containing module.
 public:
+  enum ThreadLocalMode {
+    NotThreadLocal = 0,
+    GeneralDynamicTLSModel,
+    LocalDynamicTLSModel,
+    InitialExecTLSModel,
+    LocalExecTLSModel
+  };
+
   ~GlobalValue() {
     removeDeadConstantUsers();   // remove any dead constants using this.
   }
@@ -110,6 +122,19 @@ class GlobalValue : public Constant {
     Visibility = V;
   }
 
+  /// If the value is "Thread Local", its value isn't shared by the threads.
+  bool isThreadLocal() const { return getThreadLocalMode() != NotThreadLocal; }
+  void setThreadLocal(bool Val) {
+    setThreadLocalMode(Val ? GeneralDynamicTLSModel : NotThreadLocal);
+  }
+  void setThreadLocalMode(ThreadLocalMode Val) {
+    assert(Val == NotThreadLocal || getValueID() != Value::FunctionVal);
+    ThreadLocal = Val;
+  }
+  ThreadLocalMode getThreadLocalMode() const {
+    return static_cast<ThreadLocalMode>(ThreadLocal);
+  }
+
   DLLStorageClassTypes getDLLStorageClass() const {
     return DLLStorageClassTypes(DllStorageClass);
   }
@@ -121,8 +146,14 @@ class GlobalValue : public Constant {
   }
   void setDLLStorageClass(DLLStorageClassTypes C) { DllStorageClass = C; }
 
-  bool hasSection() const { return !getSection().empty(); }
-  const std::string &getSection() const;
+  bool hasSection() const { return !StringRef(getSection()).empty(); }
+  // It is unfortunate that we have to use "char *" in here since this is
+  // always non NULL, but:
+  // * The C API expects a null terminated string, so we cannot use StringRef.
+  // * The C API expects us to own it, so we cannot use a std:string.
+  // * For GlobalAliases we can fail to find the section and we have to
+  //   return "", so we cannot use a "const std::string &".
+  const char *getSection() const;
 
   /// Global values are always pointers.
   inline PointerType *getType() const {
@@ -222,8 +253,8 @@ class GlobalValue : public Constant {
   bool hasCommonLinkage() const { return isCommonLinkage(Linkage); }
 
   void setLinkage(LinkageTypes LT) {
-    assert((!isLocalLinkage(LT) || hasDefaultVisibility()) &&
-           "local linkage requires default visibility");
+    if (isLocalLinkage(LT))
+      Visibility = DefaultVisibility;
     Linkage = LT;
   }
   LinkageTypes getLinkage() const { return Linkage; }
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 8cd4332b1ad8..4189ccb90a54 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -41,9 +41,6 @@ class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   void setParent(Module *parent);
 
   bool isConstantGlobal : 1;                   // Is this a global constant?
-  unsigned threadLocalMode : 3;                // Is this symbol "Thread Local",
-                                               // if so, what is the desired
-                                               // model?
   bool isExternallyInitializedConstant : 1;    // Is this a global whose value
                                                // can change from its initial
                                                // value before global
@@ -55,14 +52,6 @@ class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
     return User::operator new(s, 1);
   }
 
-  enum ThreadLocalMode {
-    NotThreadLocal = 0,
-    GeneralDynamicTLSModel,
-    LocalDynamicTLSModel,
-    InitialExecTLSModel,
-    LocalExecTLSModel
-  };
-
   /// GlobalVariable ctor - If a parent module is specified, the global is
   /// automatically inserted into the end of the specified modules global list.
   GlobalVariable(Type *Ty, bool isConstant, LinkageTypes Linkage,
@@ -155,16 +144,6 @@ class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   bool isConstant() const { return isConstantGlobal; }
   void setConstant(bool Val) { isConstantGlobal = Val; }
 
-  /// If the value is "Thread Local", its value isn't shared by the threads.
-  bool isThreadLocal() const { return threadLocalMode != NotThreadLocal; }
-  void setThreadLocal(bool Val) {
-    threadLocalMode = Val ? GeneralDynamicTLSModel : NotThreadLocal;
-  }
-  void setThreadLocalMode(ThreadLocalMode Val) { threadLocalMode = Val; }
-  ThreadLocalMode getThreadLocalMode() const {
-    return static_cast<ThreadLocalMode>(threadLocalMode);
-  }
-
   bool isExternallyInitialized() const {
     return isExternallyInitializedConstant;
   }
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 580d33368337..cedb87cdb7cc 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -1464,6 +1464,30 @@ class IRBuilder : public IRBuilderBase, public Inserter {
     Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32Ty, NumElts));
     return CreateShuffleVector(V, Undef, Zeros, Name + ".splat");
   }
+
+  /// \brief Return a value that has been extracted from a larger integer type.
+  Value *CreateExtractInteger(const DataLayout &DL, Value *From,
+                              IntegerType *ExtractedTy, uint64_t Offset,
+                              const Twine &Name) {
+    IntegerType *IntTy = cast<IntegerType>(From->getType());
+    assert(DL.getTypeStoreSize(ExtractedTy) + Offset <=
+               DL.getTypeStoreSize(IntTy) &&
+           "Element extends past full value");
+    uint64_t ShAmt = 8 * Offset;
+    Value *V = From;
+    if (DL.isBigEndian())
+      ShAmt = 8 * (DL.getTypeStoreSize(IntTy) -
+                   DL.getTypeStoreSize(ExtractedTy) - Offset);
+    if (ShAmt) {
+      V = CreateLShr(V, ShAmt, Name + ".shift");
+    }
+    assert(ExtractedTy->getBitWidth() <= IntTy->getBitWidth() &&
+           "Cannot extract to a larger integer!");
+    if (ExtractedTy != IntTy) {
+      V = CreateTrunc(V, ExtractedTy, Name + ".trunc");
+    }
+    return V;
+  }
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 7d338a68f546..e0c829ac985a 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -500,6 +500,16 @@ class AtomicCmpXchgInst : public Instruction {
                                 (unsigned)V);
   }
 
+  /// Return true if this cmpxchg may spuriously fail.
+  bool isWeak() const {
+    return getSubclassDataFromInstruction() & 0x100;
+  }
+
+  void setWeak(bool IsWeak) {
+    setInstructionSubclassData((getSubclassDataFromInstruction() & ~0x100) |
+                               (IsWeak << 8));
+  }
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -2311,12 +2321,14 @@ class LandingPadInst : public Instruction {
                                (V ? 1 : 0));
   }
 
-  /// addClause - Add a catch or filter clause to the landing pad.
-  void addClause(Value *ClauseVal);
+  /// Add a catch or filter clause to the landing pad.
+  void addClause(Constant *ClauseVal);
 
-  /// getClause - Get the value of the clause at index Idx. Use isCatch/isFilter
-  /// to determine what type of clause this is.
-  Value *getClause(unsigned Idx) const { return OperandList[Idx + 1]; }
+  /// Get the value of the clause at index Idx. Use isCatch/isFilter to
+  /// determine what type of clause this is.
+  Constant *getClause(unsigned Idx) const {
+    return cast<Constant>(OperandList[Idx + 1]);
+  }
 
   /// isCatch - Return 'true' if the clause and index Idx is a catch clause.
   bool isCatch(unsigned Idx) const {
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 2ba230a09789..edd1621ef25d 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -533,7 +533,6 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
 include "llvm/IR/IntrinsicsPowerPC.td"
 include "llvm/IR/IntrinsicsX86.td"
 include "llvm/IR/IntrinsicsARM.td"
-include "llvm/IR/IntrinsicsARM64.td"
 include "llvm/IR/IntrinsicsAArch64.td"
 include "llvm/IR/IntrinsicsXCore.td"
 include "llvm/IR/IntrinsicsHexagon.td"
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index 61c0e5d419f4..e3c0fb359901 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -1,4 +1,4 @@
-//===- IntrinsicsAArch64.td - Defines AArch64 intrinsics -----------*- tablegen -*-===//
+//===- IntrinsicsAARCH64.td - Defines AARCH64 intrinsics ---*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,401 +7,637 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines all of the AArch64-specific intrinsics.
+// This file defines all of the AARCH64-specific intrinsics.
 //
 //===----------------------------------------------------------------------===//
 
+let TargetPrefix = "aarch64" in {
+
+def int_aarch64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+def int_aarch64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+def int_aarch64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
+def int_aarch64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
+
+def int_aarch64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_aarch64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_aarch64_stxp : Intrinsic<[llvm_i32_ty],
+                               [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+def int_aarch64_stlxp : Intrinsic<[llvm_i32_ty],
+                                [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+
+def int_aarch64_clrex : Intrinsic<[]>;
+
+def int_aarch64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// RBIT
+
+def int_aarch64_rbit : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
+                                 [IntrNoMem]>;
+
+}
+
 //===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_2Scalar_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_FPToIntRounding_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+
+  class AdvSIMD_1IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Expand_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1IntArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Int_Across_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Float_Across_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+  class AdvSIMD_2IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Compare_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_FloatCompare_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMExtendedType<0>, LLVMExtendedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_3VectorArg_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
+                LLVMMatchType<1>], [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFxToFP_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFPToFx_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+}
 
-// Vector Absolute Compare (Floating Point)
-def int_aarch64_neon_vacgeq :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vacgtq :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
-
-// Vector saturating accumulate
-def int_aarch64_neon_suqadd : Neon_2Arg_Intrinsic;
-def int_aarch64_neon_usqadd : Neon_2Arg_Intrinsic;
-
-// Vector Bitwise reverse
-def int_aarch64_neon_rbit : Neon_1Arg_Intrinsic;
-
-// Vector extract and narrow
-def int_aarch64_neon_xtn : 
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Vector floating-point convert
-def int_aarch64_neon_frintn : Neon_1Arg_Intrinsic;
-def int_aarch64_neon_fsqrt : Neon_1Arg_Intrinsic;
-def int_aarch64_neon_vcvtxn :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vcvtzs :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vcvtzu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Vector maxNum (Floating Point)
-def int_aarch64_neon_vmaxnm : Neon_2Arg_Intrinsic;
-
-// Vector minNum (Floating Point)
-def int_aarch64_neon_vminnm : Neon_2Arg_Intrinsic;
-
-// Vector Pairwise maxNum (Floating Point)
-def int_aarch64_neon_vpmaxnm : Neon_2Arg_Intrinsic;
-
-// Vector Pairwise minNum (Floating Point)
-def int_aarch64_neon_vpminnm : Neon_2Arg_Intrinsic;
-
-// Vector Multiply Extended and Scalar Multiply Extended (Floating Point)
-def int_aarch64_neon_vmulx  :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
-
-class Neon_N2V_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty],
-              [IntrNoMem]>;
-class Neon_N3V_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
-              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-              [IntrNoMem]>;
-class Neon_N2V_Narrow_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
-              [LLVMExtendedType<0>, llvm_i32_ty],
-              [IntrNoMem]>;
-
-// Vector rounding shift right by immediate (Signed)
-def int_aarch64_neon_vsrshr : Neon_N2V_Intrinsic;
-def int_aarch64_neon_vurshr : Neon_N2V_Intrinsic;
-def int_aarch64_neon_vsqshlu : Neon_N2V_Intrinsic;
-
-def int_aarch64_neon_vsri : Neon_N3V_Intrinsic;
-def int_aarch64_neon_vsli : Neon_N3V_Intrinsic;
-
-def int_aarch64_neon_vsqshrun : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vrshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vsqrshrun : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vsqshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vuqshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vsqrshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vuqrshrn : Neon_N2V_Narrow_Intrinsic;
-
-// Vector across
-class Neon_Across_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-def int_aarch64_neon_saddlv : Neon_Across_Intrinsic;
-def int_aarch64_neon_uaddlv : Neon_Across_Intrinsic;
-def int_aarch64_neon_smaxv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_umaxv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_sminv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_uminv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_vaddv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_vmaxv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_aarch64_neon_vminv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_aarch64_neon_vmaxnmv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_aarch64_neon_vminnmv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
-// Vector Table Lookup.
-def int_aarch64_neon_vtbl1 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbl2 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-def int_aarch64_neon_vtbl3 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-            LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbl4 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-            llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-
-// Vector Table Extension.
-// Some elements of the destination vector may not be updated, so the original
-// value of that vector is passed as the first argument.  The next 1-4
-// arguments after that are the table.
-def int_aarch64_neon_vtbx1 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-def int_aarch64_neon_vtbx2 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-             LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbx3 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-             llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbx4 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-             llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Vector Load/store
-def int_aarch64_neon_vld1x2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-def int_aarch64_neon_vld1x3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>],
-                                        [llvm_ptr_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-def int_aarch64_neon_vld1x4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-
-def int_aarch64_neon_vst1x2 : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
-def int_aarch64_neon_vst1x3 : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
-def int_aarch64_neon_vst1x4 : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
-
-// Scalar Add
-def int_aarch64_neon_vaddds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vadddu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-
-// Scalar Sub
-def int_aarch64_neon_vsubds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vsubdu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-
-// Scalar Shift
-// Scalar Shift Left
-def int_aarch64_neon_vshlds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vshldu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-// Scalar Saturating Shift Left
-def int_aarch64_neon_vqshls : Neon_2Arg_Intrinsic;
-def int_aarch64_neon_vqshlu : Neon_2Arg_Intrinsic;
-
-// Scalar Shift Rouding Left
-def int_aarch64_neon_vrshlds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vrshldu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-// Scalar Saturating Rounding Shift Left
-def int_aarch64_neon_vqrshls : Neon_2Arg_Intrinsic;
-def int_aarch64_neon_vqrshlu : Neon_2Arg_Intrinsic;
-
-// Scalar Reduce Pairwise Add.
-def int_aarch64_neon_vpadd :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v2i64_ty],[IntrNoMem]>;
-def int_aarch64_neon_vpfadd :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Reduce Pairwise Floating Point Max/Min.
-def int_aarch64_neon_vpmax :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vpmin :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Reduce Pairwise Floating Point Maxnm/Minnm.
-def int_aarch64_neon_vpfmaxnm :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vpfminnm :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Signed Integer Convert To Floating-point
-def int_aarch64_neon_vcvtint2fps :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Unsigned Integer Convert To Floating-point
-def int_aarch64_neon_vcvtint2fpu :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Floating-point Convert
-def int_aarch64_neon_fcvtxn :
-  Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtns : 
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtnu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtps :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtpu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtms :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtmu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtas :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtau :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtzs :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtzu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Estimate.
-def int_aarch64_neon_vrecpe :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Exponent
-def int_aarch64_neon_vrecpx :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Square Root Estimate
-def int_aarch64_neon_vrsqrte :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Step
-def int_aarch64_neon_vrecps :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Square Root Step
-def int_aarch64_neon_vrsqrts :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Compare with vector operands.
-class Neon_Cmp_Intrinsic :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyvector_ty],
-            [IntrNoMem]>;
-
-// Floating-point compare with scalar operands.
-class Neon_Float_Cmp_Intrinsic :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty, llvm_anyfloat_ty],
-            [IntrNoMem]>;
-
-// Scalar Compare Equal
-def int_aarch64_neon_vceq : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fceq : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Greater-Than or Equal
-def int_aarch64_neon_vcge : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_vchs : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcge : Neon_Float_Cmp_Intrinsic;
-def int_aarch64_neon_fchs : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Less-Than or Equal
-def int_aarch64_neon_vclez : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fclez : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Less-Than
-def int_aarch64_neon_vcltz : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcltz : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Greater-Than
-def int_aarch64_neon_vcgt : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_vchi : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcgt : Neon_Float_Cmp_Intrinsic;
-def int_aarch64_neon_fchi : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Bitwise Test Bits
-def int_aarch64_neon_vtstd : Neon_Cmp_Intrinsic;
-
-// Scalar Floating-point Absolute Compare Greater Than Or Equal
-def int_aarch64_neon_vcage : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcage : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Floating-point Absolute Compare Greater Than
-def int_aarch64_neon_vcagt : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcagt : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Signed Saturating Accumulated of Unsigned Value
-def int_aarch64_neon_vuqadd : Neon_2Arg_Intrinsic;
-
-// Scalar Unsigned Saturating Accumulated of Signed Value
-def int_aarch64_neon_vsqadd : Neon_2Arg_Intrinsic;
-
-// Scalar Absolute Value
-def int_aarch64_neon_vabs :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty], [IntrNoMem]>;
-
-// Scalar Absolute Difference
-def int_aarch64_neon_vabd :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Scalar Negate Value
-def int_aarch64_neon_vneg :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty], [IntrNoMem]>;
-
-// Signed Saturating Doubling Multiply-Add Long
-def int_aarch64_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
-
-// Signed Saturating Doubling Multiply-Subtract Long
-def int_aarch64_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
-
-def int_aarch64_neon_vmull_p64 :
-  Intrinsic<[llvm_v16i8_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+// Arithmetic ops
 
-class Neon_2Arg_ShiftImm_Intrinsic
-  : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_i32_ty], [IntrNoMem]>;
+let Properties = [IntrNoMem] in {
+  // Vector Add Across Lanes
+  def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Long Add Across Lanes
+  def int_aarch64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+
+  // Vector Halving Add
+  def int_aarch64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Rounding Halving Add
+  def int_aarch64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Add
+  def int_aarch64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Add High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_aarch64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Add High-Half
+  def int_aarch64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Doubling Multiply High
+  def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Doubling Multiply High
+  def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Polynominal Multiply
+  def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Long Multiply
+  def int_aarch64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+
+  // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
+  // it with a v16i8.
+  def int_aarch64_neon_pmull64 :
+        Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
+
+  // Vector Extending Multiply
+  def int_aarch64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic {
+    let Properties = [IntrNoMem, Commutative];
+  }
+
+  // Vector Saturating Doubling Long Multiply
+  def int_aarch64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_sqdmulls_scalar
+    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Vector Halving Subtract
+  def int_aarch64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Subtract
+  def int_aarch64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Subtract High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_aarch64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Subtract High-Half
+  def int_aarch64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Compare Absolute Greater-than-or-equal
+  def int_aarch64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Compare Absolute Greater-than
+  def int_aarch64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Absolute Difference
+  def int_aarch64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Scalar Absolute Difference
+  def int_aarch64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic;
+
+  // Vector Max
+  def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Max Across Lanes
+  def int_aarch64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_aarch64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Min
+  def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Min/Max Number
+  def int_aarch64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic;
+  def int_aarch64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Vector Min Across Lanes
+  def int_aarch64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_aarch64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Pairwise Add
+  def int_aarch64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Long Pairwise Add
+  // FIXME: In theory, we shouldn't need intrinsics for saddlp or
+  // uaddlp, but tblgen's type inference currently can't handle the
+  // pattern fragments this ends up generating.
+  def int_aarch64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
+  def int_aarch64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
 
-class Neon_3Arg_ShiftImm_Intrinsic
-  : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty, llvm_i32_ty],
-              [IntrNoMem]>;
+  // Folding Maximum
+  def int_aarch64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Folding Minimum
+  def int_aarch64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Reciprocal Estimate/Step
+  def int_aarch64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic;
+  def int_aarch64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Reciprocal Exponent
+  def int_aarch64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Saturating Shift Left
+  def int_aarch64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Rounding Shift Left
+  def int_aarch64_neon_srshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_urshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Shift Left
+  def int_aarch64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Shift Left by Constant
+  def int_aarch64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant
+  def int_aarch64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const
+  def int_aarch64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Narrowing Shift Right by Constant
+  def int_aarch64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_aarch64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Shift Right by Constant
+  def int_aarch64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Saturating Shift Right by Constant
+  def int_aarch64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_aarch64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Shift Left
+  def int_aarch64_neon_sshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_ushl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Widening Shift Left by Constant
+  def int_aarch64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic;
+  def int_aarch64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+  def int_aarch64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+
+  // Vector Shift Right by Constant and Insert
+  def int_aarch64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Shift Left by Constant and Insert
+  def int_aarch64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Saturating Narrow
+  def int_aarch64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+  def int_aarch64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Extract and Unsigned Narrow
+  def int_aarch64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Absolute Value
+  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Saturating Absolute Value
+  def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Saturating Negation
+  def int_aarch64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Count Leading Sign Bits
+  def int_aarch64_neon_cls : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Reciprocal Estimate
+  def int_aarch64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic;
+  def int_aarch64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Square Root Estimate
+  def int_aarch64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic;
+  def int_aarch64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Bitwise Reverse
+  def int_aarch64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Conversions Between Half-Precision and Single-Precision.
+  def int_aarch64_neon_vcvtfp2hf
+    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_aarch64_neon_vcvthf2fp
+    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
 
-// Scalar Shift Right (Immediate)
-def int_aarch64_neon_vshrds_n : Neon_2Arg_ShiftImm_Intrinsic;
-def int_aarch64_neon_vshrdu_n : Neon_2Arg_ShiftImm_Intrinsic;
+  // Vector Conversions Between Floating-point and Fixed-point.
+  def int_aarch64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_aarch64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_aarch64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic;
+  def int_aarch64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic;
 
-// Scalar Shift Right and Accumulate (Immediate)
-def int_aarch64_neon_vsrads_n : Neon_3Arg_ShiftImm_Intrinsic;
-def int_aarch64_neon_vsradu_n : Neon_3Arg_ShiftImm_Intrinsic;
+  // Vector FP->Int Conversions
+  def int_aarch64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic;
 
-// Scalar Rounding Shift Right and Accumulate (Immediate)
-def int_aarch64_neon_vrsrads_n : Neon_3Arg_ShiftImm_Intrinsic;
-def int_aarch64_neon_vrsradu_n : Neon_3Arg_ShiftImm_Intrinsic;
+  // Vector FP Rounding: only ties to even is unrepresented by a normal
+  // intrinsic.
+  def int_aarch64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic;
 
-// Scalar Shift Left (Immediate)
-def int_aarch64_neon_vshld_n : Neon_2Arg_ShiftImm_Intrinsic;
+  // Scalar FP->Int conversions
 
-// Scalar Saturating Shift Left (Immediate)
-def int_aarch64_neon_vqshls_n : Neon_N2V_Intrinsic;
-def int_aarch64_neon_vqshlu_n : Neon_N2V_Intrinsic;
+  // Vector FP Inexact Narrowing
+  def int_aarch64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
+
+  // Scalar FP Inexact Narrowing
+  def int_aarch64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
+                                        [IntrNoMem]>;
+}
 
-// Scalar Signed Saturating Shift Left Unsigned (Immediate)
-def int_aarch64_neon_vqshlus_n : Neon_N2V_Intrinsic;
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_2Vector2Index_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
+                [IntrNoMem]>;
+}
 
-// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
-def int_aarch64_neon_vcvtfxs2fp_n :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrNoMem]>;
+// Vector element to element moves
+def int_aarch64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
 
-// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
-def int_aarch64_neon_vcvtfxu2fp_n :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrNoMem]>;
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_1Vec_Load_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
+                  [IntrReadArgMem]>;
+  class AdvSIMD_1Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+
+  class AdvSIMD_2Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+  class AdvSIMD_2Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+
+  class AdvSIMD_3Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+  class AdvSIMD_3Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+
+  class AdvSIMD_4Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+  class AdvSIMD_4Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<5>]>;
+}
 
-// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
-def int_aarch64_neon_vcvtfp2fxs_n :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+// Memory ops
 
-// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
-def int_aarch64_neon_vcvtfp2fxu_n :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic;
 
+def int_aarch64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic;
+def int_aarch64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic;
+def int_aarch64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic;
+
+def int_aarch64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_aarch64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic;
+def int_aarch64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic;
+def int_aarch64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic;
+
+def int_aarch64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_aarch64_neon_st2  : AdvSIMD_2Vec_Store_Intrinsic;
+def int_aarch64_neon_st3  : AdvSIMD_3Vec_Store_Intrinsic;
+def int_aarch64_neon_st4  : AdvSIMD_4Vec_Store_Intrinsic;
+
+def int_aarch64_neon_st2lane  : AdvSIMD_2Vec_Store_Lane_Intrinsic;
+def int_aarch64_neon_st3lane  : AdvSIMD_3Vec_Store_Lane_Intrinsic;
+def int_aarch64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_Tbl1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_Tbl3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_Tbx1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+}
+def int_aarch64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
+def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
+def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
+def int_aarch64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
+
+def int_aarch64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
+def int_aarch64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
+def int_aarch64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
+def int_aarch64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
+
+let TargetPrefix = "aarch64" in {
+  class Crypto_AES_DataKey_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+  class Crypto_AES_Data_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_5Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_1Hash_Intrinsic
+    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the schedule
+  class Crypto_SHA_8Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 12 words of the schedule
+  class Crypto_SHA_12Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
+  class Crypto_SHA_8Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+}
+
+// AES
+def int_aarch64_crypto_aese   : Crypto_AES_DataKey_Intrinsic;
+def int_aarch64_crypto_aesd   : Crypto_AES_DataKey_Intrinsic;
+def int_aarch64_crypto_aesmc  : Crypto_AES_Data_Intrinsic;
+def int_aarch64_crypto_aesimc : Crypto_AES_Data_Intrinsic;
+
+// SHA1
+def int_aarch64_crypto_sha1c  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1p  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1m  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1h  : Crypto_SHA_1Hash_Intrinsic;
+
+def int_aarch64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic;
+def int_aarch64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic;
+
+// SHA256
+def int_aarch64_crypto_sha256h   : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha256h2  : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic;
+def int_aarch64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
+
+//===----------------------------------------------------------------------===//
+// CRC32
+
+let TargetPrefix = "aarch64" in {
+
+def int_aarch64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
 }
diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index b8ba9291a7af..145eeedc22b4 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@@ -125,6 +125,16 @@ def int_arm_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
 
 def int_arm_hint : Intrinsic<[], [llvm_i32_ty]>;
 
+//===----------------------------------------------------------------------===//
+// RBIT
+
+def int_arm_rbit : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// UND (reserved undefined sequence)
+
+def int_arm_undefined : Intrinsic<[], [llvm_i32_ty]>;
+
 //===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
diff --git a/include/llvm/IR/IntrinsicsARM64.td b/include/llvm/IR/IntrinsicsARM64.td
deleted file mode 100644
index 146ea5d970cf..000000000000
--- a/include/llvm/IR/IntrinsicsARM64.td
+++ /dev/null
@@ -1,636 +0,0 @@
-//===- IntrinsicsARM64.td - Defines ARM64 intrinsics -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the ARM64-specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "arm64" in {
-
-def int_arm64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
-def int_arm64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
-def int_arm64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
-def int_arm64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
-
-def int_arm64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
-def int_arm64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
-def int_arm64_stxp : Intrinsic<[llvm_i32_ty],
-                               [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
-def int_arm64_stlxp : Intrinsic<[llvm_i32_ty],
-                                [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
-
-def int_arm64_clrex : Intrinsic<[]>;
-
-def int_arm64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-                                LLVMMatchType<0>], [IntrNoMem]>;
-def int_arm64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-                                LLVMMatchType<0>], [IntrNoMem]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON)
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_2Scalar_Float_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-
-  class AdvSIMD_FPToIntRounding_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-
-  class AdvSIMD_1IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Expand_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
-  class AdvSIMD_1IntArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Int_Across_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Float_Across_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-  class AdvSIMD_2IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Compare_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-  class AdvSIMD_2Arg_FloatCompare_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMExtendedType<0>, LLVMExtendedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty],
-                [LLVMExtendedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
-                [IntrNoMem]>;
-
-  class AdvSIMD_3VectorArg_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-               [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-               [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
-                LLVMMatchType<1>], [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_CvtFxToFP_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_CvtFPToFx_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-}
-
-// Arithmetic ops
-
-let Properties = [IntrNoMem] in {
-  // Vector Add Across Lanes
-  def int_arm64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Vector Long Add Across Lanes
-  def int_arm64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-
-  // Vector Halving Add
-  def int_arm64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Rounding Halving Add
-  def int_arm64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Saturating Add
-  def int_arm64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Add High-Half
-  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
-  // header is no longer supported.
-  def int_arm64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Rounding Add High-Half
-  def int_arm64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Saturating Doubling Multiply High
-  def int_arm64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Saturating Rounding Doubling Multiply High
-  def int_arm64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Polynominal Multiply
-  def int_arm64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Long Multiply
-  def int_arm64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
-
-  // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
-  // it with a v16i8.
-  def int_arm64_neon_pmull64 :
-        Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-
-  // Vector Extending Multiply
-  def int_arm64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic {
-    let Properties = [IntrNoMem, Commutative];
-  }
-
-  // Vector Saturating Doubling Long Multiply
-  def int_arm64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_sqdmulls_scalar
-    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // Vector Halving Subtract
-  def int_arm64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Saturating Subtract
-  def int_arm64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Subtract High-Half
-  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
-  // header is no longer supported.
-  def int_arm64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Rounding Subtract High-Half
-  def int_arm64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Compare Absolute Greater-than-or-equal
-  def int_arm64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic;
-
-  // Vector Compare Absolute Greater-than
-  def int_arm64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic;
-
-  // Vector Absolute Difference
-  def int_arm64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Scalar Absolute Difference
-  def int_arm64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic;
-
-  // Vector Max
-  def int_arm64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Max Across Lanes
-  def int_arm64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-  def int_arm64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Vector Min
-  def int_arm64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Min/Max Number
-  def int_arm64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic;
-  def int_arm64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic;
-
-  // Vector Min Across Lanes
-  def int_arm64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-  def int_arm64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Pairwise Add
-  def int_arm64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Long Pairwise Add
-  // FIXME: In theory, we shouldn't need intrinsics for saddlp or
-  // uaddlp, but tblgen's type inference currently can't handle the
-  // pattern fragments this ends up generating.
-  def int_arm64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
-  def int_arm64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
-
-  // Folding Maximum
-  def int_arm64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Folding Minimum
-  def int_arm64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Reciprocal Estimate/Step
-  def int_arm64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic;
-  def int_arm64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic;
-
-  // Reciprocal Exponent
-  def int_arm64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Saturating Shift Left
-  def int_arm64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Rounding Shift Left
-  def int_arm64_neon_srshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_urshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Saturating Rounding Shift Left
-  def int_arm64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Signed->Unsigned Shift Left by Constant
-  def int_arm64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant
-  def int_arm64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const
-  def int_arm64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Narrowing Shift Right by Constant
-  def int_arm64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-  def int_arm64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Rounding Narrowing Shift Right by Constant
-  def int_arm64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Rounding Narrowing Saturating Shift Right by Constant
-  def int_arm64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-  def int_arm64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Shift Left
-  def int_arm64_neon_sshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_ushl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Widening Shift Left by Constant
-  def int_arm64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic;
-  def int_arm64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
-  def int_arm64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
-
-  // Vector Shift Right by Constant and Insert
-  def int_arm64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic;
-
-  // Vector Shift Left by Constant and Insert
-  def int_arm64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic;
-
-  // Vector Saturating Narrow
-  def int_arm64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-  def int_arm64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-
-  // Vector Saturating Extract and Unsigned Narrow
-  def int_arm64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-
-  // Vector Absolute Value
-  def int_arm64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Saturating Absolute Value
-  def int_arm64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Saturating Negation
-  def int_arm64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Count Leading Sign Bits
-  def int_arm64_neon_cls : AdvSIMD_1VectorArg_Intrinsic;
-
-  // Vector Reciprocal Estimate
-  def int_arm64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic;
-  def int_arm64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Square Root Estimate
-  def int_arm64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic;
-  def int_arm64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Bitwise Reverse
-  def int_arm64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic;
-
-  // Vector Conversions Between Half-Precision and Single-Precision.
-  def int_arm64_neon_vcvtfp2hf
-    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_arm64_neon_vcvthf2fp
-    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
-
-  // Vector Conversions Between Floating-point and Fixed-point.
-  def int_arm64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
-  def int_arm64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic;
-  def int_arm64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic;
-  def int_arm64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic;
-
-  // Vector FP->Int Conversions
-  def int_arm64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic;
-
-  // Vector FP Rounding: only ties to even is unrepresented by a normal
-  // intrinsic.
-  def int_arm64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Scalar FP->Int conversions
-
-  // Vector FP Inexact Narrowing
-  def int_arm64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
-
-  // Scalar FP Inexact Narrowing
-  def int_arm64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
-                                        [IntrNoMem]>;
-}
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_2Vector2Index_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
-                [IntrNoMem]>;
-}
-
-// Vector element to element moves
-def int_arm64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_1Vec_Load_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
-                  [IntrReadArgMem]>;
-  class AdvSIMD_1Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<2>]>;
-
-  class AdvSIMD_2Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_2Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_2Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                     LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<2>]>;
-  class AdvSIMD_2Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<3>]>;
-
-  class AdvSIMD_3Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_3Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_3Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                     LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<3>]>;
-  class AdvSIMD_3Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<4>]>;
-
-  class AdvSIMD_4Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_4Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_4Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<4>]>;
-  class AdvSIMD_4Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<5>]>;
-}
-
-// Memory ops
-
-def int_arm64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic;
-def int_arm64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic;
-def int_arm64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic;
-
-def int_arm64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic;
-def int_arm64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic;
-def int_arm64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic;
-
-def int_arm64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_st2  : AdvSIMD_2Vec_Store_Intrinsic;
-def int_arm64_neon_st3  : AdvSIMD_3Vec_Store_Intrinsic;
-def int_arm64_neon_st4  : AdvSIMD_4Vec_Store_Intrinsic;
-
-def int_arm64_neon_st2lane  : AdvSIMD_2Vec_Store_Lane_Intrinsic;
-def int_arm64_neon_st3lane  : AdvSIMD_3Vec_Store_Lane_Intrinsic;
-def int_arm64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_Tbl1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbl2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_Tbl3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbl4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-
-  class AdvSIMD_Tbx1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-}
-def int_arm64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
-def int_arm64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
-def int_arm64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
-def int_arm64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
-
-def int_arm64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
-def int_arm64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
-def int_arm64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
-def int_arm64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
-
-let TargetPrefix = "arm64" in {
-  class Crypto_AES_DataKey_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-
-  class Crypto_AES_Data_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
-  // (v4i32).
-  class Crypto_SHA_5Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-
-  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
-  // (v4i32).
-  class Crypto_SHA_1Hash_Intrinsic
-    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 8 words of the schedule
-  class Crypto_SHA_8Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 12 words of the schedule
-  class Crypto_SHA_12Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-
-  // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
-  class Crypto_SHA_8Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-}
-
-// AES
-def int_arm64_crypto_aese   : Crypto_AES_DataKey_Intrinsic;
-def int_arm64_crypto_aesd   : Crypto_AES_DataKey_Intrinsic;
-def int_arm64_crypto_aesmc  : Crypto_AES_Data_Intrinsic;
-def int_arm64_crypto_aesimc : Crypto_AES_Data_Intrinsic;
-
-// SHA1
-def int_arm64_crypto_sha1c  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1p  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1m  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1h  : Crypto_SHA_1Hash_Intrinsic;
-
-def int_arm64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic;
-def int_arm64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic;
-
-// SHA256
-def int_arm64_crypto_sha256h   : Crypto_SHA_8Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha256h2  : Crypto_SHA_8Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic;
-def int_arm64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
-
-//===----------------------------------------------------------------------===//
-// CRC32
-
-let TargetPrefix = "arm64" in {
-
-def int_arm64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
-    [IntrNoMem]>;
-}
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 36d93fee8a0c..7c81223b091f 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -1304,15 +1304,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector load with broadcast
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_vbroadcast_ss :
-        GCCBuiltin<"__builtin_ia32_vbroadcastss">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
-  def int_x86_avx_vbroadcast_sd_256 :
-        GCCBuiltin<"__builtin_ia32_vbroadcastsd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
-  def int_x86_avx_vbroadcast_ss_256 :
-        GCCBuiltin<"__builtin_ia32_vbroadcastss256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcastf128_pd_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastf128_pd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
@@ -1948,6 +1939,8 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_i32_ty], [IntrNoMem, Commutative]>;
   def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+  def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3141,6 +3134,16 @@ let TargetPrefix = "x86" in {
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                     llvm_v8i64_ty, llvm_i8_ty],
                     []>;
+  def int_x86_avx512_mask_lzcnt_d_512 :
+          GCCBuiltin<"__builtin_ia32_vplzcntd_512_mask">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                    llvm_v16i32_ty, llvm_i16_ty],
+                    []>;
+  def int_x86_avx512_mask_lzcnt_q_512 :
+          GCCBuiltin<"__builtin_ia32_vplzcntq_512_mask">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                    llvm_v8i64_ty, llvm_i8_ty],
+                    []>;
 }
 
 // Vector blend
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index f9644aca6b47..4d940d599b9a 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -157,15 +157,6 @@ class LLVMContext {
   void emitError(const Instruction *I, const Twine &ErrorStr);
   void emitError(const Twine &ErrorStr);
 
-  /// emitOptimizationRemark - Emit an optimization remark message. \p PassName
-  /// is the name of the pass emitting the message. If -Rpass= is given
-  /// and \p PassName matches the regular expression in -Rpass, then the
-  /// remark will be emitted. \p Fn is the function triggering the remark,
-  /// \p DLoc is the debug location where the diagnostic is generated.
-  /// \p Msg is the message string to use.
-  void emitOptimizationRemark(const char *PassName, const Function &Fn,
-                              const DebugLoc &DLoc, const Twine &Msg);
-
 private:
   LLVMContext(LLVMContext&) LLVM_DELETED_FUNCTION;
   void operator=(LLVMContext&) LLVM_DELETED_FUNCTION;
diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index f4d29675ce02..f6065a4e21a6 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -441,7 +441,7 @@ class FPPassManager : public ModulePass, public PMDataManager {
   Pass *getAsPass() override { return this; }
 
   /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const override{
+  void getAnalysisUsage(AnalysisUsage &Info) const override {
     Info.setPreservesAll();
   }
 
diff --git a/include/llvm/IR/LegacyPassNameParser.h b/include/llvm/IR/LegacyPassNameParser.h
index b72fc4c1a11b..e2e4912ef64e 100644
--- a/include/llvm/IR/LegacyPassNameParser.h
+++ b/include/llvm/IR/LegacyPassNameParser.h
@@ -43,7 +43,7 @@ class PassNameParser : public PassRegistrationListener,
                        public cl::parser<const PassInfo*> {
   cl::Option *Opt;
 public:
-  PassNameParser() : Opt(nullptr) {}
+  PassNameParser();
   virtual ~PassNameParser();
 
   void initialize(cl::Option &O) {
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 132d2da1a611..e2b86f62fb8d 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -23,10 +23,9 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-
 class FunctionType;
 class GVMaterializer;
 class LLVMContext;
@@ -298,10 +297,6 @@ class Module {
   /// registered in this LLVMContext.
   void getMDKindNames(SmallVectorImpl<StringRef> &Result) const;
 
-
-  typedef DenseMap<StructType*, unsigned, DenseMapInfo<StructType*> >
-                   NumeredTypesMapTy;
-
   /// Return the type with the specified name, or null if there is none by that
   /// name.
   StructType *getTypeByName(StringRef Name) const;
@@ -458,12 +453,12 @@ class Module {
   void Dematerialize(GlobalValue *GV);
 
   /// Make sure all GlobalValues in this Module are fully read.
-  error_code materializeAll();
+  std::error_code materializeAll();
 
   /// Make sure all GlobalValues in this Module are fully read and clear the
   /// Materializer. If the module is corrupt, this DOES NOT clear the old
   /// Materializer.
-  error_code materializeAllPermanently();
+  std::error_code materializeAllPermanently();
 
 /// @}
 /// @name Direct access to the globals list, functions list, and symbol table
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index bc7696bdaf5b..848adae9ceca 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -39,6 +39,10 @@ class User : public Value {
   friend struct HungoffOperandTraits;
   virtual void anchor();
 protected:
+  /// NumOperands - The number of values used by this User.
+  ///
+  unsigned NumOperands;
+
   /// OperandList - This is a pointer to the array of Uses for this User.
   /// For nodes of fixed arity (e.g. a binary operator) this array will live
   /// prefixed to some derived class instance.  For nodes of resizable variable
@@ -46,13 +50,9 @@ class User : public Value {
   /// allocated and should be destroyed by the classes' virtual dtor.
   Use *OperandList;
 
-  /// NumOperands - The number of values used by this User.
-  ///
-  unsigned NumOperands;
-
   void *operator new(size_t s, unsigned Us);
   User(Type *ty, unsigned vty, Use *OpList, unsigned NumOps)
-    : Value(ty, vty), OperandList(OpList), NumOperands(NumOps) {}
+    : Value(ty, vty), NumOperands(NumOps), OperandList(OpList) {}
   Use *allocHungoffUses(unsigned) const;
   void dropHungoffUses() {
     Use::zap(OperandList, OperandList + NumOperands, true);
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 01586831bc4e..aae39ccbbfb1 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -67,6 +67,13 @@ typedef StringMapEntry<Value*> ValueName;
 ///
 /// @brief LLVM Value Representation
 class Value {
+  Type *VTy;
+  Use *UseList;
+
+  friend class ValueSymbolTable; // Allow ValueSymbolTable to directly mod Name.
+  friend class ValueHandleBase;
+  ValueName *Name;
+
   const unsigned char SubclassID;   // Subclass identifier (for isa/dyn_cast)
   unsigned char HasValueHandle : 1; // Has a ValueHandle pointing to this?
 protected:
@@ -77,6 +84,11 @@ class Value {
   unsigned char SubclassOptionalData : 7;
 
 private:
+  /// SubclassData - This member is defined by this class, but is not used for
+  /// anything.  Subclasses can use it to hold whatever state they find useful.
+  /// This field is initialized to zero by the ctor.
+  unsigned short SubclassData;
+
   template <typename UseT> // UseT == 'Use' or 'const Use'
   class use_iterator_impl
       : public std::iterator<std::forward_iterator_tag, UseT *, ptrdiff_t> {
@@ -167,18 +179,6 @@ class Value {
     unsigned getOperandNo() const { return UI->getOperandNo(); }
   };
 
-  /// SubclassData - This member is defined by this class, but is not used for
-  /// anything.  Subclasses can use it to hold whatever state they find useful.
-  /// This field is initialized to zero by the ctor.
-  unsigned short SubclassData;
-
-  Type *VTy;
-  Use *UseList;
-
-  friend class ValueSymbolTable; // Allow ValueSymbolTable to directly mod Name.
-  friend class ValueHandleBase;
-  ValueName *Name;
-
   void operator=(const Value &) LLVM_DELETED_FUNCTION;
   Value(const Value &) LLVM_DELETED_FUNCTION;
 
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 1503aed621d5..f196f334b605 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -45,8 +45,10 @@ class ValueMapConstIterator;
 /// This class defines the default behavior for configurable aspects of
 /// ValueMap<>.  User Configs should inherit from this class to be as compatible
 /// as possible with future versions of ValueMap.
-template<typename KeyT>
+template<typename KeyT, typename MutexT = sys::Mutex>
 struct ValueMapConfig {
+  typedef MutexT mutex_type;
+
   /// If FollowRAUW is true, the ValueMap will update mappings on RAUW. If it's
   /// false, the ValueMap will leave the original mapping in place.
   enum { FollowRAUW = true };
@@ -67,7 +69,7 @@ struct ValueMapConfig {
   /// and onDelete) and not inside other ValueMap methods.  NULL means that no
   /// mutex is necessary.
   template<typename ExtraDataT>
-  static sys::Mutex *getMutex(const ExtraDataT &/*Data*/) { return nullptr; }
+  static mutex_type *getMutex(const ExtraDataT &/*Data*/) { return nullptr; }
 };
 
 /// See the file comment.
@@ -212,7 +214,7 @@ class ValueMapCallbackVH : public CallbackVH {
   void deleted() override {
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
-    sys::Mutex *M = Config::getMutex(Copy.Map->Data);
+    typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
     if (M)
       M->acquire();
     Config::onDelete(Copy.Map->Data, Copy.Unwrap());  // May destroy *this.
@@ -225,7 +227,7 @@ class ValueMapCallbackVH : public CallbackVH {
            "Invalid RAUW on key of ValueMap<>");
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
-    sys::Mutex *M = Config::getMutex(Copy.Map->Data);
+    typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
     if (M)
       M->acquire();
 
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 8e536159db12..0c840f39f522 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -146,6 +146,8 @@ void initializeInstCountPass(PassRegistry&);
 void initializeInstNamerPass(PassRegistry&);
 void initializeInternalizePassPass(PassRegistry&);
 void initializeIntervalPartitionPass(PassRegistry&);
+void initializeJumpInstrTableInfoPass(PassRegistry&);
+void initializeJumpInstrTablesPass(PassRegistry&);
 void initializeJumpThreadingPass(PassRegistry&);
 void initializeLCSSAPass(PassRegistry&);
 void initializeLICMPass(PassRegistry&);
@@ -272,6 +274,7 @@ void initializeSLPVectorizerPass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 void initializeStackMapLivenessPass(PassRegistry&);
+void initializeLoadCombinePass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 2616ebd1fabe..b2309ffc2140 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -85,6 +85,8 @@ namespace {
       (void) llvm::createIndVarSimplifyPass();
       (void) llvm::createInstructionCombiningPass();
       (void) llvm::createInternalizePass();
+      (void) llvm::createJumpInstrTableInfoPass();
+      (void) llvm::createJumpInstrTablesPass();
       (void) llvm::createLCSSAPass();
       (void) llvm::createLICMPass();
       (void) llvm::createLazyValueInfoPass();
diff --git a/include/llvm/MC/ConstantPools.h b/include/llvm/MC/ConstantPools.h
new file mode 100644
index 000000000000..2819b757b8bf
--- /dev/null
+++ b/include/llvm/MC/ConstantPools.h
@@ -0,0 +1,80 @@
+//===- ConstantPool.h - Keep track of assembler-generated  ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ConstantPool and AssemblerConstantPools classes.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_MC_CONSTANTPOOL_H
+#define LLVM_MC_CONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+namespace llvm {
+class MCContext;
+class MCExpr;
+class MCSection;
+class MCStreamer;
+class MCSymbol;
+// A class to keep track of assembler-generated constant pools that are use to
+// implement the ldr-pseudo.
+class ConstantPool {
+  typedef SmallVector<std::pair<MCSymbol *, const MCExpr *>, 4> EntryVecTy;
+  EntryVecTy Entries;
+
+public:
+  // Initialize a new empty constant pool
+  ConstantPool() {}
+
+  // Add a new entry to the constant pool in the next slot.
+  // \param Value is the new entry to put in the constant pool.
+  //
+  // \returns a MCExpr that references the newly inserted value
+  const MCExpr *addEntry(const MCExpr *Value, MCContext &Context);
+
+  // Emit the contents of the constant pool using the provided streamer.
+  void emitEntries(MCStreamer &Streamer);
+
+  // Return true if the constant pool is empty
+  bool empty();
+};
+
+class AssemblerConstantPools {
+  // Map type used to keep track of per-Section constant pools used by the
+  // ldr-pseudo opcode. The map associates a section to its constant pool. The
+  // constant pool is a vector of (label, value) pairs. When the ldr
+  // pseudo is parsed we insert a new (label, value) pair into the constant pool
+  // for the current section and add MCSymbolRefExpr to the new label as
+  // an opcode to the ldr. After we have parsed all the user input we
+  // output the (label, value) pairs in each constant pool at the end of the
+  // section.
+  //
+  // We use the MapVector for the map type to ensure stable iteration of
+  // the sections at the end of the parse. We need to iterate over the
+  // sections in a stable order to ensure that we have print the
+  // constant pools in a deterministic order when printing an assembly
+  // file.
+  typedef MapVector<const MCSection *, ConstantPool> ConstantPoolMapTy;
+  ConstantPoolMapTy ConstantPools;
+
+public:
+  AssemblerConstantPools() {}
+  ~AssemblerConstantPools() {}
+
+  void emitAll(MCStreamer &Streamer);
+  void emitForCurrentSection(MCStreamer &Streamer);
+  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr);
+
+private:
+  ConstantPool *getConstantPool(const MCSection *Section);
+  ConstantPool &getOrCreateConstantPool(const MCSection *Section);
+};
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index f7d3be251e1b..55dc40afe5ab 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -23,546 +23,495 @@
 #include <vector>
 
 namespace llvm {
-  class MCExpr;
-  class MCSection;
-  class MCStreamer;
-  class MCSymbol;
-  class MCContext;
-
-  namespace ExceptionHandling {
-    enum ExceptionsType { None, DwarfCFI, SjLj, ARM, Win64 };
+class MCExpr;
+class MCSection;
+class MCStreamer;
+class MCSymbol;
+class MCContext;
+
+namespace WinEH {
+enum class EncodingType {
+  ET_Invalid, /// Invalid
+  ET_Alpha,   /// Windows Alpha
+  ET_Alpha64, /// Windows AXP64
+  ET_ARM,     /// Windows NT (Windows on ARM)
+  ET_CE,      /// Windows CE ARM, PowerPC, SH3, SH4
+  ET_Itanium, /// Windows x64, Windows Itanium (IA-64)
+  ET_MIPS = ET_Alpha,
+};
+}
+
+namespace ExceptionHandling {
+enum ExceptionsType { None, DwarfCFI, SjLj, ARM, Win64 };
+}
+
+namespace LCOMM {
+enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
+}
+
+/// This class is intended to be used as a base class for asm
+/// properties and features specific to the target.
+class MCAsmInfo {
+protected:
+  //===------------------------------------------------------------------===//
+  // Properties to be set by the target writer, used to configure asm printer.
+  //
+
+  /// Pointer size in bytes.  Default is 4.
+  unsigned PointerSize;
+
+  /// Size of the stack slot reserved for callee-saved registers, in bytes.
+  /// Default is same as pointer size.
+  unsigned CalleeSaveStackSlotSize;
+
+  /// True if target is little endian.  Default is true.
+  bool IsLittleEndian;
+
+  /// True if target stack grow up.  Default is false.
+  bool StackGrowsUp;
+
+  /// True if this target has the MachO .subsections_via_symbols directive.
+  /// Default is false.
+  bool HasSubsectionsViaSymbols;
+
+  /// True if this is a MachO target that supports the macho-specific .zerofill
+  /// directive for emitting BSS Symbols.  Default is false.
+  bool HasMachoZeroFillDirective;
+
+  /// True if this is a MachO target that supports the macho-specific .tbss
+  /// directive for emitting thread local BSS Symbols.  Default is false.
+  bool HasMachoTBSSDirective;
+
+  /// True if the compiler should emit a ".reference .constructors_used" or
+  /// ".reference .destructors_used" directive after the a static ctor/dtor
+  /// list.  This directive is only emitted in Static relocation model.  Default
+  /// is false.
+  bool HasStaticCtorDtorReferenceInStaticMode;
+
+  /// True if the linker has a bug and requires that the debug_line section be
+  /// of a minimum size. In practice such a linker requires a non-empty line
+  /// sequence if a file is present.  Default to false.
+  bool LinkerRequiresNonEmptyDwarfLines;
+
+  /// This is the maximum possible length of an instruction, which is needed to
+  /// compute the size of an inline asm.  Defaults to 4.
+  unsigned MaxInstLength;
+
+  /// Every possible instruction length is a multiple of this value.  Factored
+  /// out in .debug_frame and .debug_line.  Defaults to 1.
+  unsigned MinInstAlignment;
+
+  /// The '$' token, when not referencing an identifier or constant, refers to
+  /// the current PC.  Defaults to false.
+  bool DollarIsPC;
+
+  /// This string, if specified, is used to separate instructions from each
+  /// other when on the same line.  Defaults to ';'
+  const char *SeparatorString;
+
+  /// This indicates the comment character used by the assembler.  Defaults to
+  /// "#"
+  const char *CommentString;
+
+  /// This is appended to emitted labels.  Defaults to ":"
+  const char *LabelSuffix;
+
+  /// This is appended to emitted labels.  Defaults to ":"
+  const char *DebugLabelSuffix;
+
+  /// This prefix is used for globals like constant pool entries that are
+  /// completely private to the .s file and should not have names in the .o
+  /// file.  Defaults to "L"
+  const char *PrivateGlobalPrefix;
+
+  /// This prefix is used for symbols that should be passed through the
+  /// assembler but be removed by the linker.  This is 'l' on Darwin, currently
+  /// used for some ObjC metadata.  The default of "" meast that for this system
+  /// a plain private symbol should be used.  Defaults to "".
+  const char *LinkerPrivateGlobalPrefix;
+
+  /// If these are nonempty, they contain a directive to emit before and after
+  /// an inline assembly statement.  Defaults to "#APP\n", "#NO_APP\n"
+  const char *InlineAsmStart;
+  const char *InlineAsmEnd;
+
+  /// These are assembly directives that tells the assembler to interpret the
+  /// following instructions differently.  Defaults to ".code16", ".code32",
+  /// ".code64".
+  const char *Code16Directive;
+  const char *Code32Directive;
+  const char *Code64Directive;
+
+  /// Which dialect of an assembler variant to use.  Defaults to 0
+  unsigned AssemblerDialect;
+
+  /// This is true if the assembler allows @ characters in symbol names.
+  /// Defaults to false.
+  bool AllowAtInName;
+
+  /// This is true if data region markers should be printed as
+  /// ".data_region/.end_data_region" directives. If false, use "$d/$a" labels
+  /// instead.
+  bool UseDataRegionDirectives;
+
+  //===--- Data Emission Directives -------------------------------------===//
+
+  /// This should be set to the directive used to get some number of zero bytes
+  /// emitted to the current section.  Common cases are "\t.zero\t" and
+  /// "\t.space\t".  If this is set to null, the Data*bitsDirective's will be
+  /// used to emit zero bytes.  Defaults to "\t.zero\t"
+  const char *ZeroDirective;
+
+  /// This directive allows emission of an ascii string with the standard C
+  /// escape characters embedded into it.  Defaults to "\t.ascii\t"
+  const char *AsciiDirective;
+
+  /// If not null, this allows for special handling of zero terminated strings
+  /// on this target.  This is commonly supported as ".asciz".  If a target
+  /// doesn't support this, it can be set to null.  Defaults to "\t.asciz\t"
+  const char *AscizDirective;
+
+  /// These directives are used to output some unit of integer data to the
+  /// current section.  If a data directive is set to null, smaller data
+  /// directives will be used to emit the large sizes.  Defaults to "\t.byte\t",
+  /// "\t.short\t", "\t.long\t", "\t.quad\t"
+  const char *Data8bitsDirective;
+  const char *Data16bitsDirective;
+  const char *Data32bitsDirective;
+  const char *Data64bitsDirective;
+
+  /// If non-null, a directive that is used to emit a word which should be
+  /// relocated as a 64-bit GP-relative offset, e.g. .gpdword on Mips.  Defaults
+  /// to NULL.
+  const char *GPRel64Directive;
+
+  /// If non-null, a directive that is used to emit a word which should be
+  /// relocated as a 32-bit GP-relative offset, e.g. .gpword on Mips or .gprel32
+  /// on Alpha.  Defaults to NULL.
+  const char *GPRel32Directive;
+
+  /// This is true if this target uses "Sun Style" syntax for section switching
+  /// ("#alloc,#write" etc) instead of the normal ELF syntax (,"a,w") in
+  /// .section directives.  Defaults to false.
+  bool SunStyleELFSectionSwitchSyntax;
+
+  /// This is true if this target uses ELF '.section' directive before the
+  /// '.bss' one. It's used for PPC/Linux which doesn't support the '.bss'
+  /// directive only.  Defaults to false.
+  bool UsesELFSectionDirectiveForBSS;
+
+  bool NeedsDwarfSectionOffsetDirective;
+
+  //===--- Alignment Information ----------------------------------------===//
+
+  /// If this is true (the default) then the asmprinter emits ".align N"
+  /// directives, where N is the number of bytes to align to.  Otherwise, it
+  /// emits ".align log2(N)", e.g. 3 to align to an 8 byte boundary.  Defaults
+  /// to true.
+  bool AlignmentIsInBytes;
+
+  /// If non-zero, this is used to fill the executable space created as the
+  /// result of a alignment directive.  Defaults to 0
+  unsigned TextAlignFillValue;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+
+  /// This is the directive used to declare a global entity.  Defaults to NULL.
+  const char *GlobalDirective;
+
+  /// True if the assembler supports the .set directive.  Defaults to true.
+  bool HasSetDirective;
+
+  /// False if the assembler requires that we use
+  /// \code
+  ///   Lc = a - b
+  ///   .long Lc
+  /// \endcode
+  //
+  /// instead of
+  //
+  /// \code
+  ///   .long a - b
+  /// \endcode
+  ///
+  ///  Defaults to true.
+  bool HasAggressiveSymbolFolding;
+
+  /// True is .comm's and .lcomms optional alignment is to be specified in bytes
+  /// instead of log2(n).  Defaults to true.
+  bool COMMDirectiveAlignmentIsInBytes;
+
+  /// Describes if the .lcomm directive for the target supports an alignment
+  /// argument and how it is interpreted.  Defaults to NoAlignment.
+  LCOMM::LCOMMType LCOMMDirectiveAlignmentType;
+
+  /// True if the target has .type and .size directives, this is true for most
+  /// ELF targets.  Defaults to true.
+  bool HasDotTypeDotSizeDirective;
+
+  /// True if the target has a single parameter .file directive, this is true
+  /// for ELF targets.  Defaults to true.
+  bool HasSingleParameterDotFile;
+
+  /// True if the target has a .ident directive, this is true for ELF targets.
+  /// Defaults to false.
+  bool HasIdentDirective;
+
+  /// True if this target supports the MachO .no_dead_strip directive.  Defaults
+  /// to false.
+  bool HasNoDeadStrip;
+
+  /// This directive, if non-null, is used to declare a global as being a weak
+  /// undefined symbol.  Defaults to NULL.
+  const char *WeakRefDirective;
+
+  /// True if we have a directive to declare a global as being a weak defined
+  /// symbol.  Defaults to false.
+  bool HasWeakDefDirective;
+
+  /// True if we have a directive to declare a global as being a weak defined
+  /// symbol that can be hidden (unexported).  Defaults to false.
+  bool HasWeakDefCanBeHiddenDirective;
+
+  /// True if we have a .linkonce directive.  This is used on cygwin/mingw.
+  /// Defaults to false.
+  bool HasLinkOnceDirective;
+
+  /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
+  /// hidden visibility.  Defaults to MCSA_Hidden.
+  MCSymbolAttr HiddenVisibilityAttr;
+
+  /// This attribute, if not MCSA_Invalid, is used to declare an undefined
+  /// symbol as having hidden visibility. Defaults to MCSA_Hidden.
+  MCSymbolAttr HiddenDeclarationVisibilityAttr;
+
+  /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
+  /// protected visibility.  Defaults to MCSA_Protected
+  MCSymbolAttr ProtectedVisibilityAttr;
+
+  //===--- Dwarf Emission Directives -----------------------------------===//
+
+  /// True if target asm supports leb128 directives.  Defaults to false.
+  bool HasLEB128;
+
+  /// True if target supports emission of debugging information.  Defaults to
+  /// false.
+  bool SupportsDebugInformation;
+
+  /// Exception handling format for the target.  Defaults to None.
+  ExceptionHandling::ExceptionsType ExceptionsType;
+
+  /// Windows exception handling data (.pdata) encoding.  Defaults to Invalid.
+  WinEH::EncodingType WinEHEncodingType;
+
+  /// True if Dwarf2 output generally uses relocations for references to other
+  /// .debug_* sections.
+  bool DwarfUsesRelocationsAcrossSections;
+
+  /// True if DWARF FDE symbol reference relocations should be replaced by an
+  /// absolute difference.
+  bool DwarfFDESymbolsUseAbsDiff;
+
+  /// True if dwarf register numbers are printed instead of symbolic register
+  /// names in .cfi_* directives.  Defaults to false.
+  bool DwarfRegNumForCFI;
+
+  /// True if target uses parens to indicate the symbol variant instead of @.
+  /// For example, foo(plt) instead of foo@plt.  Defaults to false.
+  bool UseParensForSymbolVariant;
+
+  //===--- Prologue State ----------------------------------------------===//
+
+  std::vector<MCCFIInstruction> InitialFrameState;
+
+  //===--- Integrated Assembler State ----------------------------------===//
+
+  /// Should we use the integrated assembler?
+  /// The integrated assembler should be enabled by default (by the
+  /// constructors) when failing to parse a valid piece of assembly (inline
+  /// or otherwise) is considered a bug. It may then be overridden after
+  /// construction (see LLVMTargetMachine::initAsmInfo()).
+  bool UseIntegratedAssembler;
+
+  /// Compress DWARF debug sections. Defaults to false.
+  bool CompressDebugSections;
+
+public:
+  explicit MCAsmInfo();
+  virtual ~MCAsmInfo();
+
+  /// Get the pointer size in bytes.
+  unsigned getPointerSize() const { return PointerSize; }
+
+  /// Get the callee-saved register stack slot
+  /// size in bytes.
+  unsigned getCalleeSaveStackSlotSize() const {
+    return CalleeSaveStackSlotSize;
+  }
+
+  /// True if the target is little endian.
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  /// True if target stack grow up.
+  bool isStackGrowthDirectionUp() const { return StackGrowsUp; }
+
+  bool hasSubsectionsViaSymbols() const { return HasSubsectionsViaSymbols; }
+
+  // Data directive accessors.
+
+  const char *getData8bitsDirective() const { return Data8bitsDirective; }
+  const char *getData16bitsDirective() const { return Data16bitsDirective; }
+  const char *getData32bitsDirective() const { return Data32bitsDirective; }
+  const char *getData64bitsDirective() const { return Data64bitsDirective; }
+  const char *getGPRel64Directive() const { return GPRel64Directive; }
+  const char *getGPRel32Directive() const { return GPRel32Directive; }
+
+  /// Targets can implement this method to specify a section to switch to if the
+  /// translation unit doesn't have any trampolines that require an executable
+  /// stack.
+  virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const {
+    return nullptr;
+  }
+
+  virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                    unsigned Encoding,
+                                                    MCStreamer &Streamer) const;
+
+  virtual const MCExpr *getExprForFDESymbol(const MCSymbol *Sym,
+                                            unsigned Encoding,
+                                            MCStreamer &Streamer) const;
+
+  bool usesSunStyleELFSectionSwitchSyntax() const {
+    return SunStyleELFSectionSwitchSyntax;
   }
 
-  namespace LCOMM {
-    enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
+  bool usesELFSectionDirectiveForBSS() const {
+    return UsesELFSectionDirectiveForBSS;
   }
 
-  /// MCAsmInfo - This class is intended to be used as a base class for asm
-  /// properties and features specific to the target.
-  class MCAsmInfo {
-  protected:
-    //===------------------------------------------------------------------===//
-    // Properties to be set by the target writer, used to configure asm printer.
-    //
-
-    /// PointerSize - Pointer size in bytes.
-    ///               Default is 4.
-    unsigned PointerSize;
-
-    /// CalleeSaveStackSlotSize - Size of the stack slot reserved for
-    ///                           callee-saved registers, in bytes.
-    ///                           Default is same as pointer size.
-    unsigned CalleeSaveStackSlotSize;
-
-    /// IsLittleEndian - True if target is little endian.
-    ///                  Default is true.
-    bool IsLittleEndian;
-
-    /// StackGrowsUp - True if target stack grow up.
-    ///                Default is false.
-    bool StackGrowsUp;
-
-    /// HasSubsectionsViaSymbols - True if this target has the MachO
-    /// .subsections_via_symbols directive.
-    bool HasSubsectionsViaSymbols;           // Default is false.
-
-    /// HasMachoZeroFillDirective - True if this is a MachO target that supports
-    /// the macho-specific .zerofill directive for emitting BSS Symbols.
-    bool HasMachoZeroFillDirective;               // Default is false.
-
-    /// HasMachoTBSSDirective - True if this is a MachO target that supports
-    /// the macho-specific .tbss directive for emitting thread local BSS Symbols
-    bool HasMachoTBSSDirective;                 // Default is false.
-
-    /// HasStaticCtorDtorReferenceInStaticMode - True if the compiler should
-    /// emit a ".reference .constructors_used" or ".reference .destructors_used"
-    /// directive after the a static ctor/dtor list.  This directive is only
-    /// emitted in Static relocation model.
-    bool HasStaticCtorDtorReferenceInStaticMode;  // Default is false.
-
-    /// LinkerRequiresNonEmptyDwarfLines - True if the linker has a bug and
-    /// requires that the debug_line section be of a minimum size. In practice
-    /// such a linker requires a non-empty line sequence if a file is present.
-    bool LinkerRequiresNonEmptyDwarfLines; // Default to false.
-
-    /// MaxInstLength - This is the maximum possible length of an instruction,
-    /// which is needed to compute the size of an inline asm.
-    unsigned MaxInstLength;                  // Defaults to 4.
-
-    /// MinInstAlignment - Every possible instruction length is a multiple of
-    /// this value.  Factored out in .debug_frame and .debug_line.
-    unsigned MinInstAlignment;                  // Defaults to 1.
-
-    /// DollarIsPC - The '$' token, when not referencing an identifier or
-    /// constant, refers to the current PC.
-    bool DollarIsPC;                         // Defaults to false.
-
-    /// SeparatorString - This string, if specified, is used to separate
-    /// instructions from each other when on the same line.
-    const char *SeparatorString;             // Defaults to ';'
-
-    /// CommentString - This indicates the comment character used by the
-    /// assembler.
-    const char *CommentString;               // Defaults to "#"
-
-    /// LabelSuffix - This is appended to emitted labels.
-    const char *LabelSuffix;                 // Defaults to ":"
-
-    /// LabelSuffix - This is appended to emitted labels.
-    const char *DebugLabelSuffix;                 // Defaults to ":"
-
-    /// This prefix is used for globals like constant pool entries that are
-    /// completely private to the .s file and should not have names in the .o
-    /// file.
-    const char *PrivateGlobalPrefix;         // Defaults to "L"
-
-    /// This prefix is used for symbols that should be passed through the
-    /// assembler but be removed by the linker.  This is 'l' on Darwin,
-    /// currently used for some ObjC metadata.
-    /// The default of "" meast that for this system a plain private symbol
-    /// should be used.
-    const char *LinkerPrivateGlobalPrefix;    // Defaults to "".
-
-    /// InlineAsmStart/End - If these are nonempty, they contain a directive to
-    /// emit before and after an inline assembly statement.
-    const char *InlineAsmStart;              // Defaults to "#APP\n"
-    const char *InlineAsmEnd;                // Defaults to "#NO_APP\n"
-
-    /// Code16Directive, Code32Directive, Code64Directive - These are assembly
-    /// directives that tells the assembler to interpret the following
-    /// instructions differently.
-    const char *Code16Directive;             // Defaults to ".code16"
-    const char *Code32Directive;             // Defaults to ".code32"
-    const char *Code64Directive;             // Defaults to ".code64"
-
-    /// AssemblerDialect - Which dialect of an assembler variant to use.
-    unsigned AssemblerDialect;               // Defaults to 0
-
-    /// \brief This is true if the assembler allows @ characters in symbol
-    /// names. Defaults to false.
-    bool AllowAtInName;
-
-    /// UseDataRegionDirectives - This is true if data region markers should
-    /// be printed as ".data_region/.end_data_region" directives. If false,
-    /// use "$d/$a" labels instead.
-    bool UseDataRegionDirectives;
-
-    //===--- Data Emission Directives -------------------------------------===//
-
-    /// ZeroDirective - this should be set to the directive used to get some
-    /// number of zero bytes emitted to the current section.  Common cases are
-    /// "\t.zero\t" and "\t.space\t".  If this is set to null, the
-    /// Data*bitsDirective's will be used to emit zero bytes.
-    const char *ZeroDirective;               // Defaults to "\t.zero\t"
-
-    /// AsciiDirective - This directive allows emission of an ascii string with
-    /// the standard C escape characters embedded into it.
-    const char *AsciiDirective;              // Defaults to "\t.ascii\t"
-
-    /// AscizDirective - If not null, this allows for special handling of
-    /// zero terminated strings on this target.  This is commonly supported as
-    /// ".asciz".  If a target doesn't support this, it can be set to null.
-    const char *AscizDirective;              // Defaults to "\t.asciz\t"
-
-    /// DataDirectives - These directives are used to output some unit of
-    /// integer data to the current section.  If a data directive is set to
-    /// null, smaller data directives will be used to emit the large sizes.
-    const char *Data8bitsDirective;          // Defaults to "\t.byte\t"
-    const char *Data16bitsDirective;         // Defaults to "\t.short\t"
-    const char *Data32bitsDirective;         // Defaults to "\t.long\t"
-    const char *Data64bitsDirective;         // Defaults to "\t.quad\t"
-
-    /// GPRel64Directive - if non-null, a directive that is used to emit a word
-    /// which should be relocated as a 64-bit GP-relative offset, e.g. .gpdword
-    /// on Mips.
-    const char *GPRel64Directive;            // Defaults to NULL.
-
-    /// GPRel32Directive - if non-null, a directive that is used to emit a word
-    /// which should be relocated as a 32-bit GP-relative offset, e.g. .gpword
-    /// on Mips or .gprel32 on Alpha.
-    const char *GPRel32Directive;            // Defaults to NULL.
-
-    /// SunStyleELFSectionSwitchSyntax - This is true if this target uses "Sun
-    /// Style" syntax for section switching ("#alloc,#write" etc) instead of the
-    /// normal ELF syntax (,"a,w") in .section directives.
-    bool SunStyleELFSectionSwitchSyntax;     // Defaults to false.
-
-    /// UsesELFSectionDirectiveForBSS - This is true if this target uses ELF
-    /// '.section' directive before the '.bss' one. It's used for PPC/Linux
-    /// which doesn't support the '.bss' directive only.
-    bool UsesELFSectionDirectiveForBSS;      // Defaults to false.
-
-    bool NeedsDwarfSectionOffsetDirective;
-
-    //===--- Alignment Information ----------------------------------------===//
-
-    /// AlignmentIsInBytes - If this is true (the default) then the asmprinter
-    /// emits ".align N" directives, where N is the number of bytes to align to.
-    /// Otherwise, it emits ".align log2(N)", e.g. 3 to align to an 8 byte
-    /// boundary.
-    bool AlignmentIsInBytes;                 // Defaults to true
-
-    /// TextAlignFillValue - If non-zero, this is used to fill the executable
-    /// space created as the result of a alignment directive.
-    unsigned TextAlignFillValue;             // Defaults to 0
+  bool needsDwarfSectionOffsetDirective() const {
+    return NeedsDwarfSectionOffsetDirective;
+  }
+
+  // Accessors.
 
-    //===--- Global Variable Emission Directives --------------------------===//
+  bool hasMachoZeroFillDirective() const { return HasMachoZeroFillDirective; }
+  bool hasMachoTBSSDirective() const { return HasMachoTBSSDirective; }
+  bool hasStaticCtorDtorReferenceInStaticMode() const {
+    return HasStaticCtorDtorReferenceInStaticMode;
+  }
+  bool getLinkerRequiresNonEmptyDwarfLines() const {
+    return LinkerRequiresNonEmptyDwarfLines;
+  }
+  unsigned getMaxInstLength() const { return MaxInstLength; }
+  unsigned getMinInstAlignment() const { return MinInstAlignment; }
+  bool getDollarIsPC() const { return DollarIsPC; }
+  const char *getSeparatorString() const { return SeparatorString; }
+
+  /// This indicates the column (zero-based) at which asm comments should be
+  /// printed.
+  unsigned getCommentColumn() const { return 40; }
+
+  const char *getCommentString() const { return CommentString; }
+  const char *getLabelSuffix() const { return LabelSuffix; }
+
+  const char *getDebugLabelSuffix() const { return DebugLabelSuffix; }
+  const char *getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; }
+  bool hasLinkerPrivateGlobalPrefix() const {
+    return LinkerPrivateGlobalPrefix[0] != '\0';
+  }
+  const char *getLinkerPrivateGlobalPrefix() const {
+    if (hasLinkerPrivateGlobalPrefix())
+      return LinkerPrivateGlobalPrefix;
+    return getPrivateGlobalPrefix();
+  }
+  const char *getInlineAsmStart() const { return InlineAsmStart; }
+  const char *getInlineAsmEnd() const { return InlineAsmEnd; }
+  const char *getCode16Directive() const { return Code16Directive; }
+  const char *getCode32Directive() const { return Code32Directive; }
+  const char *getCode64Directive() const { return Code64Directive; }
+  unsigned getAssemblerDialect() const { return AssemblerDialect; }
+  bool doesAllowAtInName() const { return AllowAtInName; }
+  bool doesSupportDataRegionDirectives() const {
+    return UseDataRegionDirectives;
+  }
+  const char *getZeroDirective() const { return ZeroDirective; }
+  const char *getAsciiDirective() const { return AsciiDirective; }
+  const char *getAscizDirective() const { return AscizDirective; }
+  bool getAlignmentIsInBytes() const { return AlignmentIsInBytes; }
+  unsigned getTextAlignFillValue() const { return TextAlignFillValue; }
+  const char *getGlobalDirective() const { return GlobalDirective; }
+  bool hasSetDirective() const { return HasSetDirective; }
+  bool hasAggressiveSymbolFolding() const { return HasAggressiveSymbolFolding; }
+  bool getCOMMDirectiveAlignmentIsInBytes() const {
+    return COMMDirectiveAlignmentIsInBytes;
+  }
+  LCOMM::LCOMMType getLCOMMDirectiveAlignmentType() const {
+    return LCOMMDirectiveAlignmentType;
+  }
+  bool hasDotTypeDotSizeDirective() const { return HasDotTypeDotSizeDirective; }
+  bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
+  bool hasIdentDirective() const { return HasIdentDirective; }
+  bool hasNoDeadStrip() const { return HasNoDeadStrip; }
+  const char *getWeakRefDirective() const { return WeakRefDirective; }
+  bool hasWeakDefDirective() const { return HasWeakDefDirective; }
+  bool hasWeakDefCanBeHiddenDirective() const {
+    return HasWeakDefCanBeHiddenDirective;
+  }
+  bool hasLinkOnceDirective() const { return HasLinkOnceDirective; }
 
-    /// GlobalDirective - This is the directive used to declare a global entity.
-    ///
-    const char *GlobalDirective;             // Defaults to NULL.
-
-    /// HasSetDirective - True if the assembler supports the .set directive.
-    bool HasSetDirective;                    // Defaults to true.
-
-    /// HasAggressiveSymbolFolding - False if the assembler requires that we use
-    /// Lc = a - b
-    /// .long Lc
-    /// instead of
-    /// .long a - b
-    bool HasAggressiveSymbolFolding;           // Defaults to true.
+  MCSymbolAttr getHiddenVisibilityAttr() const { return HiddenVisibilityAttr; }
+  MCSymbolAttr getHiddenDeclarationVisibilityAttr() const {
+    return HiddenDeclarationVisibilityAttr;
+  }
+  MCSymbolAttr getProtectedVisibilityAttr() const {
+    return ProtectedVisibilityAttr;
+  }
+  bool hasLEB128() const { return HasLEB128; }
+  bool doesSupportDebugInformation() const { return SupportsDebugInformation; }
+  bool doesSupportExceptionHandling() const {
+    return ExceptionsType != ExceptionHandling::None;
+  }
+  ExceptionHandling::ExceptionsType getExceptionHandlingType() const {
+    return ExceptionsType;
+  }
+  WinEH::EncodingType getWinEHEncodingType() const {
+    return WinEHEncodingType;
+  }
+  bool isExceptionHandlingDwarf() const {
+    return (ExceptionsType == ExceptionHandling::DwarfCFI ||
+            ExceptionsType == ExceptionHandling::ARM ||
+            ExceptionsType == ExceptionHandling::Win64);
+  }
+  bool doesDwarfUseRelocationsAcrossSections() const {
+    return DwarfUsesRelocationsAcrossSections;
+  }
+  bool doDwarfFDESymbolsUseAbsDiff() const { return DwarfFDESymbolsUseAbsDiff; }
+  bool useDwarfRegNumForCFI() const { return DwarfRegNumForCFI; }
+  bool useParensForSymbolVariant() const { return UseParensForSymbolVariant; }
 
-    /// COMMDirectiveAlignmentIsInBytes - True is .comm's and .lcomms optional
-    /// alignment is to be specified in bytes instead of log2(n).
-    bool COMMDirectiveAlignmentIsInBytes;    // Defaults to true;
+  void addInitialFrameState(const MCCFIInstruction &Inst) {
+    InitialFrameState.push_back(Inst);
+  }
 
-    /// LCOMMDirectiveAlignment - Describes if the .lcomm directive for the
-    /// target supports an alignment argument and how it is interpreted.
-    LCOMM::LCOMMType LCOMMDirectiveAlignmentType; // Defaults to NoAlignment.
-
-    /// HasDotTypeDotSizeDirective - True if the target has .type and .size
-    /// directives, this is true for most ELF targets.
-    bool HasDotTypeDotSizeDirective;         // Defaults to true.
+  const std::vector<MCCFIInstruction> &getInitialFrameState() const {
+    return InitialFrameState;
+  }
 
-    /// HasSingleParameterDotFile - True if the target has a single parameter
-    /// .file directive, this is true for ELF targets.
-    bool HasSingleParameterDotFile;          // Defaults to true.
+  /// Return true if assembly (inline or otherwise) should be parsed.
+  bool useIntegratedAssembler() const { return UseIntegratedAssembler; }
 
-    /// hasIdentDirective - True if the target has a .ident directive, this is
-    /// true for ELF targets.
-    bool HasIdentDirective;                  // Defaults to false.
+  /// Set whether assembly (inline or otherwise) should be parsed.
+  virtual void setUseIntegratedAssembler(bool Value) {
+    UseIntegratedAssembler = Value;
+  }
 
-    /// HasNoDeadStrip - True if this target supports the MachO .no_dead_strip
-    /// directive.
-    bool HasNoDeadStrip;                     // Defaults to false.
+  bool compressDebugSections() const { return CompressDebugSections; }
 
-    /// WeakRefDirective - This directive, if non-null, is used to declare a
-    /// global as being a weak undefined symbol.
-    const char *WeakRefDirective;            // Defaults to NULL.
-
-    /// True if we have a directive to declare a global as being a weak
-    /// defined symbol.
-    bool HasWeakDefDirective;                // Defaults to false.
-
-    /// True if we have a directive to declare a global as being a weak
-    /// defined symbol that can be hidden (unexported).
-    bool HasWeakDefCanBeHiddenDirective;     // Defaults to false.
-
-    /// True if we have a .linkonce directive.  This is used on cygwin/mingw.
-    bool HasLinkOnceDirective;               // Defaults to false.
-
-    /// HiddenVisibilityAttr - This attribute, if not MCSA_Invalid, is used to
-    /// declare a symbol as having hidden visibility.
-    MCSymbolAttr HiddenVisibilityAttr;       // Defaults to MCSA_Hidden.
-
-    /// HiddenDeclarationVisibilityAttr - This attribute, if not MCSA_Invalid,
-    /// is used to declare an undefined symbol as having hidden visibility.
-    MCSymbolAttr HiddenDeclarationVisibilityAttr;   // Defaults to MCSA_Hidden.
-
-
-    /// ProtectedVisibilityAttr - This attribute, if not MCSA_Invalid, is used
-    /// to declare a symbol as having protected visibility.
-    MCSymbolAttr ProtectedVisibilityAttr;    // Defaults to MCSA_Protected
-
-    //===--- Dwarf Emission Directives -----------------------------------===//
-
-    /// HasLEB128 - True if target asm supports leb128 directives.
-    bool HasLEB128;                          // Defaults to false.
-
-    /// SupportsDebugInformation - True if target supports emission of debugging
-    /// information.
-    bool SupportsDebugInformation;           // Defaults to false.
-
-    /// SupportsExceptionHandling - True if target supports exception handling.
-    ExceptionHandling::ExceptionsType ExceptionsType; // Defaults to None
-
-    /// DwarfUsesRelocationsAcrossSections - True if Dwarf2 output generally
-    /// uses relocations for references to other .debug_* sections.
-    bool DwarfUsesRelocationsAcrossSections;
-
-    /// DwarfFDESymbolsUseAbsDiff - true if DWARF FDE symbol reference
-    /// relocations should be replaced by an absolute difference.
-    bool DwarfFDESymbolsUseAbsDiff;
-
-    /// DwarfRegNumForCFI - True if dwarf register numbers are printed
-    /// instead of symbolic register names in .cfi_* directives.
-    bool DwarfRegNumForCFI;  // Defaults to false;
-
-    /// UseParensForSymbolVariant - True if target uses parens to indicate the
-    /// symbol variant instead of @. For example, foo(plt) instead of foo@plt.
-    bool UseParensForSymbolVariant; // Defaults to false;
-
-    //===--- Prologue State ----------------------------------------------===//
-
-    std::vector<MCCFIInstruction> InitialFrameState;
-
-    //===--- Integrated Assembler State ----------------------------------===//
-    /// Should we use the integrated assembler?
-    /// The integrated assembler should be enabled by default (by the
-    /// constructors) when failing to parse a valid piece of assembly (inline
-    /// or otherwise) is considered a bug. It may then be overridden after
-    /// construction (see LLVMTargetMachine::initAsmInfo()).
-    bool UseIntegratedAssembler;
-
-    /// Compress DWARF debug sections. Defaults to false.
-    bool CompressDebugSections;
-
-  public:
-    explicit MCAsmInfo();
-    virtual ~MCAsmInfo();
-
-    /// getPointerSize - Get the pointer size in bytes.
-    unsigned getPointerSize() const {
-      return PointerSize;
-    }
-
-    /// getCalleeSaveStackSlotSize - Get the callee-saved register stack slot
-    /// size in bytes.
-    unsigned getCalleeSaveStackSlotSize() const {
-      return CalleeSaveStackSlotSize;
-    }
-
-    /// isLittleEndian - True if the target is little endian.
-    bool isLittleEndian() const {
-      return IsLittleEndian;
-    }
-
-    /// isStackGrowthDirectionUp - True if target stack grow up.
-    bool isStackGrowthDirectionUp() const {
-      return StackGrowsUp;
-    }
-
-    bool hasSubsectionsViaSymbols() const { return HasSubsectionsViaSymbols; }
-
-    // Data directive accessors.
-    //
-    const char *getData8bitsDirective() const {
-      return Data8bitsDirective;
-    }
-    const char *getData16bitsDirective() const {
-      return Data16bitsDirective;
-    }
-    const char *getData32bitsDirective() const {
-      return Data32bitsDirective;
-    }
-    const char *getData64bitsDirective() const {
-      return Data64bitsDirective;
-    }
-    const char *getGPRel64Directive() const { return GPRel64Directive; }
-    const char *getGPRel32Directive() const { return GPRel32Directive; }
-
-    /// getNonexecutableStackSection - Targets can implement this method to
-    /// specify a section to switch to if the translation unit doesn't have any
-    /// trampolines that require an executable stack.
-    virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const{
-      return nullptr;
-    }
-
-    virtual const MCExpr *
-    getExprForPersonalitySymbol(const MCSymbol *Sym,
-                                unsigned Encoding,
-                                MCStreamer &Streamer) const;
-
-    virtual const MCExpr *
-    getExprForFDESymbol(const MCSymbol *Sym,
-                        unsigned Encoding,
-                        MCStreamer &Streamer) const;
-
-    bool usesSunStyleELFSectionSwitchSyntax() const {
-      return SunStyleELFSectionSwitchSyntax;
-    }
-
-    bool usesELFSectionDirectiveForBSS() const {
-      return UsesELFSectionDirectiveForBSS;
-    }
-
-    bool needsDwarfSectionOffsetDirective() const {
-      return NeedsDwarfSectionOffsetDirective;
-    }
-
-    // Accessors.
-    //
-    bool hasMachoZeroFillDirective() const { return HasMachoZeroFillDirective; }
-    bool hasMachoTBSSDirective() const { return HasMachoTBSSDirective; }
-    bool hasStaticCtorDtorReferenceInStaticMode() const {
-      return HasStaticCtorDtorReferenceInStaticMode;
-    }
-    bool getLinkerRequiresNonEmptyDwarfLines() const {
-      return LinkerRequiresNonEmptyDwarfLines;
-    }
-    unsigned getMaxInstLength() const {
-      return MaxInstLength;
-    }
-    unsigned getMinInstAlignment() const {
-      return MinInstAlignment;
-    }
-    bool getDollarIsPC() const {
-      return DollarIsPC;
-    }
-    const char *getSeparatorString() const {
-      return SeparatorString;
-    }
-
-    /// This indicates the column (zero-based) at which asm comments should be
-    /// printed.
-    unsigned getCommentColumn() const {
-      return 40;
-    }
-
-    const char *getCommentString() const {
-      return CommentString;
-    }
-    const char *getLabelSuffix() const {
-      return LabelSuffix;
-    }
-
-    const char *getDebugLabelSuffix() const {
-      return DebugLabelSuffix;
-    }
-    const char *getPrivateGlobalPrefix() const {
-      return PrivateGlobalPrefix;
-    }
-    bool hasLinkerPrivateGlobalPrefix() const {
-      return LinkerPrivateGlobalPrefix[0] != '\0';
-    }
-    const char *getLinkerPrivateGlobalPrefix() const {
-      if (hasLinkerPrivateGlobalPrefix())
-        return LinkerPrivateGlobalPrefix;
-      return getPrivateGlobalPrefix();
-    }
-    const char *getInlineAsmStart() const {
-      return InlineAsmStart;
-    }
-    const char *getInlineAsmEnd() const {
-      return InlineAsmEnd;
-    }
-    const char *getCode16Directive() const {
-      return Code16Directive;
-    }
-    const char *getCode32Directive() const {
-      return Code32Directive;
-    }
-    const char *getCode64Directive() const {
-      return Code64Directive;
-    }
-    unsigned getAssemblerDialect() const {
-      return AssemblerDialect;
-    }
-    bool doesAllowAtInName() const {
-      return AllowAtInName;
-    }
-    bool doesSupportDataRegionDirectives() const {
-      return UseDataRegionDirectives;
-    }
-    const char *getZeroDirective() const {
-      return ZeroDirective;
-    }
-    const char *getAsciiDirective() const {
-      return AsciiDirective;
-    }
-    const char *getAscizDirective() const {
-      return AscizDirective;
-    }
-    bool getAlignmentIsInBytes() const {
-      return AlignmentIsInBytes;
-    }
-    unsigned getTextAlignFillValue() const {
-      return TextAlignFillValue;
-    }
-    const char *getGlobalDirective() const {
-      return GlobalDirective;
-    }
-    bool hasSetDirective() const { return HasSetDirective; }
-    bool hasAggressiveSymbolFolding() const {
-      return HasAggressiveSymbolFolding;
-    }
-    bool getCOMMDirectiveAlignmentIsInBytes() const {
-      return COMMDirectiveAlignmentIsInBytes;
-    }
-    LCOMM::LCOMMType getLCOMMDirectiveAlignmentType() const {
-      return LCOMMDirectiveAlignmentType;
-    }
-    bool hasDotTypeDotSizeDirective() const {return HasDotTypeDotSizeDirective;}
-    bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
-    bool hasIdentDirective() const { return HasIdentDirective; }
-    bool hasNoDeadStrip() const { return HasNoDeadStrip; }
-    const char *getWeakRefDirective() const { return WeakRefDirective; }
-    bool hasWeakDefDirective() const { return HasWeakDefDirective; }
-    bool hasWeakDefCanBeHiddenDirective() const {
-      return HasWeakDefCanBeHiddenDirective;
-    }
-    bool hasLinkOnceDirective() const { return HasLinkOnceDirective; }
-
-    MCSymbolAttr getHiddenVisibilityAttr() const { return HiddenVisibilityAttr;}
-    MCSymbolAttr getHiddenDeclarationVisibilityAttr() const {
-      return HiddenDeclarationVisibilityAttr;
-    }
-    MCSymbolAttr getProtectedVisibilityAttr() const {
-      return ProtectedVisibilityAttr;
-    }
-    bool hasLEB128() const {
-      return HasLEB128;
-    }
-    bool doesSupportDebugInformation() const {
-      return SupportsDebugInformation;
-    }
-    bool doesSupportExceptionHandling() const {
-      return ExceptionsType != ExceptionHandling::None;
-    }
-    ExceptionHandling::ExceptionsType getExceptionHandlingType() const {
-      return ExceptionsType;
-    }
-    bool isExceptionHandlingDwarf() const {
-      return
-        (ExceptionsType == ExceptionHandling::DwarfCFI ||
-         ExceptionsType == ExceptionHandling::ARM ||
-         ExceptionsType == ExceptionHandling::Win64);
-    }
-    bool doesDwarfUseRelocationsAcrossSections() const {
-      return DwarfUsesRelocationsAcrossSections;
-    }
-    bool doDwarfFDESymbolsUseAbsDiff() const {
-      return DwarfFDESymbolsUseAbsDiff;
-    }
-    bool useDwarfRegNumForCFI() const {
-      return DwarfRegNumForCFI;
-    }
-    bool useParensForSymbolVariant() const {
-      return UseParensForSymbolVariant;
-    }
-
-    void addInitialFrameState(const MCCFIInstruction &Inst) {
-      InitialFrameState.push_back(Inst);
-    }
-
-    const std::vector<MCCFIInstruction> &getInitialFrameState() const {
-      return InitialFrameState;
-    }
-
-    /// Return true if assembly (inline or otherwise) should be parsed.
-    bool useIntegratedAssembler() const { return UseIntegratedAssembler; }
-
-    /// Set whether assembly (inline or otherwise) should be parsed.
-    virtual void setUseIntegratedAssembler(bool Value) {
-      UseIntegratedAssembler = Value;
-    }
-
-    bool compressDebugSections() const { return CompressDebugSections; }
-
-    void setCompressDebugSections(bool CompressDebugSections) {
-      this->CompressDebugSections = CompressDebugSections;
-    }
-  };
+  void setCompressDebugSections(bool CompressDebugSections) {
+    this->CompressDebugSections = CompressDebugSections;
+  }
+};
 }
 
 #endif
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 7557e7629bb9..2f9b32b984ec 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -273,9 +273,7 @@ namespace llvm {
     const MCSectionCOFF *getCOFFSection(StringRef Section,
                                         unsigned Characteristics,
                                         SectionKind Kind,
-                                        StringRef COMDATSymName,
-                                        int Selection,
-                                        const MCSectionCOFF *Assoc = nullptr);
+                                        StringRef COMDATSymName, int Selection);
 
     const MCSectionCOFF *getCOFFSection(StringRef Section,
                                         unsigned Characteristics,
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index f28adfa86526..ca5cecbef0a2 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -262,6 +262,8 @@ class MCSymbolRefExpr : public MCExpr {
     VK_Mips_GOT_LO16,
     VK_Mips_CALL_HI16,
     VK_Mips_CALL_LO16,
+    VK_Mips_PCREL_HI16,
+    VK_Mips_PCREL_LO16,
 
     VK_COFF_IMGREL32 // symbol@imgrel (image-relative)
   };
diff --git a/include/llvm/MC/MCLinkerOptimizationHint.h b/include/llvm/MC/MCLinkerOptimizationHint.h
index 3b0d933005dd..50fd527ffe2a 100644
--- a/include/llvm/MC/MCLinkerOptimizationHint.h
+++ b/include/llvm/MC/MCLinkerOptimizationHint.h
@@ -132,8 +132,19 @@ class MCLOHDirective {
   /// the given @p Layout.
   uint64_t getEmitSize(const MachObjectWriter &ObjWriter,
                        const MCAsmLayout &Layout) const {
-    std::string Buffer;
-    raw_string_ostream OutStream(Buffer);
+    class raw_counting_ostream : public raw_ostream {
+      uint64_t Count;
+
+      void write_impl(const char *, size_t size) override { Count += size; }
+
+      uint64_t current_pos() const override { return Count; }
+
+    public:
+      raw_counting_ostream() : Count(0) {}
+      ~raw_counting_ostream() { flush(); }
+    };
+
+    raw_counting_ostream OutStream;
     Emit_impl(OutStream, ObjWriter, Layout);
     return OutStream.tell();
   }
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index e7d5bbd5be67..12a7f0ee7bb5 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -111,6 +111,8 @@ class MachObjectWriter : public MCObjectWriter {
 
   /// @}
 
+  MachSymbolData *findSymbolData(const MCSymbol &Sym);
+
 public:
   MachObjectWriter(MCMachObjectTargetWriter *MOTW, raw_ostream &_OS,
                    bool _IsLittleEndian)
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 58f37d437c86..1a56040e4667 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -122,7 +122,6 @@ class MCObjectFileInfo {
 
   /// These are used for the Fission separate debug information files.
   const MCSection *DwarfInfoDWOSection;
-  const MCSection *DwarfTypesDWOSection;
   const MCSection *DwarfAbbrevDWOSection;
   const MCSection *DwarfStrDWOSection;
   const MCSection *DwarfLineDWOSection;
@@ -271,9 +270,7 @@ class MCObjectFileInfo {
     return DwarfInfoDWOSection;
   }
   const MCSection *getDwarfTypesSection(uint64_t Hash) const;
-  const MCSection *getDwarfTypesDWOSection() const {
-    return DwarfTypesDWOSection;
-  }
+  const MCSection *getDwarfTypesDWOSection(uint64_t Hash) const;
   const MCSection *getDwarfAbbrevDWOSection() const {
     return DwarfAbbrevDWOSection;
   }
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index f751786a8eba..9836795450ff 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -30,23 +30,24 @@ class SMRange;
 class SourceMgr;
 class Twine;
 
+class InlineAsmIdentifierInfo {
+public:
+  void *OpDecl;
+  bool IsVarDecl;
+  unsigned Length, Size, Type;
+
+  void clear() {
+    OpDecl = nullptr;
+    IsVarDecl = false;
+    Length = 1;
+    Size = 0;
+    Type = 0;
+  }
+};
+
 /// MCAsmParserSemaCallback - Generic Sema callback for assembly parser.
 class MCAsmParserSemaCallback {
 public:
-  typedef struct {
-    void *OpDecl;
-    bool IsVarDecl;
-    unsigned Length, Size, Type;
-
-    void clear() {
-      OpDecl = nullptr;
-      IsVarDecl = false;
-      Length = 1;
-      Size = 0;
-      Type = 0;
-    }
-  } InlineAsmIdentifierInfo;
-
   virtual ~MCAsmParserSemaCallback();
   virtual void *LookupInlineAsmIdentifier(StringRef &LineBuf,
                                           InlineAsmIdentifierInfo &Info,
@@ -56,9 +57,6 @@ class MCAsmParserSemaCallback {
                                     unsigned &Offset) = 0;
 };
 
-typedef MCAsmParserSemaCallback::InlineAsmIdentifierInfo
-  InlineAsmIdentifierInfo;
-
 /// MCAsmParser - Generic assembler parser interface, for use by target specific
 /// assembly parsers.
 class MCAsmParser {
diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index a428f9efb2b5..d205e2aebfb0 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h
@@ -42,24 +42,15 @@ class MCSymbol;
     /// it is a COMDAT section (Characteristics & IMAGE_SCN_LNK_COMDAT) != 0
     mutable int Selection;
 
-    /// Assoc - This is name of the associated section, if it is a COMDAT
-    /// section (Characteristics & IMAGE_SCN_LNK_COMDAT) != 0 with an
-    /// associative Selection (IMAGE_COMDAT_SELECT_ASSOCIATIVE).
-    mutable const MCSectionCOFF *Assoc;
-
   private:
     friend class MCContext;
     MCSectionCOFF(StringRef Section, unsigned Characteristics,
-                  const MCSymbol *COMDATSymbol, int Selection,
-                  const MCSectionCOFF *Assoc, SectionKind K)
+                  const MCSymbol *COMDATSymbol, int Selection, SectionKind K)
         : MCSection(SV_COFF, K), SectionName(Section),
           Characteristics(Characteristics), COMDATSymbol(COMDATSymbol),
-          Selection(Selection), Assoc(Assoc) {
+          Selection(Selection) {
       assert ((Characteristics & 0x00F00000) == 0 &&
         "alignment must not be set upon section creation");
-      assert ((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
-              (Assoc != nullptr) &&
-        "associative COMDAT section must have an associated section");
     }
     ~MCSectionCOFF();
 
@@ -76,11 +67,10 @@ class MCSymbol;
       return SectionName.str() + "_end";
     }
     unsigned getCharacteristics() const { return Characteristics; }
+    const MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; }
     int getSelection() const { return Selection; }
-    const MCSectionCOFF *getAssocSection() const { return Assoc; }
 
-    void setSelection(int Selection,
-                      const MCSectionCOFF *Assoc = nullptr) const;
+    void setSelection(int Selection) const;
 
     void PrintSwitchToSection(const MCAsmInfo &MAI, raw_ostream &OS,
                               const MCExpr *Subsection) const override;
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 18ef6c23f304..384cc1b880ca 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -14,6 +14,8 @@
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCTargetOptions.h"
 
+#include <memory>
+
 namespace llvm {
 class AsmToken;
 class MCInst;
@@ -23,6 +25,8 @@ class SMLoc;
 class StringRef;
 template <typename T> class SmallVectorImpl;
 
+typedef SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> OperandVector;
+
 enum AsmRewriteKind {
   AOK_Delete = 0,     // Rewrite should be ignored.
   AOK_Align,          // Rewrite align as .align.
@@ -131,8 +135,7 @@ class MCTargetAsmParser : public MCAsmParserExtension {
   ///        ownership of them to the caller.
   /// \return True on failure.
   virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                SMLoc NameLoc,
-                            SmallVectorImpl<MCParsedAsmOperand*> &Operands) = 0;
+                                SMLoc NameLoc, OperandVector &Operands) = 0;
 
   /// ParseDirective - Parse a target specific assembler directive
   ///
@@ -156,17 +159,16 @@ class MCTargetAsmParser : public MCAsmParserExtension {
   ///
   /// On failure, the target parser is responsible for emitting a diagnostic
   /// explaining the match failure.
-  virtual bool
-  MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                          SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                          MCStreamer &Out, unsigned &ErrorInfo,
-                          bool MatchingInlineAsm) = 0;
+  virtual bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                       OperandVector &Operands, MCStreamer &Out,
+                                       unsigned &ErrorInfo,
+                                       bool MatchingInlineAsm) = 0;
 
   /// Allow a target to add special case operand matching for things that
   /// tblgen doesn't/can't handle effectively. For example, literal
   /// immediates on ARM. TableGen expects a token operand, but the parser
   /// will recognize them as immediates.
-  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                               unsigned Kind) {
     return Match_InvalidOperand;
   }
@@ -178,7 +180,7 @@ class MCTargetAsmParser : public MCAsmParserExtension {
   }
 
   virtual void convertToMapAndConstraints(unsigned Kind,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) = 0;
+                                          const OperandVector &Operands) = 0;
 
   virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
                                             MCSymbolRefExpr::VariantKind,
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index ad34958ae380..80cc8befb7a7 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -22,12 +22,13 @@ class MCTargetOptions {
   /// Enables AddressSanitizer instrumentation at machine level.
   bool SanitizeAddress : 1;
 
-  unsigned MCRelaxAll : 1;
-  unsigned MCNoExecStack : 1;
-  unsigned MCSaveTempLabels : 1;
-  unsigned MCUseDwarfDirectory : 1;
-  unsigned ShowMCEncoding : 1;
-  unsigned ShowMCInst : 1;
+  bool MCRelaxAll : 1;
+  bool MCNoExecStack : 1;
+  bool MCSaveTempLabels : 1;
+  bool MCUseDwarfDirectory : 1;
+  bool ShowMCEncoding : 1;
+  bool ShowMCInst : 1;
+  bool AsmVerbose : 1;
   MCTargetOptions();
 };
 
@@ -39,7 +40,8 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
           ARE_EQUAL(MCSaveTempLabels) &&
           ARE_EQUAL(MCUseDwarfDirectory) &&
           ARE_EQUAL(ShowMCEncoding) &&
-          ARE_EQUAL(ShowMCInst));
+          ARE_EQUAL(ShowMCInst) &&
+          ARE_EQUAL(AsmVerbose));
 #undef ARE_EQUAL
 }
 
diff --git a/include/llvm/MC/MCTargetOptionsCommandFlags.h b/include/llvm/MC/MCTargetOptionsCommandFlags.h
index 1edf8f75b139..17a117a2a3bd 100644
--- a/include/llvm/MC/MCTargetOptionsCommandFlags.h
+++ b/include/llvm/MC/MCTargetOptionsCommandFlags.h
@@ -33,30 +33,11 @@ cl::opt<bool> RelaxAll("mc-relax-all",
                        cl::desc("When used with filetype=obj, "
                                 "relax all fixups in the emitted object file"));
 
-cl::opt<bool> EnableDwarfDirectory(
-    "enable-dwarf-directory", cl::Hidden,
-    cl::desc("Use .file directives with an explicit directory."));
-
-cl::opt<bool> NoExecStack("mc-no-exec-stack",
-                          cl::desc("File doesn't need an exec stack"));
-
-cl::opt<bool> SaveTempLabels("L", cl::desc("Don't discard temporary labels"));
-
-cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
-                             cl::desc("Show encoding in .s output"));
-cl::opt<bool> ShowMCInst("show-mc-inst", cl::Hidden,
-                         cl::desc("Show instruction structure in .s output"));
-
 static inline MCTargetOptions InitMCTargetOptionsFromFlags() {
   MCTargetOptions Options;
   Options.SanitizeAddress =
       (AsmInstrumentation == MCTargetOptions::AsmInstrumentationAddress);
   Options.MCRelaxAll = RelaxAll;
-  Options.MCUseDwarfDirectory = EnableDwarfDirectory;
-  Options.MCNoExecStack = NoExecStack;
-  Options.MCSaveTempLabels = SaveTempLabels;
-  Options.ShowMCEncoding = ShowMCEncoding;
-  Options.ShowMCInst = ShowMCInst;
   return Options;
 }
 
diff --git a/include/llvm/MC/MCWinCOFFObjectWriter.h b/include/llvm/MC/MCWinCOFFObjectWriter.h
index 213481c9090c..dad7bb597039 100644
--- a/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -30,6 +30,7 @@ namespace llvm {
     virtual unsigned getRelocType(const MCValue &Target,
                                   const MCFixup &Fixup,
                                   bool IsCrossSection) const = 0;
+    virtual bool recordRelocation(const MCFixup &) const { return true; }
   };
 
   /// \brief Construct a new Win COFF writer instance.
diff --git a/include/llvm/MC/MCWinCOFFStreamer.h b/include/llvm/MC/MCWinCOFFStreamer.h
index b0a27cdbd75b..34e39bb0a636 100644
--- a/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/include/llvm/MC/MCWinCOFFStreamer.h
@@ -65,6 +65,9 @@ class MCWinCOFFStreamer : public MCObjectStreamer {
 protected:
   const MCSymbol *CurSymbol;
   void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+private:
+  LLVM_ATTRIBUTE_NORETURN void FatalError(const Twine &Msg) const;
 };
 }
 
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 652b6597bebf..4fe44a7769ae 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -72,7 +72,7 @@ class Archive : public Binary {
 
     Child getNext() const;
 
-    error_code getName(StringRef &Result) const;
+    ErrorOr<StringRef> getName() const;
     StringRef getRawName() const { return getHeader()->getName(); }
     sys::TimeValue getLastModified() const {
       return getHeader()->getLastModified();
@@ -89,11 +89,11 @@ class Archive : public Binary {
       return StringRef(Data.data() + StartOfFile, getSize());
     }
 
-    error_code getMemoryBuffer(std::unique_ptr<MemoryBuffer> &Result,
-                               bool FullPath = false) const;
+    ErrorOr<std::unique_ptr<MemoryBuffer>>
+    getMemoryBuffer(bool FullPath = false) const;
 
-    error_code getAsBinary(std::unique_ptr<Binary> &Result,
-                           LLVMContext *Context = nullptr) const;
+    ErrorOr<std::unique_ptr<Binary>>
+    getAsBinary(LLVMContext *Context = nullptr) const;
   };
 
   class child_iterator {
@@ -137,8 +137,8 @@ class Archive : public Binary {
       : Parent(p)
       , SymbolIndex(symi)
       , StringIndex(stri) {}
-    error_code getName(StringRef &Result) const;
-    error_code getMember(child_iterator &Result) const;
+    StringRef getName() const;
+    ErrorOr<child_iterator> getMember() const;
     Symbol getNext() const;
   };
 
@@ -164,7 +164,7 @@ class Archive : public Binary {
     }
   };
 
-  Archive(MemoryBuffer *source, error_code &ec);
+  Archive(MemoryBuffer *source, std::error_code &ec);
   static ErrorOr<Archive *> create(MemoryBuffer *Source);
 
   enum Kind {
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index bd9c67740dff..f0f2793a928c 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -353,65 +353,75 @@ class COFFObjectFile : public ObjectFile {
   uint32_t NumberOfImportDirectory;
   const export_directory_table_entry *ExportDirectory;
 
-  error_code getString(uint32_t offset, StringRef &Res) const;
+  std::error_code getString(uint32_t offset, StringRef &Res) const;
 
   const coff_symbol *toSymb(DataRefImpl Symb) const;
   const coff_section *toSec(DataRefImpl Sec) const;
   const coff_relocation *toRel(DataRefImpl Rel) const;
 
-  error_code initSymbolTablePtr();
-  error_code initImportTablePtr();
-  error_code initExportTablePtr();
+  std::error_code initSymbolTablePtr();
+  std::error_code initImportTablePtr();
+  std::error_code initExportTablePtr();
 
 protected:
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const override;
-  error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
+  std::error_code getSymbolName(DataRefImpl Symb,
+                                StringRef &Res) const override;
+  std::error_code getSymbolAddress(DataRefImpl Symb,
+                                   uint64_t &Res) const override;
+  std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  error_code getSymbolType(DataRefImpl Symb,
-                           SymbolRef::Type &Res) const override;
-  error_code getSymbolSection(DataRefImpl Symb,
-                              section_iterator &Res) const override;
+  std::error_code getSymbolType(DataRefImpl Symb,
+                                SymbolRef::Type &Res) const override;
+  std::error_code getSymbolSection(DataRefImpl Symb,
+                                   section_iterator &Res) const override;
   void moveSectionNext(DataRefImpl &Sec) const override;
-  error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                           bool &Res) const override;
-  error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                   bool &Result) const override;
+  std::error_code getSectionName(DataRefImpl Sec,
+                                 StringRef &Res) const override;
+  std::error_code getSectionAddress(DataRefImpl Sec,
+                                    uint64_t &Res) const override;
+  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  std::error_code getSectionContents(DataRefImpl Sec,
+                                     StringRef &Res) const override;
+  std::error_code getSectionAlignment(DataRefImpl Sec,
+                                      uint64_t &Res) const override;
+  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                        bool &Res) const override;
+  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                bool &Res) const override;
+  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
+                                        bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  error_code getRelocationAddress(DataRefImpl Rel,
-                                  uint64_t &Res) const override;
-  error_code getRelocationOffset(DataRefImpl Rel, uint64_t &Res) const override;
+  std::error_code getRelocationAddress(DataRefImpl Rel,
+                                       uint64_t &Res) const override;
+  std::error_code getRelocationOffset(DataRefImpl Rel,
+                                      uint64_t &Res) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
-  error_code getRelocationType(DataRefImpl Rel, uint64_t &Res) const override;
-  error_code
+  std::error_code getRelocationType(DataRefImpl Rel,
+                                    uint64_t &Res) const override;
+  std::error_code
   getRelocationTypeName(DataRefImpl Rel,
                         SmallVectorImpl<char> &Result) const override;
-  error_code
+  std::error_code
   getRelocationValueString(DataRefImpl Rel,
                            SmallVectorImpl<char> &Result) const override;
 
-  error_code getLibraryNext(DataRefImpl LibData,
-                            LibraryRef &Result) const override;
-  error_code getLibraryPath(DataRefImpl LibData,
-                            StringRef &Result) const override;
+  std::error_code getLibraryNext(DataRefImpl LibData,
+                                 LibraryRef &Result) const override;
+  std::error_code getLibraryPath(DataRefImpl LibData,
+                                 StringRef &Result) const override;
 
 public:
-  COFFObjectFile(MemoryBuffer *Object, error_code &EC, bool BufferOwned = true);
+  COFFObjectFile(MemoryBuffer *Object, std::error_code &EC,
+                 bool BufferOwned = true);
   basic_symbol_iterator symbol_begin_impl() const override;
   basic_symbol_iterator symbol_end_impl() const override;
   library_iterator needed_library_begin() const override;
@@ -433,30 +443,33 @@ class COFFObjectFile : public ObjectFile {
   export_directory_iterator export_directory_begin() const;
   export_directory_iterator export_directory_end() const;
 
-  error_code getHeader(const coff_file_header *&Res) const;
-  error_code getCOFFHeader(const coff_file_header *&Res) const;
-  error_code getPE32Header(const pe32_header *&Res) const;
-  error_code getPE32PlusHeader(const pe32plus_header *&Res) const;
-  error_code getDataDirectory(uint32_t index, const data_directory *&Res) const;
-  error_code getSection(int32_t index, const coff_section *&Res) const;
-  error_code getSymbol(uint32_t index, const coff_symbol *&Res) const;
+  std::error_code getHeader(const coff_file_header *&Res) const;
+  std::error_code getCOFFHeader(const coff_file_header *&Res) const;
+  std::error_code getPE32Header(const pe32_header *&Res) const;
+  std::error_code getPE32PlusHeader(const pe32plus_header *&Res) const;
+  std::error_code getDataDirectory(uint32_t index,
+                                   const data_directory *&Res) const;
+  std::error_code getSection(int32_t index, const coff_section *&Res) const;
+  std::error_code getSymbol(uint32_t index, const coff_symbol *&Res) const;
   template <typename T>
-  error_code getAuxSymbol(uint32_t index, const T *&Res) const {
+  std::error_code getAuxSymbol(uint32_t index, const T *&Res) const {
     const coff_symbol *s;
-    error_code ec = getSymbol(index, s);
+    std::error_code ec = getSymbol(index, s);
     Res = reinterpret_cast<const T *>(s);
     return ec;
   }
-  error_code getSymbolName(const coff_symbol *symbol, StringRef &Res) const;
+  std::error_code getSymbolName(const coff_symbol *symbol,
+                                StringRef &Res) const;
   ArrayRef<uint8_t> getSymbolAuxData(const coff_symbol *symbol) const;
 
-  error_code getSectionName(const coff_section *Sec, StringRef &Res) const;
-  error_code getSectionContents(const coff_section *Sec,
-                                ArrayRef<uint8_t> &Res) const;
+  std::error_code getSectionName(const coff_section *Sec, StringRef &Res) const;
+  std::error_code getSectionContents(const coff_section *Sec,
+                                     ArrayRef<uint8_t> &Res) const;
 
-  error_code getVaPtr(uint64_t VA, uintptr_t &Res) const;
-  error_code getRvaPtr(uint32_t Rva, uintptr_t &Res) const;
-  error_code getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const;
+  std::error_code getVaPtr(uint64_t VA, uintptr_t &Res) const;
+  std::error_code getRvaPtr(uint32_t Rva, uintptr_t &Res) const;
+  std::error_code getHintName(uint32_t Rva, uint16_t &Hint,
+                              StringRef &Name) const;
 
   static inline bool classof(const Binary *v) { return v->isCOFF(); }
 };
@@ -471,12 +484,12 @@ class ImportDirectoryEntryRef {
 
   bool operator==(const ImportDirectoryEntryRef &Other) const;
   void moveNext();
-  error_code getName(StringRef &Result) const;
+  std::error_code getName(StringRef &Result) const;
 
-  error_code
+  std::error_code
   getImportTableEntry(const import_directory_table_entry *&Result) const;
 
-  error_code
+  std::error_code
   getImportLookupEntry(const import_lookup_table_entry32 *&Result) const;
 
 private:
@@ -496,11 +509,11 @@ class ExportDirectoryEntryRef {
   bool operator==(const ExportDirectoryEntryRef &Other) const;
   void moveNext();
 
-  error_code getDllName(StringRef &Result) const;
-  error_code getOrdinalBase(uint32_t &Result) const;
-  error_code getOrdinal(uint32_t &Result) const;
-  error_code getExportRVA(uint32_t &Result) const;
-  error_code getSymbolName(StringRef &Result) const;
+  std::error_code getDllName(StringRef &Result) const;
+  std::error_code getOrdinalBase(uint32_t &Result) const;
+  std::error_code getOrdinal(uint32_t &Result) const;
+  std::error_code getExportRVA(uint32_t &Result) const;
+  std::error_code getSymbolName(StringRef &Result) const;
 
 private:
   const export_directory_table_entry *ExportTable;
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index ee97d4e05d69..716b3b93c0ea 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -133,6 +133,7 @@ class ELFFile {
   typedef Elf_Vernaux_Impl<ELFT> Elf_Vernaux;
   typedef Elf_Versym_Impl<ELFT> Elf_Versym;
   typedef ELFEntityIterator<const Elf_Dyn> Elf_Dyn_Iter;
+  typedef iterator_range<Elf_Dyn_Iter> Elf_Dyn_Range;
   typedef ELFEntityIterator<const Elf_Rela> Elf_Rela_Iter;
   typedef ELFEntityIterator<const Elf_Rel> Elf_Rel_Iter;
   typedef ELFEntityIterator<const Elf_Shdr> Elf_Shdr_Iter;
@@ -316,7 +317,7 @@ class ELFFile {
   std::pair<const Elf_Shdr *, const Elf_Sym *>
   getRelocationSymbol(const Elf_Shdr *RelSec, const RelT *Rel) const;
 
-  ELFFile(MemoryBuffer *Object, error_code &ec);
+  ELFFile(MemoryBuffer *Object, std::error_code &ec);
 
   bool isMipsELF64() const {
     return Header->e_machine == ELF::EM_MIPS &&
@@ -342,6 +343,9 @@ class ELFFile {
   /// \param NULLEnd use one past the first DT_NULL entry as the end instead of
   /// the section size.
   Elf_Dyn_Iter end_dynamic_table(bool NULLEnd = false) const;
+  Elf_Dyn_Range dynamic_table(bool NULLEnd = false) const {
+    return make_range(begin_dynamic_table(), end_dynamic_table(NULLEnd));
+  }
 
   Elf_Sym_Iter begin_dynamic_symbols() const {
     if (DynSymRegion.Addr)
@@ -617,16 +621,11 @@ typename ELFFile<ELFT>::uintX_t ELFFile<ELFT>::getStringTableIndex() const {
 }
 
 template <class ELFT>
-ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
-    : Buf(Object),
-      SectionHeaderTable(nullptr),
-      dot_shstrtab_sec(nullptr),
-      dot_strtab_sec(nullptr),
-      dot_symtab_sec(nullptr),
-      SymbolTableSectionHeaderIndex(nullptr),
-      dot_gnu_version_sec(nullptr),
-      dot_gnu_version_r_sec(nullptr),
-      dot_gnu_version_d_sec(nullptr),
+ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, std::error_code &ec)
+    : Buf(Object), SectionHeaderTable(nullptr), dot_shstrtab_sec(nullptr),
+      dot_strtab_sec(nullptr), dot_symtab_sec(nullptr),
+      SymbolTableSectionHeaderIndex(nullptr), dot_gnu_version_sec(nullptr),
+      dot_gnu_version_r_sec(nullptr), dot_gnu_version_d_sec(nullptr),
       dt_soname(nullptr) {
   const uint64_t FileSize = Buf->getBufferSize();
 
@@ -744,7 +743,7 @@ ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
     }
   }
 
-  ec = error_code::success();
+  ec = std::error_code();
 }
 
 // Get the symbol table index in the symtab section given a symbol
@@ -823,17 +822,13 @@ ELFFile<ELFT>::end_dynamic_table(bool NULLEnd) const {
 template <class ELFT>
 StringRef ELFFile<ELFT>::getLoadName() const {
   if (!dt_soname) {
+    dt_soname = "";
     // Find the DT_SONAME entry
-    Elf_Dyn_Iter it = begin_dynamic_table();
-    Elf_Dyn_Iter ie = end_dynamic_table();
-    while (it != ie && it->getTag() != ELF::DT_SONAME)
-      ++it;
-
-    if (it != ie) {
-      dt_soname = getDynamicString(it->getVal());
-    } else {
-      dt_soname = "";
-    }
+    for (const auto &Entry : dynamic_table())
+      if (Entry.getTag() == ELF::DT_SONAME) {
+        dt_soname = getDynamicString(Entry.getVal());
+        break;
+      }
   }
   return dt_soname;
 }
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 302cabaf8ead..876206b38a1e 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -57,50 +57,63 @@ class ELFObjectFile : public ObjectFile {
   ELFFile<ELFT> EF;
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const override;
-  error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const override;
-  error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
+  std::error_code getSymbolName(DataRefImpl Symb,
+                                StringRef &Res) const override;
+  std::error_code getSymbolAddress(DataRefImpl Symb,
+                                   uint64_t &Res) const override;
+  std::error_code getSymbolAlignment(DataRefImpl Symb,
+                                     uint32_t &Res) const override;
+  std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  error_code getSymbolType(DataRefImpl Symb,
-                           SymbolRef::Type &Res) const override;
-  error_code getSymbolSection(DataRefImpl Symb,
-                              section_iterator &Res) const override;
+  std::error_code getSymbolType(DataRefImpl Symb,
+                                SymbolRef::Type &Res) const override;
+  std::error_code getSymbolSection(DataRefImpl Symb,
+                                   section_iterator &Res) const override;
 
-  error_code getLibraryNext(DataRefImpl Data,
-                            LibraryRef &Result) const override;
-  error_code getLibraryPath(DataRefImpl Data, StringRef &Res) const override;
+  std::error_code getLibraryNext(DataRefImpl Data,
+                                 LibraryRef &Result) const override;
+  std::error_code getLibraryPath(DataRefImpl Data,
+                                 StringRef &Res) const override;
 
   void moveSectionNext(DataRefImpl &Sec) const override;
-  error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                           bool &Res) const override;
-  error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const override;
-  error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                   bool &Result) const override;
+  std::error_code getSectionName(DataRefImpl Sec,
+                                 StringRef &Res) const override;
+  std::error_code getSectionAddress(DataRefImpl Sec,
+                                    uint64_t &Res) const override;
+  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  std::error_code getSectionContents(DataRefImpl Sec,
+                                     StringRef &Res) const override;
+  std::error_code getSectionAlignment(DataRefImpl Sec,
+                                      uint64_t &Res) const override;
+  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                bool &Res) const override;
+  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                        bool &Res) const override;
+  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
+                                        bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
   section_iterator getRelocatedSection(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  error_code getRelocationAddress(DataRefImpl Rel,
-                                  uint64_t &Res) const override;
-  error_code getRelocationOffset(DataRefImpl Rel, uint64_t &Res) const override;
+  std::error_code getRelocationAddress(DataRefImpl Rel,
+                                       uint64_t &Res) const override;
+  std::error_code getRelocationOffset(DataRefImpl Rel,
+                                      uint64_t &Res) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
-  error_code getRelocationType(DataRefImpl Rel, uint64_t &Res) const override;
-  error_code getRelocationTypeName(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
-  error_code getRelocationValueString(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
+  std::error_code getRelocationType(DataRefImpl Rel,
+                                    uint64_t &Res) const override;
+  std::error_code
+  getRelocationTypeName(DataRefImpl Rel,
+                        SmallVectorImpl<char> &Result) const override;
+  std::error_code
+  getRelocationValueString(DataRefImpl Rel,
+                           SmallVectorImpl<char> &Result) const override;
 
   uint64_t getROffset(DataRefImpl Rel) const;
   StringRef getRelocationTypeName(uint32_t Type) const;
@@ -164,7 +177,8 @@ class ELFObjectFile : public ObjectFile {
   bool isDyldELFObject;
 
 public:
-  ELFObjectFile(MemoryBuffer *Object, error_code &EC, bool BufferOwned = true);
+  ELFObjectFile(MemoryBuffer *Object, std::error_code &EC,
+                bool BufferOwned = true);
 
   const Elf_Sym *getSymbol(DataRefImpl Symb) const;
 
@@ -180,10 +194,9 @@ class ELFObjectFile : public ObjectFile {
   library_iterator needed_library_begin() const override;
   library_iterator needed_library_end() const override;
 
-  error_code getRelocationAddend(DataRefImpl Rel, int64_t &Res) const;
-  error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
-                              bool &IsDefault) const;
-
+  std::error_code getRelocationAddend(DataRefImpl Rel, int64_t &Res) const;
+  std::error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
+                                   bool &IsDefault) const;
 
   uint8_t getBytesInAddress() const override;
   StringRef getFileFormatName() const override;
@@ -212,8 +225,8 @@ void ELFObjectFile<ELFT>::moveSymbolNext(DataRefImpl &Symb) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
-                                              StringRef &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
+                                                   StringRef &Result) const {
   ErrorOr<StringRef> Name = EF.getSymbolName(toELFSymIter(Symb));
   if (!Name)
     return Name.getError();
@@ -222,9 +235,9 @@ error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
-                                                 StringRef &Version,
-                                                 bool &IsDefault) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
+                                                      StringRef &Version,
+                                                      bool &IsDefault) const {
   DataRefImpl Symb = SymRef.getRawDataRefImpl();
   const Elf_Sym *symb = getSymbol(Symb);
   ErrorOr<StringRef> Ver =
@@ -236,8 +249,8 @@ error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
-                                                 uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
+                                                      uint64_t &Result) const {
   const Elf_Sym *ESym = getSymbol(Symb);
   switch (EF.getSymbolTableIndex(ESym)) {
   case ELF::SHN_COMMON:
@@ -265,8 +278,8 @@ error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
-                                                   uint32_t &Res) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
+                                                        uint32_t &Res) const {
   Elf_Sym_Iter Sym = toELFSymIter(Symb);
   if (Sym->st_shndx == ELF::SHN_COMMON)
     Res = Sym->st_value;
@@ -276,15 +289,16 @@ error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Symb,
-                                              uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Symb,
+                                                   uint64_t &Result) const {
   Result = toELFSymIter(Symb)->st_size;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
-                                              SymbolRef::Type &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
+                                   SymbolRef::Type &Result) const {
   const Elf_Sym *ESym = getSymbol(Symb);
 
   switch (ESym->getType()) {
@@ -343,8 +357,9 @@ uint32_t ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Symb) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb,
-                                                 section_iterator &Res) const {
+std::error_code
+ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb,
+                                      section_iterator &Res) const {
   const Elf_Sym *ESym = getSymbol(Symb);
   const Elf_Shdr *ESec = EF.getSection(ESym);
   if (!ESec)
@@ -363,8 +378,8 @@ void ELFObjectFile<ELFT>::moveSectionNext(DataRefImpl &Sec) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
-                                               StringRef &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
+                                                    StringRef &Result) const {
   ErrorOr<StringRef> Name = EF.getSectionName(&*toELFShdrIter(Sec));
   if (!Name)
     return Name.getError();
@@ -373,44 +388,46 @@ error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
-                                                  uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
+                                                       uint64_t &Result) const {
   Result = toELFShdrIter(Sec)->sh_addr;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
-                                               uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
+                                                    uint64_t &Result) const {
   Result = toELFShdrIter(Sec)->sh_size;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec,
-                                                   StringRef &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec,
+                                        StringRef &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = StringRef((const char *)base() + EShdr->sh_offset, EShdr->sh_size);
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
-                                                    uint64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
+                                         uint64_t &Result) const {
   Result = toELFShdrIter(Sec)->sh_addralign;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
-                                              bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
+                                                   bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_EXECINSTR;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
-                                              bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
+                                                   bool &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
            EShdr->sh_type == ELF::SHT_PROGBITS;
@@ -418,8 +435,8 @@ error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
-                                             bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
+                                                  bool &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
            EShdr->sh_type == ELF::SHT_NOBITS;
@@ -427,7 +444,7 @@ error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code
+std::error_code
 ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec,
                                                    bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_ALLOC;
@@ -435,31 +452,31 @@ ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
-                                                 bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
+                                                      bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
-                                                  bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
+                                                       bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
-                                                      bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
+                                                           bool &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = !(EShdr->sh_flags & (ELF::SHF_WRITE | ELF::SHF_EXECINSTR));
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
-                                                      DataRefImpl Symb,
-                                                      bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
+                                                           DataRefImpl Symb,
+                                                           bool &Result) const {
   Elf_Sym_Iter ESym = toELFSymIter(Symb);
 
   uintX_t Index = ESym->st_shndx;
@@ -553,8 +570,9 @@ ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
-                                                     uint64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
+                                          uint64_t &Result) const {
   uint64_t ROffset = getROffset(Rel);
   const Elf_Ehdr *Header = EF.getHeader();
 
@@ -570,8 +588,9 @@ error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel,
-                                                    uint64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel,
+                                         uint64_t &Result) const {
   assert(EF.getHeader()->e_type == ELF::ET_REL &&
          "Only relocatable object files have relocation offsets");
   Result = getROffset(Rel);
@@ -592,8 +611,8 @@ uint64_t ELFObjectFile<ELFT>::getROffset(DataRefImpl Rel) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel,
-                                                  uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel,
+                                                       uint64_t &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   switch (sec->sh_type) {
   default:
@@ -616,7 +635,7 @@ StringRef ELFObjectFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationTypeName(
+std::error_code ELFObjectFile<ELFT>::getRelocationTypeName(
     DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   uint32_t type;
@@ -638,8 +657,9 @@ error_code ELFObjectFile<ELFT>::getRelocationTypeName(
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationAddend(DataRefImpl Rel,
-                                                    int64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getRelocationAddend(DataRefImpl Rel,
+                                         int64_t &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   switch (sec->sh_type) {
   default:
@@ -656,7 +676,7 @@ error_code ELFObjectFile<ELFT>::getRelocationAddend(DataRefImpl Rel,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationValueString(
+std::error_code ELFObjectFile<ELFT>::getRelocationValueString(
     DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   uint8_t type;
@@ -754,7 +774,7 @@ ELFObjectFile<ELFT>::getRela(DataRefImpl Rela) const {
 }
 
 template <class ELFT>
-ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, error_code &ec,
+ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, std::error_code &ec,
                                    bool BufferOwned)
     : ObjectFile(getELFType(static_cast<endianness>(ELFT::TargetEndianness) ==
                                 support::little,
@@ -817,8 +837,8 @@ library_iterator ELFObjectFile<ELFT>::needed_library_begin() const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
-                                               LibraryRef &Result) const {
+std::error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
+                                                    LibraryRef &Result) const {
   Elf_Dyn_Iter DI = toELFDynIter(Data);
   Elf_Dyn_Iter DE = EF.end_dynamic_table();
 
@@ -832,8 +852,8 @@ error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
-                                               StringRef &Res) const {
+std::error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
+                                                    StringRef &Res) const {
   Res = EF.getDynamicString(toELFDynIter(Data)->getVal());
   return object_error::success;
 }
@@ -931,8 +951,8 @@ unsigned ELFObjectFile<ELFT>::getArch() const {
 
 /// FIXME: Maybe we should have a base ElfObjectFile that is not a template
 /// and make these member functions?
-inline error_code getELFRelocationAddend(const RelocationRef R,
-                                         int64_t &Addend) {
+inline std::error_code getELFRelocationAddend(const RelocationRef R,
+                                              int64_t &Addend) {
   const ObjectFile *Obj = R.getObjectFile();
   DataRefImpl DRI = R.getRawDataRefImpl();
   // Little-endian 32-bit
@@ -975,9 +995,10 @@ getELFDynamicSymbolIterators(SymbolicFile *Obj) {
 
 /// This is a generic interface for retrieving GNU symbol version
 /// information from an ELFObjectFile.
-inline error_code GetELFSymbolVersion(const ObjectFile *Obj,
-                                      const SymbolRef &Sym, StringRef &Version,
-                                      bool &IsDefault) {
+inline std::error_code GetELFSymbolVersion(const ObjectFile *Obj,
+                                           const SymbolRef &Sym,
+                                           StringRef &Version,
+                                           bool &IsDefault) {
   // Little-endian 32-bit
   if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
diff --git a/include/llvm/Object/ELFYAML.h b/include/llvm/Object/ELFYAML.h
index 524e55b07e1c..42eeb0ef752c 100644
--- a/include/llvm/Object/ELFYAML.h
+++ b/include/llvm/Object/ELFYAML.h
@@ -44,6 +44,7 @@ LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_REL)
 // Just use 64, since it can hold 32-bit values too.
 LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_SHF)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STT)
+LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STV)
 
 // For now, hardcode 64 bits everywhere that 32 or 64 would be needed
 // since 64-bit can hold 32-bit values too.
@@ -62,6 +63,7 @@ struct Symbol {
   StringRef Section;
   llvm::yaml::Hex64 Value;
   llvm::yaml::Hex64 Size;
+  ELF_STV Visibility;
 };
 struct LocalGlobalWeakSymbols {
   std::vector<Symbol> Local;
@@ -76,7 +78,6 @@ struct Section {
   ELF_SHF Flags;
   llvm::yaml::Hex64 Address;
   StringRef Link;
-  StringRef Info;
   llvm::yaml::Hex64 AddressAlign;
   Section(SectionKind Kind) : Kind(Kind) {}
   virtual ~Section();
@@ -96,6 +97,7 @@ struct Relocation {
   StringRef Symbol;
 };
 struct RelocationSection : Section {
+  StringRef Info;
   std::vector<Relocation> Relocations;
   RelocationSection() : Section(SectionKind::Relocation) {}
   static bool classof(const Section *S) {
@@ -167,6 +169,11 @@ struct ScalarEnumerationTraits<ELFYAML::ELF_STT> {
   static void enumeration(IO &IO, ELFYAML::ELF_STT &Value);
 };
 
+template <>
+struct ScalarEnumerationTraits<ELFYAML::ELF_STV> {
+  static void enumeration(IO &IO, ELFYAML::ELF_STV &Value);
+};
+
 template <>
 struct ScalarEnumerationTraits<ELFYAML::ELF_REL> {
   static void enumeration(IO &IO, ELFYAML::ELF_REL &Value);
diff --git a/include/llvm/Object/Error.h b/include/llvm/Object/Error.h
index 779c747461a8..701da1272cd5 100644
--- a/include/llvm/Object/Error.h
+++ b/include/llvm/Object/Error.h
@@ -14,38 +14,32 @@
 #ifndef LLVM_OBJECT_ERROR_H
 #define LLVM_OBJECT_ERROR_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
 namespace object {
 
-const error_category &object_category();
+const std::error_category &object_category();
 
-struct object_error {
-  enum Impl {
-    success = 0,
-    arch_not_found,
-    invalid_file_type,
-    parse_failed,
-    unexpected_eof
-  };
-  Impl V;
-
-  object_error(Impl V) : V(V) {}
-  operator Impl() const { return V; }
+enum class object_error {
+  success = 0,
+  arch_not_found,
+  invalid_file_type,
+  parse_failed,
+  unexpected_eof
 };
 
-inline error_code make_error_code(object_error e) {
-  return error_code(static_cast<int>(e), object_category());
+inline std::error_code make_error_code(object_error e) {
+  return std::error_code(static_cast<int>(e), object_category());
 }
 
 } // end namespace object.
 
-template <> struct is_error_code_enum<object::object_error> : std::true_type {};
+} // end namespace llvm.
 
+namespace std {
 template <>
-struct is_error_code_enum<object::object_error::Impl> : std::true_type {};
-
-} // end namespace llvm.
+struct is_error_code_enum<llvm::object::object_error> : std::true_type {};
+}
 
 #endif
diff --git a/include/llvm/Object/IRObjectFile.h b/include/llvm/Object/IRObjectFile.h
index 78f5b2b64f23..c87fe1519acc 100644
--- a/include/llvm/Object/IRObjectFile.h
+++ b/include/llvm/Object/IRObjectFile.h
@@ -27,10 +27,11 @@ class IRObjectFile : public SymbolicFile {
   std::unique_ptr<Mangler> Mang;
 
 public:
-  IRObjectFile(MemoryBuffer *Object, error_code &EC, LLVMContext &Context,
+  IRObjectFile(MemoryBuffer *Object, std::error_code &EC, LLVMContext &Context,
                bool BufferOwned);
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override;
+  std::error_code printSymbolName(raw_ostream &OS,
+                                  DataRefImpl Symb) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
   const GlobalValue &getSymbolGV(DataRefImpl Symb) const;
   basic_symbol_iterator symbol_begin_impl() const override;
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index 710ad7ef1569..6e1ab253ec9b 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -40,9 +40,9 @@ class DiceRef {
 
   void moveNext();
 
-  error_code getOffset(uint32_t &Result) const;
-  error_code getLength(uint16_t &Result) const;
-  error_code getKind(uint16_t &Result) const;
+  std::error_code getOffset(uint32_t &Result) const;
+  std::error_code getLength(uint16_t &Result) const;
+  std::error_code getKind(uint16_t &Result) const;
 
   DataRefImpl getRawDataRefImpl() const;
   const ObjectFile *getObjectFile() const;
@@ -57,53 +57,74 @@ class MachOObjectFile : public ObjectFile {
   };
 
   MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian, bool Is64Bits,
-                  error_code &EC, bool BufferOwned = true);
+                  std::error_code &EC, bool BufferOwned = true);
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const override;
-  error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const override;
-  error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolType(DataRefImpl Symb,
-                           SymbolRef::Type &Res) const override;
+  std::error_code getSymbolName(DataRefImpl Symb,
+                                StringRef &Res) const override;
+
+  // MachO specific.
+  std::error_code getIndirectName(DataRefImpl Symb, StringRef &Res) const;
+
+  std::error_code getSymbolAddress(DataRefImpl Symb,
+                                   uint64_t &Res) const override;
+  std::error_code getSymbolAlignment(DataRefImpl Symb,
+                                     uint32_t &Res) const override;
+  std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
+  std::error_code getSymbolType(DataRefImpl Symb,
+                                SymbolRef::Type &Res) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  error_code getSymbolSection(DataRefImpl Symb,
-                              section_iterator &Res) const override;
+  std::error_code getSymbolSection(DataRefImpl Symb,
+                                   section_iterator &Res) const override;
 
   void moveSectionNext(DataRefImpl &Sec) const override;
-  error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                           bool &Res) const override;
-  error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const override;
-  error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                   bool &Result) const override;
+  std::error_code getSectionName(DataRefImpl Sec,
+                                 StringRef &Res) const override;
+  std::error_code getSectionAddress(DataRefImpl Sec,
+                                    uint64_t &Res) const override;
+  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  std::error_code getSectionContents(DataRefImpl Sec,
+                                     StringRef &Res) const override;
+  std::error_code getSectionAlignment(DataRefImpl Sec,
+                                      uint64_t &Res) const override;
+  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                bool &Res) const override;
+  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                        bool &Res) const override;
+  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
+                                        bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  error_code getRelocationAddress(DataRefImpl Rel,
-                                  uint64_t &Res) const override;
-  error_code getRelocationOffset(DataRefImpl Rel, uint64_t &Res) const override;
+  std::error_code getRelocationAddress(DataRefImpl Rel,
+                                       uint64_t &Res) const override;
+  std::error_code getRelocationOffset(DataRefImpl Rel,
+                                      uint64_t &Res) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
-  error_code getRelocationType(DataRefImpl Rel, uint64_t &Res) const override;
-  error_code getRelocationTypeName(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
-  error_code getRelocationValueString(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
-  error_code getRelocationHidden(DataRefImpl Rel, bool &Result) const override;
+  std::error_code getRelocationType(DataRefImpl Rel,
+                                    uint64_t &Res) const override;
+  std::error_code
+  getRelocationTypeName(DataRefImpl Rel,
+                        SmallVectorImpl<char> &Result) const override;
+  std::error_code
+  getRelocationValueString(DataRefImpl Rel,
+                           SmallVectorImpl<char> &Result) const override;
+  std::error_code getRelocationHidden(DataRefImpl Rel,
+                                      bool &Result) const override;
+
+  std::error_code getLibraryNext(DataRefImpl LibData,
+                                 LibraryRef &Res) const override;
+  std::error_code getLibraryPath(DataRefImpl LibData,
+                                 StringRef &Res) const override;
 
-  error_code getLibraryNext(DataRefImpl LibData,
-                            LibraryRef &Res) const override;
-  error_code getLibraryPath(DataRefImpl LibData, StringRef &Res) const override;
+  // MachO specific.
+  std::error_code getLibraryShortNameByIndex(unsigned Index, StringRef &Res);
 
   // TODO: Would be useful to have an iterator based version
   // of the load command interface too.
@@ -198,6 +219,9 @@ class MachOObjectFile : public ObjectFile {
   bool is64Bit() const;
   void ReadULEB128s(uint64_t Index, SmallVectorImpl<uint64_t> &Out) const;
 
+  static StringRef guessLibraryShortName(StringRef Name, bool &isFramework,
+                                         StringRef &Suffix);
+
   static Triple::ArchType getArch(uint32_t CPUType);
 
   static bool classof(const Binary *v) {
@@ -207,6 +231,10 @@ class MachOObjectFile : public ObjectFile {
 private:
   typedef SmallVector<const char*, 1> SectionList;
   SectionList Sections;
+  typedef SmallVector<const char*, 1> LibraryList;
+  LibraryList Libraries;
+  typedef SmallVector<StringRef, 1> LibraryShortName;
+  LibraryShortName LibrariesShortNames;
   const char *SymtabLoadCmd;
   const char *DysymtabLoadCmd;
   const char *DataInCodeLoadCmd;
@@ -234,7 +262,7 @@ inline void DiceRef::moveNext() {
 // the OwningObject ObjectFile is a MachOObjectFile a static_cast<> is used for
 // the methods that get the values of the fields of the reference.
 
-inline error_code DiceRef::getOffset(uint32_t &Result) const {
+inline std::error_code DiceRef::getOffset(uint32_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
   MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
@@ -242,7 +270,7 @@ inline error_code DiceRef::getOffset(uint32_t &Result) const {
   return object_error::success;
 }
 
-inline error_code DiceRef::getLength(uint16_t &Result) const {
+inline std::error_code DiceRef::getLength(uint16_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
   MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
@@ -250,7 +278,7 @@ inline error_code DiceRef::getLength(uint16_t &Result) const {
   return object_error::success;
 }
 
-inline error_code DiceRef::getKind(uint16_t &Result) const {
+inline std::error_code DiceRef::getKind(uint16_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
   MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h
index d27c824a87a0..47e93c26b46a 100644
--- a/include/llvm/Object/MachOUniversal.h
+++ b/include/llvm/Object/MachOUniversal.h
@@ -53,9 +53,9 @@ class MachOUniversalBinary : public Binary {
     ObjectForArch getNext() const { return ObjectForArch(Parent, Index + 1); }
     uint32_t getCPUType() const { return Header.cputype; }
 
-    error_code getAsObjectFile(std::unique_ptr<ObjectFile> &Result) const;
+    std::error_code getAsObjectFile(std::unique_ptr<ObjectFile> &Result) const;
 
-    error_code getAsArchive(std::unique_ptr<Archive> &Result) const;
+    std::error_code getAsArchive(std::unique_ptr<Archive> &Result) const;
   };
 
   class object_iterator {
@@ -79,7 +79,7 @@ class MachOUniversalBinary : public Binary {
     }
   };
 
-  MachOUniversalBinary(MemoryBuffer *Source, error_code &ec);
+  MachOUniversalBinary(MemoryBuffer *Source, std::error_code &ec);
   static ErrorOr<MachOUniversalBinary*> create(MemoryBuffer *Source);
 
   object_iterator begin_objects() const {
@@ -96,8 +96,8 @@ class MachOUniversalBinary : public Binary {
     return V->isMachOUniversalBinary();
   }
 
-  error_code getObjectForArch(Triple::ArchType Arch,
-                              std::unique_ptr<ObjectFile> &Result) const;
+  std::error_code getObjectForArch(Triple::ArchType Arch,
+                                   std::unique_ptr<ObjectFile> &Result) const;
 };
 
 }
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 10209b90c05a..152bb7e10c1f 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -46,26 +46,26 @@ class RelocationRef {
 
   void moveNext();
 
-  error_code getAddress(uint64_t &Result) const;
-  error_code getOffset(uint64_t &Result) const;
+  std::error_code getAddress(uint64_t &Result) const;
+  std::error_code getOffset(uint64_t &Result) const;
   symbol_iterator getSymbol() const;
-  error_code getType(uint64_t &Result) const;
+  std::error_code getType(uint64_t &Result) const;
 
   /// @brief Indicates whether this relocation should hidden when listing
   /// relocations, usually because it is the trailing part of a multipart
   /// relocation that will be printed as part of the leading relocation.
-  error_code getHidden(bool &Result) const;
+  std::error_code getHidden(bool &Result) const;
 
   /// @brief Get a string that represents the type of this relocation.
   ///
   /// This is for display purposes only.
-  error_code getTypeName(SmallVectorImpl<char> &Result) const;
+  std::error_code getTypeName(SmallVectorImpl<char> &Result) const;
 
   /// @brief Get a string that represents the calculation of the value of this
   ///        relocation.
   ///
   /// This is for display purposes only.
-  error_code getValueString(SmallVectorImpl<char> &Result) const;
+  std::error_code getValueString(SmallVectorImpl<char> &Result) const;
 
   DataRefImpl getRawDataRefImpl() const;
   const ObjectFile *getObjectFile() const;
@@ -92,24 +92,24 @@ class SectionRef {
 
   void moveNext();
 
-  error_code getName(StringRef &Result) const;
-  error_code getAddress(uint64_t &Result) const;
-  error_code getSize(uint64_t &Result) const;
-  error_code getContents(StringRef &Result) const;
+  std::error_code getName(StringRef &Result) const;
+  std::error_code getAddress(uint64_t &Result) const;
+  std::error_code getSize(uint64_t &Result) const;
+  std::error_code getContents(StringRef &Result) const;
 
   /// @brief Get the alignment of this section as the actual value (not log 2).
-  error_code getAlignment(uint64_t &Result) const;
+  std::error_code getAlignment(uint64_t &Result) const;
 
   // FIXME: Move to the normalization layer when it's created.
-  error_code isText(bool &Result) const;
-  error_code isData(bool &Result) const;
-  error_code isBSS(bool &Result) const;
-  error_code isRequiredForExecution(bool &Result) const;
-  error_code isVirtual(bool &Result) const;
-  error_code isZeroInit(bool &Result) const;
-  error_code isReadOnlyData(bool &Result) const;
+  std::error_code isText(bool &Result) const;
+  std::error_code isData(bool &Result) const;
+  std::error_code isBSS(bool &Result) const;
+  std::error_code isRequiredForExecution(bool &Result) const;
+  std::error_code isVirtual(bool &Result) const;
+  std::error_code isZeroInit(bool &Result) const;
+  std::error_code isReadOnlyData(bool &Result) const;
 
-  error_code containsSymbol(SymbolRef S, bool &Result) const;
+  std::error_code containsSymbol(SymbolRef S, bool &Result) const;
 
   relocation_iterator relocation_begin() const;
   relocation_iterator relocation_end() const;
@@ -141,18 +141,18 @@ class SymbolRef : public BasicSymbolRef {
 
   SymbolRef(DataRefImpl SymbolP, const ObjectFile *Owner);
 
-  error_code getName(StringRef &Result) const;
+  std::error_code getName(StringRef &Result) const;
   /// Returns the symbol virtual address (i.e. address at which it will be
   /// mapped).
-  error_code getAddress(uint64_t &Result) const;
+  std::error_code getAddress(uint64_t &Result) const;
   /// @brief Get the alignment of this symbol as the actual value (not log 2).
-  error_code getAlignment(uint32_t &Result) const;
-  error_code getSize(uint64_t &Result) const;
-  error_code getType(SymbolRef::Type &Result) const;
+  std::error_code getAlignment(uint32_t &Result) const;
+  std::error_code getSize(uint64_t &Result) const;
+  std::error_code getType(SymbolRef::Type &Result) const;
 
   /// @brief Get section this symbol is defined in reference to. Result is
   /// end_sections() if it is undefined or is an absolute symbol.
-  error_code getSection(section_iterator &Result) const;
+  std::error_code getSection(section_iterator &Result) const;
 
   const ObjectFile *getObject() const;
 };
@@ -190,10 +190,10 @@ class LibraryRef {
   bool operator==(const LibraryRef &Other) const;
   bool operator<(const LibraryRef &Other) const;
 
-  error_code getNext(LibraryRef &Result) const;
+  std::error_code getNext(LibraryRef &Result) const;
 
   // Get the path to this library, as stored in the object file.
-  error_code getPath(StringRef &Result) const;
+  std::error_code getPath(StringRef &Result) const;
 
   DataRefImpl getRawDataRefImpl() const;
 };
@@ -223,35 +223,49 @@ class ObjectFile : public SymbolicFile {
   // Implementations assume that the DataRefImpl is valid and has not been
   // modified externally. It's UB otherwise.
   friend class SymbolRef;
-  virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const = 0;
-  error_code printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override;
-  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const = 0;
-  virtual error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const;
-  virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const = 0;
-  virtual error_code getSymbolType(DataRefImpl Symb,
-                                   SymbolRef::Type &Res) const = 0;
-  virtual error_code getSymbolSection(DataRefImpl Symb,
-                                      section_iterator &Res) const = 0;
+  virtual std::error_code getSymbolName(DataRefImpl Symb,
+                                        StringRef &Res) const = 0;
+  std::error_code printSymbolName(raw_ostream &OS,
+                                  DataRefImpl Symb) const override;
+  virtual std::error_code getSymbolAddress(DataRefImpl Symb,
+                                           uint64_t &Res) const = 0;
+  virtual std::error_code getSymbolAlignment(DataRefImpl Symb,
+                                             uint32_t &Res) const;
+  virtual std::error_code getSymbolSize(DataRefImpl Symb,
+                                        uint64_t &Res) const = 0;
+  virtual std::error_code getSymbolType(DataRefImpl Symb,
+                                        SymbolRef::Type &Res) const = 0;
+  virtual std::error_code getSymbolSection(DataRefImpl Symb,
+                                           section_iterator &Res) const = 0;
 
   // Same as above for SectionRef.
   friend class SectionRef;
   virtual void moveSectionNext(DataRefImpl &Sec) const = 0;
-  virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const = 0;
-  virtual error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const =0;
-  virtual error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const = 0;
-  virtual error_code getSectionContents(DataRefImpl Sec, StringRef &Res)const=0;
-  virtual error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res)const=0;
-  virtual error_code isSectionText(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionData(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionBSS(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                                   bool &Res) const = 0;
+  virtual std::error_code getSectionName(DataRefImpl Sec,
+                                         StringRef &Res) const = 0;
+  virtual std::error_code getSectionAddress(DataRefImpl Sec,
+                                            uint64_t &Res) const = 0;
+  virtual std::error_code getSectionSize(DataRefImpl Sec,
+                                         uint64_t &Res) const = 0;
+  virtual std::error_code getSectionContents(DataRefImpl Sec,
+                                             StringRef &Res) const = 0;
+  virtual std::error_code getSectionAlignment(DataRefImpl Sec,
+                                              uint64_t &Res) const = 0;
+  virtual std::error_code isSectionText(DataRefImpl Sec, bool &Res) const = 0;
+  virtual std::error_code isSectionData(DataRefImpl Sec, bool &Res) const = 0;
+  virtual std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const = 0;
+  virtual std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                        bool &Res) const = 0;
   // A section is 'virtual' if its contents aren't present in the object image.
-  virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const =0;
-  virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                           bool &Result) const = 0;
+  virtual std::error_code isSectionVirtual(DataRefImpl Sec,
+                                           bool &Res) const = 0;
+  virtual std::error_code isSectionZeroInit(DataRefImpl Sec,
+                                            bool &Res) const = 0;
+  virtual std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                                bool &Res) const = 0;
+  virtual std::error_code sectionContainsSymbol(DataRefImpl Sec,
+                                                DataRefImpl Symb,
+                                                bool &Result) const = 0;
   virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
   virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0;
   virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
@@ -259,26 +273,31 @@ class ObjectFile : public SymbolicFile {
   // Same as above for RelocationRef.
   friend class RelocationRef;
   virtual void moveRelocationNext(DataRefImpl &Rel) const = 0;
-  virtual error_code getRelocationAddress(DataRefImpl Rel,
-                                          uint64_t &Res) const =0;
-  virtual error_code getRelocationOffset(DataRefImpl Rel,
-                                         uint64_t &Res) const =0;
+  virtual std::error_code getRelocationAddress(DataRefImpl Rel,
+                                               uint64_t &Res) const = 0;
+  virtual std::error_code getRelocationOffset(DataRefImpl Rel,
+                                              uint64_t &Res) const = 0;
   virtual symbol_iterator getRelocationSymbol(DataRefImpl Rel) const = 0;
-  virtual error_code getRelocationType(DataRefImpl Rel,
-                                       uint64_t &Res) const = 0;
-  virtual error_code getRelocationTypeName(DataRefImpl Rel,
-                                       SmallVectorImpl<char> &Result) const = 0;
-  virtual error_code getRelocationValueString(DataRefImpl Rel,
-                                       SmallVectorImpl<char> &Result) const = 0;
-  virtual error_code getRelocationHidden(DataRefImpl Rel, bool &Result) const {
+  virtual std::error_code getRelocationType(DataRefImpl Rel,
+                                            uint64_t &Res) const = 0;
+  virtual std::error_code
+  getRelocationTypeName(DataRefImpl Rel,
+                        SmallVectorImpl<char> &Result) const = 0;
+  virtual std::error_code
+  getRelocationValueString(DataRefImpl Rel,
+                           SmallVectorImpl<char> &Result) const = 0;
+  virtual std::error_code getRelocationHidden(DataRefImpl Rel,
+                                              bool &Result) const {
     Result = false;
     return object_error::success;
   }
 
   // Same for LibraryRef
   friend class LibraryRef;
-  virtual error_code getLibraryNext(DataRefImpl Lib, LibraryRef &Res) const = 0;
-  virtual error_code getLibraryPath(DataRefImpl Lib, StringRef &Res) const = 0;
+  virtual std::error_code getLibraryNext(DataRefImpl Lib,
+                                         LibraryRef &Res) const = 0;
+  virtual std::error_code getLibraryPath(DataRefImpl Lib,
+                                         StringRef &Res) const = 0;
 
 public:
   typedef iterator_range<symbol_iterator> symbol_iterator_range;
@@ -339,27 +358,27 @@ class ObjectFile : public SymbolicFile {
 inline SymbolRef::SymbolRef(DataRefImpl SymbolP, const ObjectFile *Owner)
     : BasicSymbolRef(SymbolP, Owner) {}
 
-inline error_code SymbolRef::getName(StringRef &Result) const {
+inline std::error_code SymbolRef::getName(StringRef &Result) const {
   return getObject()->getSymbolName(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getAddress(uint64_t &Result) const {
+inline std::error_code SymbolRef::getAddress(uint64_t &Result) const {
   return getObject()->getSymbolAddress(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getAlignment(uint32_t &Result) const {
+inline std::error_code SymbolRef::getAlignment(uint32_t &Result) const {
   return getObject()->getSymbolAlignment(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getSize(uint64_t &Result) const {
+inline std::error_code SymbolRef::getSize(uint64_t &Result) const {
   return getObject()->getSymbolSize(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getSection(section_iterator &Result) const {
+inline std::error_code SymbolRef::getSection(section_iterator &Result) const {
   return getObject()->getSymbolSection(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getType(SymbolRef::Type &Result) const {
+inline std::error_code SymbolRef::getType(SymbolRef::Type &Result) const {
   return getObject()->getSymbolType(getRawDataRefImpl(), Result);
 }
 
@@ -391,55 +410,56 @@ inline void SectionRef::moveNext() {
   return OwningObject->moveSectionNext(SectionPimpl);
 }
 
-inline error_code SectionRef::getName(StringRef &Result) const {
+inline std::error_code SectionRef::getName(StringRef &Result) const {
   return OwningObject->getSectionName(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getAddress(uint64_t &Result) const {
+inline std::error_code SectionRef::getAddress(uint64_t &Result) const {
   return OwningObject->getSectionAddress(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getSize(uint64_t &Result) const {
+inline std::error_code SectionRef::getSize(uint64_t &Result) const {
   return OwningObject->getSectionSize(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getContents(StringRef &Result) const {
+inline std::error_code SectionRef::getContents(StringRef &Result) const {
   return OwningObject->getSectionContents(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getAlignment(uint64_t &Result) const {
+inline std::error_code SectionRef::getAlignment(uint64_t &Result) const {
   return OwningObject->getSectionAlignment(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isText(bool &Result) const {
+inline std::error_code SectionRef::isText(bool &Result) const {
   return OwningObject->isSectionText(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isData(bool &Result) const {
+inline std::error_code SectionRef::isData(bool &Result) const {
   return OwningObject->isSectionData(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isBSS(bool &Result) const {
+inline std::error_code SectionRef::isBSS(bool &Result) const {
   return OwningObject->isSectionBSS(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isRequiredForExecution(bool &Result) const {
+inline std::error_code SectionRef::isRequiredForExecution(bool &Result) const {
   return OwningObject->isSectionRequiredForExecution(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isVirtual(bool &Result) const {
+inline std::error_code SectionRef::isVirtual(bool &Result) const {
   return OwningObject->isSectionVirtual(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isZeroInit(bool &Result) const {
+inline std::error_code SectionRef::isZeroInit(bool &Result) const {
   return OwningObject->isSectionZeroInit(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isReadOnlyData(bool &Result) const {
+inline std::error_code SectionRef::isReadOnlyData(bool &Result) const {
   return OwningObject->isSectionReadOnlyData(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::containsSymbol(SymbolRef S, bool &Result) const {
+inline std::error_code SectionRef::containsSymbol(SymbolRef S,
+                                                  bool &Result) const {
   return OwningObject->sectionContainsSymbol(SectionPimpl,
                                              S.getRawDataRefImpl(), Result);
 }
@@ -474,11 +494,11 @@ inline void RelocationRef::moveNext() {
   return OwningObject->moveRelocationNext(RelocationPimpl);
 }
 
-inline error_code RelocationRef::getAddress(uint64_t &Result) const {
+inline std::error_code RelocationRef::getAddress(uint64_t &Result) const {
   return OwningObject->getRelocationAddress(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getOffset(uint64_t &Result) const {
+inline std::error_code RelocationRef::getOffset(uint64_t &Result) const {
   return OwningObject->getRelocationOffset(RelocationPimpl, Result);
 }
 
@@ -486,21 +506,21 @@ inline symbol_iterator RelocationRef::getSymbol() const {
   return OwningObject->getRelocationSymbol(RelocationPimpl);
 }
 
-inline error_code RelocationRef::getType(uint64_t &Result) const {
+inline std::error_code RelocationRef::getType(uint64_t &Result) const {
   return OwningObject->getRelocationType(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getTypeName(SmallVectorImpl<char> &Result)
-  const {
+inline std::error_code
+RelocationRef::getTypeName(SmallVectorImpl<char> &Result) const {
   return OwningObject->getRelocationTypeName(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getValueString(SmallVectorImpl<char> &Result)
-  const {
+inline std::error_code
+RelocationRef::getValueString(SmallVectorImpl<char> &Result) const {
   return OwningObject->getRelocationValueString(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getHidden(bool &Result) const {
+inline std::error_code RelocationRef::getHidden(bool &Result) const {
   return OwningObject->getRelocationHidden(RelocationPimpl, Result);
 }
 
@@ -525,11 +545,11 @@ inline bool LibraryRef::operator<(const LibraryRef &Other) const {
   return LibraryPimpl < Other.LibraryPimpl;
 }
 
-inline error_code LibraryRef::getNext(LibraryRef &Result) const {
+inline std::error_code LibraryRef::getNext(LibraryRef &Result) const {
   return OwningObject->getLibraryNext(LibraryPimpl, Result);
 }
 
-inline error_code LibraryRef::getPath(StringRef &Result) const {
+inline std::error_code LibraryRef::getPath(StringRef &Result) const {
   return OwningObject->getLibraryPath(LibraryPimpl, Result);
 }
 
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index a3aaf17f1d66..5ca245057a55 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -253,12 +253,14 @@ class RelocVisitor {
 
   /// PPC64 ELF
   RelocToApply visitELF_PPC64_ADDR32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64BE(R);
+    int64_t Addend;
+    getELFRelocationAddend(R, Addend);
     uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
     return RelocToApply(Res, 4);
   }
   RelocToApply visitELF_PPC64_ADDR64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64BE(R);
+    int64_t Addend;
+    getELFRelocationAddend(R, Addend);
     return RelocToApply(Value + Addend, 8);
   }
 
diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h
index 28400e1ab896..113373694556 100644
--- a/include/llvm/Object/SymbolicFile.h
+++ b/include/llvm/Object/SymbolicFile.h
@@ -86,7 +86,8 @@ class BasicSymbolRef {
     SF_Weak = 1U << 2,           // Weak symbol
     SF_Absolute = 1U << 3,       // Absolute symbol
     SF_Common = 1U << 4,         // Symbol has common linkage
-    SF_FormatSpecific = 1U << 5  // Specific to the object file format
+    SF_Indirect = 1U << 5,       // Symbol is an alias to another symbol
+    SF_FormatSpecific = 1U << 6  // Specific to the object file format
                                  // (e.g. section symbols)
   };
 
@@ -98,7 +99,7 @@ class BasicSymbolRef {
 
   void moveNext();
 
-  error_code printName(raw_ostream &OS) const;
+  std::error_code printName(raw_ostream &OS) const;
 
   /// Get symbol flags (bitwise OR of SymbolRef::Flags)
   uint32_t getFlags() const;
@@ -119,8 +120,8 @@ class SymbolicFile : public Binary {
   // virtual interface.
   virtual void moveSymbolNext(DataRefImpl &Symb) const = 0;
 
-  virtual error_code printSymbolName(raw_ostream &OS,
-                                     DataRefImpl Symb) const = 0;
+  virtual std::error_code printSymbolName(raw_ostream &OS,
+                                          DataRefImpl Symb) const = 0;
 
   virtual uint32_t getSymbolFlags(DataRefImpl Symb) const = 0;
 
@@ -173,7 +174,7 @@ inline void BasicSymbolRef::moveNext() {
   return OwningObject->moveSymbolNext(SymbolPimpl);
 }
 
-inline error_code BasicSymbolRef::printName(raw_ostream &OS) const {
+inline std::error_code BasicSymbolRef::printName(raw_ostream &OS) const {
   return OwningObject->printSymbolName(OS, SymbolPimpl);
 }
 
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index c2b9f95956e8..22d57edb74e4 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -29,6 +29,7 @@
 #ifndef LLVM_PASS_H
 #define LLVM_PASS_H
 
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include <string>
 
@@ -82,13 +83,14 @@ enum PassKind {
 class Pass {
   AnalysisResolver *Resolver;  // Used to resolve analysis
   const void *PassID;
+  mutable const PassInfo *PI;
   PassKind Kind;
   void operator=(const Pass&) LLVM_DELETED_FUNCTION;
   Pass(const Pass &) LLVM_DELETED_FUNCTION;
 
 public:
   explicit Pass(PassKind K, char &pid)
-    : Resolver(nullptr), PassID(&pid), Kind(K) { }
+    : Resolver(nullptr), PassID(&pid), PI(nullptr), Kind(K) { }
   virtual ~Pass();
 
 
@@ -105,6 +107,13 @@ class Pass {
     return PassID;
   }
 
+  /// getPassInfo - Return the PassInfo associated with this pass.
+  const PassInfo *getPassInfo() const {
+    if (!PI)
+      PI = PassRegistry::getPassRegistry()->getPassInfo(PassID);
+    return PI;
+  }
+
   /// doInitialization - Virtual method overridden by subclasses to do
   /// any necessary initialization before any pass is run.
   ///
diff --git a/include/llvm/PassInfo.h b/include/llvm/PassInfo.h
new file mode 100644
index 000000000000..d53daf1cf72c
--- /dev/null
+++ b/include/llvm/PassInfo.h
@@ -0,0 +1,147 @@
+//===- llvm/PassInfo.h - Pass Info class ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines and implements the PassInfo class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_PASSINFO_H
+#define LLVM_PASSINFO_H
+
+#include <cassert>
+#include <vector>
+
+namespace llvm {
+
+class Pass;
+class TargetMachine;
+
+//===---------------------------------------------------------------------------
+/// PassInfo class - An instance of this class exists for every pass known by
+/// the system, and can be obtained from a live Pass by calling its
+/// getPassInfo() method.  These objects are set up by the RegisterPass<>
+/// template.
+///
+class PassInfo {
+public:
+  typedef Pass* (*NormalCtor_t)();
+  typedef Pass *(*TargetMachineCtor_t)(TargetMachine *);
+
+private:
+  const char      *const PassName;     // Nice name for Pass
+  const char      *const PassArgument; // Command Line argument to run this pass
+  const void *PassID;      
+  const bool IsCFGOnlyPass;            // Pass only looks at the CFG.
+  const bool IsAnalysis;               // True if an analysis pass.
+  const bool IsAnalysisGroup;          // True if an analysis group.
+  std::vector<const PassInfo*> ItfImpl;// Interfaces implemented by this pass
+
+  NormalCtor_t NormalCtor;
+  TargetMachineCtor_t TargetMachineCtor;
+
+public:
+  /// PassInfo ctor - Do not call this directly, this should only be invoked
+  /// through RegisterPass.
+  PassInfo(const char *name, const char *arg, const void *pi,
+           NormalCtor_t normal, bool isCFGOnly, bool is_analysis,
+           TargetMachineCtor_t machine = nullptr)
+    : PassName(name), PassArgument(arg), PassID(pi), 
+      IsCFGOnlyPass(isCFGOnly), 
+      IsAnalysis(is_analysis), IsAnalysisGroup(false), NormalCtor(normal),
+      TargetMachineCtor(machine) {}
+  /// PassInfo ctor - Do not call this directly, this should only be invoked
+  /// through RegisterPass. This version is for use by analysis groups; it
+  /// does not auto-register the pass.
+  PassInfo(const char *name, const void *pi)
+    : PassName(name), PassArgument(""), PassID(pi), 
+      IsCFGOnlyPass(false), 
+      IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(nullptr),
+      TargetMachineCtor(nullptr) {}
+
+  /// getPassName - Return the friendly name for the pass, never returns null
+  ///
+  const char *getPassName() const { return PassName; }
+
+  /// getPassArgument - Return the command line option that may be passed to
+  /// 'opt' that will cause this pass to be run.  This will return null if there
+  /// is no argument.
+  ///
+  const char *getPassArgument() const { return PassArgument; }
+
+  /// getTypeInfo - Return the id object for the pass...
+  /// TODO : Rename
+  const void *getTypeInfo() const { return PassID; }
+
+  /// Return true if this PassID implements the specified ID pointer.
+  bool isPassID(const void *IDPtr) const {
+    return PassID == IDPtr;
+  }
+  
+  /// isAnalysisGroup - Return true if this is an analysis group, not a normal
+  /// pass.
+  ///
+  bool isAnalysisGroup() const { return IsAnalysisGroup; }
+  bool isAnalysis() const { return IsAnalysis; }
+
+  /// isCFGOnlyPass - return true if this pass only looks at the CFG for the
+  /// function.
+  bool isCFGOnlyPass() const { return IsCFGOnlyPass; }
+  
+  /// getNormalCtor - Return a pointer to a function, that when called, creates
+  /// an instance of the pass and returns it.  This pointer may be null if there
+  /// is no default constructor for the pass.
+  ///
+  NormalCtor_t getNormalCtor() const {
+    return NormalCtor;
+  }
+  void setNormalCtor(NormalCtor_t Ctor) {
+    NormalCtor = Ctor;
+  }
+
+  /// getTargetMachineCtor - Return a pointer to a function, that when called
+  /// with a TargetMachine, creates an instance of the pass and returns it.
+  /// This pointer may be null if there is no constructor with a TargetMachine
+  /// for the pass.
+  ///
+  TargetMachineCtor_t getTargetMachineCtor() const { return TargetMachineCtor; }
+  void setTargetMachineCtor(TargetMachineCtor_t Ctor) {
+    TargetMachineCtor = Ctor;
+  }
+
+  /// createPass() - Use this method to create an instance of this pass.
+  Pass *createPass() const {
+    assert((!isAnalysisGroup() || NormalCtor) &&
+           "No default implementation found for analysis group!");
+    assert(NormalCtor &&
+           "Cannot call createPass on PassInfo without default ctor!");
+    return NormalCtor();
+  }
+
+  /// addInterfaceImplemented - This method is called when this pass is
+  /// registered as a member of an analysis group with the RegisterAnalysisGroup
+  /// template.
+  ///
+  void addInterfaceImplemented(const PassInfo *ItfPI) {
+    ItfImpl.push_back(ItfPI);
+  }
+
+  /// getInterfacesImplemented - Return a list of all of the analysis group
+  /// interfaces implemented by this pass.
+  ///
+  const std::vector<const PassInfo*> &getInterfacesImplemented() const {
+    return ItfImpl;
+  }
+
+private:
+  void operator=(const PassInfo &) LLVM_DELETED_FUNCTION;
+  PassInfo(const PassInfo &) LLVM_DELETED_FUNCTION;
+};
+
+}
+
+#endif
diff --git a/include/llvm/PassRegistry.h b/include/llvm/PassRegistry.h
index 7f2a01426e0d..1558c51bde48 100644
--- a/include/llvm/PassRegistry.h
+++ b/include/llvm/PassRegistry.h
@@ -18,8 +18,14 @@
 #define LLVM_PASSREGISTRY_H
 
 #include "llvm-c/Core.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/PassInfo.h"
 #include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/RWMutex.h"
+#include <vector>
 
 namespace llvm {
 
@@ -33,11 +39,26 @@ struct PassRegistrationListener;
 /// threads simultaneously, you will need to use a separate PassRegistry on
 /// each thread.
 class PassRegistry {
-  mutable void *pImpl;
-  void *getImpl() const;
+  mutable sys::SmartRWMutex<true> Lock;
+
+  /// PassInfoMap - Keep track of the PassInfo object for each registered pass.
+  typedef DenseMap<const void*, const PassInfo*> MapType;
+  MapType PassInfoMap;
+  
+  typedef StringMap<const PassInfo*> StringMapType;
+  StringMapType PassInfoStringMap;
+  
+  /// AnalysisGroupInfo - Keep track of information for each analysis group.
+  struct AnalysisGroupInfo {
+    SmallPtrSet<const PassInfo *, 8> Implementations;
+  };
+  DenseMap<const PassInfo*, AnalysisGroupInfo> AnalysisGroupInfoMap;
+  
+  std::vector<std::unique_ptr<const PassInfo>> ToFree;
+  std::vector<PassRegistrationListener*> Listeners;
    
 public:
-  PassRegistry() : pImpl(nullptr) { }
+  PassRegistry() { }
   ~PassRegistry();
   
   /// getPassRegistry - Access the global registry object, which is 
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index 8efb45f55a24..449bc9281084 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -23,6 +23,7 @@
 
 #include "Pass.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/Atomic.h"
 #include "llvm/Support/Valgrind.h"
@@ -31,120 +32,6 @@
 namespace llvm {
 
 class TargetMachine;
-//===---------------------------------------------------------------------------
-/// PassInfo class - An instance of this class exists for every pass known by
-/// the system, and can be obtained from a live Pass by calling its
-/// getPassInfo() method.  These objects are set up by the RegisterPass<>
-/// template, defined below.
-///
-class PassInfo {
-public:
-  typedef Pass* (*NormalCtor_t)();
-  typedef Pass *(*TargetMachineCtor_t)(TargetMachine *);
-
-private:
-  const char      *const PassName;     // Nice name for Pass
-  const char      *const PassArgument; // Command Line argument to run this pass
-  const void *PassID;      
-  const bool IsCFGOnlyPass;            // Pass only looks at the CFG.
-  const bool IsAnalysis;               // True if an analysis pass.
-  const bool IsAnalysisGroup;          // True if an analysis group.
-  std::vector<const PassInfo*> ItfImpl;// Interfaces implemented by this pass
-
-  NormalCtor_t NormalCtor;
-  TargetMachineCtor_t TargetMachineCtor;
-
-public:
-  /// PassInfo ctor - Do not call this directly, this should only be invoked
-  /// through RegisterPass.
-  PassInfo(const char *name, const char *arg, const void *pi,
-           NormalCtor_t normal, bool isCFGOnly, bool is_analysis,
-           TargetMachineCtor_t machine = nullptr)
-    : PassName(name), PassArgument(arg), PassID(pi), 
-      IsCFGOnlyPass(isCFGOnly), 
-      IsAnalysis(is_analysis), IsAnalysisGroup(false), NormalCtor(normal),
-      TargetMachineCtor(machine) {}
-  /// PassInfo ctor - Do not call this directly, this should only be invoked
-  /// through RegisterPass. This version is for use by analysis groups; it
-  /// does not auto-register the pass.
-  PassInfo(const char *name, const void *pi)
-    : PassName(name), PassArgument(""), PassID(pi), 
-      IsCFGOnlyPass(false), 
-      IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(nullptr),
-      TargetMachineCtor(nullptr) {}
-
-  /// getPassName - Return the friendly name for the pass, never returns null
-  ///
-  const char *getPassName() const { return PassName; }
-
-  /// getPassArgument - Return the command line option that may be passed to
-  /// 'opt' that will cause this pass to be run.  This will return null if there
-  /// is no argument.
-  ///
-  const char *getPassArgument() const { return PassArgument; }
-
-  /// getTypeInfo - Return the id object for the pass...
-  /// TODO : Rename
-  const void *getTypeInfo() const { return PassID; }
-
-  /// Return true if this PassID implements the specified ID pointer.
-  bool isPassID(const void *IDPtr) const {
-    return PassID == IDPtr;
-  }
-  
-  /// isAnalysisGroup - Return true if this is an analysis group, not a normal
-  /// pass.
-  ///
-  bool isAnalysisGroup() const { return IsAnalysisGroup; }
-  bool isAnalysis() const { return IsAnalysis; }
-
-  /// isCFGOnlyPass - return true if this pass only looks at the CFG for the
-  /// function.
-  bool isCFGOnlyPass() const { return IsCFGOnlyPass; }
-  
-  /// getNormalCtor - Return a pointer to a function, that when called, creates
-  /// an instance of the pass and returns it.  This pointer may be null if there
-  /// is no default constructor for the pass.
-  ///
-  NormalCtor_t getNormalCtor() const {
-    return NormalCtor;
-  }
-  void setNormalCtor(NormalCtor_t Ctor) {
-    NormalCtor = Ctor;
-  }
-
-  /// getTargetMachineCtor - Return a pointer to a function, that when called
-  /// with a TargetMachine, creates an instance of the pass and returns it.
-  /// This pointer may be null if there is no constructor with a TargetMachine
-  /// for the pass.
-  ///
-  TargetMachineCtor_t getTargetMachineCtor() const { return TargetMachineCtor; }
-  void setTargetMachineCtor(TargetMachineCtor_t Ctor) {
-    TargetMachineCtor = Ctor;
-  }
-
-  /// createPass() - Use this method to create an instance of this pass.
-  Pass *createPass() const;
-
-  /// addInterfaceImplemented - This method is called when this pass is
-  /// registered as a member of an analysis group with the RegisterAnalysisGroup
-  /// template.
-  ///
-  void addInterfaceImplemented(const PassInfo *ItfPI) {
-    ItfImpl.push_back(ItfPI);
-  }
-
-  /// getInterfacesImplemented - Return a list of all of the analysis group
-  /// interfaces implemented by this pass.
-  ///
-  const std::vector<const PassInfo*> &getInterfacesImplemented() const {
-    return ItfImpl;
-  }
-
-private:
-  void operator=(const PassInfo &) LLVM_DELETED_FUNCTION;
-  PassInfo(const PassInfo &) LLVM_DELETED_FUNCTION;
-};
 
 #define CALL_ONCE_INITIALIZATION(function) \
   static volatile sys::cas_flag initialized = 0; \
@@ -325,19 +212,12 @@ struct RegisterAnalysisGroup : public RegisterAGBase {
 /// clients that are interested in which passes get registered and unregistered
 /// at runtime (which can be because of the RegisterPass constructors being run
 /// as the program starts up, or may be because a shared object just got
-/// loaded).  Deriving from the PassRegistrationListener class automatically
-/// registers your object to receive callbacks indicating when passes are loaded
-/// and removed.
+/// loaded).
 ///
 struct PassRegistrationListener {
 
-  /// PassRegistrationListener ctor - Add the current object to the list of
-  /// PassRegistrationListeners...
-  PassRegistrationListener();
-
-  /// dtor - Remove object from list of listeners...
-  ///
-  virtual ~PassRegistrationListener();
+  PassRegistrationListener() {}
+  virtual ~PassRegistrationListener() {}
 
   /// Callback functions - These functions are invoked whenever a pass is loaded
   /// or removed from the current executable.
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 8457678c4ec6..eafb76886c83 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -16,14 +16,12 @@
 #ifndef LLVM_PROFILEDATA_INSTRPROF_H_
 #define LLVM_PROFILEDATA_INSTRPROF_H_
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
+const std::error_category &instrprof_category();
 
-const error_category &instrprof_category();
-
-struct instrprof_error {
-  enum ErrorType {
+enum class instrprof_error {
     success = 0,
     eof,
     bad_magic,
@@ -37,21 +35,17 @@ struct instrprof_error {
     hash_mismatch,
     count_mismatch,
     counter_overflow
-  };
-  ErrorType V;
-
-  instrprof_error(ErrorType V) : V(V) {}
-  operator ErrorType() const { return V; }
 };
 
-inline error_code make_error_code(instrprof_error E) {
-  return error_code(static_cast<int>(E), instrprof_category());
+inline std::error_code make_error_code(instrprof_error E) {
+  return std::error_code(static_cast<int>(E), instrprof_category());
 }
 
-template <> struct is_error_code_enum<instrprof_error> : std::true_type {};
-template <> struct is_error_code_enum<instrprof_error::ErrorType>
-  : std::true_type {};
-
 } // end namespace llvm
 
+namespace std {
+template <>
+struct is_error_code_enum<llvm::instrprof_error> : std::true_type {};
+}
+
 #endif // LLVM_PROFILEDATA_INSTRPROF_H_
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 3e18c76c57df..7a5a71dc6a31 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -60,28 +60,29 @@ class InstrProfIterator : public std::iterator<std::input_iterator_tag,
 /// Base class and interface for reading profiling data of any known instrprof
 /// format. Provides an iterator over InstrProfRecords.
 class InstrProfReader {
-  error_code LastError;
+  std::error_code LastError;
+
 public:
   InstrProfReader() : LastError(instrprof_error::success) {}
   virtual ~InstrProfReader() {}
 
   /// Read the header.  Required before reading first record.
-  virtual error_code readHeader() = 0;
+  virtual std::error_code readHeader() = 0;
   /// Read a single record.
-  virtual error_code readNextRecord(InstrProfRecord &Record) = 0;
+  virtual std::error_code readNextRecord(InstrProfRecord &Record) = 0;
   /// Iterator over profile data.
   InstrProfIterator begin() { return InstrProfIterator(this); }
   InstrProfIterator end() { return InstrProfIterator(); }
 
 protected:
-  /// Set the current error_code and return same.
-  error_code error(error_code EC) {
+  /// Set the current std::error_code and return same.
+  std::error_code error(std::error_code EC) {
     LastError = EC;
     return EC;
   }
 
   /// Clear the current error code and return a successful one.
-  error_code success() { return error(instrprof_error::success); }
+  std::error_code success() { return error(instrprof_error::success); }
 
 public:
   /// Return true if the reader has finished reading the profile data.
@@ -89,12 +90,12 @@ class InstrProfReader {
   /// Return true if the reader encountered an error reading profiling data.
   bool hasError() { return LastError && !isEOF(); }
   /// Get the current error code.
-  error_code getError() { return LastError; }
+  std::error_code getError() { return LastError; }
 
   /// Factory method to create an appropriately typed reader for the given
   /// instrprof file.
-  static error_code create(std::string Path,
-                           std::unique_ptr<InstrProfReader> &Result);
+  static std::error_code create(std::string Path,
+                                std::unique_ptr<InstrProfReader> &Result);
 };
 
 /// Reader for the simple text based instrprof format.
@@ -122,9 +123,9 @@ class TextInstrProfReader : public InstrProfReader {
       : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, '#') {}
 
   /// Read the header.
-  error_code readHeader() override { return success(); }
+  std::error_code readHeader() override { return success(); }
   /// Read a single record.
-  error_code readNextRecord(InstrProfRecord &Record) override;
+  std::error_code readNextRecord(InstrProfRecord &Record) override;
 };
 
 /// Reader for the raw instrprof binary format from runtime.
@@ -167,23 +168,23 @@ class RawInstrProfReader : public InstrProfReader {
   const char *NamesStart;
   const char *ProfileEnd;
 
-  RawInstrProfReader(const TextInstrProfReader &) LLVM_DELETED_FUNCTION;
-  RawInstrProfReader &operator=(const TextInstrProfReader &)
+  RawInstrProfReader(const RawInstrProfReader &) LLVM_DELETED_FUNCTION;
+  RawInstrProfReader &operator=(const RawInstrProfReader &)
     LLVM_DELETED_FUNCTION;
 public:
   RawInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)) { }
 
   static bool hasFormat(const MemoryBuffer &DataBuffer);
-  error_code readHeader() override;
-  error_code readNextRecord(InstrProfRecord &Record) override;
+  std::error_code readHeader() override;
+  std::error_code readNextRecord(InstrProfRecord &Record) override;
 
 private:
-  error_code readNextHeader(const char *CurrentPos);
-  error_code readHeader(const RawHeader &Header);
+  std::error_code readNextHeader(const char *CurrentPos);
+  std::error_code readHeader(const RawHeader &Header);
   template <class IntT>
   IntT swap(IntT Int) const {
-    return ShouldSwapBytes ? sys::SwapByteOrder(Int) : Int;
+    return ShouldSwapBytes ? sys::getSwappedBytes(Int) : Int;
   }
   const uint64_t *getCounter(IntPtrT CounterPtr) const {
     ptrdiff_t Offset = (swap(CounterPtr) - CountersDelta) / sizeof(uint64_t);
@@ -281,19 +282,19 @@ class IndexedInstrProfReader : public InstrProfReader {
   static bool hasFormat(const MemoryBuffer &DataBuffer);
 
   /// Read the file header.
-  error_code readHeader() override;
+  std::error_code readHeader() override;
   /// Read a single record.
-  error_code readNextRecord(InstrProfRecord &Record) override;
+  std::error_code readNextRecord(InstrProfRecord &Record) override;
 
   /// Fill Counts with the profile data for the given function name.
-  error_code getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
-                               std::vector<uint64_t> &Counts);
+  std::error_code getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
+                                    std::vector<uint64_t> &Counts);
   /// Return the maximum of all known function counts.
   uint64_t getMaximumFunctionCount() { return MaxFunctionCount; }
 
   /// Factory method to create an indexed reader.
-  static error_code create(std::string Path,
-                           std::unique_ptr<IndexedInstrProfReader> &Result);
+  static std::error_code
+  create(std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index fa37bf116d7d..6e68bee30eb8 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -38,8 +38,9 @@ class InstrProfWriter {
   /// Add function counts for the given function. If there are already counts
   /// for this function and the hash and number of counts match, each counter is
   /// summed.
-  error_code addFunctionCounts(StringRef FunctionName, uint64_t FunctionHash,
-                               ArrayRef<uint64_t> Counters);
+  std::error_code addFunctionCounts(StringRef FunctionName,
+                                    uint64_t FunctionHash,
+                                    ArrayRef<uint64_t> Counters);
   /// Ensure that all data is written to disk.
   void write(raw_fd_ostream &OS);
 };
diff --git a/include/llvm/Support/ARMBuildAttributes.h b/include/llvm/Support/ARMBuildAttributes.h
index 69732fc041e7..16312004c871 100644
--- a/include/llvm/Support/ARMBuildAttributes.h
+++ b/include/llvm/Support/ARMBuildAttributes.h
@@ -146,6 +146,19 @@ enum {
   AllowNeon2 = 2,     // SIMDv2 was permitted (Half-precision FP, MAC operations)
   AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
 
+  // Tag_ABI_PCS_RW_data, (=15), uleb128
+  AddressRWPCRel = 1, // Address RW static data PC-relative
+  AddressRWSBRel = 2, // Address RW static data SB-relative
+  AddressRWNone = 3, // No RW static data permitted
+
+  // Tag_ABI_PCS_RO_data, (=14), uleb128
+  AddressROPCRel = 1, // Address RO static data PC-relative
+  AddressRONone = 2, // No RO static data permitted
+
+  // Tag_ABI_PCS_GOT_use, (=17), uleb128
+  AddressDirect = 1, // Address imported data directly
+  AddressGOT = 2, // Address imported data indirectly (via GOT)
+
   // Tag_ABI_FP_denormal, (=20), uleb128
   PreserveFPSign = 2, // sign when flushed-to-zero is preserved
 
diff --git a/include/llvm/Support/ARMWinEH.h b/include/llvm/Support/ARMWinEH.h
new file mode 100644
index 000000000000..31efbdc4ed59
--- /dev/null
+++ b/include/llvm/Support/ARMWinEH.h
@@ -0,0 +1,384 @@
+//===-- llvm/Support/WinARMEH.h - Windows on ARM EH Constants ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_WINARMEH_H
+#define LLVM_SUPPORT_WINARMEH_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Endian.h"
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+enum class RuntimeFunctionFlag {
+  RFF_Unpacked,       /// unpacked entry
+  RFF_Packed,         /// packed entry
+  RFF_PackedFragment, /// packed entry representing a fragment
+  RFF_Reserved,       /// reserved
+};
+
+enum class ReturnType {
+  RT_POP,             /// return via pop {pc} (L flag must be set)
+  RT_B,               /// 16-bit branch
+  RT_BW,              /// 32-bit branch
+  RT_NoEpilogue,      /// no epilogue (fragment)
+};
+
+/// RuntimeFunction - An entry in the table of procedure data (.pdata)
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------------------------------------------------------------+
+/// |                     Function Start RVA                        |
+/// +-------------------+-+-+-+-----+-+---+---------------------+---+
+/// |    Stack Adjust   |C|L|R| Reg |H|Ret|   Function Length   |Flg|
+/// +-------------------+-+-+-+-----+-+---+---------------------+---+
+///
+/// Flag : 2-bit field with the following meanings:
+///   - 00 = packed unwind data not used; reamining bits point to .xdata record
+///   - 01 = packed unwind data
+///   - 10 = packed unwind data, function assumed to have no prologue; useful
+///          for function fragments that are discontiguous with the start of the
+///          function
+///   - 11 = reserved
+/// Function Length : 11-bit field providing the length of the entire function
+///                   in bytes, divided by 2; if the function is greater than
+///                   4KB, a full .xdata record must be used instead
+/// Ret : 2-bit field indicating how the function returns
+///   - 00 = return via pop {pc} (the L bit must be set)
+///   - 01 = return via 16-bit branch
+///   - 10 = return via 32-bit branch
+///   - 11 = no epilogue; useful for function fragments that may only contain a
+///          prologue but the epilogue is elsewhere
+/// H : 1-bit flag indicating whether the function "homes" the integer parameter
+///     registers (r0-r3), allocating 16-bytes on the stack
+/// Reg : 3-bit field indicating the index of the last saved non-volatile
+///       register.  If the R bit is set to 0, then only integer registers are
+///       saved (r4-rN, where N is 4 + Reg).  If the R bit is set to 1, then
+///       only floating-point registers are being saved (d8-dN, where N is
+///       8 + Reg).  The special case of the R bit being set to 1 and Reg equal
+///       to 7 indicates that no registers are saved.
+/// R : 1-bit flag indicating whether the non-volatile registers are integer or
+///     floating-point.  0 indicates integer, 1 indicates floating-point.  The
+///     special case of the R-flag being set and Reg being set to 7 indicates
+///     that no non-volatile registers are saved.
+/// L : 1-bit flag indicating whether the function saves/restores the link
+///     register (LR)
+/// C : 1-bit flag indicating whether the function includes extra instructions
+///     to setup a frame chain for fast walking.  If this flag is set, r11 is
+///     implicitly added to the list of saved non-volatile integer registers.
+/// Stack Adjust : 10-bit field indicating the number of bytes of stack that are
+///                allocated for this function.  Only values between 0x000 and
+///                0x3f3 can be directly encoded.  If the value is 0x3f4 or
+///                greater, then the low 4 bits have special meaning as follows:
+///                - Bit 0-1
+///                  indicate the number of words' of adjustment (1-4), minus 1
+///                - Bit 2
+///                  indicates if the prologue combined adjustment into push
+///                - Bit 3
+///                  indicates if the epilogue combined adjustment into pop
+///
+/// RESTRICTIONS:
+///   - IF C is SET:
+///     + L flag must be set since frame chaining requires r11 and lr
+///     + r11 must NOT be included in the set of registers described by Reg
+///   - IF Ret is 0:
+///     + L flag must be set
+
+// NOTE: RuntimeFunction is meant to be a simple class that provides raw access
+// to all fields in the structure.  The accessor methods reflect the names of
+// the bitfields that they correspond to.  Although some obvious simplifications
+// are possible via merging of methods, it would prevent the use of this class
+// to fully inspect the contents of the data structure which is particularly
+// useful for scenarios such as llvm-readobj to aid in testing.
+
+class RuntimeFunction {
+public:
+  const support::ulittle32_t BeginAddress;
+  const support::ulittle32_t UnwindData;
+
+  RuntimeFunction(const support::ulittle32_t *Data)
+    : BeginAddress(Data[0]), UnwindData(Data[1]) {}
+
+  RuntimeFunction(const support::ulittle32_t BeginAddress,
+                  const support::ulittle32_t UnwindData)
+    : BeginAddress(BeginAddress), UnwindData(UnwindData) {}
+
+  RuntimeFunctionFlag Flag() const {
+    return RuntimeFunctionFlag(UnwindData & 0x3);
+  }
+
+  uint32_t ExceptionInformationRVA() const {
+    assert(Flag() == RuntimeFunctionFlag::RFF_Unpacked &&
+           "unpacked form required for this operation");
+    return (UnwindData & ~0x3);
+  }
+
+  uint32_t PackedUnwindData() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return (UnwindData & ~0x3);
+  }
+  uint32_t FunctionLength() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return (((UnwindData & 0x00001ffc) >> 2) << 1);
+  }
+  ReturnType Ret() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    assert(((UnwindData & 0x00006000) || L()) && "L must be set to 1");
+    return ReturnType((UnwindData & 0x00006000) >> 13);
+  }
+  bool H() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00008000) >> 15);
+  }
+  uint8_t Reg() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00070000) >> 16);
+  }
+  bool R() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00080000) >> 19);
+  }
+  bool L() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00100000) >> 20);
+  }
+  bool C() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    assert(((~UnwindData & 0x00200000) || L()) &&
+           "L flag must be set, chaining requires r11 and LR");
+    assert(((~UnwindData & 0x00200000) || (Reg() < 7) || R()) &&
+           "r11 must not be included in Reg; C implies r11");
+    return ((UnwindData & 0x00200000) >> 21);
+  }
+  uint16_t StackAdjust() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0xffc00000) >> 22);
+  }
+};
+
+/// PrologueFolding - pseudo-flag derived from Stack Adjust indicating that the
+/// prologue has stack adjustment combined into the push
+inline bool PrologueFolding(const RuntimeFunction &RF) {
+  return RF.StackAdjust() >= 0x3f4 && (RF.StackAdjust() & 0x4);
+}
+/// Epilogue - pseudo-flag derived from Stack Adjust indicating that the
+/// epilogue has stack adjustment combined into the pop
+inline bool EpilogueFolding(const RuntimeFunction &RF) {
+  return RF.StackAdjust() >= 0x3f4 && (RF.StackAdjust() & 0x8);
+}
+/// StackAdjustment - calculated stack adjustment in words.  The stack
+/// adjustment should be determined via this function to account for the special
+/// handling the special encoding when the value is ≥ 0x3f4.
+inline uint16_t StackAdjustment(const RuntimeFunction &RF) {
+  uint16_t Adjustment = RF.StackAdjust();
+  if (Adjustment >= 0x3f4)
+    return (Adjustment & 0x3) ? ((Adjustment & 0x3) << 2) - 1 : 0;
+  return Adjustment;
+}
+
+/// SavedRegisterMask - Utility function to calculate the set of saved general
+/// purpose (r0-r15) and VFP (d0-d31) registers.
+std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
+
+/// ExceptionDataRecord - An entry in the table of exception data (.xdata)
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +-------+---------+-+-+-+---+-----------------------------------+
+/// | C Wrd | Epi Cnt |F|E|X|Ver|         Function Length           |
+/// +-------+--------+'-'-'-'---'---+-------------------------------+
+/// |    Reserved    |Ex. Code Words|   (Extended Epilogue Count)   |
+/// +-------+--------+--------------+-------------------------------+
+///
+/// Function Length : 18-bit field indicating the total length of the function
+///                   in bytes divided by 2.  If a function is larger than
+///                   512KB, then multiple pdata and xdata records must be used.
+/// Vers : 2-bit field describing the version of the remaining structure.  Only
+///        version 0 is currently defined (values 1-3 are not permitted).
+/// X : 1-bit field indicating the presence of exception data
+/// E : 1-bit field indicating that the single epilogue is packed into the
+///     header
+/// F : 1-bit field indicating that the record describes a function fragment
+///     (implies that no prologue is present, and prologue processing should be
+///     skipped)
+/// Epilogue Count : 5-bit field that differs in meaning based on the E field.
+///
+///                  If E is set, then this field specifies the index of the
+///                  first unwind code describing the (only) epilogue.
+///
+///                  Otherwise, this field indicates the number of exception
+///                  scopes.  If more than 31 scopes exist, then this field and
+///                  the Code Words field must both be set to 0 to indicate that
+///                  an extension word is required.
+/// Code Words : 4-bit field that species the number of 32-bit words needed to
+///              contain all the unwind codes.  If more than 15 words (63 code
+///              bytes) are required, then this field and the Epilogue Count
+///              field must both be set to 0 to indicate that an extension word
+///              is required.
+/// Extended Epilogue Count, Extended Code Words :
+///                          Valid only if Epilog Count and Code Words are both
+///                          set to 0.  Provides an 8-bit extended code word
+///                          count and 16-bits for epilogue count
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +----------------+------+---+---+-------------------------------+
+/// |  Ep Start Idx  | Cond |Res|       Epilogue Start Offset       |
+/// +----------------+------+---+-----------------------------------+
+///
+/// If the E bit is unset in the header, the header is followed by a series of
+/// epilogue scopes, which are sorted by their offset.
+///
+/// Epilogue Start Offset: 18-bit field encoding the offset of epilogue relative
+///                        to the start of the function in bytes divided by two
+/// Res : 2-bit field reserved for future expansion (must be set to 0)
+/// Condition : 4-bit field providing the condition under which the epilogue is
+///             executed.  Unconditional epilogues should set this field to 0xe.
+///             Epilogues must be entirely conditional or unconditional, and in
+///             Thumb-2 mode.  The epilogue beings with the first instruction
+///             after the IT opcode.
+/// Epilogue Start Index : 8-bit field indicating the byte index of the first
+///                        unwind code describing the epilogue
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------------+---------------+---------------+---------------+
+/// | Unwind Code 3 | Unwind Code 2 | Unwind Code 1 | Unwind Code 0 |
+/// +---------------+---------------+---------------+---------------+
+///
+/// Following the epilogue scopes, the byte code describing the unwinding
+/// follows.  This is padded to align up to word alignment.  Bytes are stored in
+/// little endian.
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------------------------------------------------------------+
+/// |           Exception Handler RVA (requires X = 1)              |
+/// +---------------------------------------------------------------+
+/// |  (possibly followed by data required for exception handler)   |
+/// +---------------------------------------------------------------+
+///
+/// If the X bit is set in the header, the unwind byte code is followed by the
+/// exception handler information.  This constants of one Exception Handler RVA
+/// which is the address to the exception handler, followed immediately by the
+/// variable length data associated with the exception handler.
+///
+
+struct EpilogueScope {
+  const support::ulittle32_t ES;
+
+  EpilogueScope(const support::ulittle32_t Data) : ES(Data) {}
+  uint32_t EpilogueStartOffset() const {
+    return (ES & 0x0003ffff);
+  }
+  uint8_t Res() const {
+    return ((ES & 0x000c0000) >> 18);
+  }
+  uint8_t Condition() const {
+    return ((ES & 0x00f00000) >> 20);
+  }
+  uint8_t EpilogueStartIndex() const {
+    return ((ES & 0xff000000) >> 24);
+  }
+};
+
+struct ExceptionDataRecord;
+inline size_t HeaderWords(const ExceptionDataRecord &XR);
+
+struct ExceptionDataRecord {
+  const support::ulittle32_t *Data;
+
+  ExceptionDataRecord(const support::ulittle32_t *Data) : Data(Data) {}
+
+  uint32_t FunctionLength() const {
+    return (Data[0] & 0x0003ffff);
+  }
+
+  uint8_t Vers() const {
+    return (Data[0] & 0x000C0000) >> 18;
+  }
+
+  bool X() const {
+    return ((Data[0] & 0x00100000) >> 20);
+  }
+
+  bool E() const {
+    return ((Data[0] & 0x00200000) >> 21);
+  }
+
+  bool F() const {
+    return ((Data[0] & 0x00400000) >> 22);
+  }
+
+  uint8_t EpilogueCount() const {
+    if (HeaderWords(*this) == 1)
+      return (Data[0] & 0x0f800000) >> 23;
+    return Data[1] & 0x0000ffff;
+  }
+
+  uint8_t CodeWords() const {
+    if (HeaderWords(*this) == 1)
+      return (Data[0] & 0xf0000000) >> 28;
+    return (Data[1] & 0x00ff0000) >> 16;
+  }
+
+  ArrayRef<support::ulittle32_t> EpilogueScopes() const {
+    assert(E() == 0 && "epilogue scopes are only present when the E bit is 0");
+    size_t Offset = HeaderWords(*this);
+    return ArrayRef<support::ulittle32_t>(&Data[Offset], EpilogueCount());
+  }
+
+  ArrayRef<support::ulittle8_t> UnwindByteCode() const {
+    const size_t Offset = HeaderWords(*this)
+                        + (E() ? 0 :  EpilogueCount());
+    const support::ulittle8_t *ByteCode =
+      reinterpret_cast<const support::ulittle8_t *>(&Data[Offset]);
+    return ArrayRef<support::ulittle8_t>(ByteCode,
+                                         CodeWords() * sizeof(uint32_t));
+  }
+
+  uint32_t ExceptionHandlerRVA() const {
+    assert(X() && "Exception Handler RVA is only valid if the X bit is set");
+    return Data[HeaderWords(*this) + EpilogueCount() + CodeWords()];
+  }
+
+  uint32_t ExceptionHandlerParameter() const {
+    assert(X() && "Exception Handler RVA is only valid if the X bit is set");
+    return Data[HeaderWords(*this) + EpilogueCount() + CodeWords() + 1];
+  }
+};
+
+inline size_t HeaderWords(const ExceptionDataRecord &XR) {
+  return (XR.Data[0] & 0xff800000) ? 1 : 2;
+}
+}
+}
+}
+
+#endif
+
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 1edcd45bc3bb..25bf32ade5f8 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -61,6 +61,12 @@
 #define LLVM_MSC_PREREQ(version) 0
 #endif
 
+#ifndef _MSC_VER
+#define LLVM_NOEXCEPT noexcept
+#else
+#define LLVM_NOEXCEPT
+#endif
+
 /// \brief Does the compiler support r-value reference *this?
 ///
 /// Sadly, this is separate from just r-value reference support because GCC
diff --git a/include/llvm/Support/ConvertUTF.h b/include/llvm/Support/ConvertUTF.h
index 282036619c49..a184d0df2132 100644
--- a/include/llvm/Support/ConvertUTF.h
+++ b/include/llvm/Support/ConvertUTF.h
@@ -136,7 +136,19 @@ ConversionResult ConvertUTF8toUTF16 (
   const UTF8** sourceStart, const UTF8* sourceEnd,
   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
 
-ConversionResult ConvertUTF8toUTF32 (
+/**
+ * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
+ * incomplete code unit sequence, returns \c sourceExhausted.
+ */
+ConversionResult ConvertUTF8toUTF32Partial(
+  const UTF8** sourceStart, const UTF8* sourceEnd,
+  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
+
+/**
+ * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
+ * incomplete code unit sequence, returns \c sourceIllegal.
+ */
+ConversionResult ConvertUTF8toUTF32(
   const UTF8** sourceStart, const UTF8* sourceEnd,
   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
 
diff --git a/include/llvm/Support/DataTypes.h.cmake b/include/llvm/Support/DataTypes.h.cmake
index a26070cdc9fe..1f0c8eba5e11 100644
--- a/include/llvm/Support/DataTypes.h.cmake
+++ b/include/llvm/Support/DataTypes.h.cmake
@@ -37,6 +37,16 @@
 #include <math.h>
 #endif
 
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#else
+#error "Compiler must provide an implementation of stdint.h"
+#endif
+
 #ifndef _MSC_VER
 
 /* Note that this header's correct operation depends on __STDC_LIMIT_MACROS
@@ -55,14 +65,6 @@
 /* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
 #include <sys/types.h>
 
-#ifdef HAVE_INTTYPES_H
-#include <inttypes.h>
-#endif
-
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#endif
-
 #ifdef _AIX
 #include "llvm/Support/AIXDataTypesFix.h"
 #endif
@@ -77,11 +79,6 @@ typedef u_int64_t uint64_t;
 #endif
 
 #else /* _MSC_VER */
-/* Visual C++ doesn't provide standard integer headers, but it does provide
-   built-in data types. */
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#endif
 #include <stdlib.h>
 #include <stddef.h>
 #include <sys/types.h>
@@ -90,93 +87,21 @@ typedef u_int64_t uint64_t;
 #else
 #include <math.h>
 #endif
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef short int16_t;
-typedef unsigned short uint16_t;
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
+
 #if defined(_WIN64)
-  typedef signed __int64 ssize_t;
+typedef signed __int64 ssize_t;
 #else
-  typedef signed int ssize_t;
-#endif
-#ifndef INT8_MAX
-# define INT8_MAX 127
-#endif
-#ifndef INT8_MIN
-# define INT8_MIN -128
-#endif
-#ifndef UINT8_MAX
-# define UINT8_MAX 255
-#endif
-#ifndef INT16_MAX
-# define INT16_MAX 32767
-#endif
-#ifndef INT16_MIN
-# define INT16_MIN -32768
-#endif
-#ifndef UINT16_MAX
-# define UINT16_MAX 65535
-#endif
-#ifndef INT32_MAX
-# define INT32_MAX 2147483647
-#endif
-#ifndef INT32_MIN
-/* MSC treats -2147483648 as -(2147483648U). */
-# define INT32_MIN (-INT32_MAX - 1)
-#endif
-#ifndef UINT32_MAX
-# define UINT32_MAX 4294967295U
-#endif
-/* Certain compatibility updates to VC++ introduce the `cstdint'
- * header, which defines the INT*_C macros. On default installs they
- * are absent. */
-#ifndef INT8_C
-# define INT8_C(C)   C##i8
-#endif
-#ifndef UINT8_C
-# define UINT8_C(C)  C##ui8
-#endif
-#ifndef INT16_C
-# define INT16_C(C)  C##i16
-#endif
-#ifndef UINT16_C
-# define UINT16_C(C) C##ui16
-#endif
-#ifndef INT32_C
-# define INT32_C(C)  C##i32
-#endif
-#ifndef UINT32_C
-# define UINT32_C(C) C##ui32
-#endif
-#ifndef INT64_C
-# define INT64_C(C)  C##i64
-#endif
-#ifndef UINT64_C
-# define UINT64_C(C) C##ui64
-#endif
-
-#ifndef PRId64
-# define PRId64 "I64d"
-#endif
-#ifndef PRIi64
-# define PRIi64 "I64i"
-#endif
-#ifndef PRIo64
-# define PRIo64 "I64o"
-#endif
-#ifndef PRIu64
-# define PRIu64 "I64u"
-#endif
-#ifndef PRIx64
-# define PRIx64 "I64x"
-#endif
-#ifndef PRIX64
-# define PRIX64 "I64X"
-#endif
+typedef signed int ssize_t;
+#endif /* _WIN64 */
+
+#ifndef HAVE_INTTYPES_H
+#define PRId64 "I64d"
+#define PRIi64 "I64i"
+#define PRIo64 "I64o"
+#define PRIu64 "I64u"
+#define PRIx64 "I64x"
+#define PRIX64 "I64X"
+#endif /* HAVE_INTTYPES_H */
 
 #endif /* _MSC_VER */
 
diff --git a/include/llvm/Support/DataTypes.h.in b/include/llvm/Support/DataTypes.h.in
index 7fc9b725244e..09cfcdf3b56b 100644
--- a/include/llvm/Support/DataTypes.h.in
+++ b/include/llvm/Support/DataTypes.h.in
@@ -37,6 +37,16 @@
 #include <math.h>
 #endif
 
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#else
+#error "Compiler must provide an implementation of stdint.h"
+#endif
+
 #ifndef _MSC_VER
 
 /* Note that this header's correct operation depends on __STDC_LIMIT_MACROS
@@ -55,14 +65,6 @@
 /* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
 #include <sys/types.h>
 
-#ifdef HAVE_INTTYPES_H
-#include <inttypes.h>
-#endif
-
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#endif
-
 #ifdef _AIX
 #include "llvm/Support/AIXDataTypesFix.h"
 #endif
@@ -77,8 +79,6 @@ typedef u_int64_t uint64_t;
 #endif
 
 #else /* _MSC_VER */
-/* Visual C++ doesn't provide standard integer headers, but it does provide
-   built-in data types. */
 #include <stdlib.h>
 #include <stddef.h>
 #include <sys/types.h>
@@ -87,94 +87,21 @@ typedef u_int64_t uint64_t;
 #else
 #include <math.h>
 #endif
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef short int16_t;
-typedef unsigned short uint16_t;
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
+
 #if defined(_WIN64)
-  typedef signed __int64 ssize_t;
+typedef signed __int64 ssize_t;
 #else
-  typedef signed int ssize_t;
-#endif
-
-#ifndef INT8_MAX
-# define INT8_MAX 127
-#endif
-#ifndef INT8_MIN
-# define INT8_MIN -128
-#endif
-#ifndef UINT8_MAX
-# define UINT8_MAX 255
-#endif
-#ifndef INT16_MAX
-# define INT16_MAX 32767
-#endif
-#ifndef INT16_MIN
-# define INT16_MIN -32768
-#endif
-#ifndef UINT16_MAX
-# define UINT16_MAX 65535
-#endif
-#ifndef INT32_MAX
-# define INT32_MAX 2147483647
-#endif
-#ifndef INT32_MIN
-/* MSC treats -2147483648 as -(2147483648U). */
-# define INT32_MIN (-INT32_MAX - 1)
-#endif
-#ifndef UINT32_MAX
-# define UINT32_MAX 4294967295U
-#endif
-/* Certain compatibility updates to VC++ introduce the `cstdint'
- * header, which defines the INT*_C macros. On default installs they
- * are absent. */
-#ifndef INT8_C
-# define INT8_C(C)   C##i8
-#endif
-#ifndef UINT8_C
-# define UINT8_C(C)  C##ui8
-#endif
-#ifndef INT16_C
-# define INT16_C(C)  C##i16
-#endif
-#ifndef UINT16_C
-# define UINT16_C(C) C##ui16
-#endif
-#ifndef INT32_C
-# define INT32_C(C)  C##i32
-#endif
-#ifndef UINT32_C
-# define UINT32_C(C) C##ui32
-#endif
-#ifndef INT64_C
-# define INT64_C(C)  C##i64
-#endif
-#ifndef UINT64_C
-# define UINT64_C(C) C##ui64
-#endif
-
-#ifndef PRId64
-# define PRId64 "I64d"
-#endif
-#ifndef PRIi64
-# define PRIi64 "I64i"
-#endif
-#ifndef PRIo64
-# define PRIo64 "I64o"
-#endif
-#ifndef PRIu64
-# define PRIu64 "I64u"
-#endif
-#ifndef PRIx64
-# define PRIx64 "I64x"
-#endif
-#ifndef PRIX64
-# define PRIX64 "I64X"
-#endif
+typedef signed int ssize_t;
+#endif /* _WIN64 */
+
+#ifndef HAVE_INTTYPES_H
+#define PRId64 "I64d"
+#define PRIi64 "I64i"
+#define PRIo64 "I64o"
+#define PRIu64 "I64u"
+#define PRIx64 "I64x"
+#define PRIX64 "I64X"
+#endif /* HAVE_INTTYPES_H */
 
 #endif /* _MSC_VER */
 
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 0b3e55b91524..31b34ffa703e 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -124,6 +124,8 @@ enum {
 };
 
 // Machine architectures
+// See current registered ELF machine architectures at:
+//    http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html
 enum {
   EM_NONE          = 0, // No machine
   EM_M32           = 1, // AT&T WE 32100
@@ -287,7 +289,26 @@ enum {
   EM_RL78          = 197, // Renesas RL78 family
   EM_VIDEOCORE5    = 198, // Broadcom VideoCore V processor
   EM_78KOR         = 199, // Renesas 78KOR family
-  EM_56800EX       = 200  // Freescale 56800EX Digital Signal Controller (DSC)
+  EM_56800EX       = 200, // Freescale 56800EX Digital Signal Controller (DSC)
+  EM_BA1           = 201, // Beyond BA1 CPU architecture
+  EM_BA2           = 202, // Beyond BA2 CPU architecture
+  EM_XCORE         = 203, // XMOS xCORE processor family
+  EM_MCHP_PIC      = 204, // Microchip 8-bit PIC(r) family
+  EM_INTEL205      = 205, // Reserved by Intel
+  EM_INTEL206      = 206, // Reserved by Intel
+  EM_INTEL207      = 207, // Reserved by Intel
+  EM_INTEL208      = 208, // Reserved by Intel
+  EM_INTEL209      = 209, // Reserved by Intel
+  EM_KM32          = 210, // KM211 KM32 32-bit processor
+  EM_KMX32         = 211, // KM211 KMX32 32-bit processor
+  EM_KMX16         = 212, // KM211 KMX16 16-bit processor
+  EM_KMX8          = 213, // KM211 KMX8 8-bit processor
+  EM_KVARC         = 214, // KM211 KVARC processor
+  EM_CDP           = 215, // Paneve CDP architecture family
+  EM_COGE          = 216, // Cognitive Smart Memory Processor
+  EM_COOL          = 217, // iCelero CoolEngine
+  EM_NORC          = 218, // Nanoradio Optimized RISC
+  EM_CSR_KALIMBA   = 219  // CSR Kalimba architecture family
 };
 
 // Object file classes.
diff --git a/include/llvm/Support/Endian.h b/include/llvm/Support/Endian.h
index 2c5ab74aa367..455d0fc241f2 100644
--- a/include/llvm/Support/Endian.h
+++ b/include/llvm/Support/Endian.h
@@ -38,7 +38,7 @@ namespace endian {
 template<typename value_type, endianness endian>
 inline value_type byte_swap(value_type value) {
   if (endian != native && sys::IsBigEndianHost != (endian == big))
-    return sys::SwapByteOrder(value);
+    sys::swapByteOrder(value);
   return value;
 }
 
diff --git a/include/llvm/Support/Errc.h b/include/llvm/Support/Errc.h
new file mode 100644
index 000000000000..80bfe2ac2ee5
--- /dev/null
+++ b/include/llvm/Support/Errc.h
@@ -0,0 +1,86 @@
+//===- llvm/Support/Errc.h - Defines the llvm::errc enum --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// While std::error_code works OK on all platforms we use, there are some
+// some problems with std::errc that can be avoided by using our own
+// enumeration:
+//
+// * std::errc is a namespace in some implementations. That meas that ADL
+//   doesn't work and it is sometimes necessary to write std::make_error_code
+//   or in templates:
+//   using std::make_error_code;
+//   make_error_code(...);
+//
+//   with this enum it is safe to always just use make_error_code.
+//
+// * Some implementations define fewer names than others. This header has
+//   the intersection of all the ones we support.
+//
+// * std::errc is just marked with is_error_condition_enum. This means that
+//   common patters like AnErrorCode == errc::no_such_file_or_directory take
+//   4 virtual calls instead of two comparisons.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ERRC_H
+#define LLVM_SUPPORT_ERRC_H
+
+#include <system_error>
+
+namespace llvm {
+enum class errc {
+  argument_list_too_long = int(std::errc::argument_list_too_long),
+  argument_out_of_domain = int(std::errc::argument_out_of_domain),
+  bad_address = int(std::errc::bad_address),
+  bad_file_descriptor = int(std::errc::bad_file_descriptor),
+  broken_pipe = int(std::errc::broken_pipe),
+  device_or_resource_busy = int(std::errc::device_or_resource_busy),
+  directory_not_empty = int(std::errc::directory_not_empty),
+  executable_format_error = int(std::errc::executable_format_error),
+  file_exists = int(std::errc::file_exists),
+  file_too_large = int(std::errc::file_too_large),
+  filename_too_long = int(std::errc::filename_too_long),
+  function_not_supported = int(std::errc::function_not_supported),
+  illegal_byte_sequence = int(std::errc::illegal_byte_sequence),
+  inappropriate_io_control_operation =
+      int(std::errc::inappropriate_io_control_operation),
+  interrupted = int(std::errc::interrupted),
+  invalid_argument = int(std::errc::invalid_argument),
+  invalid_seek = int(std::errc::invalid_seek),
+  io_error = int(std::errc::io_error),
+  is_a_directory = int(std::errc::is_a_directory),
+  no_child_process = int(std::errc::no_child_process),
+  no_lock_available = int(std::errc::no_lock_available),
+  no_space_on_device = int(std::errc::no_space_on_device),
+  no_such_device_or_address = int(std::errc::no_such_device_or_address),
+  no_such_device = int(std::errc::no_such_device),
+  no_such_file_or_directory = int(std::errc::no_such_file_or_directory),
+  no_such_process = int(std::errc::no_such_process),
+  not_a_directory = int(std::errc::not_a_directory),
+  not_enough_memory = int(std::errc::not_enough_memory),
+  operation_not_permitted = int(std::errc::operation_not_permitted),
+  permission_denied = int(std::errc::permission_denied),
+  read_only_file_system = int(std::errc::read_only_file_system),
+  resource_deadlock_would_occur = int(std::errc::resource_deadlock_would_occur),
+  resource_unavailable_try_again =
+      int(std::errc::resource_unavailable_try_again),
+  result_out_of_range = int(std::errc::result_out_of_range),
+  too_many_files_open_in_system = int(std::errc::too_many_files_open_in_system),
+  too_many_files_open = int(std::errc::too_many_files_open),
+  too_many_links = int(std::errc::too_many_links)
+};
+
+inline std::error_code make_error_code(errc E) {
+  return std::error_code(static_cast<int>(E), std::generic_category());
+}
+}
+
+namespace std {
+template <> struct is_error_code_enum<llvm::errc> : std::true_type {};
+}
+#endif
diff --git a/include/llvm/Support/ErrorHandling.h b/include/llvm/Support/ErrorHandling.h
index ac3a4d86469a..9afd52d1abc7 100644
--- a/include/llvm/Support/ErrorHandling.h
+++ b/include/llvm/Support/ErrorHandling.h
@@ -30,9 +30,6 @@ namespace llvm {
   /// install_fatal_error_handler - Installs a new error handler to be used
   /// whenever a serious (non-recoverable) error is encountered by LLVM.
   ///
-  /// If you are using llvm_start_multithreaded, you should register the handler
-  /// before doing that.
-  ///
   /// If no error handler is installed the default is to print the error message
   /// to stderr, and call exit(1).  If an error handler is installed then it is
   /// the handler's responsibility to log the message, it will no longer be
@@ -50,8 +47,6 @@ namespace llvm {
                                    void *user_data = nullptr);
 
   /// Restores default error handling behaviour.
-  /// This must not be called between llvm_start_multithreaded() and
-  /// llvm_stop_multithreaded().
   void remove_fatal_error_handler();
 
   /// ScopedFatalErrorHandler - This is a simple helper class which just
diff --git a/include/llvm/Support/ErrorOr.h b/include/llvm/Support/ErrorOr.h
index becd95780e44..0742a2d06f71 100644
--- a/include/llvm/Support/ErrorOr.h
+++ b/include/llvm/Support/ErrorOr.h
@@ -18,8 +18,8 @@
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/AlignOf.h"
-#include "llvm/Support/system_error.h"
 #include <cassert>
+#include <system_error>
 #include <type_traits>
 
 namespace llvm {
@@ -94,15 +94,16 @@ class ErrorOr {
 
 public:
   template <class E>
-  ErrorOr(E ErrorCode, typename std::enable_if<is_error_code_enum<E>::value ||
-                                               is_error_condition_enum<E>::value,
-                                               void *>::type = 0)
+  ErrorOr(E ErrorCode,
+          typename std::enable_if<std::is_error_code_enum<E>::value ||
+                                      std::is_error_condition_enum<E>::value,
+                                  void *>::type = 0)
       : HasError(true) {
-    new (getErrorStorage()) error_code(make_error_code(ErrorCode));
+    new (getErrorStorage()) std::error_code(make_error_code(ErrorCode));
   }
 
-  ErrorOr(llvm::error_code EC) : HasError(true) {
-    new (getErrorStorage()) error_code(EC);
+  ErrorOr(std::error_code EC) : HasError(true) {
+    new (getErrorStorage()) std::error_code(EC);
   }
 
   ErrorOr(T Val) : HasError(false) {
@@ -162,8 +163,8 @@ class ErrorOr {
   reference get() { return *getStorage(); }
   const_reference get() const { return const_cast<ErrorOr<T> >(this)->get(); }
 
-  error_code getError() const {
-    return HasError ? *getErrorStorage() : error_code::success();
+  std::error_code getError() const {
+    return HasError ? *getErrorStorage() : std::error_code();
   }
 
   pointer operator ->() {
@@ -184,7 +185,7 @@ class ErrorOr {
     } else {
       // Get other's error.
       HasError = true;
-      new (getErrorStorage()) error_code(Other.getError());
+      new (getErrorStorage()) std::error_code(Other.getError());
     }
   }
 
@@ -216,7 +217,7 @@ class ErrorOr {
     } else {
       // Get other's error.
       HasError = true;
-      new (getErrorStorage()) error_code(Other.getError());
+      new (getErrorStorage()) std::error_code(Other.getError());
     }
   }
 
@@ -247,28 +248,29 @@ class ErrorOr {
     return reinterpret_cast<const storage_type*>(TStorage.buffer);
   }
 
-  error_code *getErrorStorage() {
+  std::error_code *getErrorStorage() {
     assert(HasError && "Cannot get error when a value exists!");
-    return reinterpret_cast<error_code*>(ErrorStorage.buffer);
+    return reinterpret_cast<std::error_code *>(ErrorStorage.buffer);
   }
 
-  const error_code *getErrorStorage() const {
+  const std::error_code *getErrorStorage() const {
     return const_cast<ErrorOr<T> *>(this)->getErrorStorage();
   }
 
 
   union {
     AlignedCharArrayUnion<storage_type> TStorage;
-    AlignedCharArrayUnion<error_code> ErrorStorage;
+    AlignedCharArrayUnion<std::error_code> ErrorStorage;
   };
   bool HasError : 1;
 };
 
-template<class T, class E>
-typename std::enable_if<is_error_code_enum<E>::value ||
-                        is_error_condition_enum<E>::value, bool>::type
-operator ==(ErrorOr<T> &Err, E Code) {
-  return error_code(Err) == Code;
+template <class T, class E>
+typename std::enable_if<std::is_error_code_enum<E>::value ||
+                            std::is_error_condition_enum<E>::value,
+                        bool>::type
+operator==(ErrorOr<T> &Err, E Code) {
+  return std::error_code(Err) == Code;
 }
 } // end namespace llvm
 
diff --git a/include/llvm/Support/FEnv.h b/include/llvm/Support/FEnv.h
deleted file mode 100644
index 8560ee0a8afe..000000000000
--- a/include/llvm/Support/FEnv.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//===- llvm/Support/FEnv.h - Host floating-point exceptions ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides an operating system independent interface to
-// floating-point exception interfaces.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_FENV_H
-#define LLVM_SUPPORT_FENV_H
-
-#include "llvm/Config/config.h"
-#include <cerrno>
-#ifdef HAVE_FENV_H
-#include <fenv.h>
-#endif
-
-// FIXME: Clang's #include handling apparently doesn't work for libstdc++'s
-// fenv.h; see PR6907 for details.
-#if defined(__clang__) && defined(_GLIBCXX_FENV_H)
-#undef HAVE_FENV_H
-#endif
-
-namespace llvm {
-namespace sys {
-
-/// llvm_fenv_clearexcept - Clear the floating-point exception state.
-static inline void llvm_fenv_clearexcept() {
-#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT
-  feclearexcept(FE_ALL_EXCEPT);
-#endif
-  errno = 0;
-}
-
-/// llvm_fenv_testexcept - Test if a floating-point exception was raised.
-static inline bool llvm_fenv_testexcept() {
-  int errno_val = errno;
-  if (errno_val == ERANGE || errno_val == EDOM)
-    return true;
-#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT && HAVE_DECL_FE_INEXACT
-  if (fetestexcept(FE_ALL_EXCEPT & ~FE_INEXACT))
-    return true;
-#endif
-  return false;
-}
-
-} // End sys namespace
-} // End llvm namespace
-
-#endif
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index a8a48fa3fece..0a9a97995156 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -20,8 +20,6 @@
 #include "llvm/Support/FileSystem.h"
 
 namespace llvm {
-class error_code;
-
 /// FileOutputBuffer - This interface provides simple way to create an in-memory
 /// buffer which will be written to a file. During the lifetime of these
 /// objects, the content or existence of the specified file is undefined. That
@@ -39,9 +37,9 @@ class FileOutputBuffer {
   /// Factory method to create an OutputBuffer object which manages a read/write
   /// buffer of the specified size. When committed, the buffer will be written
   /// to the file at the specified path.
-  static error_code create(StringRef FilePath, size_t Size,
-                           std::unique_ptr<FileOutputBuffer> &Result,
-                           unsigned Flags = 0);
+  static std::error_code create(StringRef FilePath, size_t Size,
+                                std::unique_ptr<FileOutputBuffer> &Result,
+                                unsigned Flags = 0);
 
   /// Returns a pointer to the start of the buffer.
   uint8_t *getBufferStart() {
@@ -68,7 +66,7 @@ class FileOutputBuffer {
   /// is called, the file is deleted in the destructor. The optional parameter
   /// is used if it turns out you want the file size to be smaller than
   /// initially requested.
-  error_code commit(int64_t NewSmallerSize = -1);
+  std::error_code commit(int64_t NewSmallerSize = -1);
 
   /// If this object was previously committed, the destructor just deletes
   /// this object.  If this object was not committed, the destructor
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 806a3e376ccc..6abe90446b47 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -33,11 +33,11 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeValue.h"
-#include "llvm/Support/system_error.h"
 #include <ctime>
 #include <iterator>
 #include <stack>
 #include <string>
+#include <system_error>
 #include <tuple>
 #include <vector>
 
@@ -49,26 +49,18 @@ namespace llvm {
 namespace sys {
 namespace fs {
 
-/// An "enum class" enumeration for the file system's view of the type.
-struct file_type {
-  enum Impl {
-    status_error,
-    file_not_found,
-    regular_file,
-    directory_file,
-    symlink_file,
-    block_file,
-    character_file,
-    fifo_file,
-    socket_file,
-    type_unknown
-  };
-
-  file_type(Impl V) : V(V) {}
-  operator Impl() const { return V; }
-
-private:
-  Impl V;
+/// An enumeration for the file system's view of the type.
+enum class file_type {
+  status_error,
+  file_not_found,
+  regular_file,
+  directory_file,
+  symlink_file,
+  block_file,
+  character_file,
+  fifo_file,
+  socket_file,
+  type_unknown
 };
 
 /// space_info - Self explanatory.
@@ -282,7 +274,7 @@ struct file_magic {
 /// @param path A path that is modified to be an absolute path.
 /// @returns errc::success if \a path has been made absolute, otherwise a
 ///          platform specific error_code.
-error_code make_absolute(SmallVectorImpl<char> &path);
+std::error_code make_absolute(SmallVectorImpl<char> &path);
 
 /// @brief Normalize path separators in \a Path
 ///
@@ -290,7 +282,7 @@ error_code make_absolute(SmallVectorImpl<char> &path);
 /// This is particularly useful when cross-compiling Windows on Linux, but is
 /// safe to invoke on Windows, which accepts both characters as a path
 /// separator.
-error_code normalize_separators(SmallVectorImpl<char> &Path);
+std::error_code normalize_separators(SmallVectorImpl<char> &Path);
 
 /// @brief Create all the non-existent directories in path.
 ///
@@ -298,7 +290,8 @@ error_code normalize_separators(SmallVectorImpl<char> &Path);
 /// @returns errc::success if is_directory(path), otherwise a platform
 ///          specific error_code. If IgnoreExisting is false, also returns
 ///          error if the directory already existed.
-error_code create_directories(const Twine &path, bool IgnoreExisting = true);
+std::error_code create_directories(const Twine &path,
+                                   bool IgnoreExisting = true);
 
 /// @brief Create the directory in path.
 ///
@@ -306,7 +299,7 @@ error_code create_directories(const Twine &path, bool IgnoreExisting = true);
 /// @returns errc::success if is_directory(path), otherwise a platform
 ///          specific error_code. If IgnoreExisting is false, also returns
 ///          error if the directory already existed.
-error_code create_directory(const Twine &path, bool IgnoreExisting = true);
+std::error_code create_directory(const Twine &path, bool IgnoreExisting = true);
 
 /// @brief Create a link from \a from to \a to.
 ///
@@ -319,14 +312,14 @@ error_code create_directory(const Twine &path, bool IgnoreExisting = true);
 /// @param from The path to hard link from. This is created.
 /// @returns errc::success if the link was created, otherwise a platform
 /// specific error_code.
-error_code create_link(const Twine &to, const Twine &from);
+std::error_code create_link(const Twine &to, const Twine &from);
 
 /// @brief Get the current path.
 ///
 /// @param result Holds the current path on return.
 /// @returns errc::success if the current path has been stored in result,
 ///          otherwise a platform specific error_code.
-error_code current_path(SmallVectorImpl<char> &result);
+std::error_code current_path(SmallVectorImpl<char> &result);
 
 /// @brief Remove path. Equivalent to POSIX remove().
 ///
@@ -334,13 +327,13 @@ error_code current_path(SmallVectorImpl<char> &result);
 /// @returns errc::success if path has been removed or didn't exist, otherwise a
 ///          platform specific error code. If IgnoreNonExisting is false, also
 ///          returns error if the file didn't exist.
-error_code remove(const Twine &path, bool IgnoreNonExisting = true);
+std::error_code remove(const Twine &path, bool IgnoreNonExisting = true);
 
 /// @brief Rename \a from to \a to. Files are renamed as if by POSIX rename().
 ///
 /// @param from The path to rename from.
 /// @param to The path to rename to. This is created.
-error_code rename(const Twine &from, const Twine &to);
+std::error_code rename(const Twine &from, const Twine &to);
 
 /// @brief Resize path to size. File is resized as if by POSIX truncate().
 ///
@@ -348,7 +341,7 @@ error_code rename(const Twine &from, const Twine &to);
 /// @param size Size to resize to.
 /// @returns errc::success if \a path has been resized to \a size, otherwise a
 ///          platform specific error_code.
-error_code resize_file(const Twine &path, uint64_t size);
+std::error_code resize_file(const Twine &path, uint64_t size);
 
 /// @}
 /// @name Physical Observers
@@ -368,7 +361,7 @@ bool exists(file_status status);
 ///               it does not. Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code exists(const Twine &path, bool &result);
+std::error_code exists(const Twine &path, bool &result);
 
 /// @brief Simpler version of exists for clients that don't need to
 ///        differentiate between an error and false.
@@ -410,7 +403,7 @@ bool equivalent(file_status A, file_status B);
 ///               inode (or equivalent).
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code equivalent(const Twine &A, const Twine &B, bool &result);
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result);
 
 /// @brief Simpler version of equivalent for clients that don't need to
 ///        differentiate between an error and false.
@@ -432,7 +425,7 @@ bool is_directory(file_status status);
 ///               Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code is_directory(const Twine &path, bool &result);
+std::error_code is_directory(const Twine &path, bool &result);
 
 /// @brief Simpler version of is_directory for clients that don't need to
 ///        differentiate between an error and false.
@@ -454,7 +447,7 @@ bool is_regular_file(file_status status);
 ///               Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code is_regular_file(const Twine &path, bool &result);
+std::error_code is_regular_file(const Twine &path, bool &result);
 
 /// @brief Simpler version of is_regular_file for clients that don't need to
 ///        differentiate between an error and false.
@@ -480,7 +473,7 @@ bool is_other(file_status status);
 ///               file, or a symlink, false if it does not. Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code is_other(const Twine &path, bool &result);
+std::error_code is_other(const Twine &path, bool &result);
 
 /// @brief Get file status as if by POSIX stat().
 ///
@@ -488,10 +481,10 @@ error_code is_other(const Twine &path, bool &result);
 /// @param result Set to the file status.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code status(const Twine &path, file_status &result);
+std::error_code status(const Twine &path, file_status &result);
 
 /// @brief A version for when a file descriptor is already available.
-error_code status(int FD, file_status &Result);
+std::error_code status(int FD, file_status &Result);
 
 /// @brief Get file size.
 ///
@@ -499,13 +492,13 @@ error_code status(int FD, file_status &Result);
 /// @param Result Set to the size of the file in \a Path.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-inline error_code file_size(const Twine &Path, uint64_t &Result) {
+inline std::error_code file_size(const Twine &Path, uint64_t &Result) {
   file_status Status;
-  error_code EC = status(Path, Status);
+  std::error_code EC = status(Path, Status);
   if (EC)
     return EC;
   Result = Status.getSize();
-  return error_code::success();
+  return std::error_code();
 }
 
 /// @brief Set the file modification and access time.
@@ -513,7 +506,7 @@ inline error_code file_size(const Twine &Path, uint64_t &Result) {
 /// @returns errc::success if the file times were successfully set, otherwise a
 ///          platform specific error_code or errc::not_supported on platforms
 ///          where the functionality isn't available.
-error_code setLastModificationAndAccessTime(int FD, TimeValue Time);
+std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time);
 
 /// @brief Is status available?
 ///
@@ -527,7 +520,7 @@ bool status_known(file_status s);
 /// @param result Set to true if status() != status_error.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code status_known(const Twine &path, bool &result);
+std::error_code status_known(const Twine &path, bool &result);
 
 /// @brief Create a uniquely named file.
 ///
@@ -550,13 +543,13 @@ error_code status_known(const Twine &path, bool &result);
 /// @param ResultPath Set to the opened file's absolute path.
 /// @returns errc::success if Result{FD,Path} have been successfully set,
 ///          otherwise a platform specific error_code.
-error_code createUniqueFile(const Twine &Model, int &ResultFD,
-                            SmallVectorImpl<char> &ResultPath,
-                            unsigned Mode = all_read | all_write);
+std::error_code createUniqueFile(const Twine &Model, int &ResultFD,
+                                 SmallVectorImpl<char> &ResultPath,
+                                 unsigned Mode = all_read | all_write);
 
 /// @brief Simpler version for clients that don't want an open file.
-error_code createUniqueFile(const Twine &Model,
-                            SmallVectorImpl<char> &ResultPath);
+std::error_code createUniqueFile(const Twine &Model,
+                                 SmallVectorImpl<char> &ResultPath);
 
 /// @brief Create a file in the system temporary directory.
 ///
@@ -566,16 +559,16 @@ error_code createUniqueFile(const Twine &Model,
 ///
 /// This should be used for things like a temporary .s that is removed after
 /// running the assembler.
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               int &ResultFD,
-                               SmallVectorImpl<char> &ResultPath);
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    int &ResultFD,
+                                    SmallVectorImpl<char> &ResultPath);
 
 /// @brief Simpler version for clients that don't want an open file.
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               SmallVectorImpl<char> &ResultPath);
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    SmallVectorImpl<char> &ResultPath);
 
-error_code createUniqueDirectory(const Twine &Prefix,
-                                 SmallVectorImpl<char> &ResultPath);
+std::error_code createUniqueDirectory(const Twine &Prefix,
+                                      SmallVectorImpl<char> &ResultPath);
 
 enum OpenFlags : unsigned {
   F_None = 0,
@@ -606,31 +599,10 @@ inline OpenFlags &operator|=(OpenFlags &A, OpenFlags B) {
   return A;
 }
 
-error_code openFileForWrite(const Twine &Name, int &ResultFD, OpenFlags Flags,
-                            unsigned Mode = 0666);
-
-error_code openFileForRead(const Twine &Name, int &ResultFD);
-
-/// @brief Are \a path's first bytes \a magic?
-///
-/// @param path Input path.
-/// @param magic Byte sequence to compare \a path's first len(magic) bytes to.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code has_magic(const Twine &path, const Twine &magic, bool &result);
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
+                                 OpenFlags Flags, unsigned Mode = 0666);
 
-/// @brief Get \a path's first \a len bytes.
-///
-/// @param path Input path.
-/// @param len Number of magic bytes to get.
-/// @param result Set to the first \a len bytes in the file pointed to by
-///               \a path. Or the entire file if file_size(path) < len, in which
-///               case result.size() returns the size of the file.
-/// @returns errc::success if result has been successfully set,
-///          errc::value_too_large if len is larger then the file pointed to by
-///          \a path, otherwise a platform specific error_code.
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result);
+std::error_code openFileForRead(const Twine &Name, int &ResultFD);
 
 /// @brief Identify the type of a binary file based on how magical it is.
 file_magic identify_magic(StringRef magic);
@@ -641,9 +613,9 @@ file_magic identify_magic(StringRef magic);
 /// @param result Set to the type of file, or file_magic::unknown.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
-error_code identify_magic(const Twine &path, file_magic &result);
+std::error_code identify_magic(const Twine &path, file_magic &result);
 
-error_code getUniqueID(const Twine Path, UniqueID &Result);
+std::error_code getUniqueID(const Twine Path, UniqueID &Result);
 
 /// This class represents a memory mapped file. It is based on
 /// boost::iostreams::mapped_file.
@@ -670,7 +642,7 @@ class mapped_file_region {
   void *FileMappingHandle;
 #endif
 
-  error_code init(int FD, bool CloseFD, uint64_t Offset);
+  std::error_code init(int FD, bool CloseFD, uint64_t Offset);
 
 public:
   typedef char char_type;
@@ -692,21 +664,14 @@ class mapped_file_region {
   ///               mapped_file_region::alignment().
   /// \param ec This is set to errc::success if the map was constructed
   ///           successfully. Otherwise it is set to a platform dependent error.
-  mapped_file_region(const Twine &path,
-                     mapmode mode,
-                     uint64_t length,
-                     uint64_t offset,
-                     error_code &ec);
+  mapped_file_region(const Twine &path, mapmode mode, uint64_t length,
+                     uint64_t offset, std::error_code &ec);
 
   /// \param fd An open file descriptor to map. mapped_file_region takes
   ///   ownership if closefd is true. It must have been opended in the correct
   ///   mode.
-  mapped_file_region(int fd,
-                     bool closefd,
-                     mapmode mode,
-                     uint64_t length,
-                     uint64_t offset,
-                     error_code &ec);
+  mapped_file_region(int fd, bool closefd, mapmode mode, uint64_t length,
+                     uint64_t offset, std::error_code &ec);
 
   ~mapped_file_region();
 
@@ -722,30 +687,6 @@ class mapped_file_region {
   static int alignment();
 };
 
-/// @brief Memory maps the contents of a file
-///
-/// @param path Path to file to map.
-/// @param file_offset Byte offset in file where mapping should begin.
-/// @param size Byte length of range of the file to map.
-/// @param map_writable If true, the file will be mapped in r/w such
-///        that changes to the mapped buffer will be flushed back
-///        to the file.  If false, the file will be mapped read-only
-///        and the buffer will be read-only.
-/// @param result Set to the start address of the mapped buffer.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
-                          bool map_writable, void *&result);
-
-
-/// @brief Memory unmaps the contents of a file
-///
-/// @param base Pointer to the start of the buffer.
-/// @param size Byte length of the range to unmmap.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code unmap_file_pages(void *base, size_t size);
-
 /// Return the path to the main executable, given the value of argv[0] from
 /// program startup and the address of main itself. In extremis, this function
 /// may fail and return an empty path.
@@ -777,7 +718,7 @@ class directory_entry {
   void replace_filename(const Twine &filename, file_status st = file_status());
 
   const std::string &path() const { return Path; }
-  error_code status(file_status &result) const;
+  std::error_code status(file_status &result) const;
 
   bool operator==(const directory_entry& rhs) const { return Path == rhs.Path; }
   bool operator!=(const directory_entry& rhs) const { return !(*this == rhs); }
@@ -790,9 +731,9 @@ class directory_entry {
 namespace detail {
   struct DirIterState;
 
-  error_code directory_iterator_construct(DirIterState&, StringRef);
-  error_code directory_iterator_increment(DirIterState&);
-  error_code directory_iterator_destruct(DirIterState&);
+  std::error_code directory_iterator_construct(DirIterState &, StringRef);
+  std::error_code directory_iterator_increment(DirIterState &);
+  std::error_code directory_iterator_destruct(DirIterState &);
 
   /// DirIterState - Keeps state for the directory_iterator. It is reference
   /// counted in order to preserve InputIterator semantics on copy.
@@ -816,14 +757,14 @@ class directory_iterator {
   IntrusiveRefCntPtr<detail::DirIterState> State;
 
 public:
-  explicit directory_iterator(const Twine &path, error_code &ec) {
+  explicit directory_iterator(const Twine &path, std::error_code &ec) {
     State = new detail::DirIterState;
     SmallString<128> path_storage;
     ec = detail::directory_iterator_construct(*State,
             path.toStringRef(path_storage));
   }
 
-  explicit directory_iterator(const directory_entry &de, error_code &ec) {
+  explicit directory_iterator(const directory_entry &de, std::error_code &ec) {
     State = new detail::DirIterState;
     ec = detail::directory_iterator_construct(*State, de.path());
   }
@@ -832,7 +773,7 @@ class directory_iterator {
   directory_iterator() : State(nullptr) {}
 
   // No operator++ because we need error_code.
-  directory_iterator &increment(error_code &ec) {
+  directory_iterator &increment(std::error_code &ec) {
     ec = directory_iterator_increment(*State);
     return *this;
   }
@@ -878,14 +819,14 @@ class recursive_directory_iterator {
 
 public:
   recursive_directory_iterator() {}
-  explicit recursive_directory_iterator(const Twine &path, error_code &ec)
-    : State(new detail::RecDirIterState) {
+  explicit recursive_directory_iterator(const Twine &path, std::error_code &ec)
+      : State(new detail::RecDirIterState) {
     State->Stack.push(directory_iterator(path, ec));
     if (State->Stack.top() == directory_iterator())
       State.reset();
   }
   // No operator++ because we need error_code.
-  recursive_directory_iterator &increment(error_code &ec) {
+  recursive_directory_iterator &increment(std::error_code &ec) {
     const directory_iterator end_itr;
 
     if (State->HasNoPushRequest)
@@ -934,7 +875,7 @@ class recursive_directory_iterator {
     assert(State->Level > 0 && "Cannot pop an iterator with level < 1");
 
     const directory_iterator end_itr;
-    error_code ec;
+    std::error_code ec;
     do {
       if (ec)
         report_fatal_error("Error incrementing directory iterator.");
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index a62801f16658..b713cc72e852 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -36,23 +36,23 @@
 
 namespace llvm {
 
-/// format_object_base - This is a helper class used for handling formatted
-/// output.  It is the abstract base class of a templated derived class.
+/// This is a helper class used for handling formatted output.  It is the
+/// abstract base class of a templated derived class.
 class format_object_base {
 protected:
   const char *Fmt;
   virtual void home(); // Out of line virtual method.
 
-  /// snprint - Call snprintf() for this object, on the given buffer and size.
+  /// Call snprintf() for this object, on the given buffer and size.
   virtual int snprint(char *Buffer, unsigned BufferSize) const = 0;
 
 public:
   format_object_base(const char *fmt) : Fmt(fmt) {}
   virtual ~format_object_base() {}
 
-  /// print - Format the object into the specified buffer.  On success, this
-  /// returns the length of the formatted string.  If the buffer is too small,
-  /// this returns a length to retry with, which will be larger than BufferSize.
+  /// Format the object into the specified buffer.  On success, this returns
+  /// the length of the formatted string.  If the buffer is too small, this
+  /// returns a length to retry with, which will be larger than BufferSize.
   unsigned print(char *Buffer, unsigned BufferSize) const {
     assert(BufferSize && "Invalid buffer size!");
 
@@ -61,21 +61,23 @@ class format_object_base {
 
     // VC++ and old GlibC return negative on overflow, just double the size.
     if (N < 0)
-      return BufferSize*2;
+      return BufferSize * 2;
 
-    // Other impls yield number of bytes needed, not including the final '\0'.
+    // Other implementations yield number of bytes needed, not including the
+    // final '\0'.
     if (unsigned(N) >= BufferSize)
-      return N+1;
+      return N + 1;
 
     // Otherwise N is the length of output (not including the final '\0').
     return N;
   }
 };
 
-/// format_object1 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
+/// These are templated helper classes used by the format function that
+/// capture the object to be formated and the format string. When actually
+/// printed, this synthesizes the string into a temporary buffer provided and
+/// returns whether or not it is big enough.
+
 template <typename T>
 class format_object1 : public format_object_base {
   T Val;
@@ -89,10 +91,6 @@ class format_object1 : public format_object_base {
   }
 };
 
-/// format_object2 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2>
 class format_object2 : public format_object_base {
   T1 Val1;
@@ -107,10 +105,6 @@ class format_object2 : public format_object_base {
   }
 };
 
-/// format_object3 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2, typename T3>
 class format_object3 : public format_object_base {
   T1 Val1;
@@ -126,10 +120,6 @@ class format_object3 : public format_object_base {
   }
 };
 
-/// format_object4 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2, typename T3, typename T4>
 class format_object4 : public format_object_base {
   T1 Val1;
@@ -147,10 +137,6 @@ class format_object4 : public format_object_base {
   }
 };
 
-/// format_object5 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class format_object5 : public format_object_base {
   T1 Val1;
@@ -170,47 +156,52 @@ class format_object5 : public format_object_base {
   }
 };
 
-/// This is a helper function that is used to produce formatted output.
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+class format_object6 : public format_object_base {
+  T1 Val1;
+  T2 Val2;
+  T3 Val3;
+  T4 Val4;
+  T5 Val5;
+  T6 Val6;
+public:
+  format_object6(const char *Fmt, const T1 &Val1, const T2 &Val2,
+                 const T3 &Val3, const T4 &Val4, const T5 &Val5, const T6 &Val6)
+    : format_object_base(Fmt), Val1(Val1), Val2(Val2), Val3(Val3), Val4(Val4),
+      Val5(Val5), Val6(Val6) { }
+
+  int snprint(char *Buffer, unsigned BufferSize) const override {
+    return snprintf(Buffer, BufferSize, Fmt, Val1, Val2, Val3, Val4, Val5, Val6);
+  }
+};
+
+/// These are helper functions used to produce formatted output.  They use
+/// template type deduction to construct the appropriate instance of the
+/// format_object class to simplify their construction.
 ///
 /// This is typically used like:
 /// \code
 ///   OS << format("%0.4f", myfloat) << '\n';
 /// \endcode
+
 template <typename T>
 inline format_object1<T> format(const char *Fmt, const T &Val) {
   return format_object1<T>(Fmt, Val);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2>
 inline format_object2<T1, T2> format(const char *Fmt, const T1 &Val1,
                                      const T2 &Val2) {
   return format_object2<T1, T2>(Fmt, Val1, Val2);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2, typename T3>
   inline format_object3<T1, T2, T3> format(const char *Fmt, const T1 &Val1,
                                            const T2 &Val2, const T3 &Val3) {
   return format_object3<T1, T2, T3>(Fmt, Val1, Val2, Val3);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2, typename T3, typename T4>
 inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
@@ -218,12 +209,6 @@ inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
   return format_object4<T1, T2, T3, T4>(Fmt, Val1, Val2, Val3, Val4);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 inline format_object5<T1, T2, T3, T4, T5> format(const char *Fmt,const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
@@ -231,6 +216,15 @@ inline format_object5<T1, T2, T3, T4, T5> format(const char *Fmt,const T1 &Val1,
   return format_object5<T1, T2, T3, T4, T5>(Fmt, Val1, Val2, Val3, Val4, Val5);
 }
 
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+inline format_object6<T1, T2, T3, T4, T5, T6>
+format(const char *Fmt, const T1 &Val1, const T2 &Val2, const T3 &Val3,
+       const T4 &Val4, const T5 &Val5, const T6 &Val6) {
+  return format_object6<T1, T2, T3, T4, T5, T6>(Fmt, Val1, Val2, Val3, Val4,
+                                                Val5, Val6);
+}
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Support/GraphWriter.h b/include/llvm/Support/GraphWriter.h
index 660c519ed64c..2f02aa7a1c19 100644
--- a/include/llvm/Support/GraphWriter.h
+++ b/include/llvm/Support/GraphWriter.h
@@ -50,7 +50,7 @@ namespace GraphProgram {
    };
 }
 
-void DisplayGraph(StringRef Filename, bool wait = true,
+bool DisplayGraph(StringRef Filename, bool wait = true,
                   GraphProgram::Name program = GraphProgram::DOT);
 
 template<typename GraphType>
@@ -325,7 +325,10 @@ template <typename GraphType>
 std::string WriteGraph(const GraphType &G, const Twine &Name,
                        bool ShortNames = false, const Twine &Title = "") {
   int FD;
-  std::string Filename = createGraphFilename(Name, FD);
+  // Windows can't always handle long paths, so limit the length of the name.
+  std::string N = Name.str();
+  N = N.substr(0, std::min<std::size_t>(N.size(), 140));
+  std::string Filename = createGraphFilename(N, FD);
   raw_fd_ostream O(FD, /*shouldClose=*/ true);
 
   if (FD == -1) {
diff --git a/include/llvm/Support/LockFileManager.h b/include/llvm/Support/LockFileManager.h
index 523a781b87d6..61c65dabae3c 100644
--- a/include/llvm/Support/LockFileManager.h
+++ b/include/llvm/Support/LockFileManager.h
@@ -12,11 +12,10 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 #include <utility> // for std::pair
 
 namespace llvm {
-
 /// \brief Class that manages the creation of a lock file to aid
 /// implicit coordination between different processes.
 ///
@@ -56,7 +55,7 @@ class LockFileManager {
   SmallString<128> UniqueLockFileName;
 
   Optional<std::pair<std::string, int> > Owner;
-  Optional<error_code> Error;
+  Optional<std::error_code> Error;
 
   LockFileManager(const LockFileManager &) LLVM_DELETED_FUNCTION;
   LockFileManager &operator=(const LockFileManager &) LLVM_DELETED_FUNCTION;
diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h
index 2a0fc7bfe06a..60f7918ae72e 100644
--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h
@@ -360,11 +360,28 @@ namespace llvm {
     enum {
       // Constant masks for the "n_desc" field in llvm::MachO::nlist and
       // llvm::MachO::nlist_64
+      // The low 3 bits are the for the REFERENCE_TYPE.
+      REFERENCE_TYPE                            = 0x7,
+      REFERENCE_FLAG_UNDEFINED_NON_LAZY         = 0,
+      REFERENCE_FLAG_UNDEFINED_LAZY             = 1,
+      REFERENCE_FLAG_DEFINED                    = 2,
+      REFERENCE_FLAG_PRIVATE_DEFINED            = 3,
+      REFERENCE_FLAG_PRIVATE_UNDEFINED_NON_LAZY = 4,
+      REFERENCE_FLAG_PRIVATE_UNDEFINED_LAZY     = 5,
+      // Flag bits (some overlap with the library ordinal bits).
       N_ARM_THUMB_DEF   = 0x0008u,
+      REFERENCED_DYNAMICALLY = 0x0010u,
       N_NO_DEAD_STRIP   = 0x0020u,
       N_WEAK_REF        = 0x0040u,
       N_WEAK_DEF        = 0x0080u,
-      N_SYMBOL_RESOLVER = 0x0100u
+      N_SYMBOL_RESOLVER = 0x0100u,
+      N_ALT_ENTRY       = 0x0200u,
+      // For undefined symbols coming from libraries, see GET_LIBRARY_ORDINAL()
+      // as these are in the top 8 bits.
+      SELF_LIBRARY_ORDINAL   = 0x0,
+      MAX_LIBRARY_ORDINAL    = 0xfd,
+      DYNAMIC_LOOKUP_ORDINAL = 0xfe,
+      EXECUTABLE_ORDINAL     = 0xff 
     };
 
     enum StabType {
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index f1f7b4feb580..6965faf8df8b 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -258,6 +258,12 @@ inline uint32_t Lo_32(uint64_t Value) {
   return static_cast<uint32_t>(Value);
 }
 
+/// Make_64 - This functions makes a 64-bit integer from a high / low pair of
+///           32-bit integers.
+inline uint64_t Make_64(uint32_t High, uint32_t Low) {
+  return ((uint64_t)High << 32) | (uint64_t)Low;
+}
+
 /// isInt - Checks if an integer fits into the given bit width.
 template<unsigned N>
 inline bool isInt(int64_t x) {
diff --git a/include/llvm/Support/Memory.h b/include/llvm/Support/Memory.h
index 0996adb23f94..b4305cb697d0 100644
--- a/include/llvm/Support/Memory.h
+++ b/include/llvm/Support/Memory.h
@@ -15,8 +15,8 @@
 #define LLVM_SUPPORT_MEMORY_H
 
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/system_error.h"
 #include <string>
+#include <system_error>
 
 namespace llvm {
 namespace sys {
@@ -77,7 +77,7 @@ namespace sys {
     static MemoryBlock allocateMappedMemory(size_t NumBytes,
                                             const MemoryBlock *const NearBlock,
                                             unsigned Flags,
-                                            error_code &EC);
+                                            std::error_code &EC);
 
     /// This method releases a block of memory that was allocated with the
     /// allocateMappedMemory method. It should not be used to release any
@@ -88,7 +88,7 @@ namespace sys {
     /// describing the failure if an error occurred.
     /// 
     /// @brief Release mapped memory.
-    static error_code releaseMappedMemory(MemoryBlock &Block);
+    static std::error_code releaseMappedMemory(MemoryBlock &Block);
 
     /// This method sets the protection flags for a block of memory to the
     /// state specified by /p Flags.  The behavior is not specified if the
@@ -105,8 +105,8 @@ namespace sys {
     /// describing the failure if an error occurred.
     ///
     /// @brief Set memory protection state.
-    static error_code protectMappedMemory(const MemoryBlock &Block,
-                                          unsigned Flags);
+    static std::error_code protectMappedMemory(const MemoryBlock &Block,
+                                               unsigned Flags);
 
     /// This method allocates a block of Read/Write/Execute memory that is
     /// suitable for executing dynamically generated code (e.g. JIT). An
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 5810c47bebcb..8c742c686b15 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -20,11 +20,9 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include <memory>
+#include <system_error>
 
 namespace llvm {
-
-class error_code;
-
 /// MemoryBuffer - This interface provides simple read-only access to a block
 /// of memory, and provides simple methods for reading files and standard input
 /// into a memory buffer.  In addition to basic access to the characters in the
@@ -70,11 +68,11 @@ class MemoryBuffer {
   /// \param IsVolatileSize Set to true to indicate that the file size may be
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
-  static error_code getFile(Twine Filename,
-                            std::unique_ptr<MemoryBuffer> &Result,
-                            int64_t FileSize = -1,
-                            bool RequiresNullTerminator = true,
-                            bool IsVolatileSize = false);
+  static std::error_code getFile(Twine Filename,
+                                 std::unique_ptr<MemoryBuffer> &Result,
+                                 int64_t FileSize = -1,
+                                 bool RequiresNullTerminator = true,
+                                 bool IsVolatileSize = false);
 
   /// Given an already-open file descriptor, map some slice of it into a
   /// MemoryBuffer. The slice is specified by an \p Offset and \p MapSize.
@@ -83,10 +81,10 @@ class MemoryBuffer {
   /// \param IsVolatileSize Set to true to indicate that the file size may be
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
-  static error_code getOpenFileSlice(int FD, const char *Filename,
-                                     std::unique_ptr<MemoryBuffer> &Result,
-                                     uint64_t MapSize, int64_t Offset,
-                                     bool IsVolatileSize = false);
+  static std::error_code getOpenFileSlice(int FD, const char *Filename,
+                                          std::unique_ptr<MemoryBuffer> &Result,
+                                          uint64_t MapSize, int64_t Offset,
+                                          bool IsVolatileSize = false);
 
   /// Given an already-open file descriptor, read the file and return a
   /// MemoryBuffer.
@@ -94,11 +92,11 @@ class MemoryBuffer {
   /// \param IsVolatileSize Set to true to indicate that the file size may be
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
-  static error_code getOpenFile(int FD, const char *Filename,
-                                std::unique_ptr<MemoryBuffer> &Result,
-                                uint64_t FileSize,
-                                bool RequiresNullTerminator = true,
-                                bool IsVolatileSize = false);
+  static std::error_code getOpenFile(int FD, const char *Filename,
+                                     std::unique_ptr<MemoryBuffer> &Result,
+                                     uint64_t FileSize,
+                                     bool RequiresNullTerminator = true,
+                                     bool IsVolatileSize = false);
 
   /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
   /// that InputData must be null terminated if RequiresNullTerminator is true.
@@ -127,15 +125,14 @@ class MemoryBuffer {
 
   /// getSTDIN - Read all of stdin into a file buffer, and return it.
   /// If an error occurs, this returns null and sets ec.
-  static error_code getSTDIN(std::unique_ptr<MemoryBuffer> &Result);
-
+  static std::error_code getSTDIN(std::unique_ptr<MemoryBuffer> &Result);
 
   /// getFileOrSTDIN - Open the specified file as a MemoryBuffer, or open stdin
   /// if the Filename is "-".  If an error occurs, this returns null and sets
   /// ec.
-  static error_code getFileOrSTDIN(StringRef Filename,
-                                   std::unique_ptr<MemoryBuffer> &Result,
-                                   int64_t FileSize = -1);
+  static std::error_code getFileOrSTDIN(StringRef Filename,
+                                        std::unique_ptr<MemoryBuffer> &Result,
+                                        int64_t FileSize = -1);
 
   //===--------------------------------------------------------------------===//
   // Provided for performance analysis.
diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h
index 7f6441ea206a..4f98e4de8125 100644
--- a/include/llvm/Support/Process.h
+++ b/include/llvm/Support/Process.h
@@ -31,7 +31,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/TimeValue.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
 class StringRef;
@@ -174,7 +174,7 @@ class Process {
   /// This function returns a SmallVector containing the arguments passed from
   /// the operating system to the program.  This function expects to be handed
   /// the vector passed in from main.
-  static error_code
+  static std::error_code
   GetArgumentVector(SmallVectorImpl<const char *> &Args,
                     ArrayRef<const char *> ArgsFromMain,
                     SpecificBumpPtrAllocator<char> &ArgAllocator);
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 9160b7dd92b7..51279a9b864a 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -16,10 +16,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-class error_code;
 namespace sys {
 
   /// This is the OS-specific separator for PATH like environment variables:
@@ -67,8 +66,8 @@ struct ProcessInfo {
   // These functions change the specified standard stream (stdin or stdout) to
   // binary mode. They return errc::success if the specified stream
   // was changed. Otherwise a platform dependent error is returned.
-  error_code ChangeStdinToBinary();
-  error_code ChangeStdoutToBinary();
+  std::error_code ChangeStdinToBinary();
+  std::error_code ChangeStdoutToBinary();
 
   /// This function executes the program using the arguments provided.  The
   /// invoked program will inherit the stdin, stdout, and stderr file
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 39f896d65db1..30eccd99acbc 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -30,7 +30,7 @@ namespace llvm {
   class Twine;
   class raw_ostream;
 
-/// SourceMgr - This owns the files read by a parser, handles include stacks,
+/// This owns the files read by a parser, handles include stacks,
 /// and handles diagnostic wrangling.
 class SourceMgr {
 public:
@@ -40,29 +40,27 @@ class SourceMgr {
     DK_Note
   };
 
-  /// DiagHandlerTy - Clients that want to handle their own diagnostics in a
-  /// custom way can register a function pointer+context as a diagnostic
-  /// handler.  It gets called each time PrintMessage is invoked.
+  /// Clients that want to handle their own diagnostics in a custom way can
+  /// register a function pointer+context as a diagnostic handler.
+  /// It gets called each time PrintMessage is invoked.
   typedef void (*DiagHandlerTy)(const SMDiagnostic &, void *Context);
 private:
   struct SrcBuffer {
-    /// Buffer - The memory buffer for the file.
+    /// The memory buffer for the file.
     MemoryBuffer *Buffer;
 
-    /// IncludeLoc - This is the location of the parent include, or null if at
-    /// the top level.
+    /// This is the location of the parent include, or null if at the top level.
     SMLoc IncludeLoc;
   };
 
-  /// Buffers - This is all of the buffers that we are reading from.
+  /// This is all of the buffers that we are reading from.
   std::vector<SrcBuffer> Buffers;
 
-  // IncludeDirectories - This is the list of directories we should search for
-  // include files in.
+  // This is the list of directories we should search for include files in.
   std::vector<std::string> IncludeDirectories;
 
-  /// LineNoCache - This is a cache for line number queries, its implementation
-  /// is really private to SourceMgr.cpp.
+  /// This is a cache for line number queries, its implementation is really
+  /// private to SourceMgr.cpp.
   mutable void *LineNoCache;
 
   DiagHandlerTy DiagHandler;
@@ -79,8 +77,8 @@ class SourceMgr {
     IncludeDirectories = Dirs;
   }
 
-  /// setDiagHandler - Specify a diagnostic handler to be invoked every time
-  /// PrintMessage is called. Ctx is passed into the handler when it is invoked.
+  /// Specify a diagnostic handler to be invoked every time PrintMessage is
+  /// called. \p Ctx is passed into the handler when it is invoked.
   void setDiagHandler(DiagHandlerTy DH, void *Ctx = nullptr) {
     DiagHandler = DH;
     DiagContext = Ctx;
@@ -108,8 +106,8 @@ class SourceMgr {
     return Buffers[i].IncludeLoc;
   }
 
-  /// AddNewSourceBuffer - Add a new source buffer to this source manager.  This
-  /// takes ownership of the memory buffer.
+  /// Add a new source buffer to this source manager. This takes ownership of
+  /// the memory buffer.
   size_t AddNewSourceBuffer(MemoryBuffer *F, SMLoc IncludeLoc) {
     SrcBuffer NB;
     NB.Buffer = F;
@@ -118,32 +116,34 @@ class SourceMgr {
     return Buffers.size() - 1;
   }
 
-  /// AddIncludeFile - Search for a file with the specified name in the current
-  /// directory or in one of the IncludeDirs.  If no file is found, this returns
-  /// ~0, otherwise it returns the buffer ID of the stacked file.
-  /// The full path to the included file can be found in IncludedFile.
+  /// Search for a file with the specified name in the current directory or in
+  /// one of the IncludeDirs.
+  ///
+  /// If no file is found, this returns ~0, otherwise it returns the buffer ID
+  /// of the stacked file. The full path to the included file can be found in
+  /// \p IncludedFile.
   size_t AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc,
                         std::string &IncludedFile);
 
-  /// FindBufferContainingLoc - Return the ID of the buffer containing the
-  /// specified location, returning -1 if not found.
+  /// Return the ID of the buffer containing the specified location.
+  ///
+  /// -1 is returned if the buffer is not found.
   int FindBufferContainingLoc(SMLoc Loc) const;
 
-  /// FindLineNumber - Find the line number for the specified location in the
-  /// specified file.  This is not a fast method.
+  /// Find the line number for the specified location in the specified file.
+  /// This is not a fast method.
   unsigned FindLineNumber(SMLoc Loc, int BufferID = -1) const {
     return getLineAndColumn(Loc, BufferID).first;
   }
 
-  /// getLineAndColumn - Find the line and column number for the specified
-  /// location in the specified file.  This is not a fast method.
+  /// Find the line and column number for the specified location in the
+  /// specified file. This is not a fast method.
   std::pair<unsigned, unsigned>
     getLineAndColumn(SMLoc Loc, int BufferID = -1) const;
 
-  /// PrintMessage - Emit a message about the specified location with the
-  /// specified string.
+  /// Emit a message about the specified location with the specified string.
   ///
-  /// @param ShowColors - Display colored messages if output is a terminal and
+  /// \param ShowColors Display colored messages if output is a terminal and
   /// the default error handler is used.
   void PrintMessage(raw_ostream &OS, SMLoc Loc, DiagKind Kind,
                     const Twine &Msg,
@@ -157,21 +157,28 @@ class SourceMgr {
                     ArrayRef<SMFixIt> FixIts = None,
                     bool ShowColors = true) const;
 
-  /// GetMessage - Return an SMDiagnostic at the specified location with the
-  /// specified string.
+  /// Emits a manually-constructed diagnostic to the given output stream.
+  ///
+  /// \param ShowColors Display colored messages if output is a terminal and
+  /// the default error handler is used.
+  void PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
+                    bool ShowColors = true) const;
+
+  /// Return an SMDiagnostic at the specified location with the specified
+  /// string.
   ///
-  /// @param Msg If non-null, the kind of message (e.g., "error") which is
+  /// \param Msg If non-null, the kind of message (e.g., "error") which is
   /// prefixed to the message.
   SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
                           ArrayRef<SMRange> Ranges = None,
                           ArrayRef<SMFixIt> FixIts = None) const;
 
-  /// PrintIncludeStack - Prints the names of included files and the line of the
-  /// file they were included from.  A diagnostic handler can use this before
-  /// printing its custom formatted message.
+  /// Prints the names of included files and the line of the file they were
+  /// included from. A diagnostic handler can use this before printing its
+  /// custom formatted message.
   ///
-  /// @param IncludeLoc - The line of the include.
-  /// @param OS the raw_ostream to print on.
+  /// \param IncludeLoc The location of the include.
+  /// \param OS the raw_ostream to print on.
   void PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const;
 };
 
@@ -208,8 +215,8 @@ class SMFixIt {
 };
 
 
-/// SMDiagnostic - Instances of this class encapsulate one diagnostic report,
-/// allowing printing to a raw_ostream as a caret diagnostic.
+/// Instances of this class encapsulate one diagnostic report, allowing
+/// printing to a raw_ostream as a caret diagnostic.
 class SMDiagnostic {
   const SourceMgr *SM;
   SMLoc Loc;
diff --git a/include/llvm/Support/SwapByteOrder.h b/include/llvm/Support/SwapByteOrder.h
index e65f9cc0729d..340954f4c893 100644
--- a/include/llvm/Support/SwapByteOrder.h
+++ b/include/llvm/Support/SwapByteOrder.h
@@ -68,33 +68,38 @@ inline uint64_t SwapByteOrder_64(uint64_t value) {
 #endif
 }
 
-inline unsigned char  SwapByteOrder(unsigned char C) { return C; }
-inline   signed char  SwapByteOrder(signed char C) { return C; }
-inline          char  SwapByteOrder(char C) { return C; }
+inline unsigned char  getSwappedBytes(unsigned char C) { return C; }
+inline   signed char  getSwappedBytes(signed char C) { return C; }
+inline          char  getSwappedBytes(char C) { return C; }
 
-inline unsigned short SwapByteOrder(unsigned short C) { return SwapByteOrder_16(C); }
-inline   signed short SwapByteOrder(  signed short C) { return SwapByteOrder_16(C); }
+inline unsigned short getSwappedBytes(unsigned short C) { return SwapByteOrder_16(C); }
+inline   signed short getSwappedBytes(  signed short C) { return SwapByteOrder_16(C); }
 
-inline unsigned int   SwapByteOrder(unsigned int   C) { return SwapByteOrder_32(C); }
-inline   signed int   SwapByteOrder(  signed int   C) { return SwapByteOrder_32(C); }
+inline unsigned int   getSwappedBytes(unsigned int   C) { return SwapByteOrder_32(C); }
+inline   signed int   getSwappedBytes(  signed int   C) { return SwapByteOrder_32(C); }
 
 #if __LONG_MAX__ == __INT_MAX__
-inline unsigned long  SwapByteOrder(unsigned long  C) { return SwapByteOrder_32(C); }
-inline   signed long  SwapByteOrder(  signed long  C) { return SwapByteOrder_32(C); }
+inline unsigned long  getSwappedBytes(unsigned long  C) { return SwapByteOrder_32(C); }
+inline   signed long  getSwappedBytes(  signed long  C) { return SwapByteOrder_32(C); }
 #elif __LONG_MAX__ == __LONG_LONG_MAX__
-inline unsigned long  SwapByteOrder(unsigned long  C) { return SwapByteOrder_64(C); }
-inline   signed long  SwapByteOrder(  signed long  C) { return SwapByteOrder_64(C); }
+inline unsigned long  getSwappedBytes(unsigned long  C) { return SwapByteOrder_64(C); }
+inline   signed long  getSwappedBytes(  signed long  C) { return SwapByteOrder_64(C); }
 #else
 #error "Unknown long size!"
 #endif
 
-inline unsigned long long SwapByteOrder(unsigned long long C) {
+inline unsigned long long getSwappedBytes(unsigned long long C) {
   return SwapByteOrder_64(C);
 }
-inline signed long long SwapByteOrder(signed long long C) {
+inline signed long long getSwappedBytes(signed long long C) {
   return SwapByteOrder_64(C);
 }
 
+template<typename T>
+inline void swapByteOrder(T &Value) {
+  Value = getSwappedBytes(Value);
+}
+
 } // end namespace sys
 } // end namespace llvm
 
diff --git a/include/llvm/Support/WindowsError.h b/include/llvm/Support/WindowsError.h
new file mode 100644
index 000000000000..0e909a05bf4a
--- /dev/null
+++ b/include/llvm/Support/WindowsError.h
@@ -0,0 +1,19 @@
+//===-- WindowsError.h - Support for mapping windows errors to posix-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_WINDOWS_ERROR_H
+#define LLVM_SUPPORT_WINDOWS_ERROR_H
+
+#include <system_error>
+
+namespace llvm {
+std::error_code mapWindowsError(unsigned EV);
+}
+
+#endif
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 7f4a92293549..a23faf65bb59 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -24,7 +24,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
 namespace yaml {
@@ -487,6 +487,19 @@ class IO {
     }
   }
 
+  template <typename T>
+  void maskedBitSetCase(T &Val, const char *Str, T ConstVal, T Mask) {
+    if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal))
+      Val = Val | ConstVal;
+  }
+
+  template <typename T>
+  void maskedBitSetCase(T &Val, const char *Str, uint32_t ConstVal,
+                        uint32_t Mask) {
+    if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal))
+      Val = Val | ConstVal;
+  }
+
   void *getContext();
   void setContext(void *);
 
@@ -867,7 +880,7 @@ class Input : public IO {
   ~Input();
 
   // Check if there was an syntax or semantic error during parsing.
-  llvm::error_code error();
+  std::error_code error();
 
 private:
   bool outputting() override;
@@ -969,13 +982,13 @@ class Input : public IO {
   // These are only used by operator>>. They could be private
   // if those templated things could be made friends.
   bool setCurrentDocument();
-  void nextDocument();
+  bool nextDocument();
 
 private:
   llvm::SourceMgr                     SrcMgr; // must be before Strm
   std::unique_ptr<llvm::yaml::Stream> Strm;
   std::unique_ptr<HNode>              TopNode;
-  llvm::error_code                    EC;
+  std::error_code                     EC;
   llvm::BumpPtrAllocator              StringAllocator;
   llvm::yaml::document_iterator       DocIterator;
   std::vector<bool>                   BitValuesUsed;
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
deleted file mode 100644
index aa5e9f710ae4..000000000000
--- a/include/llvm/Support/system_error.h
+++ /dev/null
@@ -1,901 +0,0 @@
-//===---------------------------- system_error ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This was lifted from libc++ and modified for C++03. This is called
-// system_error even though it does not define that class because that's what
-// it's called in C++0x. We don't define system_error because it is only used
-// for exception handling, which we don't use in LLVM.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_SYSTEM_ERROR_H
-#define LLVM_SUPPORT_SYSTEM_ERROR_H
-
-#include "llvm/Support/Compiler.h"
-
-/*
-    system_error synopsis
-
-namespace std
-{
-
-class error_category
-{
-public:
-    virtual ~error_category();
-
-    error_category(const error_category&) = delete;
-    error_category& operator=(const error_category&) = delete;
-
-    virtual const char* name() const = 0;
-    virtual error_condition default_error_condition(int ev) const;
-    virtual bool equivalent(int code, const error_condition& condition) const;
-    virtual bool equivalent(const error_code& code, int condition) const;
-    virtual std::string message(int ev) const = 0;
-
-    bool operator==(const error_category& rhs) const;
-    bool operator!=(const error_category& rhs) const;
-    bool operator<(const error_category& rhs) const;
-};
-
-const error_category& generic_category();
-const error_category& system_category();
-
-template <class T> struct is_error_code_enum
-    : public std::false_type {};
-
-template <class T> struct is_error_condition_enum
-    : public std::false_type {};
-
-class error_code
-{
-public:
-    // constructors:
-    error_code();
-    error_code(int val, const error_category& cat);
-    template <class ErrorCodeEnum>
-        error_code(ErrorCodeEnum e);
-
-    // modifiers:
-    void assign(int val, const error_category& cat);
-    template <class ErrorCodeEnum>
-        error_code& operator=(ErrorCodeEnum e);
-    void clear();
-
-    // observers:
-    int value() const;
-    const error_category& category() const;
-    error_condition default_error_condition() const;
-    std::string message() const;
-    explicit operator bool() const;
-};
-
-// non-member functions:
-bool operator<(const error_code& lhs, const error_code& rhs);
-template <class charT, class traits>
-    basic_ostream<charT,traits>&
-    operator<<(basic_ostream<charT,traits>& os, const error_code& ec);
-
-class error_condition
-{
-public:
-    // constructors:
-    error_condition();
-    error_condition(int val, const error_category& cat);
-    template <class ErrorConditionEnum>
-        error_condition(ErrorConditionEnum e);
-
-    // modifiers:
-    void assign(int val, const error_category& cat);
-    template <class ErrorConditionEnum>
-        error_condition& operator=(ErrorConditionEnum e);
-    void clear();
-
-    // observers:
-    int value() const;
-    const error_category& category() const;
-    std::string message() const;
-    explicit operator bool() const;
-};
-
-bool operator<(const error_condition& lhs, const error_condition& rhs);
-
-class system_error
-    : public runtime_error
-{
-public:
-    system_error(error_code ec, const std::string& what_arg);
-    system_error(error_code ec, const char* what_arg);
-    system_error(error_code ec);
-    system_error(int ev, const error_category& ecat, const std::string& what_arg);
-    system_error(int ev, const error_category& ecat, const char* what_arg);
-    system_error(int ev, const error_category& ecat);
-
-    const error_code& code() const throw();
-    const char* what() const throw();
-};
-
-enum class errc
-{
-    address_family_not_supported,       // EAFNOSUPPORT
-    address_in_use,                     // EADDRINUSE
-    address_not_available,              // EADDRNOTAVAIL
-    already_connected,                  // EISCONN
-    argument_list_too_long,             // E2BIG
-    argument_out_of_domain,             // EDOM
-    bad_address,                        // EFAULT
-    bad_file_descriptor,                // EBADF
-    bad_message,                        // EBADMSG
-    broken_pipe,                        // EPIPE
-    connection_aborted,                 // ECONNABORTED
-    connection_already_in_progress,     // EALREADY
-    connection_refused,                 // ECONNREFUSED
-    connection_reset,                   // ECONNRESET
-    cross_device_link,                  // EXDEV
-    destination_address_required,       // EDESTADDRREQ
-    device_or_resource_busy,            // EBUSY
-    directory_not_empty,                // ENOTEMPTY
-    executable_format_error,            // ENOEXEC
-    file_exists,                        // EEXIST
-    file_too_large,                     // EFBIG
-    filename_too_long,                  // ENAMETOOLONG
-    function_not_supported,             // ENOSYS
-    host_unreachable,                   // EHOSTUNREACH
-    identifier_removed,                 // EIDRM
-    illegal_byte_sequence,              // EILSEQ
-    inappropriate_io_control_operation, // ENOTTY
-    interrupted,                        // EINTR
-    invalid_argument,                   // EINVAL
-    invalid_seek,                       // ESPIPE
-    io_error,                           // EIO
-    is_a_directory,                     // EISDIR
-    message_size,                       // EMSGSIZE
-    network_down,                       // ENETDOWN
-    network_reset,                      // ENETRESET
-    network_unreachable,                // ENETUNREACH
-    no_buffer_space,                    // ENOBUFS
-    no_child_process,                   // ECHILD
-    no_link,                            // ENOLINK
-    no_lock_available,                  // ENOLCK
-    no_message_available,               // ENODATA
-    no_message,                         // ENOMSG
-    no_protocol_option,                 // ENOPROTOOPT
-    no_space_on_device,                 // ENOSPC
-    no_stream_resources,                // ENOSR
-    no_such_device_or_address,          // ENXIO
-    no_such_device,                     // ENODEV
-    no_such_file_or_directory,          // ENOENT
-    no_such_process,                    // ESRCH
-    not_a_directory,                    // ENOTDIR
-    not_a_socket,                       // ENOTSOCK
-    not_a_stream,                       // ENOSTR
-    not_connected,                      // ENOTCONN
-    not_enough_memory,                  // ENOMEM
-    not_supported,                      // ENOTSUP
-    operation_canceled,                 // ECANCELED
-    operation_in_progress,              // EINPROGRESS
-    operation_not_permitted,            // EPERM
-    operation_not_supported,            // EOPNOTSUPP
-    operation_would_block,              // EWOULDBLOCK
-    owner_dead,                         // EOWNERDEAD
-    permission_denied,                  // EACCES
-    protocol_error,                     // EPROTO
-    protocol_not_supported,             // EPROTONOSUPPORT
-    read_only_file_system,              // EROFS
-    resource_deadlock_would_occur,      // EDEADLK
-    resource_unavailable_try_again,     // EAGAIN
-    result_out_of_range,                // ERANGE
-    state_not_recoverable,              // ENOTRECOVERABLE
-    stream_timeout,                     // ETIME
-    text_file_busy,                     // ETXTBSY
-    timed_out,                          // ETIMEDOUT
-    too_many_files_open_in_system,      // ENFILE
-    too_many_files_open,                // EMFILE
-    too_many_links,                     // EMLINK
-    too_many_symbolic_link_levels,      // ELOOP
-    value_too_large,                    // EOVERFLOW
-    wrong_protocol_type                 // EPROTOTYPE
-};
-
-template <> struct is_error_condition_enum<errc> : std::true_type { }
-
-error_code make_error_code(errc e);
-error_condition make_error_condition(errc e);
-
-// Comparison operators:
-bool operator==(const error_code& lhs, const error_code& rhs);
-bool operator==(const error_code& lhs, const error_condition& rhs);
-bool operator==(const error_condition& lhs, const error_code& rhs);
-bool operator==(const error_condition& lhs, const error_condition& rhs);
-bool operator!=(const error_code& lhs, const error_code& rhs);
-bool operator!=(const error_code& lhs, const error_condition& rhs);
-bool operator!=(const error_condition& lhs, const error_code& rhs);
-bool operator!=(const error_condition& lhs, const error_condition& rhs);
-
-template <> struct hash<std::error_code>;
-
-}  // std
-
-*/
-
-#include "llvm/Config/llvm-config.h"
-#include <cerrno>
-#include <string>
-
-// This must be here instead of a .inc file because it is used in the definition
-// of the enum values below.
-#ifdef LLVM_ON_WIN32
-
-  // The following numbers were taken from VS2010.
-# ifndef EAFNOSUPPORT
-#   define EAFNOSUPPORT 102
-# endif
-# ifndef EADDRINUSE
-#   define EADDRINUSE 100
-# endif
-# ifndef EADDRNOTAVAIL
-#   define EADDRNOTAVAIL 101
-# endif
-# ifndef EISCONN
-#   define EISCONN 113
-# endif
-# ifndef E2BIG
-#   define E2BIG 7
-# endif
-# ifndef EDOM
-#   define EDOM 33
-# endif
-# ifndef EFAULT
-#   define EFAULT 14
-# endif
-# ifndef EBADF
-#   define EBADF 9
-# endif
-# ifndef EBADMSG
-#   define EBADMSG 104
-# endif
-# ifndef EPIPE
-#   define EPIPE 32
-# endif
-# ifndef ECONNABORTED
-#   define ECONNABORTED 106
-# endif
-# ifndef EALREADY
-#   define EALREADY 103
-# endif
-# ifndef ECONNREFUSED
-#   define ECONNREFUSED 107
-# endif
-# ifndef ECONNRESET
-#   define ECONNRESET 108
-# endif
-# ifndef EXDEV
-#   define EXDEV 18
-# endif
-# ifndef EDESTADDRREQ
-#   define EDESTADDRREQ 109
-# endif
-# ifndef EBUSY
-#   define EBUSY 16
-# endif
-# ifndef ENOTEMPTY
-#   define ENOTEMPTY 41
-# endif
-# ifndef ENOEXEC
-#   define ENOEXEC 8
-# endif
-# ifndef EEXIST
-#   define EEXIST 17
-# endif
-# ifndef EFBIG
-#   define EFBIG 27
-# endif
-# ifndef ENAMETOOLONG
-#   define ENAMETOOLONG 38
-# endif
-# ifndef ENOSYS
-#   define ENOSYS 40
-# endif
-# ifndef EHOSTUNREACH
-#   define EHOSTUNREACH 110
-# endif
-# ifndef EIDRM
-#   define EIDRM 111
-# endif
-# ifndef EILSEQ
-#   define EILSEQ 42
-# endif
-# ifndef ENOTTY
-#   define ENOTTY 25
-# endif
-# ifndef EINTR
-#   define EINTR 4
-# endif
-# ifndef EINVAL
-#   define EINVAL 22
-# endif
-# ifndef ESPIPE
-#   define ESPIPE 29
-# endif
-# ifndef EIO
-#   define EIO 5
-# endif
-# ifndef EISDIR
-#   define EISDIR 21
-# endif
-# ifndef EMSGSIZE
-#   define EMSGSIZE 115
-# endif
-# ifndef ENETDOWN
-#   define ENETDOWN 116
-# endif
-# ifndef ENETRESET
-#   define ENETRESET 117
-# endif
-# ifndef ENETUNREACH
-#   define ENETUNREACH 118
-# endif
-# ifndef ENOBUFS
-#   define ENOBUFS 119
-# endif
-# ifndef ECHILD
-#   define ECHILD 10
-# endif
-# ifndef ENOLINK
-#   define ENOLINK 121
-# endif
-# ifndef ENOLCK
-#   define ENOLCK 39
-# endif
-# ifndef ENODATA
-#   define ENODATA 120
-# endif
-# ifndef ENOMSG
-#   define ENOMSG 122
-# endif
-# ifndef ENOPROTOOPT
-#   define ENOPROTOOPT 123
-# endif
-# ifndef ENOSPC
-#   define ENOSPC 28
-# endif
-# ifndef ENOSR
-#   define ENOSR 124
-# endif
-# ifndef ENXIO
-#   define ENXIO 6
-# endif
-# ifndef ENODEV
-#   define ENODEV 19
-# endif
-# ifndef ENOENT
-#   define ENOENT 2
-# endif
-# ifndef ESRCH
-#   define ESRCH 3
-# endif
-# ifndef ENOTDIR
-#   define ENOTDIR 20
-# endif
-# ifndef ENOTSOCK
-#   define ENOTSOCK 128
-# endif
-# ifndef ENOSTR
-#   define ENOSTR 125
-# endif
-# ifndef ENOTCONN
-#   define ENOTCONN 126
-# endif
-# ifndef ENOMEM
-#   define ENOMEM 12
-# endif
-# ifndef ENOTSUP
-#   define ENOTSUP 129
-# endif
-# ifndef ECANCELED
-#   define ECANCELED 105
-# endif
-# ifndef EINPROGRESS
-#   define EINPROGRESS 112
-# endif
-# ifndef EPERM
-#   define EPERM 1
-# endif
-# ifndef EOPNOTSUPP
-#   define EOPNOTSUPP 130
-# endif
-# ifndef EWOULDBLOCK
-#   define EWOULDBLOCK 140
-# endif
-# ifndef EOWNERDEAD
-#   define EOWNERDEAD 133
-# endif
-# ifndef EACCES
-#   define EACCES 13
-# endif
-# ifndef EPROTO
-#   define EPROTO 134
-# endif
-# ifndef EPROTONOSUPPORT
-#   define EPROTONOSUPPORT 135
-# endif
-# ifndef EROFS
-#   define EROFS 30
-# endif
-# ifndef EDEADLK
-#   define EDEADLK 36
-# endif
-# ifndef EAGAIN
-#   define EAGAIN 11
-# endif
-# ifndef ERANGE
-#   define ERANGE 34
-# endif
-# ifndef ENOTRECOVERABLE
-#   define ENOTRECOVERABLE 127
-# endif
-# ifndef ETIME
-#   define ETIME 137
-# endif
-# ifndef ETXTBSY
-#   define ETXTBSY 139
-# endif
-# ifndef ETIMEDOUT
-#   define ETIMEDOUT 138
-# endif
-# ifndef ENFILE
-#   define ENFILE 23
-# endif
-# ifndef EMFILE
-#   define EMFILE 24
-# endif
-# ifndef EMLINK
-#   define EMLINK 31
-# endif
-# ifndef ELOOP
-#   define ELOOP 114
-# endif
-# ifndef EOVERFLOW
-#   define EOVERFLOW 132
-# endif
-# ifndef EPROTOTYPE
-#   define EPROTOTYPE 136
-# endif
-#endif
-
-namespace llvm {
-
-// is_error_code_enum
-
-template <class Tp> struct is_error_code_enum : public std::false_type {};
-
-// is_error_condition_enum
-
-template <class Tp> struct is_error_condition_enum : public std::false_type {};
-
-// Some error codes are not present on all platforms, so we provide equivalents
-// for them:
-
-//enum class errc
-struct errc {
-enum _ {
-  success                             = 0,
-  address_family_not_supported        = EAFNOSUPPORT,
-  address_in_use                      = EADDRINUSE,
-  address_not_available               = EADDRNOTAVAIL,
-  already_connected                   = EISCONN,
-  argument_list_too_long              = E2BIG,
-  argument_out_of_domain              = EDOM,
-  bad_address                         = EFAULT,
-  bad_file_descriptor                 = EBADF,
-#ifdef EBADMSG
-  bad_message                         = EBADMSG,
-#else
-  bad_message                         = EINVAL,
-#endif
-  broken_pipe                         = EPIPE,
-  connection_aborted                  = ECONNABORTED,
-  connection_already_in_progress      = EALREADY,
-  connection_refused                  = ECONNREFUSED,
-  connection_reset                    = ECONNRESET,
-  cross_device_link                   = EXDEV,
-  destination_address_required        = EDESTADDRREQ,
-  device_or_resource_busy             = EBUSY,
-  directory_not_empty                 = ENOTEMPTY,
-  executable_format_error             = ENOEXEC,
-  file_exists                         = EEXIST,
-  file_too_large                      = EFBIG,
-  filename_too_long                   = ENAMETOOLONG,
-  function_not_supported              = ENOSYS,
-  host_unreachable                    = EHOSTUNREACH,
-  identifier_removed                  = EIDRM,
-  illegal_byte_sequence               = EILSEQ,
-  inappropriate_io_control_operation  = ENOTTY,
-  interrupted                         = EINTR,
-  invalid_argument                    = EINVAL,
-  invalid_seek                        = ESPIPE,
-  io_error                            = EIO,
-  is_a_directory                      = EISDIR,
-  message_size                        = EMSGSIZE,
-  network_down                        = ENETDOWN,
-  network_reset                       = ENETRESET,
-  network_unreachable                 = ENETUNREACH,
-  no_buffer_space                     = ENOBUFS,
-  no_child_process                    = ECHILD,
-#ifdef ENOLINK
-  no_link                             = ENOLINK,
-#else
-  no_link                             = EINVAL,
-#endif
-  no_lock_available                   = ENOLCK,
-#ifdef ENODATA
-  no_message_available                = ENODATA,
-#else
-  no_message_available                = ENOMSG,
-#endif
-  no_message                          = ENOMSG,
-  no_protocol_option                  = ENOPROTOOPT,
-  no_space_on_device                  = ENOSPC,
-#ifdef ENOSR
-  no_stream_resources                 = ENOSR,
-#else
-  no_stream_resources                 = ENOMEM,
-#endif
-  no_such_device_or_address           = ENXIO,
-  no_such_device                      = ENODEV,
-  no_such_file_or_directory           = ENOENT,
-  no_such_process                     = ESRCH,
-  not_a_directory                     = ENOTDIR,
-  not_a_socket                        = ENOTSOCK,
-#ifdef ENOSTR
-  not_a_stream                        = ENOSTR,
-#else
-  not_a_stream                        = EINVAL,
-#endif
-  not_connected                       = ENOTCONN,
-  not_enough_memory                   = ENOMEM,
-  not_supported                       = ENOTSUP,
-#ifdef ECANCELED
-  operation_canceled                  = ECANCELED,
-#else
-  operation_canceled                  = EINVAL,
-#endif
-  operation_in_progress               = EINPROGRESS,
-  operation_not_permitted             = EPERM,
-  operation_not_supported             = EOPNOTSUPP,
-  operation_would_block               = EWOULDBLOCK,
-#ifdef EOWNERDEAD
-  owner_dead                          = EOWNERDEAD,
-#else
-  owner_dead                          = EINVAL,
-#endif
-  permission_denied                   = EACCES,
-#ifdef EPROTO
-  protocol_error                      = EPROTO,
-#else
-  protocol_error                      = EINVAL,
-#endif
-  protocol_not_supported              = EPROTONOSUPPORT,
-  read_only_file_system               = EROFS,
-  resource_deadlock_would_occur       = EDEADLK,
-  resource_unavailable_try_again      = EAGAIN,
-  result_out_of_range                 = ERANGE,
-#ifdef ENOTRECOVERABLE
-  state_not_recoverable               = ENOTRECOVERABLE,
-#else
-  state_not_recoverable               = EINVAL,
-#endif
-#ifdef ETIME
-  stream_timeout                      = ETIME,
-#else
-  stream_timeout                      = ETIMEDOUT,
-#endif
-  text_file_busy                      = ETXTBSY,
-  timed_out                           = ETIMEDOUT,
-  too_many_files_open_in_system       = ENFILE,
-  too_many_files_open                 = EMFILE,
-  too_many_links                      = EMLINK,
-  too_many_symbolic_link_levels       = ELOOP,
-  value_too_large                     = EOVERFLOW,
-  wrong_protocol_type                 = EPROTOTYPE
-};
-
-  _ v_;
-
-  errc(_ v) : v_(v) {}
-  operator int() const {return v_;}
-};
-
-template <> struct is_error_condition_enum<errc> : std::true_type { };
-
-template <> struct is_error_condition_enum<errc::_> : std::true_type { };
-
-class error_condition;
-class error_code;
-
-// class error_category
-
-class _do_message;
-
-class error_category
-{
-public:
-  virtual ~error_category();
-
-  error_category();
-private:
-  error_category(const error_category&) LLVM_DELETED_FUNCTION;
-  error_category& operator=(const error_category&) LLVM_DELETED_FUNCTION;
-
-public:
-  virtual const char* name() const = 0;
-  virtual error_condition default_error_condition(int _ev) const;
-  virtual bool equivalent(int _code, const error_condition& _condition) const;
-  virtual bool equivalent(const error_code& _code, int _condition) const;
-  virtual std::string message(int _ev) const = 0;
-
-  bool operator==(const error_category& _rhs) const {return this == &_rhs;}
-
-  bool operator!=(const error_category& _rhs) const {return !(*this == _rhs);}
-
-  bool operator< (const error_category& _rhs) const {return this < &_rhs;}
-
-  friend class _do_message;
-};
-
-class _do_message : public error_category
-{
-public:
-  std::string message(int ev) const override;
-};
-
-const error_category& generic_category();
-const error_category& system_category();
-
-/// Get the error_category used for errno values from POSIX functions. This is
-/// the same as the system_category on POSIX systems, but is the same as the
-/// generic_category on Windows.
-const error_category& posix_category();
-
-class error_condition
-{
-  int _val_;
-  const error_category* _cat_;
-public:
-  error_condition() : _val_(0), _cat_(&generic_category()) {}
-
-  error_condition(int _val, const error_category& _cat)
-    : _val_(_val), _cat_(&_cat) {}
-
-  template <class E>
-  error_condition(E _e, typename std::enable_if<
-                          is_error_condition_enum<E>::value
-                        >::type* = 0)
-    {*this = make_error_condition(_e);}
-
-  void assign(int _val, const error_category& _cat) {
-    _val_ = _val;
-    _cat_ = &_cat;
-  }
-
-  template <class E>
-  typename std::enable_if<is_error_condition_enum<E>::value,
-                          error_condition &>::type
-  operator=(E _e) {
-    *this = make_error_condition(_e);
-    return *this;
-  }
-
-  void clear() {
-    _val_ = 0;
-    _cat_ = &generic_category();
-  }
-
-  int value() const {return _val_;}
-
-  const error_category& category() const {return *_cat_;}
-  std::string message() const;
-
-  typedef void (*unspecified_bool_type)();
-  static void unspecified_bool_true() {}
-
-  operator unspecified_bool_type() const { // true if error
-    return _val_ == 0 ? nullptr : unspecified_bool_true;
-  }
-};
-
-inline error_condition make_error_condition(errc _e) {
-  return error_condition(static_cast<int>(_e), generic_category());
-}
-
-inline bool operator<(const error_condition& _x, const error_condition& _y) {
-  return _x.category() < _y.category()
-      || (_x.category() == _y.category() && _x.value() < _y.value());
-}
-
-// error_code
-
-class error_code {
-  int _val_;
-  const error_category* _cat_;
-public:
-  error_code() : _val_(0), _cat_(&system_category()) {}
-
-  static error_code success() {
-    return error_code();
-  }
-
-  error_code(int _val, const error_category& _cat)
-    : _val_(_val), _cat_(&_cat) {}
-
-  template <class E>
-  error_code(E _e, typename std::enable_if<
-                     is_error_code_enum<E>::value
-                   >::type* = 0) {
-    *this = make_error_code(_e);
-  }
-
-  void assign(int _val, const error_category& _cat) {
-      _val_ = _val;
-      _cat_ = &_cat;
-  }
-
-  template <class E>
-  typename std::enable_if<is_error_code_enum<E>::value, error_code &>::type
-  operator=(E _e) {
-    *this = make_error_code(_e);
-    return *this;
-  }
-
-  void clear() {
-    _val_ = 0;
-    _cat_ = &system_category();
-  }
-
-  int value() const {return _val_;}
-
-  const error_category& category() const {return *_cat_;}
-
-  error_condition default_error_condition() const
-    {return _cat_->default_error_condition(_val_);}
-
-  std::string message() const;
-
-  typedef void (*unspecified_bool_type)();
-  static void unspecified_bool_true() {}
-
-  operator unspecified_bool_type() const { // true if error
-    return _val_ == 0 ? nullptr : unspecified_bool_true;
-  }
-};
-
-inline error_code make_error_code(errc _e) {
-  return error_code(static_cast<int>(_e), generic_category());
-}
-
-inline bool operator<(const error_code& _x, const error_code& _y) {
-  return _x.category() < _y.category()
-      || (_x.category() == _y.category() && _x.value() < _y.value());
-}
-
-inline bool operator==(const error_code& _x, const error_code& _y) {
-  return _x.category() == _y.category() && _x.value() == _y.value();
-}
-
-inline bool operator==(const error_code& _x, const error_condition& _y) {
-  return _x.category().equivalent(_x.value(), _y)
-      || _y.category().equivalent(_x, _y.value());
-}
-
-inline bool operator==(const error_condition& _x, const error_code& _y) {
-  return _y == _x;
-}
-
-inline bool operator==(const error_condition& _x, const error_condition& _y) {
-   return _x.category() == _y.category() && _x.value() == _y.value();
-}
-
-inline bool operator!=(const error_code& _x, const error_code& _y) {
-  return !(_x == _y);
-}
-
-inline bool operator!=(const error_code& _x, const error_condition& _y) {
-  return !(_x == _y);
-}
-
-inline bool operator!=(const error_condition& _x, const error_code& _y) {
-  return !(_x == _y);
-}
-
-inline bool operator!=(const error_condition& _x, const error_condition& _y) {
-  return !(_x == _y);
-}
-
-// Windows errors.
-
-//  To construct an error_code after an API error:
-//
-//      error_code( ::GetLastError(), system_category() )
-struct windows_error {
-enum _ {
-  success = 0,
-  // These names and values are based on Windows WinError.h
-  // This is not a complete list. Add to this list if you need to explicitly
-  // check for it.
-  invalid_function        = 1, // ERROR_INVALID_FUNCTION,
-  file_not_found          = 2, // ERROR_FILE_NOT_FOUND,
-  path_not_found          = 3, // ERROR_PATH_NOT_FOUND,
-  too_many_open_files     = 4, // ERROR_TOO_MANY_OPEN_FILES,
-  access_denied           = 5, // ERROR_ACCESS_DENIED,
-  invalid_handle          = 6, // ERROR_INVALID_HANDLE,
-  arena_trashed           = 7, // ERROR_ARENA_TRASHED,
-  not_enough_memory       = 8, // ERROR_NOT_ENOUGH_MEMORY,
-  invalid_block           = 9, // ERROR_INVALID_BLOCK,
-  bad_environment         = 10, // ERROR_BAD_ENVIRONMENT,
-  bad_format              = 11, // ERROR_BAD_FORMAT,
-  invalid_access          = 12, // ERROR_INVALID_ACCESS,
-  outofmemory             = 14, // ERROR_OUTOFMEMORY,
-  invalid_drive           = 15, // ERROR_INVALID_DRIVE,
-  current_directory       = 16, // ERROR_CURRENT_DIRECTORY,
-  not_same_device         = 17, // ERROR_NOT_SAME_DEVICE,
-  no_more_files           = 18, // ERROR_NO_MORE_FILES,
-  write_protect           = 19, // ERROR_WRITE_PROTECT,
-  bad_unit                = 20, // ERROR_BAD_UNIT,
-  not_ready               = 21, // ERROR_NOT_READY,
-  bad_command             = 22, // ERROR_BAD_COMMAND,
-  crc                     = 23, // ERROR_CRC,
-  bad_length              = 24, // ERROR_BAD_LENGTH,
-  seek                    = 25, // ERROR_SEEK,
-  not_dos_disk            = 26, // ERROR_NOT_DOS_DISK,
-  sector_not_found        = 27, // ERROR_SECTOR_NOT_FOUND,
-  out_of_paper            = 28, // ERROR_OUT_OF_PAPER,
-  write_fault             = 29, // ERROR_WRITE_FAULT,
-  read_fault              = 30, // ERROR_READ_FAULT,
-  gen_failure             = 31, // ERROR_GEN_FAILURE,
-  sharing_violation       = 32, // ERROR_SHARING_VIOLATION,
-  lock_violation          = 33, // ERROR_LOCK_VIOLATION,
-  wrong_disk              = 34, // ERROR_WRONG_DISK,
-  sharing_buffer_exceeded = 36, // ERROR_SHARING_BUFFER_EXCEEDED,
-  handle_eof              = 38, // ERROR_HANDLE_EOF,
-  handle_disk_full        = 39, // ERROR_HANDLE_DISK_FULL,
-  rem_not_list            = 51, // ERROR_REM_NOT_LIST,
-  dup_name                = 52, // ERROR_DUP_NAME,
-  bad_net_path            = 53, // ERROR_BAD_NETPATH,
-  network_busy            = 54, // ERROR_NETWORK_BUSY,
-  file_exists             = 80, // ERROR_FILE_EXISTS,
-  cannot_make             = 82, // ERROR_CANNOT_MAKE,
-  broken_pipe             = 109, // ERROR_BROKEN_PIPE,
-  open_failed             = 110, // ERROR_OPEN_FAILED,
-  buffer_overflow         = 111, // ERROR_BUFFER_OVERFLOW,
-  disk_full               = 112, // ERROR_DISK_FULL,
-  insufficient_buffer     = 122, // ERROR_INSUFFICIENT_BUFFER,
-  lock_failed             = 167, // ERROR_LOCK_FAILED,
-  busy                    = 170, // ERROR_BUSY,
-  cancel_violation        = 173, // ERROR_CANCEL_VIOLATION,
-  already_exists          = 183  // ERROR_ALREADY_EXISTS
-};
-  _ v_;
-
-  windows_error(_ v) : v_(v) {}
-  explicit windows_error(int v) : v_(_(v)) {}
-  operator int() const {return v_;}
-};
-
-
-template <> struct is_error_code_enum<windows_error> : std::true_type { };
-
-template <> struct is_error_code_enum<windows_error::_> : std::true_type { };
-
-inline error_code make_error_code(windows_error e) {
-  return error_code(static_cast<int>(e), system_category());
-}
-
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index ed1c4c90f82a..36464d75d5ab 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -198,16 +198,16 @@ class BitsRecTy : public RecTy {
   Init *convertValue(VarBitInit *VB) override { return nullptr; }
   Init *convertValue(   DefInit *DI) override { return nullptr; }
   Init *convertValue(   DagInit *DI) override { return nullptr; }
-  Init *convertValue( UnOpInit *UI) override { return RecTy::convertValue(UI);}
+  Init *convertValue(  UnOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( BinOpInit *UI) override { return RecTy::convertValue(UI);}
-  Init *convertValue( TernOpInit *UI) override {return RecTy::convertValue(UI);}
+  Init *convertValue(TernOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( TypedInit *TI) override;
-  Init *convertValue(   VarInit *VI) override{ return RecTy::convertValue(VI);}
-  Init *convertValue( FieldInit *FI) override{ return RecTy::convertValue(FI);}
+  Init *convertValue(   VarInit *VI) override { return RecTy::convertValue(VI);}
+  Init *convertValue( FieldInit *FI) override { return RecTy::convertValue(FI);}
 
   std::string getAsString() const override;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const override{
+  bool typeIsConvertibleTo(const RecTy *RHS) const override {
     return RHS->baseClassOf(this);
   }
   bool baseClassOf(const RecTy*) const override;
@@ -313,16 +313,16 @@ class ListRecTy : public RecTy {
   Init *convertValue(VarBitInit *VB) override { return nullptr; }
   Init *convertValue(   DefInit *DI) override { return nullptr; }
   Init *convertValue(   DagInit *DI) override { return nullptr; }
-  Init *convertValue( UnOpInit *UI) override { return RecTy::convertValue(UI);}
+  Init *convertValue(  UnOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( BinOpInit *UI) override { return RecTy::convertValue(UI);}
-  Init *convertValue( TernOpInit *UI) override{ return RecTy::convertValue(UI);}
+  Init *convertValue(TernOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( TypedInit *TI) override;
   Init *convertValue(   VarInit *VI) override { return RecTy::convertValue(VI);}
   Init *convertValue( FieldInit *FI) override { return RecTy::convertValue(FI);}
 
   std::string getAsString() const override;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const override{
+  bool typeIsConvertibleTo(const RecTy *RHS) const override {
     return RHS->baseClassOf(this);
   }
 
@@ -360,7 +360,7 @@ class DagRecTy : public RecTy {
 
   std::string getAsString() const override { return "dag"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const override{
+  bool typeIsConvertibleTo(const RecTy *RHS) const override {
     return RHS->baseClassOf(this);
   }
 };
diff --git a/utils/TableGen/SetTheory.h b/include/llvm/TableGen/SetTheory.h
similarity index 100%
rename from utils/TableGen/SetTheory.h
rename to include/llvm/TableGen/SetTheory.h
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 7d1f19c477cf..f77cc7a35eb8 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -587,6 +587,11 @@ class Operand<ValueType ty> : DAGOperand {
   string OperandType = "OPERAND_UNKNOWN";
   dag MIOperandInfo = (ops);
 
+  // MCOperandPredicate - Optionally, a code fragment operating on
+  // const MCOperand &MCOp, and returning a bool, to indicate if
+  // the value of MCOp is valid for the specific subclass of Operand
+  code MCOperandPredicate;
+
   // ParserMatchClass - The "match class" that operands of this type fit
   // in. Match classes are used to define the order in which instructions are
   // match, to ensure that which instructions gets matched is deterministic.
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 165b35fee2b6..87e7c14ac1e8 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -29,6 +29,7 @@ class MachineRegisterInfo;
 class MDNode;
 class MCInst;
 class MCSchedModel;
+class MCSymbolRefExpr;
 class SDNode;
 class ScheduleHazardRecognizer;
 class SelectionDAG;
@@ -36,6 +37,7 @@ class ScheduleDAG;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 class BranchProbability;
+class TargetSubtargetInfo;
 
 template<class T> class SmallVectorImpl;
 
@@ -321,6 +323,20 @@ class TargetInstrInfo : public MCInstrInfo {
   virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
                                        MachineBasicBlock *NewDest) const;
 
+  /// getUnconditionalBranch - Get an instruction that performs an unconditional
+  /// branch to the given symbol.
+  virtual void
+  getUnconditionalBranch(MCInst &MI,
+                         const MCSymbolRefExpr *BranchTarget) const {
+    llvm_unreachable("Target didn't implement "
+                     "TargetInstrInfo::getUnconditionalBranch!");
+  }
+
+  /// getTrap - Get a machine trap instruction
+  virtual void getTrap(MCInst &MI) const {
+    llvm_unreachable("Target didn't implement TargetInstrInfo::getTrap!");
+  }
+
   /// isLegalToSplitMBBAt - Return true if it's legal to split the given basic
   /// block at the specified instruction (i.e. instruction would be the start
   /// of a new basic block).
@@ -728,7 +744,7 @@ class TargetInstrInfo : public MCInstrInfo {
   /// use for this target when scheduling the machine instructions before
   /// register allocation.
   virtual ScheduleHazardRecognizer*
-  CreateTargetHazardRecognizer(const TargetMachine *TM,
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const;
 
   /// CreateTargetMIHazardRecognizer - Allocate and return a hazard recognizer
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 374a163800b4..419eced0a0be 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -131,14 +131,12 @@ class TargetLoweringObjectFile : public MCObjectFileInfo {
                     MCStreamer &Streamer) const;
 
   virtual const MCSection *getStaticCtorSection(unsigned Priority,
-                                                const MCSymbol *KeySym,
-                                                const MCSection *KeySec) const {
+                                                const MCSymbol *KeySym) const {
     return StaticCtorSection;
   }
 
   virtual const MCSection *getStaticDtorSection(unsigned Priority,
-                                                const MCSymbol *KeySym,
-                                                const MCSection *KeySec) const {
+                                                const MCSymbol *KeySym) const {
     return StaticDtorSection;
   }
 
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index bf6963b79659..b263c571d9e6 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -187,26 +187,26 @@ class TargetMachine {
 
   /// getAsmVerbosityDefault - Returns the default value of asm verbosity.
   ///
-  static bool getAsmVerbosityDefault();
+  bool getAsmVerbosityDefault() const ;
 
   /// setAsmVerbosityDefault - Set the default value of asm verbosity. Default
   /// is false.
-  static void setAsmVerbosityDefault(bool);
+  void setAsmVerbosityDefault(bool);
 
   /// getDataSections - Return true if data objects should be emitted into their
   /// own section, corresponds to -fdata-sections.
-  static bool getDataSections();
+  bool getDataSections() const;
 
   /// getFunctionSections - Return true if functions should be emitted into
   /// their own section, corresponding to -ffunction-sections.
-  static bool getFunctionSections();
+  bool getFunctionSections() const;
 
   /// setDataSections - Set if the data are emit into separate sections.
-  static void setDataSections(bool);
+  void setDataSections(bool);
 
   /// setFunctionSections - Set if the functions are emit into separate
   /// sections.
-  static void setFunctionSections(bool);
+  void setFunctionSections(bool);
 
   /// \brief Register analysis passes for this target with a pass manager.
   virtual void addAnalysisPasses(PassManagerBase &) {}
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 188395968bb5..922fae54bb80 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -39,6 +39,17 @@ namespace llvm {
     };
   }
 
+  namespace JumpTable {
+    enum JumpTableType {
+      Single,          // Use a single table for all indirect jumptable calls.
+      Arity,           // Use one table per number of function parameters.
+      Simplified,      // Use one table per function type, with types projected
+                       // into 4 types: pointer to non-function, struct,
+                       // primitive, and function pointer.
+      Full             // Use one table per unique function type
+    };
+  }
+
   class TargetOptions {
   public:
     TargetOptions()
@@ -50,11 +61,11 @@ namespace llvm {
           JITEmitDebugInfoToDisk(false), GuaranteedTailCallOpt(false),
           DisableTailCalls(false), StackAlignmentOverride(0),
           EnableFastISel(false), PositionIndependentExecutable(false),
-          UseInitArray(false),
-          DisableIntegratedAS(false), CompressDebugSections(false),
-          TrapUnreachable(false),
-          TrapFuncName(""), FloatABIType(FloatABI::Default),
-          AllowFPOpFusion(FPOpFusion::Standard) {}
+          UseInitArray(false), DisableIntegratedAS(false),
+          CompressDebugSections(false), FunctionSections(false),
+          DataSections(false), TrapUnreachable(false), TrapFuncName(""),
+          FloatABIType(FloatABI::Default),
+          AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single) {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
     /// option is specified on the command line, and should enable debugging
@@ -164,6 +175,12 @@ namespace llvm {
     /// Compress DWARF debug sections.
     unsigned CompressDebugSections : 1;
 
+    /// Emit functions into separate sections.
+    unsigned FunctionSections : 1;
+
+    /// Emit data into separate sections.
+    unsigned DataSections : 1;
+
     /// Emit target-specific trap instruction for 'unreachable' IR instructions.
     unsigned TrapUnreachable : 1;
 
@@ -199,6 +216,10 @@ namespace llvm {
     /// the value of this option.
     FPOpFusion::FPOpFusionMode AllowFPOpFusion;
 
+    /// JTType - This flag specifies the type of jump-instruction table to
+    /// create for functions that have the jumptable attribute.
+    JumpTable::JumpTableType JTType;
+
     /// Machine level options.
     MCTargetOptions MCOptions;
   };
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index 98a5149d50f8..78a2db183fb1 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -37,7 +37,7 @@ class TargetSelectionDAGInfo {
   const DataLayout *getDataLayout() const { return DL; }
 
 public:
-  explicit TargetSelectionDAGInfo(const TargetMachine &TM);
+  explicit TargetSelectionDAGInfo(const DataLayout *DL);
   virtual ~TargetSelectionDAGInfo();
 
   /// EmitTargetCodeForMemcpy - Emit target-specific code that performs a
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index 1b2e06acc2b0..bb164288b013 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -66,6 +66,13 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   /// scheduler. It does not yet disable the postRA scheduler.
   virtual bool enableMachineScheduler() const;
 
+  /// \brief True if the subtarget should run PostMachineScheduler.
+  ///
+  /// This only takes effect if the target has configured the
+  /// PostMachineScheduler pass to run, or if the global cl::opt flag,
+  /// MISchedPostRA, is set.
+  virtual bool enablePostMachineScheduler() const;
+
   /// \brief Override generic scheduling policy within a region.
   ///
   /// This is a convenient way for targets that don't provide any custom
@@ -76,6 +83,11 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
                                    MachineInstr *end,
                                    unsigned NumRegionInstrs) const {}
 
+  // \brief Perform target specific adjustments to the latency of a schedule
+  // dependency.
+  virtual void adjustSchedDependency(SUnit *def, SUnit *use,
+                                     SDep& dep) const { }
+
   // enablePostRAScheduler - If the target can benefit from post-regalloc
   // scheduling and the specified optimization level meets the requirement
   // return true to enable post-register-allocation scheduling. In
@@ -84,15 +96,14 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      AntiDepBreakMode& Mode,
                                      RegClassVector& CriticalPathRCs) const;
-  // adjustSchedDependency - Perform target specific adjustments to
-  // the latency of a schedule dependency.
-  virtual void adjustSchedDependency(SUnit *def, SUnit *use,
-                                     SDep& dep) const { }
 
   /// \brief Enable use of alias analysis during code generation (during MI
   /// scheduling, DAGCombine, etc.).
   virtual bool useAA() const;
 
+  /// \brief Enable the use of the early if conversion pass.
+  virtual bool enableEarlyIfConversion() const { return false; }
+
   /// \brief Reset the features for the subtarget.
   virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
 };
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 334fb1cc4d3a..ce1a7d6a5230 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -58,21 +58,18 @@ ModulePass *createStripDeadDebugInfoPass();
 ///
 ModulePass *createConstantMergePass();
 
-
 //===----------------------------------------------------------------------===//
 /// createGlobalOptimizerPass - This function returns a new pass that optimizes
 /// non-address taken internal globals.
 ///
 ModulePass *createGlobalOptimizerPass();
 
-
 //===----------------------------------------------------------------------===//
 /// createGlobalDCEPass - This transform is designed to eliminate unreachable
 /// internal globals (functions or global variables)
 ///
 ModulePass *createGlobalDCEPass();
 
-
 //===----------------------------------------------------------------------===//
 /// createGVExtractionPass - If deleteFn is true, this pass deletes
 /// the specified global values. Otherwise, it deletes as much of the module as
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index bfd58f11756d..50877d013702 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -55,7 +55,6 @@ using legacy::FunctionPassManager;
 ///   ...
 class PassManagerBuilder {
 public:
-
   /// Extensions are passed the builder itself (so they can see how it is
   /// configured) as well as the pass manager to add stuff to.
   typedef void (*ExtensionFn)(const PassManagerBuilder &Builder,
@@ -86,7 +85,12 @@ class PassManagerBuilder {
     /// EP_EnabledOnOptLevel0 - This extension point allows adding passes that
     /// should not be disabled by O0 optimization level. The passes will be
     /// inserted after the inlining pass.
-    EP_EnabledOnOptLevel0
+    EP_EnabledOnOptLevel0,
+
+    /// EP_Peephole - This extension point allows adding passes that perform
+    /// peephole optimizations similar to the instruction combiner. These passes
+    /// will be inserted after each instance of the instruction combiner pass.
+    EP_Peephole,
   };
 
   /// The Optimization Level - Specify the basic optimization level.
@@ -113,6 +117,7 @@ class PassManagerBuilder {
   bool SLPVectorize;
   bool LoopVectorize;
   bool RerollLoops;
+  bool LoadCombine;
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
@@ -130,8 +135,8 @@ class PassManagerBuilder {
 private:
   void addExtensionsToPM(ExtensionPointTy ETy, PassManagerBase &PM) const;
   void addInitialAliasAnalysisPasses(PassManagerBase &PM) const;
-public:
 
+public:
   /// populateFunctionPassManager - This fills in the function pass manager,
   /// which is expected to be run on each function immediately as it is
   /// generated.  The idea is to reduce the size of the IR in memory.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 61d5c26c5008..e2139efe472f 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -64,18 +64,15 @@ ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
                                    GCOVOptions::getDefault());
 
 // Insert AddressSanitizer (address sanity checking) instrumentation
-FunctionPass *createAddressSanitizerFunctionPass(
-    bool CheckInitOrder = true, bool CheckUseAfterReturn = false,
-    bool CheckLifetime = false, StringRef BlacklistFile = StringRef());
-ModulePass *createAddressSanitizerModulePass(
-    bool CheckInitOrder = true, StringRef BlacklistFile = StringRef());
+FunctionPass *createAddressSanitizerFunctionPass();
+ModulePass *
+createAddressSanitizerModulePass(StringRef BlacklistFile = StringRef());
 
 // Insert MemorySanitizer instrumentation (detection of uninitialized reads)
-FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0,
-                                        StringRef BlacklistFile = StringRef());
+FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0);
 
 // Insert ThreadSanitizer (race detection) instrumentation
-FunctionPass *createThreadSanitizerPass(StringRef BlacklistFile = StringRef());
+FunctionPass *createThreadSanitizerPass();
 
 // Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation
 ModulePass *createDataFlowSanitizerPass(StringRef ABIListFile = StringRef(),
diff --git a/include/llvm/Transforms/ObjCARC.h b/include/llvm/Transforms/ObjCARC.h
index b3c19c077eab..1897adc2ffbf 100644
--- a/include/llvm/Transforms/ObjCARC.h
+++ b/include/llvm/Transforms/ObjCARC.h
@@ -46,4 +46,3 @@ Pass *createObjCARCOptPass();
 } // End llvm namespace
 
 #endif
-
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 6aea643c42aa..8ecfd801d0d8 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
 
+class BasicBlockPass;
 class FunctionPass;
 class Pass;
 class GetElementPtrInst;
@@ -155,14 +156,14 @@ Pass *createLoopRerollPass();
 //
 // LoopRotate - This pass is a simple loop rotating pass.
 //
-Pass *createLoopRotatePass();
+Pass *createLoopRotatePass(int MaxHeaderSize = -1);
 
 //===----------------------------------------------------------------------===//
 //
 // LoopIdiom - This pass recognizes and replaces idioms in loops.
 //
 Pass *createLoopIdiomPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
 // PromoteMemoryToRegister - This pass is used to promote memory references to
@@ -201,7 +202,7 @@ FunctionPass *createReassociatePass();
 // preds always go to some succ.
 //
 FunctionPass *createJumpThreadingPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
 // CFGSimplification - Merge basic blocks, eliminate unreachable blocks,
@@ -284,10 +285,10 @@ extern char &LCSSAID;
 // tree.
 //
 FunctionPass *createEarlyCSEPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
-// GVN - This pass performs global value numbering and redundant load 
+// GVN - This pass performs global value numbering and redundant load
 // elimination cotemporaneously.
 //
 FunctionPass *createGVNPass(bool NoLoads = false);
@@ -305,7 +306,7 @@ FunctionPass *createMemCpyOptPass();
 // can prove are dead.
 //
 Pass *createLoopDeletionPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
 // ConstantHoisting - This pass prepares a function for expensive constants.
@@ -318,7 +319,7 @@ FunctionPass *createConstantHoistingPass();
 //
 FunctionPass *createInstructionNamerPass();
 extern char &InstructionNamerID;
-  
+
 //===----------------------------------------------------------------------===//
 //
 // Sink - Code Sinking
@@ -344,14 +345,12 @@ Pass *createCorrelatedValuePropagationPass();
 FunctionPass *createInstructionSimplifierPass();
 extern char &InstructionSimplifierID;
 
-
 //===----------------------------------------------------------------------===//
 //
 // LowerExpectIntrinsics - Removes llvm.expect intrinsics and creates
 // "block_weights" metadata.
 FunctionPass *createLowerExpectIntrinsicPass();
 
-
 //===----------------------------------------------------------------------===//
 //
 // PartiallyInlineLibCalls - Tries to inline the fast path of library
@@ -383,6 +382,12 @@ FunctionPass *createAddDiscriminatorsPass();
 //
 FunctionPass *createSeparateConstOffsetFromGEPPass();
 
+//===----------------------------------------------------------------------===//
+//
+// LoadCombine - Combine loads into bigger loads.
+//
+BasicBlockPass *createLoadCombinePass();
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/VectorUtils.h b/include/llvm/Transforms/Utils/VectorUtils.h
index e1d6c562923b..44a7149eee98 100644
--- a/include/llvm/Transforms/Utils/VectorUtils.h
+++ b/include/llvm/Transforms/Utils/VectorUtils.h
@@ -48,12 +48,27 @@ static inline bool isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::pow:
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+  case Intrinsic::powi:
     return true;
   default:
     return false;
   }
 }
 
+static bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
+                                         unsigned ScalarOpdIdx) {
+  switch (ID) {
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+    case Intrinsic::powi:
+      return (ScalarOpdIdx == 1);
+    default:
+      return false;
+  }
+}
+
 static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
                                               Intrinsic::ID ValidIntrinsicID) {
   if (I.getNumArgOperands() != 1 ||
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
new file mode 100644
index 000000000000..1790a7242e5b
--- /dev/null
+++ b/include/llvm/module.modulemap
@@ -0,0 +1,177 @@
+module LLVM_Analysis {
+  requires cplusplus
+  umbrella "Analysis"
+  module * { export * }
+  exclude header "Analysis/BlockFrequencyInfoImpl.h"
+}
+
+module LLVM_AsmParser { requires cplusplus umbrella "AsmParser" module * { export * } }
+
+// A module covering CodeGen/ and Target/. These are intertwined
+// and codependent, and thus notionally form a single module.
+module LLVM_Backend {
+  requires cplusplus
+
+  module CodeGen {
+    umbrella "CodeGen"
+    module * { export * }
+
+    // FIXME: Why is this excluded?
+    exclude header "CodeGen/MachineValueType.h"
+
+    // Exclude these; they're intended to be included into only a single
+    // translation unit (or none) and aren't part of this module.
+    exclude header "CodeGen/CommandFlags.h"
+    exclude header "CodeGen/LinkAllAsmWriterComponents.h"
+    exclude header "CodeGen/LinkAllCodegenComponents.h"
+  }
+
+  module Target {
+    umbrella "Target"
+    module * { export * }
+  }
+
+  // FIXME: Where should this go?
+  module Analysis_BlockFrequencyInfoImpl {
+    header "Analysis/BlockFrequencyInfoImpl.h"
+    export *
+  }
+}
+
+module LLVM_Bitcode { requires cplusplus umbrella "Bitcode" module * { export * } }
+module LLVM_Config { requires cplusplus umbrella "Config" module * { export * } }
+module LLVM_DebugInfo { requires cplusplus umbrella "DebugInfo" module * { export * } }
+module LLVM_ExecutionEngine {
+  requires cplusplus
+
+  umbrella "ExecutionEngine"
+  module * { export * }
+
+  // Exclude this; it's an optional component of the ExecutionEngine.
+  exclude header "ExecutionEngine/OProfileWrapper.h"
+
+  // Exclude these; they're intended to be included into only a single
+  // translation unit (or none) and aren't part of this module.
+  exclude header "ExecutionEngine/JIT.h"
+  exclude header "ExecutionEngine/MCJIT.h"
+  exclude header "ExecutionEngine/Interpreter.h"
+}
+
+module LLVM_IR {
+  requires cplusplus
+
+  // FIXME: Is this the right place for these?
+  module Pass { header "Pass.h" export * }
+  module PassSupport { header "PassSupport.h" export * }
+  module PassAnalysisSupport { header "PassAnalysisSupport.h" export * }
+  module PassRegistry { header "PassRegistry.h" export * }
+  module InitializePasses { header "InitializePasses.h" export * }
+
+  umbrella "IR"
+  module * { export * }
+
+  // We cannot have llvm/PassManager.h and llvm/IR/PassManager.h in the same TU,
+  // so we can't include llvm/IR/PassManager.h in the IR module.
+  exclude header "IR/PassManager.h"
+  exclude header "IR/LegacyPassManager.h"
+
+  // Exclude this; it's intended for (repeated) textual inclusion.
+  exclude header "IR/Instruction.def"
+}
+
+module LLVM_LegacyPassManager {
+  requires cplusplus
+  module CompatInterface { header "PassManager.h" export * }
+  module Implementation { header "IR/LegacyPassManager.h" export * }
+}
+
+module LLVM_IR_PassManager {
+  requires cplusplus
+  // FIXME PR19358: This doesn't work! conflict LLVM_LegacyPassManager, "cannot use legacy pass manager and new pass manager in same file"
+  header "IR/PassManager.h"
+  export *
+}
+
+module LLVM_IRReader { requires cplusplus umbrella "IRReader" module * { export * } }
+module LLVM_LineEditor { requires cplusplus umbrella "LineEditor" module * { export * } }
+module LLVM_LTO { requires cplusplus umbrella "LTO" module * { export * } }
+
+module LLVM_MC {
+  requires cplusplus
+
+  // FIXME: Mislayered?
+  module Support_TargetRegistry {
+    header "Support/TargetRegistry.h"
+    export *
+  }
+
+  umbrella "MC"
+  module * { export * }
+
+  // Exclude this; it's fundamentally non-modular.
+  exclude header "MC/MCTargetOptionsCommandFlags.h"
+}
+
+module LLVM_Object { requires cplusplus umbrella "Object" module * { export * } }
+module LLVM_Option { requires cplusplus umbrella "Option" module * { export * } }
+module LLVM_TableGen { requires cplusplus umbrella "TableGen" module * { export * } }
+
+module LLVM_Transforms {
+  requires cplusplus
+  umbrella "Transforms"
+  module * { export * }
+
+  // FIXME: Excluded because it does bad things with the legacy pass manager.
+  exclude header "Transforms/IPO/PassManagerBuilder.h"
+}
+
+// A module covering ADT/ and Support/. These are intertwined and
+// codependent, and notionally form a single module.
+module LLVM_Utils {
+  module ADT {
+    requires cplusplus
+
+    umbrella "ADT"
+    module * { export * }
+  }
+
+  module Support {
+    requires cplusplus
+
+    umbrella "Support"
+    module * { export * }
+
+    // Exclude this; it's only included on Solaris.
+    exclude header "Support/Solaris.h"
+
+    // Exclude this; it's only included on AIX and fundamentally non-modular.
+    exclude header "Support/AIXDataTypesFix.h"
+
+    // Exclude this; it's fundamentally non-modular.
+    exclude header "Support/Debug.h"
+
+    // Exclude this; it's fundamentally non-modular.
+    exclude header "Support/PluginLoader.h"
+
+    // Exclude this; it's a weirdly-factored part of llvm-gcov and conflicts
+    // with the Analysis module (which also defines an llvm::GCOVOptions).
+    exclude header "Support/GCOV.h"
+
+    // FIXME: Mislayered?
+    exclude header "Support/TargetRegistry.h"
+  }
+}
+
+module LLVM_CodeGen_MachineValueType {
+  requires cplusplus
+  header "CodeGen/MachineValueType.h"
+  export *
+}
+
+// This is used for a $src == $build compilation. Otherwise we use
+// LLVM_Support_DataTypes_Build, defined in a module map that is
+// copied into the build area.
+module LLVM_Support_DataTypes_Src {
+  header "llvm/Support/DataTypes.h"
+  export *
+}
diff --git a/include/llvm/module.modulemap.build b/include/llvm/module.modulemap.build
new file mode 100644
index 000000000000..7150fe93935f
--- /dev/null
+++ b/include/llvm/module.modulemap.build
@@ -0,0 +1,5 @@
+// This is copied into the build area for a $src != $build compilation.
+module LLVM_Support_DataTypes {
+  header "Support/DataTypes.h"
+  export *
+}
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 01c1c7e572c2..ade940a7d300 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -48,6 +48,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeIVUsersPass(Registry);
   initializeInstCountPass(Registry);
   initializeIntervalPartitionPass(Registry);
+  initializeJumpInstrTableInfoPass(Registry);
   initializeLazyValueInfoPass(Registry);
   initializeLibCallAliasAnalysisPass(Registry);
   initializeLintPass(Registry);
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index b546789b9491..d1632fd26ace 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   InstructionSimplify.cpp
   Interval.cpp
   IntervalPartition.cpp
+  JumpInstrTableInfo.cpp
   LazyCallGraph.cpp
   LazyValueInfo.cpp
   LibCallAliasAnalysis.cpp
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 0ac1cb56aa2f..eb3e2c636909 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -31,11 +32,15 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FEnv.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include <cerrno>
 #include <cmath>
+
+#ifdef HAVE_FENV_H
+#include <fenv.h>
+#endif
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -706,7 +711,7 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
 static Constant* StripPtrCastKeepAS(Constant* Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Not a pointer type");
   PointerType *OldPtrTy = cast<PointerType>(Ptr->getType());
-  Ptr = cast<Constant>(Ptr->stripPointerCasts());
+  Ptr = Ptr->stripPointerCasts();
   PointerType *NewPtrTy = cast<PointerType>(Ptr->getType());
 
   // Preserve the address space number of the pointer.
@@ -1314,12 +1319,34 @@ static Constant *GetConstantFoldFPValue(double V, Type *Ty) {
 
 }
 
+namespace {
+/// llvm_fenv_clearexcept - Clear the floating-point exception state.
+static inline void llvm_fenv_clearexcept() {
+#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT
+  feclearexcept(FE_ALL_EXCEPT);
+#endif
+  errno = 0;
+}
+
+/// llvm_fenv_testexcept - Test if a floating-point exception was raised.
+static inline bool llvm_fenv_testexcept() {
+  int errno_val = errno;
+  if (errno_val == ERANGE || errno_val == EDOM)
+    return true;
+#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT && HAVE_DECL_FE_INEXACT
+  if (fetestexcept(FE_ALL_EXCEPT & ~FE_INEXACT))
+    return true;
+#endif
+  return false;
+}
+} // End namespace
+
 static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
                                 Type *Ty) {
-  sys::llvm_fenv_clearexcept();
+  llvm_fenv_clearexcept();
   V = NativeFP(V);
-  if (sys::llvm_fenv_testexcept()) {
-    sys::llvm_fenv_clearexcept();
+  if (llvm_fenv_testexcept()) {
+    llvm_fenv_clearexcept();
     return nullptr;
   }
 
@@ -1328,10 +1355,10 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
 
 static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
                                       double V, double W, Type *Ty) {
-  sys::llvm_fenv_clearexcept();
+  llvm_fenv_clearexcept();
   V = NativeFP(V, W);
-  if (sys::llvm_fenv_testexcept()) {
-    sys::llvm_fenv_clearexcept();
+  if (llvm_fenv_testexcept()) {
+    llvm_fenv_clearexcept();
     return nullptr;
   }
 
diff --git a/lib/Analysis/Delinearization.cpp b/lib/Analysis/Delinearization.cpp
index 1a588211a23f..9334cebe1802 100644
--- a/lib/Analysis/Delinearization.cpp
+++ b/lib/Analysis/Delinearization.cpp
@@ -95,26 +95,34 @@ void Delinearization::print(raw_ostream &O, const Module *) const {
     // Do not analyze memory accesses outside loops.
     for (Loop *L = LI->getLoopFor(BB); L != nullptr; L = L->getParentLoop()) {
       const SCEV *AccessFn = SE->getSCEVAtScope(getPointerOperand(*Inst), L);
+
+      const SCEVUnknown *BasePointer =
+          dyn_cast<SCEVUnknown>(SE->getPointerBase(AccessFn));
+      // Do not delinearize if we cannot find the base pointer.
+      if (!BasePointer)
+        break;
+      AccessFn = SE->getMinusSCEV(AccessFn, BasePointer);
       const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(AccessFn);
 
       // Do not try to delinearize memory accesses that are not AddRecs.
       if (!AR)
         break;
 
+
       O << "\n";
       O << "Inst:" << *Inst << "\n";
       O << "In Loop with Header: " << L->getHeader()->getName() << "\n";
-
       O << "AddRec: " << *AR << "\n";
 
       SmallVector<const SCEV *, 3> Subscripts, Sizes;
-      const SCEV *Res = AR->delinearize(*SE, Subscripts, Sizes);
-      if (Res == AR || Subscripts.size() == 0 || Sizes.size() == 0 ||
+      AR->delinearize(*SE, Subscripts, Sizes, SE->getElementSize(Inst));
+      if (Subscripts.size() == 0 || Sizes.size() == 0 ||
           Subscripts.size() != Sizes.size()) {
         O << "failed to delinearize\n";
         continue;
       }
-      O << "Base offset: " << *Res << "\n";
+
+      O << "Base offset: " << *BasePointer << "\n";
       O << "ArrayDecl[UnknownSize]";
       int Size = Subscripts.size();
       for (int i = 0; i < Size - 1; i++)
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index 57231b8325a3..d0784f1e678d 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -3180,9 +3180,21 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
 /// source and destination array references are recurrences on a nested loop,
 /// this function flattens the nested recurrences into separate recurrences
 /// for each loop level.
-bool
-DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
-                                   SmallVectorImpl<Subscript> &Pair) const {
+bool DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV,
+                                        const SCEV *DstSCEV,
+                                        SmallVectorImpl<Subscript> &Pair,
+                                        const SCEV *ElementSize) const {
+  const SCEVUnknown *SrcBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcSCEV));
+  const SCEVUnknown *DstBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(DstSCEV));
+
+  if (!SrcBase || !DstBase || SrcBase != DstBase)
+    return false;
+
+  SrcSCEV = SE->getMinusSCEV(SrcSCEV, SrcBase);
+  DstSCEV = SE->getMinusSCEV(DstSCEV, DstBase);
+
   const SCEVAddRecExpr *SrcAR = dyn_cast<SCEVAddRecExpr>(SrcSCEV);
   const SCEVAddRecExpr *DstAR = dyn_cast<SCEVAddRecExpr>(DstSCEV);
   if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine())
@@ -3195,24 +3207,18 @@ DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
 
   // Second step: find subscript sizes.
   SmallVector<const SCEV *, 4> Sizes;
-  SE->findArrayDimensions(Terms, Sizes);
+  SE->findArrayDimensions(Terms, Sizes, ElementSize);
 
   // Third step: compute the access functions for each subscript.
   SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts;
-  const SCEV *RemainderS = SrcAR->computeAccessFunctions(*SE, SrcSubscripts, Sizes);
-  const SCEV *RemainderD = DstAR->computeAccessFunctions(*SE, DstSubscripts, Sizes);
+  SrcAR->computeAccessFunctions(*SE, SrcSubscripts, Sizes);
+  DstAR->computeAccessFunctions(*SE, DstSubscripts, Sizes);
 
   // Fail when there is only a subscript: that's a linearized access function.
   if (SrcSubscripts.size() < 2 || DstSubscripts.size() < 2 ||
       SrcSubscripts.size() != DstSubscripts.size())
     return false;
 
-  // When the difference in remainders is different than a constant it might be
-  // that the base address of the arrays is not the same.
-  const SCEV *DiffRemainders = SE->getMinusSCEV(RemainderS, RemainderD);
-  if (!isa<SCEVConstant>(DiffRemainders))
-    return false;
-
   int size = SrcSubscripts.size();
 
   DEBUG({
@@ -3353,7 +3359,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
   }
 
   if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
-      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) {
     DEBUG(dbgs() << "    delinerized GEP\n");
     Pairs = Pair.size();
   }
@@ -3777,7 +3783,7 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
   }
 
   if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
-      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) {
     DEBUG(dbgs() << "    delinerized GEP\n");
     Pairs = Pair.size();
   }
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index bfab744d479f..0d9d0ef842c6 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -602,8 +602,10 @@ namespace {
 
     bool runOnSCC(CallGraphSCC &SCC) override {
       Out << Banner;
-      for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
+      for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+        assert((*I)->getFunction() && "Expecting non-null Function");
         (*I)->getFunction()->print(Out);
+      }
       return false;
     }
   };
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index c819bd319c6a..0b94238e6631 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -287,6 +287,7 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
       OS << ")";
     }
     OS << " in  ";
+    assert(UI->getUser() != nullptr && "Expected non-null User");
     UI->getUser()->print(OS);
     OS << '\n';
   }
diff --git a/lib/Analysis/JumpInstrTableInfo.cpp b/lib/Analysis/JumpInstrTableInfo.cpp
new file mode 100644
index 000000000000..b5b426533ff3
--- /dev/null
+++ b/lib/Analysis/JumpInstrTableInfo.cpp
@@ -0,0 +1,40 @@
+//===-- JumpInstrTableInfo.cpp: Info for Jump-Instruction Tables ----------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Information about jump-instruction tables that have been created by
+/// JumpInstrTables pass.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jiti"
+
+#include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS(JumpInstrTableInfo, "jump-instr-table-info",
+                "Jump-Instruction Table Info", true, true)
+char JumpInstrTableInfo::ID = 0;
+
+ImmutablePass *llvm::createJumpInstrTableInfoPass() {
+  return new JumpInstrTableInfo();
+}
+
+JumpInstrTableInfo::JumpInstrTableInfo() : ImmutablePass(ID), Tables() {
+  initializeJumpInstrTableInfoPass(*PassRegistry::getPassRegistry());
+}
+
+JumpInstrTableInfo::~JumpInstrTableInfo() {}
+
+void JumpInstrTableInfo::insertEntry(FunctionType *TableFunTy, Function *Target,
+                                     Function *Jump) {
+  Tables[TableFunTy].push_back(JumpPair(Target, Jump));
+}
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 8df18e75c64e..2c6e6e3ffff3 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -45,6 +45,7 @@ class PrintLoopPass : public LoopPass {
     for (Loop::block_iterator b = L->block_begin(), be = L->block_end();
          b != be;
          ++b) {
+      assert((*b) != nullptr && "Expecting non-null block");
       (*b)->print(Out);
     }
     return false;
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 3c7798f2f42b..d11b3323cac6 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -195,8 +195,10 @@ class PrintRegionPass : public RegionPass {
 
   bool runOnRegion(Region *R, RGPassManager &RGM) override {
     Out << Banner;
-    for (const auto &BB : R->blocks())
+    for (const auto &BB : R->blocks()) {
+      assert(BB != nullptr && "Expecting non-null Block");
       BB->print(Out);
+    }
 
     return false;
   }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index cef9966f9e8c..06dbde58c108 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -1201,6 +1201,23 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       return getTruncateOrSignExtend(X, Ty);
   }
 
+  // sext(C1 + (C2 * x)) --> C1 + sext(C2 * x) if C1 < C2
+  if (auto SA = dyn_cast<SCEVAddExpr>(Op)) {
+    if (SA->getNumOperands() == 2) {
+      auto SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0));
+      auto SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1));
+      if (SMul && SC1) {
+        if (auto SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) {
+          const APInt &C1 = SC1->getValue()->getValue();
+          const APInt &C2 = SC2->getValue()->getValue();
+          if (C1.isStrictlyPositive() && C2.isStrictlyPositive() &&
+              C2.ugt(C1) && C2.isPowerOf2())
+            return getAddExpr(getSignExtendExpr(SC1, Ty),
+                              getSignExtendExpr(SMul, Ty));
+        }
+      }
+    }
+  }
   // If the input value is a chrec scev, and we can prove that the value
   // did not overflow the old, smaller, value, we can sign extend all of the
   // operands (often constants).  This allows analysis of something like
@@ -1292,6 +1309,22 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                                L, AR->getNoWrapFlags());
         }
       }
+      // If Start and Step are constants, check if we can apply this
+      // transformation:
+      // sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2
+      auto SC1 = dyn_cast<SCEVConstant>(Start);
+      auto SC2 = dyn_cast<SCEVConstant>(Step);
+      if (SC1 && SC2) {
+        const APInt &C1 = SC1->getValue()->getValue();
+        const APInt &C2 = SC2->getValue()->getValue();
+        if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) &&
+            C2.isPowerOf2()) {
+          Start = getSignExtendExpr(Start, Ty);
+          const SCEV *NewAR = getAddRecExpr(getConstant(AR->getType(), 0), Step,
+                                            L, AR->getNoWrapFlags());
+          return getAddExpr(Start, getSignExtendExpr(NewAR, Ty));
+        }
+      }
     }
 
   // The cast wasn't folded; create an explicit cast node.
@@ -4409,38 +4442,63 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
   SmallVector<BasicBlock *, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  // Examine all exits and pick the most conservative values.
-  const SCEV *MaxBECount = getCouldNotCompute();
+  SmallVector<std::pair<BasicBlock *, const SCEV *>, 4> ExitCounts;
   bool CouldComputeBECount = true;
   BasicBlock *Latch = L->getLoopLatch(); // may be NULL.
-  const SCEV *LatchMaxCount = nullptr;
-  SmallVector<std::pair<BasicBlock *, const SCEV *>, 4> ExitCounts;
+  const SCEV *MustExitMaxBECount = nullptr;
+  const SCEV *MayExitMaxBECount = nullptr;
+
+  // Compute the ExitLimit for each loop exit. Use this to populate ExitCounts
+  // and compute maxBECount.
   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
-    ExitLimit EL = ComputeExitLimit(L, ExitingBlocks[i]);
+    BasicBlock *ExitBB = ExitingBlocks[i];
+    ExitLimit EL = ComputeExitLimit(L, ExitBB);
+
+    // 1. For each exit that can be computed, add an entry to ExitCounts.
+    // CouldComputeBECount is true only if all exits can be computed.
     if (EL.Exact == getCouldNotCompute())
       // We couldn't compute an exact value for this exit, so
       // we won't be able to compute an exact value for the loop.
       CouldComputeBECount = false;
     else
-      ExitCounts.push_back(std::make_pair(ExitingBlocks[i], EL.Exact));
-
-    if (MaxBECount == getCouldNotCompute())
-      MaxBECount = EL.Max;
-    else if (EL.Max != getCouldNotCompute()) {
-      // We cannot take the "min" MaxBECount, because non-unit stride loops may
-      // skip some loop tests. Taking the max over the exits is sufficiently
-      // conservative.  TODO: We could do better taking into consideration
-      // non-latch exits that dominate the latch.
-      if (EL.MustExit && ExitingBlocks[i] == Latch)
-        LatchMaxCount = EL.Max;
-      else
-        MaxBECount = getUMaxFromMismatchedTypes(MaxBECount, EL.Max);
+      ExitCounts.push_back(std::make_pair(ExitBB, EL.Exact));
+
+    // 2. Derive the loop's MaxBECount from each exit's max number of
+    // non-exiting iterations. Partition the loop exits into two kinds:
+    // LoopMustExits and LoopMayExits.
+    //
+    // A LoopMustExit meets two requirements:
+    //
+    // (a) Its ExitLimit.MustExit flag must be set which indicates that the exit
+    // test condition cannot be skipped (the tested variable has unit stride or
+    // the test is less-than or greater-than, rather than a strict inequality).
+    //
+    // (b) It must dominate the loop latch, hence must be tested on every loop
+    // iteration.
+    //
+    // If any computable LoopMustExit is found, then MaxBECount is the minimum
+    // EL.Max of computable LoopMustExits. Otherwise, MaxBECount is
+    // conservatively the maximum EL.Max, where CouldNotCompute is considered
+    // greater than any computable EL.Max.
+    if (EL.MustExit && EL.Max != getCouldNotCompute() && Latch &&
+        DT->dominates(ExitBB, Latch)) {
+      if (!MustExitMaxBECount)
+        MustExitMaxBECount = EL.Max;
+      else {
+        MustExitMaxBECount =
+          getUMinFromMismatchedTypes(MustExitMaxBECount, EL.Max);
+      }
+    } else if (MayExitMaxBECount != getCouldNotCompute()) {
+      if (!MayExitMaxBECount || EL.Max == getCouldNotCompute())
+        MayExitMaxBECount = EL.Max;
+      else {
+        MayExitMaxBECount =
+          getUMaxFromMismatchedTypes(MayExitMaxBECount, EL.Max);
+      }
     }
   }
-  // Be more precise in the easy case of a loop latch that must exit.
-  if (LatchMaxCount) {
-    MaxBECount = getUMinFromMismatchedTypes(MaxBECount, LatchMaxCount);
-  }
+  const SCEV *MaxBECount = MustExitMaxBECount ? MustExitMaxBECount :
+    (MayExitMaxBECount ? MayExitMaxBECount : getCouldNotCompute());
   return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount);
 }
 
@@ -6137,18 +6195,30 @@ bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
 
   // If LHS or RHS is an addrec, check to see if the condition is true in
   // every iteration of the loop.
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS))
-    if (isLoopEntryGuardedByCond(
-          AR->getLoop(), Pred, AR->getStart(), RHS) &&
-        isLoopBackedgeGuardedByCond(
-          AR->getLoop(), Pred, AR->getPostIncExpr(*this), RHS))
-      return true;
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(RHS))
-    if (isLoopEntryGuardedByCond(
-          AR->getLoop(), Pred, LHS, AR->getStart()) &&
-        isLoopBackedgeGuardedByCond(
-          AR->getLoop(), Pred, LHS, AR->getPostIncExpr(*this)))
-      return true;
+  // If LHS and RHS are both addrec, both conditions must be true in
+  // every iteration of the loop.
+  const SCEVAddRecExpr *LAR = dyn_cast<SCEVAddRecExpr>(LHS);
+  const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS);
+  bool LeftGuarded = false;
+  bool RightGuarded = false;
+  if (LAR) {
+    const Loop *L = LAR->getLoop();
+    if (isLoopEntryGuardedByCond(L, Pred, LAR->getStart(), RHS) &&
+        isLoopBackedgeGuardedByCond(L, Pred, LAR->getPostIncExpr(*this), RHS)) {
+      if (!RAR) return true;
+      LeftGuarded = true;
+    }
+  }
+  if (RAR) {
+    const Loop *L = RAR->getLoop();
+    if (isLoopEntryGuardedByCond(L, Pred, LHS, RAR->getStart()) &&
+        isLoopBackedgeGuardedByCond(L, Pred, LHS, RAR->getPostIncExpr(*this))) {
+      if (!LAR) return true;
+      RightGuarded = true;
+    }
+  }
+  if (LeftGuarded && RightGuarded)
+    return true;
 
   // Otherwise see what can be done with known constant ranges.
   return isKnownPredicateWithRanges(Pred, LHS, RHS);
@@ -6874,7 +6944,7 @@ struct SCEVCollectTerms {
       : Terms(T) {}
 
   bool follow(const SCEV *S) {
-    if (isa<SCEVUnknown>(S) || isa<SCEVConstant>(S) || isa<SCEVMulExpr>(S)) {
+    if (isa<SCEVUnknown>(S) || isa<SCEVMulExpr>(S)) {
       if (!containsUndefs(S))
         Terms.push_back(S);
 
@@ -7061,9 +7131,19 @@ struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
 
   void visitAddExpr(const SCEVAddExpr *Numerator) {
     SmallVector<const SCEV *, 2> Qs, Rs;
+    Type *Ty = Denominator->getType();
+
     for (const SCEV *Op : Numerator->operands()) {
       const SCEV *Q, *R;
       divide(SE, Op, Denominator, &Q, &R);
+
+      // Bail out if types do not match.
+      if (Ty != Q->getType() || Ty != R->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
+
       Qs.push_back(Q);
       Rs.push_back(R);
     }
@@ -7080,9 +7160,17 @@ struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
 
   void visitMulExpr(const SCEVMulExpr *Numerator) {
     SmallVector<const SCEV *, 2> Qs;
+    Type *Ty = Denominator->getType();
 
     bool FoundDenominatorTerm = false;
     for (const SCEV *Op : Numerator->operands()) {
+      // Bail out if types do not match.
+      if (Ty != Op->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
+
       if (FoundDenominatorTerm) {
         Qs.push_back(Op);
         continue;
@@ -7095,6 +7183,14 @@ struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
         Qs.push_back(Op);
         continue;
       }
+
+      // Bail out if types do not match.
+      if (Ty != Q->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
+
       FoundDenominatorTerm = true;
       Qs.push_back(Q);
     }
@@ -7120,6 +7216,15 @@ struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
         cast<SCEVConstant>(Zero)->getValue();
     Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
 
+    if (Remainder->isZero()) {
+      // The Quotient is obtained by replacing Denominator by 1 in Numerator.
+      RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
+          cast<SCEVConstant>(One)->getValue();
+      Quotient =
+          SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+      return;
+    }
+
     // Quotient is (Numerator - Remainder) divided by Denominator.
     const SCEV *Q, *R;
     const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
@@ -7141,82 +7246,31 @@ struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
 };
 }
 
-// Find the Greatest Common Divisor of A and B.
-static const SCEV *
-findGCD(ScalarEvolution &SE, const SCEV *A, const SCEV *B) {
-
-  if (const SCEVConstant *CA = dyn_cast<SCEVConstant>(A))
-    if (const SCEVConstant *CB = dyn_cast<SCEVConstant>(B))
-      return SE.getConstant(gcd(CA, CB));
-
-  const SCEV *One = SE.getConstant(A->getType(), 1);
-  if (isa<SCEVConstant>(A) && isa<SCEVUnknown>(B))
-    return One;
-  if (isa<SCEVUnknown>(A) && isa<SCEVConstant>(B))
-    return One;
-
-  const SCEV *Q, *R;
-  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(A)) {
-    SmallVector<const SCEV *, 2> Qs;
-    for (const SCEV *Op : M->operands())
-      Qs.push_back(findGCD(SE, Op, B));
-    return SE.getMulExpr(Qs);
-  }
-  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(B)) {
-    SmallVector<const SCEV *, 2> Qs;
-    for (const SCEV *Op : M->operands())
-      Qs.push_back(findGCD(SE, A, Op));
-    return SE.getMulExpr(Qs);
-  }
-
-  SCEVDivision::divide(SE, A, B, &Q, &R);
-  if (R->isZero())
-    return B;
-
-  SCEVDivision::divide(SE, B, A, &Q, &R);
-  if (R->isZero())
-    return A;
-
-  return One;
-}
-
-// Find the Greatest Common Divisor of all the SCEVs in Terms.
-static const SCEV *
-findGCD(ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &Terms) {
-  assert(Terms.size() > 0 && "Terms vector is empty");
-
-  const SCEV *GCD = Terms[0];
-  for (const SCEV *T : Terms)
-    GCD = findGCD(SE, GCD, T);
-
-  return GCD;
-}
-
 static bool findArrayDimensionsRec(ScalarEvolution &SE,
                                    SmallVectorImpl<const SCEV *> &Terms,
                                    SmallVectorImpl<const SCEV *> &Sizes) {
-  // The GCD of all Terms is the dimension of the innermost dimension.
-  const SCEV *GCD = findGCD(SE, Terms);
+  int Last = Terms.size() - 1;
+  const SCEV *Step = Terms[Last];
 
   // End of recursion.
-  if (Terms.size() == 1) {
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(GCD)) {
+  if (Last == 0) {
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Step)) {
       SmallVector<const SCEV *, 2> Qs;
       for (const SCEV *Op : M->operands())
         if (!isa<SCEVConstant>(Op))
           Qs.push_back(Op);
 
-      GCD = SE.getMulExpr(Qs);
+      Step = SE.getMulExpr(Qs);
     }
 
-    Sizes.push_back(GCD);
+    Sizes.push_back(Step);
     return true;
   }
 
   for (const SCEV *&Term : Terms) {
     // Normalize the terms before the next call to findArrayDimensionsRec.
     const SCEV *Q, *R;
-    SCEVDivision::divide(SE, Term, GCD, &Q, &R);
+    SCEVDivision::divide(SE, Term, Step, &Q, &R);
 
     // Bail out when GCD does not evenly divide one of the terms.
     if (!R->isZero())
@@ -7235,7 +7289,7 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE,
     if (!findArrayDimensionsRec(SE, Terms, Sizes))
       return false;
 
-  Sizes.push_back(GCD);
+  Sizes.push_back(Step);
   return true;
 }
 
@@ -7286,13 +7340,46 @@ static inline int numberOfTerms(const SCEV *S) {
   return 1;
 }
 
+static const SCEV *removeConstantFactors(ScalarEvolution &SE, const SCEV *T) {
+  if (isa<SCEVConstant>(T))
+    return nullptr;
+
+  if (isa<SCEVUnknown>(T))
+    return T;
+
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(T)) {
+    SmallVector<const SCEV *, 2> Factors;
+    for (const SCEV *Op : M->operands())
+      if (!isa<SCEVConstant>(Op))
+        Factors.push_back(Op);
+
+    return SE.getMulExpr(Factors);
+  }
+
+  return T;
+}
+
+/// Return the size of an element read or written by Inst.
+const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
+  Type *Ty;
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+    Ty = Store->getValueOperand()->getType();
+  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+    Ty = Load->getType();
+  else
+    return nullptr;
+
+  Type *ETy = getEffectiveSCEVType(PointerType::getUnqual(Ty));
+  return getSizeOfExpr(ETy, Ty);
+}
+
 /// Second step of delinearization: compute the array dimensions Sizes from the
 /// set of Terms extracted from the memory access function of this SCEVAddRec.
-void ScalarEvolution::findArrayDimensions(
-    SmallVectorImpl<const SCEV *> &Terms,
-    SmallVectorImpl<const SCEV *> &Sizes) const {
+void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
+                                          SmallVectorImpl<const SCEV *> &Sizes,
+                                          const SCEV *ElementSize) const {
 
-  if (Terms.size() < 2)
+  if (Terms.size() < 1 || !ElementSize)
     return;
 
   // Early return when Terms do not contain parameters: we do not delinearize
@@ -7315,20 +7402,37 @@ void ScalarEvolution::findArrayDimensions(
     return numberOfTerms(LHS) > numberOfTerms(RHS);
   });
 
+  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
+
+  // Divide all terms by the element size.
+  for (const SCEV *&Term : Terms) {
+    const SCEV *Q, *R;
+    SCEVDivision::divide(SE, Term, ElementSize, &Q, &R);
+    Term = Q;
+  }
+
+  SmallVector<const SCEV *, 4> NewTerms;
+
+  // Remove constant factors.
+  for (const SCEV *T : Terms)
+    if (const SCEV *NewT = removeConstantFactors(SE, T))
+      NewTerms.push_back(NewT);
+
   DEBUG({
       dbgs() << "Terms after sorting:\n";
-      for (const SCEV *T : Terms)
+      for (const SCEV *T : NewTerms)
         dbgs() << *T << "\n";
     });
 
-  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
-  bool Res = findArrayDimensionsRec(SE, Terms, Sizes);
-
-  if (!Res) {
+  if (NewTerms.empty() ||
+      !findArrayDimensionsRec(SE, NewTerms, Sizes)) {
     Sizes.clear();
     return;
   }
 
+  // The last element to be pushed into Sizes is the size of an element.
+  Sizes.push_back(ElementSize);
+
   DEBUG({
       dbgs() << "Sizes:\n";
       for (const SCEV *S : Sizes)
@@ -7338,16 +7442,15 @@ void ScalarEvolution::findArrayDimensions(
 
 /// Third step of delinearization: compute the access functions for the
 /// Subscripts based on the dimensions in Sizes.
-const SCEV *SCEVAddRecExpr::computeAccessFunctions(
+void SCEVAddRecExpr::computeAccessFunctions(
     ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &Subscripts,
     SmallVectorImpl<const SCEV *> &Sizes) const {
 
   // Early exit in case this SCEV is not an affine multivariate function.
   if (Sizes.empty() || !this->isAffine())
-    return nullptr;
+    return;
 
-  const SCEV *Zero = SE.getConstant(this->getType(), 0);
-  const SCEV *Res = this, *Remainder = Zero;
+  const SCEV *Res = this;
   int Last = Sizes.size() - 1;
   for (int i = Last; i >= 0; i--) {
     const SCEV *Q, *R;
@@ -7363,10 +7466,17 @@ const SCEV *SCEVAddRecExpr::computeAccessFunctions(
 
     Res = Q;
 
+    // Do not record the last subscript corresponding to the size of elements in
+    // the array.
     if (i == Last) {
-      // Do not record the last subscript corresponding to the size of elements
-      // in the array.
-      Remainder = R;
+
+      // Bail out if the remainder is too complex.
+      if (isa<SCEVAddRecExpr>(R)) {
+        Subscripts.clear();
+        Sizes.clear();
+        return;
+      }
+
       continue;
     }
 
@@ -7385,7 +7495,6 @@ const SCEV *SCEVAddRecExpr::computeAccessFunctions(
       for (const SCEV *S : Subscripts)
         dbgs() << *S << "\n";
     });
-  return Remainder;
 }
 
 /// Splits the SCEV into two vectors of SCEVs representing the subscripts and
@@ -7437,28 +7546,28 @@ const SCEV *SCEVAddRecExpr::computeAccessFunctions(
 /// asking for the SCEV of the memory access with respect to all enclosing
 /// loops, calling SCEV->delinearize on that and printing the results.
 
-const SCEV *
-SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
-                            SmallVectorImpl<const SCEV *> &Subscripts,
-                            SmallVectorImpl<const SCEV *> &Sizes) const {
+void SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
+                                 SmallVectorImpl<const SCEV *> &Subscripts,
+                                 SmallVectorImpl<const SCEV *> &Sizes,
+                                 const SCEV *ElementSize) const {
   // First step: collect parametric terms.
   SmallVector<const SCEV *, 4> Terms;
   collectParametricTerms(SE, Terms);
 
   if (Terms.empty())
-    return nullptr;
+    return;
 
   // Second step: find subscript sizes.
-  SE.findArrayDimensions(Terms, Sizes);
+  SE.findArrayDimensions(Terms, Sizes, ElementSize);
 
   if (Sizes.empty())
-    return nullptr;
+    return;
 
   // Third step: compute the access functions for each subscript.
-  const SCEV *Remainder = computeAccessFunctions(SE, Subscripts, Sizes);
+  computeAccessFunctions(SE, Subscripts, Sizes);
 
-  if (!Remainder || Subscripts.empty())
-    return nullptr;
+  if (Subscripts.empty())
+    return;
 
   DEBUG({
       dbgs() << "succeeded to delinearize " << *this << "\n";
@@ -7471,8 +7580,6 @@ SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
         dbgs() << "[" << *S << "]";
       dbgs() << "\n";
     });
-
-  return Remainder;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index e9db295a4acf..3ccefb01101d 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -241,7 +241,7 @@ TransformSubExpr(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
 }
 
 /// Top level driver for transforming an expression DAG into its requested
-/// post-inc form (either "Normalized" or "Denormalized".
+/// post-inc form (either "Normalized" or "Denormalized").
 const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
                                          const SCEV *S,
                                          Instruction *User,
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 44a34126e487..1334825a7d56 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -490,7 +490,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
-  KEYWORD(weak);
+  KEYWORD(weak); // Use as a linkage, and a modifier for "cmpxchg".
   KEYWORD(weak_odr);
   KEYWORD(appending);
   KEYWORD(dllimport);
@@ -583,6 +583,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(cold);
   KEYWORD(inlinehint);
   KEYWORD(inreg);
+  KEYWORD(jumptable);
   KEYWORD(minsize);
   KEYWORD(naked);
   KEYWORD(nest);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 3282e8a23ba7..0c188f983ff5 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -257,33 +257,31 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_appending:           // OptionalLinkage
     case lltok::kw_common:              // OptionalLinkage
     case lltok::kw_extern_weak:         // OptionalLinkage
-    case lltok::kw_external: {          // OptionalLinkage
+    case lltok::kw_external:            // OptionalLinkage
+    case lltok::kw_default:             // OptionalVisibility
+    case lltok::kw_hidden:              // OptionalVisibility
+    case lltok::kw_protected:           // OptionalVisibility
+    case lltok::kw_dllimport:           // OptionalDLLStorageClass
+    case lltok::kw_dllexport:           // OptionalDLLStorageClass
+    case lltok::kw_thread_local:        // OptionalThreadLocal
+    case lltok::kw_addrspace:           // OptionalAddrSpace
+    case lltok::kw_constant:            // GlobalType
+    case lltok::kw_global: {            // GlobalType
       unsigned Linkage, Visibility, DLLStorageClass;
-      if (ParseOptionalLinkage(Linkage) ||
+      bool UnnamedAddr;
+      GlobalVariable::ThreadLocalMode TLM;
+      bool HasLinkage;
+      if (ParseOptionalLinkage(Linkage, HasLinkage) ||
           ParseOptionalVisibility(Visibility) ||
           ParseOptionalDLLStorageClass(DLLStorageClass) ||
-          ParseGlobal("", SMLoc(), Linkage, true, Visibility, DLLStorageClass))
-        return true;
-      break;
-    }
-    case lltok::kw_default:       // OptionalVisibility
-    case lltok::kw_hidden:        // OptionalVisibility
-    case lltok::kw_protected: {   // OptionalVisibility
-      unsigned Visibility, DLLStorageClass;
-      if (ParseOptionalVisibility(Visibility) ||
-          ParseOptionalDLLStorageClass(DLLStorageClass) ||
-          ParseGlobal("", SMLoc(), 0, false, Visibility, DLLStorageClass))
+          ParseOptionalThreadLocal(TLM) ||
+          parseOptionalUnnamedAddr(UnnamedAddr) ||
+          ParseGlobal("", SMLoc(), Linkage, HasLinkage, Visibility,
+                      DLLStorageClass, TLM, UnnamedAddr))
         return true;
       break;
     }
 
-    case lltok::kw_thread_local:  // OptionalThreadLocal
-    case lltok::kw_addrspace:     // OptionalAddrSpace
-    case lltok::kw_constant:      // GlobalType
-    case lltok::kw_global:        // GlobalType
-      if (ParseGlobal("", SMLoc(), 0, false, 0, 0)) return true;
-      break;
-
     case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
     }
   }
@@ -470,15 +468,20 @@ bool LLParser::ParseUnnamedGlobal() {
 
   bool HasLinkage;
   unsigned Linkage, Visibility, DLLStorageClass;
+  GlobalVariable::ThreadLocalMode TLM;
+  bool UnnamedAddr;
   if (ParseOptionalLinkage(Linkage, HasLinkage) ||
       ParseOptionalVisibility(Visibility) ||
-      ParseOptionalDLLStorageClass(DLLStorageClass))
+      ParseOptionalDLLStorageClass(DLLStorageClass) ||
+      ParseOptionalThreadLocal(TLM) ||
+      parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
   if (HasLinkage || Lex.getKind() != lltok::kw_alias)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
-                       DLLStorageClass);
-  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass);
+                       DLLStorageClass, TLM, UnnamedAddr);
+  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass, TLM,
+                    UnnamedAddr);
 }
 
 /// ParseNamedGlobal:
@@ -493,16 +496,21 @@ bool LLParser::ParseNamedGlobal() {
 
   bool HasLinkage;
   unsigned Linkage, Visibility, DLLStorageClass;
+  GlobalVariable::ThreadLocalMode TLM;
+  bool UnnamedAddr;
   if (ParseToken(lltok::equal, "expected '=' in global variable") ||
       ParseOptionalLinkage(Linkage, HasLinkage) ||
       ParseOptionalVisibility(Visibility) ||
-      ParseOptionalDLLStorageClass(DLLStorageClass))
+      ParseOptionalDLLStorageClass(DLLStorageClass) ||
+      ParseOptionalThreadLocal(TLM) ||
+      parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
   if (HasLinkage || Lex.getKind() != lltok::kw_alias)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
-                       DLLStorageClass);
-  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass);
+                       DLLStorageClass, TLM, UnnamedAddr);
+  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass, TLM,
+                    UnnamedAddr);
 }
 
 // MDString:
@@ -628,18 +636,19 @@ static bool isValidVisibilityForLinkage(unsigned V, unsigned L) {
 }
 
 /// ParseAlias:
-///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass 'alias'
+///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass
+///                     OptionalThreadLocal OptionalUnNammedAddr 'alias'
 ///                     OptionalLinkage Aliasee
-///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass 'alias'
-///                     OptionalLinkage OptionalAddrSpace Type, Aliasee
 ///
 /// Aliasee
 ///   ::= TypeAndValue
 ///
-/// Everything through DLL storage class has already been parsed.
+/// Everything through OptionalUnNammedAddr has already been parsed.
 ///
 bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
-                          unsigned Visibility, unsigned DLLStorageClass) {
+                          unsigned Visibility, unsigned DLLStorageClass,
+                          GlobalVariable::ThreadLocalMode TLM,
+                          bool UnnamedAddr) {
   assert(Lex.getKind() == lltok::kw_alias);
   Lex.Lex();
   LocTy LinkageLoc = Lex.getLoc();
@@ -656,51 +665,39 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
     return Error(LinkageLoc,
                  "symbol with local linkage must have default visibility");
 
-  bool HasAddrSpace = Lex.getKind() == lltok::kw_addrspace;
-  unsigned AddrSpace;
-  LocTy AddrSpaceLoc = Lex.getLoc();
-  if (ParseOptionalAddrSpace(AddrSpace))
-    return true;
-
-  LocTy TyLoc = Lex.getLoc();
-  Type *Ty = nullptr;
-  if (ParseType(Ty))
-    return true;
-
-  bool DifferentType = EatIfPresent(lltok::comma);
-  if (HasAddrSpace && !DifferentType)
-    return Error(AddrSpaceLoc, "A type is required if addrspace is given");
-
-  Type *AliaseeType = nullptr;
-  if (DifferentType) {
-    if (ParseType(AliaseeType))
+  Constant *Aliasee;
+  LocTy AliaseeLoc = Lex.getLoc();
+  if (Lex.getKind() != lltok::kw_bitcast &&
+      Lex.getKind() != lltok::kw_getelementptr &&
+      Lex.getKind() != lltok::kw_addrspacecast &&
+      Lex.getKind() != lltok::kw_inttoptr) {
+    if (ParseGlobalTypeAndValue(Aliasee))
       return true;
   } else {
-    AliaseeType = Ty;
-    auto *PTy = dyn_cast<PointerType>(Ty);
-    if (!PTy)
-      return Error(TyLoc, "An alias must have pointer type");
-    Ty = PTy->getElementType();
-    AddrSpace = PTy->getAddressSpace();
+    // The bitcast dest type is not present, it is implied by the dest type.
+    ValID ID;
+    if (ParseValID(ID))
+      return true;
+    if (ID.Kind != ValID::t_Constant)
+      return Error(AliaseeLoc, "invalid aliasee");
+    Aliasee = ID.ConstantVal;
   }
 
-  LocTy AliaseeLoc = Lex.getLoc();
-  Constant *C;
-  if (ParseGlobalValue(AliaseeType, C))
-    return true;
-
-  auto *Aliasee = dyn_cast<GlobalObject>(C);
-  if (!Aliasee)
-    return Error(AliaseeLoc, "Alias must point to function or variable");
-
-  assert(Aliasee->getType()->isPointerTy());
+  Type *AliaseeType = Aliasee->getType();
+  auto *PTy = dyn_cast<PointerType>(AliaseeType);
+  if (!PTy)
+    return Error(AliaseeLoc, "An alias must have pointer type");
+  Type *Ty = PTy->getElementType();
+  unsigned AddrSpace = PTy->getAddressSpace();
 
   // Okay, create the alias but do not insert it into the module yet.
   std::unique_ptr<GlobalAlias> GA(
       GlobalAlias::create(Ty, AddrSpace, (GlobalValue::LinkageTypes)Linkage,
                           Name, Aliasee, /*Parent*/ nullptr));
+  GA->setThreadLocalMode(TLM);
   GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
+  GA->setUnnamedAddr(UnnamedAddr);
 
   // See if this value already exists in the symbol table.  If so, it is either
   // a redefinition or a definition of a forward reference.
@@ -720,11 +717,6 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
 
     // If they agree, just RAUW the old value with the alias and remove the
     // forward ref info.
-    for (auto *User : Val->users()) {
-      if (auto *GA = dyn_cast<GlobalAlias>(User))
-        return Error(NameLoc, "Alias is pointed by alias " + GA->getName());
-    }
-
     Val->replaceAllUsesWith(GA.get());
     Val->eraseFromParent();
     ForwardRefVals.erase(I);
@@ -742,34 +734,31 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
 
 /// ParseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
-///       OptionalThreadLocal OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalThreadLocal OptionalUnNammedAddr OptionalAddrSpace
 ///       OptionalExternallyInitialized GlobalType Type Const
 ///   ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
-///       OptionalThreadLocal OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalThreadLocal OptionalUnNammedAddr OptionalAddrSpace
 ///       OptionalExternallyInitialized GlobalType Type Const
 ///
-/// Everything up to and including OptionalDLLStorageClass has been parsed
+/// Everything up to and including OptionalUnNammedAddr has been parsed
 /// already.
 ///
 bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
-                           unsigned Visibility, unsigned DLLStorageClass) {
+                           unsigned Visibility, unsigned DLLStorageClass,
+                           GlobalVariable::ThreadLocalMode TLM,
+                           bool UnnamedAddr) {
   if (!isValidVisibilityForLinkage(Visibility, Linkage))
     return Error(NameLoc,
                  "symbol with local linkage must have default visibility");
 
   unsigned AddrSpace;
-  bool IsConstant, UnnamedAddr, IsExternallyInitialized;
-  GlobalVariable::ThreadLocalMode TLM;
-  LocTy UnnamedAddrLoc;
+  bool IsConstant, IsExternallyInitialized;
   LocTy IsExternallyInitializedLoc;
   LocTy TyLoc;
 
   Type *Ty = nullptr;
-  if (ParseOptionalThreadLocal(TLM) ||
-      ParseOptionalAddrSpace(AddrSpace) ||
-      ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
-                         &UnnamedAddrLoc) ||
+  if (ParseOptionalAddrSpace(AddrSpace) ||
       ParseOptionalToken(lltok::kw_externally_initialized,
                          IsExternallyInitialized,
                          &IsExternallyInitializedLoc) ||
@@ -967,6 +956,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_builtin:           B.addAttribute(Attribute::Builtin); break;
     case lltok::kw_cold:              B.addAttribute(Attribute::Cold); break;
     case lltok::kw_inlinehint:        B.addAttribute(Attribute::InlineHint); break;
+    case lltok::kw_jumptable:         B.addAttribute(Attribute::JumpTable); break;
     case lltok::kw_minsize:           B.addAttribute(Attribute::MinSize); break;
     case lltok::kw_naked:             B.addAttribute(Attribute::Naked); break;
     case lltok::kw_nobuiltin:         B.addAttribute(Attribute::NoBuiltin); break;
@@ -1230,6 +1220,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_alwaysinline:
     case lltok::kw_builtin:
     case lltok::kw_inlinehint:
+    case lltok::kw_jumptable:
     case lltok::kw_minsize:
     case lltok::kw_naked:
     case lltok::kw_nobuiltin:
@@ -1291,6 +1282,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_builtin:
     case lltok::kw_cold:
     case lltok::kw_inlinehint:
+    case lltok::kw_jumptable:
     case lltok::kw_minsize:
     case lltok::kw_naked:
     case lltok::kw_nobuiltin:
@@ -4011,7 +4003,8 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
     else
       return TokError("expected 'catch' or 'filter' clause type");
 
-    Value *V; LocTy VLoc;
+    Value *V;
+    LocTy VLoc;
     if (ParseTypeAndValue(V, VLoc, PFS)) {
       delete LP;
       return true;
@@ -4027,7 +4020,7 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
         Error(VLoc, "'filter' clause has an invalid type");
     }
 
-    LP->addClause(V);
+    LP->addClause(cast<Constant>(V));
   }
 
   Inst = LP;
@@ -4263,8 +4256,8 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
 }
 
 /// ParseCmpXchg
-///   ::= 'cmpxchg' 'volatile'? TypeAndValue ',' TypeAndValue ',' TypeAndValue
-///       'singlethread'? AtomicOrdering AtomicOrdering
+///   ::= 'cmpxchg' 'weak'? 'volatile'? TypeAndValue ',' TypeAndValue ','
+///       TypeAndValue 'singlethread'? AtomicOrdering AtomicOrdering
 int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc;
   bool AteExtraComma = false;
@@ -4272,6 +4265,10 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   AtomicOrdering FailureOrdering = NotAtomic;
   SynchronizationScope Scope = CrossThread;
   bool isVolatile = false;
+  bool isWeak = false;
+
+  if (EatIfPresent(lltok::kw_weak))
+    isWeak = true;
 
   if (EatIfPresent(lltok::kw_volatile))
     isVolatile = true;
@@ -4304,9 +4301,10 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(NewLoc, "cmpxchg operand must be power-of-two byte-sized"
                          " integer");
 
-  AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering,
-                                                 FailureOrdering, Scope);
+  AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(
+      Ptr, Cmp, New, SuccessOrdering, FailureOrdering, Scope);
   CXI->setVolatile(isVolatile);
+  CXI->setWeak(isWeak);
   Inst = CXI;
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index e2bf46290b38..f7d69d267d5d 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -197,6 +197,9 @@ namespace llvm {
 
     bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
+    bool parseOptionalUnnamedAddr(bool &UnnamedAddr) {
+      return ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr);
+    }
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
     bool ParseOptionalParamAttrs(AttrBuilder &B);
     bool ParseOptionalReturnAttrs(AttrBuilder &B);
@@ -239,9 +242,11 @@ namespace llvm {
     bool ParseNamedGlobal();
     bool ParseGlobal(const std::string &Name, LocTy Loc, unsigned Linkage,
                      bool HasLinkage, unsigned Visibility,
-                     unsigned DLLStorageClass);
+                     unsigned DLLStorageClass,
+                     GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
     bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Visibility,
-                    unsigned DLLStorageClass);
+                    unsigned DLLStorageClass,
+                    GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
     bool ParseStandaloneMetadata();
     bool ParseNamedMetadata();
     bool ParseMDString(MDString *&Result);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index b6b7d824b5f5..af8b0da78bf8 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -42,7 +42,8 @@ namespace lltok {
     kw_linker_private,          // NOTE: deprecated, for parser compatibility
     kw_linker_private_weak,     // NOTE: deprecated, for parser compatibility
     kw_linkonce, kw_linkonce_odr,
-    kw_weak, kw_weak_odr, kw_appending,
+    kw_weak, // Used as a linkage, and a modifier for "cmpxchg".
+    kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
     kw_unnamed_addr,
@@ -107,6 +108,7 @@ namespace lltok {
     kw_cold,
     kw_inlinehint,
     kw_inreg,
+    kw_jumptable,
     kw_minsize,
     kw_naked,
     kw_nest,
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index 2606bc2d0832..067cac8578cd 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -17,8 +17,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cstring>
+#include <system_error>
 using namespace llvm;
 
 Module *llvm::ParseAssembly(MemoryBuffer *F,
@@ -42,7 +42,7 @@ Module *llvm::ParseAssembly(MemoryBuffer *F,
 Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err,
                                 LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
     return nullptr;
diff --git a/lib/AsmParser/module.modulemap b/lib/AsmParser/module.modulemap
new file mode 100644
index 000000000000..cc300060b3f5
--- /dev/null
+++ b/lib/AsmParser/module.modulemap
@@ -0,0 +1 @@
+module AsmParser { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/Bitcode/Reader/BitReader.cpp b/lib/Bitcode/Reader/BitReader.cpp
index 716299fc68e3..b5886c16fa44 100644
--- a/lib/Bitcode/Reader/BitReader.cpp
+++ b/lib/Bitcode/Reader/BitReader.cpp
@@ -32,7 +32,7 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
                                    char **OutMessage) {
   ErrorOr<Module *> ModuleOrErr =
       parseBitcodeFile(unwrap(MemBuf), *unwrap(ContextRef));
-  if (error_code EC = ModuleOrErr.getError()) {
+  if (std::error_code EC = ModuleOrErr.getError()) {
     if (OutMessage)
       *OutMessage = strdup(EC.message().c_str());
     *OutModule = wrap((Module*)nullptr);
@@ -54,7 +54,7 @@ LLVMBool LLVMGetBitcodeModuleInContext(LLVMContextRef ContextRef,
   ErrorOr<Module *> ModuleOrErr =
       getLazyBitcodeModule(unwrap(MemBuf), *unwrap(ContextRef));
 
-  if (error_code EC = ModuleOrErr.getError()) {
+  if (std::error_code EC = ModuleOrErr.getError()) {
     *OutM = wrap((Module *)nullptr);
     if (OutMessage)
       *OutMessage = strdup(EC.message().c_str());
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 4170f98567c2..9c398277d42d 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -470,7 +470,7 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
                 (EncodedAttrs & 0xffff));
 }
 
-error_code BitcodeReader::ParseAttributeBlock() {
+std::error_code BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -490,7 +490,7 @@ error_code BitcodeReader::ParseAttributeBlock() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -549,6 +549,8 @@ static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
     return Attribute::InlineHint;
   case bitc::ATTR_KIND_IN_REG:
     return Attribute::InReg;
+  case bitc::ATTR_KIND_JUMP_TABLE:
+    return Attribute::JumpTable;
   case bitc::ATTR_KIND_MIN_SIZE:
     return Attribute::MinSize;
   case bitc::ATTR_KIND_NAKED:
@@ -614,15 +616,15 @@ static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
   }
 }
 
-error_code BitcodeReader::ParseAttrKind(uint64_t Code,
-                                        Attribute::AttrKind *Kind) {
+std::error_code BitcodeReader::ParseAttrKind(uint64_t Code,
+                                             Attribute::AttrKind *Kind) {
   *Kind = GetAttrFromCode(Code);
   if (*Kind == Attribute::None)
     return Error(InvalidValue);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::ParseAttributeGroupBlock() {
+std::error_code BitcodeReader::ParseAttributeGroupBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -640,7 +642,7 @@ error_code BitcodeReader::ParseAttributeGroupBlock() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -662,13 +664,13 @@ error_code BitcodeReader::ParseAttributeGroupBlock() {
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Record[i] == 0) {        // Enum attribute
           Attribute::AttrKind Kind;
-          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+          if (std::error_code EC = ParseAttrKind(Record[++i], &Kind))
             return EC;
 
           B.addAttribute(Kind);
         } else if (Record[i] == 1) { // Align attribute
           Attribute::AttrKind Kind;
-          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+          if (std::error_code EC = ParseAttrKind(Record[++i], &Kind))
             return EC;
           if (Kind == Attribute::Alignment)
             B.addAlignmentAttr(Record[++i]);
@@ -704,14 +706,14 @@ error_code BitcodeReader::ParseAttributeGroupBlock() {
   }
 }
 
-error_code BitcodeReader::ParseTypeTable() {
+std::error_code BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
     return Error(InvalidRecord);
 
   return ParseTypeTableBody();
 }
 
-error_code BitcodeReader::ParseTypeTableBody() {
+std::error_code BitcodeReader::ParseTypeTableBody() {
   if (!TypeList.empty())
     return Error(InvalidMultipleBlocks);
 
@@ -731,7 +733,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
     case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
         return Error(MalformedBlock);
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -931,7 +933,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
   }
 }
 
-error_code BitcodeReader::ParseValueSymbolTable() {
+std::error_code BitcodeReader::ParseValueSymbolTable() {
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -947,7 +949,7 @@ error_code BitcodeReader::ParseValueSymbolTable() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -985,7 +987,7 @@ error_code BitcodeReader::ParseValueSymbolTable() {
   }
 }
 
-error_code BitcodeReader::ParseMetadata() {
+std::error_code BitcodeReader::ParseMetadata() {
   unsigned NextMDValueNo = MDValueList.size();
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
@@ -1002,7 +1004,7 @@ error_code BitcodeReader::ParseMetadata() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1094,31 +1096,9 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
   return 1ULL << 63;
 }
 
-// FIXME: Delete this in LLVM 4.0 and just assert that the aliasee is a
-// GlobalObject.
-static GlobalObject &
-getGlobalObjectInExpr(const DenseMap<GlobalAlias *, Constant *> &Map,
-                      Constant &C) {
-  auto *GO = dyn_cast<GlobalObject>(&C);
-  if (GO)
-    return *GO;
-
-  auto *GA = dyn_cast<GlobalAlias>(&C);
-  if (GA)
-    return getGlobalObjectInExpr(Map, *Map.find(GA)->second);
-
-  auto &CE = cast<ConstantExpr>(C);
-  assert(CE.getOpcode() == Instruction::BitCast ||
-         CE.getOpcode() == Instruction::GetElementPtr ||
-         CE.getOpcode() == Instruction::AddrSpaceCast);
-  if (CE.getOpcode() == Instruction::GetElementPtr)
-    assert(cast<GEPOperator>(CE).hasAllZeroIndices());
-  return getGlobalObjectInExpr(Map, *CE.getOperand(0));
-}
-
 /// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
 /// values and aliases that we can.
-error_code BitcodeReader::ResolveGlobalAndAliasInits() {
+std::error_code BitcodeReader::ResolveGlobalAndAliasInits() {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
   std::vector<std::pair<Function*, unsigned> > FunctionPrefixWorklist;
@@ -1141,30 +1121,19 @@ error_code BitcodeReader::ResolveGlobalAndAliasInits() {
     GlobalInitWorklist.pop_back();
   }
 
-  // FIXME: Delete this in LLVM 4.0
-  // Older versions of llvm could write an alias pointing to another. We cannot
-  // construct those aliases, so we first collect an alias to aliasee expression
-  // and then compute the actual aliasee.
-  DenseMap<GlobalAlias *, Constant *> AliasInit;
-
   while (!AliasInitWorklist.empty()) {
     unsigned ValID = AliasInitWorklist.back().second;
     if (ValID >= ValueList.size()) {
       AliasInits.push_back(AliasInitWorklist.back());
     } else {
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-        AliasInit.insert(std::make_pair(AliasInitWorklist.back().first, C));
+        AliasInitWorklist.back().first->setAliasee(C);
       else
         return Error(ExpectedConstant);
     }
     AliasInitWorklist.pop_back();
   }
 
-  for (auto &Pair : AliasInit) {
-    auto &GO = getGlobalObjectInExpr(AliasInit, *Pair.second);
-    Pair.first->setAliasee(&GO);
-  }
-
   while (!FunctionPrefixWorklist.empty()) {
     unsigned ValID = FunctionPrefixWorklist.back().second;
     if (ValID >= ValueList.size()) {
@@ -1178,7 +1147,7 @@ error_code BitcodeReader::ResolveGlobalAndAliasInits() {
     FunctionPrefixWorklist.pop_back();
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
 static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
@@ -1189,7 +1158,7 @@ static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   return APInt(TypeBits, Words);
 }
 
-error_code BitcodeReader::ParseConstants() {
+std::error_code BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -1212,7 +1181,7 @@ error_code BitcodeReader::ParseConstants() {
       // Once all the constants have been read, go through and resolve forward
       // references.
       ValueList.ResolveConstantForwardRefs();
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1627,7 +1596,7 @@ error_code BitcodeReader::ParseConstants() {
   }
 }
 
-error_code BitcodeReader::ParseUseLists() {
+std::error_code BitcodeReader::ParseUseLists() {
   if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -1642,7 +1611,7 @@ error_code BitcodeReader::ParseUseLists() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1667,7 +1636,7 @@ error_code BitcodeReader::ParseUseLists() {
 /// RememberAndSkipFunctionBody - When we see the block for a function body,
 /// remember where it is and then skip it.  This lets us lazily deserialize the
 /// functions.
-error_code BitcodeReader::RememberAndSkipFunctionBody() {
+std::error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Get the function we are talking about.
   if (FunctionsWithBodies.empty())
     return Error(InsufficientFunctionProtos);
@@ -1682,10 +1651,10 @@ error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Skip over the function block for now.
   if (Stream.SkipBlock())
     return Error(InvalidRecord);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::GlobalCleanup() {
+std::error_code BitcodeReader::GlobalCleanup() {
   // Patch the initializers for globals and aliases up.
   ResolveGlobalAndAliasInits();
   if (!GlobalInits.empty() || !AliasInits.empty())
@@ -1711,10 +1680,10 @@ error_code BitcodeReader::GlobalCleanup() {
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
   std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::ParseModule(bool Resume) {
+std::error_code BitcodeReader::ParseModule(bool Resume) {
   if (Resume)
     Stream.JumpToBit(NextUnreadBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
@@ -1745,30 +1714,30 @@ error_code BitcodeReader::ParseModule(bool Resume) {
           return Error(MalformedBlock);
         break;
       case bitc::PARAMATTR_BLOCK_ID:
-        if (error_code EC = ParseAttributeBlock())
+        if (std::error_code EC = ParseAttributeBlock())
           return EC;
         break;
       case bitc::PARAMATTR_GROUP_BLOCK_ID:
-        if (error_code EC = ParseAttributeGroupBlock())
+        if (std::error_code EC = ParseAttributeGroupBlock())
           return EC;
         break;
       case bitc::TYPE_BLOCK_ID_NEW:
-        if (error_code EC = ParseTypeTable())
+        if (std::error_code EC = ParseTypeTable())
           return EC;
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (error_code EC = ParseValueSymbolTable())
+        if (std::error_code EC = ParseValueSymbolTable())
           return EC;
         SeenValueSymbolTable = true;
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (error_code EC = ParseConstants())
+        if (std::error_code EC = ParseConstants())
           return EC;
-        if (error_code EC = ResolveGlobalAndAliasInits())
+        if (std::error_code EC = ResolveGlobalAndAliasInits())
           return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (error_code EC = ParseMetadata())
+        if (std::error_code EC = ParseMetadata())
           return EC;
         break;
       case bitc::FUNCTION_BLOCK_ID:
@@ -1776,12 +1745,12 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         // FunctionsWithBodies list.
         if (!SeenFirstFunctionBody) {
           std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end());
-          if (error_code EC = GlobalCleanup())
+          if (std::error_code EC = GlobalCleanup())
             return EC;
           SeenFirstFunctionBody = true;
         }
 
-        if (error_code EC = RememberAndSkipFunctionBody())
+        if (std::error_code EC = RememberAndSkipFunctionBody())
           return EC;
         // For streaming bitcode, suspend parsing when we reach the function
         // bodies. Subsequent materialization calls will resume it when
@@ -1791,11 +1760,11 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         // just finish the parse now.
         if (LazyStreamer && SeenValueSymbolTable) {
           NextUnreadBit = Stream.GetCurrentBitNo();
-          return error_code::success();
+          return std::error_code();
         }
         break;
       case bitc::USELIST_BLOCK_ID:
-        if (error_code EC = ParseUseLists())
+        if (std::error_code EC = ParseUseLists())
           return EC;
         break;
       }
@@ -2017,6 +1986,10 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         NewGA->setDLLStorageClass(GetDecodedDLLStorageClass(Record[4]));
       else
         UpgradeDLLImportExportLinkage(NewGA, Record[2]);
+      if (Record.size() > 5)
+	NewGA->setThreadLocalMode(GetDecodedThreadLocalMode(Record[5]));
+      if (Record.size() > 6)
+	NewGA->setUnnamedAddr(Record[6]);
       ValueList.push_back(NewGA);
       AliasInits.push_back(std::make_pair(NewGA, Record[1]));
       break;
@@ -2033,10 +2006,10 @@ error_code BitcodeReader::ParseModule(bool Resume) {
   }
 }
 
-error_code BitcodeReader::ParseBitcodeInto(Module *M) {
+std::error_code BitcodeReader::ParseBitcodeInto(Module *M) {
   TheModule = nullptr;
 
-  if (error_code EC = InitStream())
+  if (std::error_code EC = InitStream())
     return EC;
 
   // Sniff for the signature.
@@ -2052,7 +2025,7 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
   // need to understand them all.
   while (1) {
     if (Stream.AtEndOfStream())
-      return error_code::success();
+      return std::error_code();
 
     BitstreamEntry Entry =
       Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
@@ -2061,7 +2034,7 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
 
     case BitstreamEntry::SubBlock:
       switch (Entry.ID) {
@@ -2074,10 +2047,10 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
         if (TheModule)
           return Error(InvalidMultipleBlocks);
         TheModule = M;
-        if (error_code EC = ParseModule(false))
+        if (std::error_code EC = ParseModule(false))
           return EC;
         if (LazyStreamer)
-          return error_code::success();
+          return std::error_code();
         break;
       default:
         if (Stream.SkipBlock())
@@ -2094,14 +2067,14 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
       if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
           Stream.AtEndOfStream())
-        return error_code::success();
+        return std::error_code();
 
       return Error(InvalidRecord);
     }
   }
 }
 
-error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
+std::error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -2116,7 +2089,7 @@ error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2137,8 +2110,8 @@ error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
   }
 }
 
-error_code BitcodeReader::ParseTriple(std::string &Triple) {
-  if (error_code EC = InitStream())
+std::error_code BitcodeReader::ParseTriple(std::string &Triple) {
+  if (std::error_code EC = InitStream())
     return EC;
 
   // Sniff for the signature.
@@ -2159,7 +2132,7 @@ error_code BitcodeReader::ParseTriple(std::string &Triple) {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
 
     case BitstreamEntry::SubBlock:
       if (Entry.ID == bitc::MODULE_BLOCK_ID)
@@ -2178,7 +2151,7 @@ error_code BitcodeReader::ParseTriple(std::string &Triple) {
 }
 
 /// ParseMetadataAttachment - Parse metadata attachments.
-error_code BitcodeReader::ParseMetadataAttachment() {
+std::error_code BitcodeReader::ParseMetadataAttachment() {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
     return Error(InvalidRecord);
 
@@ -2191,7 +2164,7 @@ error_code BitcodeReader::ParseMetadataAttachment() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2225,7 +2198,7 @@ error_code BitcodeReader::ParseMetadataAttachment() {
 }
 
 /// ParseFunctionBody - Lazily parse the specified function body block.
-error_code BitcodeReader::ParseFunctionBody(Function *F) {
+std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -2261,20 +2234,20 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
           return Error(InvalidRecord);
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (error_code EC = ParseConstants())
+        if (std::error_code EC = ParseConstants())
           return EC;
         NextValueNo = ValueList.size();
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (error_code EC = ParseValueSymbolTable())
+        if (std::error_code EC = ParseValueSymbolTable())
           return EC;
         break;
       case bitc::METADATA_ATTACHMENT_ID:
-        if (error_code EC = ParseMetadataAttachment())
+        if (std::error_code EC = ParseMetadataAttachment())
           return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (error_code EC = ParseMetadata())
+        if (std::error_code EC = ParseMetadata())
           return EC;
         break;
       }
@@ -2857,7 +2830,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         assert((CT != LandingPadInst::Filter ||
                 isa<ArrayType>(Val->getType())) &&
                "Filter clause has invalid type!");
-        LP->addClause(Val);
+        LP->addClause(cast<Constant>(Val));
       }
 
       I = LP;
@@ -2950,7 +2923,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_CMPXCHG: {
       // CMPXCHG:[ptrty, ptr, cmp, new, vol, successordering, synchscope,
-      //          failureordering]
+      //          failureordering?, isweak?]
       unsigned OpNum = 0;
       Value *Ptr, *Cmp, *New;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -2958,7 +2931,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
                     cast<PointerType>(Ptr->getType())->getElementType(), Cmp) ||
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
-          (OpNum + 3 != Record.size() && OpNum + 4 != Record.size()))
+          (Record.size() < OpNum + 3 || Record.size() > OpNum + 5))
         return Error(InvalidRecord);
       AtomicOrdering SuccessOrdering = GetDecodedOrdering(Record[OpNum+1]);
       if (SuccessOrdering == NotAtomic || SuccessOrdering == Unordered)
@@ -2975,6 +2948,17 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering, FailureOrdering,
                                 SynchScope);
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
+
+      if (Record.size() < 8) {
+        // Before weak cmpxchgs existed, the instruction simply returned the
+        // value loaded from memory, so bitcode files from that era will be
+        // expecting the first component of a modern cmpxchg.
+        CurBB->getInstList().push_back(I);
+        I = ExtractValueInst::Create(I, 0);
+      } else {
+        cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum+4]);
+      }
+
       InstructionList.push_back(I);
       break;
     }
@@ -3144,21 +3128,22 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
   ValueList.shrinkTo(ModuleValueListSize);
   MDValueList.shrinkTo(ModuleMDValueListSize);
   std::vector<BasicBlock*>().swap(FunctionBBs);
-  return error_code::success();
+  return std::error_code();
 }
 
 /// Find the function body in the bitcode stream
-error_code BitcodeReader::FindFunctionInStream(Function *F,
-       DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator) {
+std::error_code BitcodeReader::FindFunctionInStream(
+    Function *F,
+    DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
     if (Stream.AtEndOfStream())
       return Error(CouldNotFindFunctionInStream);
     // ParseModule will parse the next body in the stream and set its
     // position in the DeferredFunctionInfo map.
-    if (error_code EC = ParseModule(true))
+    if (std::error_code EC = ParseModule(true))
       return EC;
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 //===----------------------------------------------------------------------===//
@@ -3174,24 +3159,24 @@ bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
   return false;
 }
 
-error_code BitcodeReader::Materialize(GlobalValue *GV) {
+std::error_code BitcodeReader::Materialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
   if (!F || !F->isMaterializable())
-    return error_code::success();
+    return std::error_code();
 
   DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
   assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
   // If its position is recorded as 0, its body is somewhere in the stream
   // but we haven't seen it yet.
   if (DFII->second == 0 && LazyStreamer)
-    if (error_code EC = FindFunctionInStream(F, DFII))
+    if (std::error_code EC = FindFunctionInStream(F, DFII))
       return EC;
 
   // Move the bit stream to the saved position of the deferred function body.
   Stream.JumpToBit(DFII->second);
 
-  if (error_code EC = ParseFunctionBody(F))
+  if (std::error_code EC = ParseFunctionBody(F))
     return EC;
 
   // Upgrade any old intrinsic calls in the function.
@@ -3206,7 +3191,7 @@ error_code BitcodeReader::Materialize(GlobalValue *GV) {
     }
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
@@ -3228,8 +3213,7 @@ void BitcodeReader::Dematerialize(GlobalValue *GV) {
   F->deleteBody();
 }
 
-
-error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::MaterializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on
@@ -3237,7 +3221,7 @@ error_code BitcodeReader::MaterializeModule(Module *M) {
   for (Module::iterator F = TheModule->begin(), E = TheModule->end();
        F != E; ++F) {
     if (F->isMaterializable()) {
-      if (error_code EC = Materialize(F))
+      if (std::error_code EC = Materialize(F))
         return EC;
     }
   }
@@ -3270,16 +3254,16 @@ error_code BitcodeReader::MaterializeModule(Module *M) {
     UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
 
   UpgradeDebugInfo(*M);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::InitStream() {
+std::error_code BitcodeReader::InitStream() {
   if (LazyStreamer)
     return InitLazyStream();
   return InitStreamFromBuffer();
 }
 
-error_code BitcodeReader::InitStreamFromBuffer() {
+std::error_code BitcodeReader::InitStreamFromBuffer() {
   const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
@@ -3299,10 +3283,10 @@ error_code BitcodeReader::InitStreamFromBuffer() {
   StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
   Stream.init(*StreamFile);
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::InitLazyStream() {
+std::error_code BitcodeReader::InitLazyStream() {
   // Check and strip off the bitcode wrapper; BitstreamReader expects never to
   // see it.
   StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
@@ -3323,12 +3307,12 @@ error_code BitcodeReader::InitLazyStream() {
     Bytes->dropLeadingBytes(bitcodeStart - buf);
     Bytes->setKnownObjectSize(bitcodeEnd - bitcodeStart);
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 namespace {
-class BitcodeErrorCategoryType : public error_category {
-  const char *name() const override {
+class BitcodeErrorCategoryType : public std::error_category {
+  const char *name() const LLVM_NOEXCEPT override {
     return "llvm.bitcode";
   }
   std::string message(int IE) const override {
@@ -3378,7 +3362,7 @@ class BitcodeErrorCategoryType : public error_category {
 };
 }
 
-const error_category &BitcodeReader::BitcodeErrorCategory() {
+const std::error_category &BitcodeReader::BitcodeErrorCategory() {
   static BitcodeErrorCategoryType O;
   return O;
 }
@@ -3390,16 +3374,16 @@ const error_category &BitcodeReader::BitcodeErrorCategory() {
 /// getLazyBitcodeModule - lazy function-at-a-time loading from a file.
 ///
 ErrorOr<Module *> llvm::getLazyBitcodeModule(MemoryBuffer *Buffer,
-                                             LLVMContext &Context) {
+                                             LLVMContext &Context,
+                                             bool BufferOwned) {
   Module *M = new Module(Buffer->getBufferIdentifier(), Context);
-  BitcodeReader *R = new BitcodeReader(Buffer, Context);
+  BitcodeReader *R = new BitcodeReader(Buffer, Context, BufferOwned);
   M->setMaterializer(R);
-  if (error_code EC = R->ParseBitcodeInto(M)) {
+  if (std::error_code EC = R->ParseBitcodeInto(M)) {
+    R->releaseBuffer(); // Never take ownership on error.
     delete M;  // Also deletes R.
     return EC;
   }
-  // Have the BitcodeReader dtor delete 'Buffer'.
-  R->setBufferOwned(true);
 
   R->materializeForwardReferencedFunctions();
 
@@ -3414,29 +3398,24 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name,
   Module *M = new Module(name, Context);
   BitcodeReader *R = new BitcodeReader(streamer, Context);
   M->setMaterializer(R);
-  if (error_code EC = R->ParseBitcodeInto(M)) {
+  if (std::error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
       *ErrMsg = EC.message();
     delete M;  // Also deletes R.
     return nullptr;
   }
-  R->setBufferOwned(false); // no buffer to delete
   return M;
 }
 
 ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBuffer *Buffer,
                                          LLVMContext &Context) {
-  ErrorOr<Module *> ModuleOrErr = getLazyBitcodeModule(Buffer, Context);
+  ErrorOr<Module *> ModuleOrErr = getLazyBitcodeModule(Buffer, Context, false);
   if (!ModuleOrErr)
     return ModuleOrErr;
   Module *M = ModuleOrErr.get();
 
-  // Don't let the BitcodeReader dtor delete 'Buffer', regardless of whether
-  // there was an error.
-  static_cast<BitcodeReader*>(M->getMaterializer())->setBufferOwned(false);
-
   // Read in the entire module, and destroy the BitcodeReader.
-  if (error_code EC = M->materializeAllPermanently()) {
+  if (std::error_code EC = M->materializeAllPermanently()) {
     delete M;
     return EC;
   }
@@ -3450,12 +3429,10 @@ ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBuffer *Buffer,
 std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer,
                                          LLVMContext& Context,
                                          std::string *ErrMsg) {
-  BitcodeReader *R = new BitcodeReader(Buffer, Context);
-  // Don't let the BitcodeReader dtor delete 'Buffer'.
-  R->setBufferOwned(false);
+  BitcodeReader *R = new BitcodeReader(Buffer, Context, /*BufferOwned*/ false);
 
   std::string Triple("");
-  if (error_code EC = R->ParseTriple(Triple))
+  if (std::error_code EC = R->ParseTriple(Triple))
     if (ErrMsg)
       *ErrMsg = EC.message();
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 593d8f93ccf3..6aa3e0e5adf7 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -22,7 +22,7 @@
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 #include <vector>
 
 namespace llvm {
@@ -193,7 +193,7 @@ class BitcodeReader : public GVMaterializer {
   /// not need this flag.
   bool UseRelativeIDs;
 
-  static const error_category &BitcodeErrorCategory();
+  static const std::error_category &BitcodeErrorCategory();
 
 public:
   enum ErrorType {
@@ -219,47 +219,43 @@ class BitcodeReader : public GVMaterializer {
     InvalidValue // Invalid version, inst number, attr number, etc
   };
 
-  error_code Error(ErrorType E) {
-    return error_code(E, BitcodeErrorCategory());
+  std::error_code Error(ErrorType E) {
+    return std::error_code(E, BitcodeErrorCategory());
   }
 
-  explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
-    : Context(C), TheModule(nullptr), Buffer(buffer), BufferOwned(false),
-      LazyStreamer(nullptr), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ValueList(C), MDValueList(C),
-      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
-  }
+  explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C, bool BufferOwned)
+      : Context(C), TheModule(nullptr), Buffer(buffer),
+        BufferOwned(BufferOwned), LazyStreamer(nullptr), NextUnreadBit(0),
+        SeenValueSymbolTable(false), ValueList(C), MDValueList(C),
+        SeenFirstFunctionBody(false), UseRelativeIDs(false) {}
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
-    : Context(C), TheModule(nullptr), Buffer(nullptr), BufferOwned(false),
-      LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ValueList(C), MDValueList(C),
-      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
-  }
-  ~BitcodeReader() {
-    FreeState();
-  }
+      : Context(C), TheModule(nullptr), Buffer(nullptr), BufferOwned(false),
+        LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
+        ValueList(C), MDValueList(C), SeenFirstFunctionBody(false),
+        UseRelativeIDs(false) {}
+  ~BitcodeReader() { FreeState(); }
 
   void materializeForwardReferencedFunctions();
 
   void FreeState();
 
-  /// setBufferOwned - If this is true, the reader will destroy the MemoryBuffer
-  /// when the reader is destroyed.
-  void setBufferOwned(bool Owned) { BufferOwned = Owned; }
+  void releaseBuffer() {
+    Buffer = nullptr;
+  }
 
   bool isMaterializable(const GlobalValue *GV) const override;
   bool isDematerializable(const GlobalValue *GV) const override;
-  error_code Materialize(GlobalValue *GV) override;
-  error_code MaterializeModule(Module *M) override;
+  std::error_code Materialize(GlobalValue *GV) override;
+  std::error_code MaterializeModule(Module *M) override;
   void Dematerialize(GlobalValue *GV) override;
 
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
-  error_code ParseBitcodeInto(Module *M);
+  std::error_code ParseBitcodeInto(Module *M);
 
   /// @brief Cheap mechanism to just extract module triple
   /// @returns true if an error occurred.
-  error_code ParseTriple(std::string &Triple);
+  std::error_code ParseTriple(std::string &Triple);
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -346,28 +342,29 @@ class BitcodeReader : public GVMaterializer {
     return getFnValueByID(ValNo, Ty);
   }
 
-  error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
-  error_code ParseModule(bool Resume);
-  error_code ParseAttributeBlock();
-  error_code ParseAttributeGroupBlock();
-  error_code ParseTypeTable();
-  error_code ParseTypeTableBody();
-
-  error_code ParseValueSymbolTable();
-  error_code ParseConstants();
-  error_code RememberAndSkipFunctionBody();
-  error_code ParseFunctionBody(Function *F);
-  error_code GlobalCleanup();
-  error_code ResolveGlobalAndAliasInits();
-  error_code ParseMetadata();
-  error_code ParseMetadataAttachment();
-  error_code ParseModuleTriple(std::string &Triple);
-  error_code ParseUseLists();
-  error_code InitStream();
-  error_code InitStreamFromBuffer();
-  error_code InitLazyStream();
-  error_code FindFunctionInStream(Function *F,
-         DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator);
+  std::error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
+  std::error_code ParseModule(bool Resume);
+  std::error_code ParseAttributeBlock();
+  std::error_code ParseAttributeGroupBlock();
+  std::error_code ParseTypeTable();
+  std::error_code ParseTypeTableBody();
+
+  std::error_code ParseValueSymbolTable();
+  std::error_code ParseConstants();
+  std::error_code RememberAndSkipFunctionBody();
+  std::error_code ParseFunctionBody(Function *F);
+  std::error_code GlobalCleanup();
+  std::error_code ResolveGlobalAndAliasInits();
+  std::error_code ParseMetadata();
+  std::error_code ParseMetadataAttachment();
+  std::error_code ParseModuleTriple(std::string &Triple);
+  std::error_code ParseUseLists();
+  std::error_code InitStream();
+  std::error_code InitStreamFromBuffer();
+  std::error_code InitLazyStream();
+  std::error_code FindFunctionInStream(
+      Function *F,
+      DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator);
 };
 
 } // End llvm namespace
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index f31e1fa9b173..72451ec9500c 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -97,7 +97,7 @@ void BitstreamCursor::readAbbreviatedField(const BitCodeAbbrevOp &Op,
   switch (Op.getEncoding()) {
   case BitCodeAbbrevOp::Array:
   case BitCodeAbbrevOp::Blob:
-    assert(0 && "Should not reach here");
+    llvm_unreachable("Should not reach here");
   case BitCodeAbbrevOp::Fixed:
     Vals.push_back(Read((unsigned)Op.getEncodingData()));
     break;
@@ -117,7 +117,7 @@ void BitstreamCursor::skipAbbreviatedField(const BitCodeAbbrevOp &Op) {
   switch (Op.getEncoding()) {
   case BitCodeAbbrevOp::Array:
   case BitCodeAbbrevOp::Blob:
-    assert(0 && "Should not reach here");
+    llvm_unreachable("Should not reach here");
   case BitCodeAbbrevOp::Fixed:
     (void)Read((unsigned)Op.getEncodingData());
     break;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index db254e6a9413..3ba7358ae5b9 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -177,6 +177,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_INLINE_HINT;
   case Attribute::InReg:
     return bitc::ATTR_KIND_IN_REG;
+  case Attribute::JumpTable:
+    return bitc::ATTR_KIND_JUMP_TABLE;
   case Attribute::MinSize:
     return bitc::ATTR_KIND_MIN_SIZE;
   case Attribute::Naked:
@@ -476,8 +478,8 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
-static unsigned getEncodedLinkage(const GlobalValue *GV) {
-  switch (GV->getLinkage()) {
+static unsigned getEncodedLinkage(const GlobalValue &GV) {
+  switch (GV.getLinkage()) {
   case GlobalValue::ExternalLinkage:                 return 0;
   case GlobalValue::WeakAnyLinkage:                  return 1;
   case GlobalValue::AppendingLinkage:                return 2;
@@ -493,8 +495,8 @@ static unsigned getEncodedLinkage(const GlobalValue *GV) {
   llvm_unreachable("Invalid linkage");
 }
 
-static unsigned getEncodedVisibility(const GlobalValue *GV) {
-  switch (GV->getVisibility()) {
+static unsigned getEncodedVisibility(const GlobalValue &GV) {
+  switch (GV.getVisibility()) {
   case GlobalValue::DefaultVisibility:   return 0;
   case GlobalValue::HiddenVisibility:    return 1;
   case GlobalValue::ProtectedVisibility: return 2;
@@ -502,8 +504,8 @@ static unsigned getEncodedVisibility(const GlobalValue *GV) {
   llvm_unreachable("Invalid visibility");
 }
 
-static unsigned getEncodedDLLStorageClass(const GlobalValue *GV) {
-  switch (GV->getDLLStorageClass()) {
+static unsigned getEncodedDLLStorageClass(const GlobalValue &GV) {
+  switch (GV.getDLLStorageClass()) {
   case GlobalValue::DefaultStorageClass:   return 0;
   case GlobalValue::DLLImportStorageClass: return 1;
   case GlobalValue::DLLExportStorageClass: return 2;
@@ -511,8 +513,8 @@ static unsigned getEncodedDLLStorageClass(const GlobalValue *GV) {
   llvm_unreachable("Invalid DLL storage class");
 }
 
-static unsigned getEncodedThreadLocalMode(const GlobalVariable *GV) {
-  switch (GV->getThreadLocalMode()) {
+static unsigned getEncodedThreadLocalMode(const GlobalValue &GV) {
+  switch (GV.getThreadLocalMode()) {
     case GlobalVariable::NotThreadLocal:         return 0;
     case GlobalVariable::GeneralDynamicTLSModel: return 1;
     case GlobalVariable::LocalDynamicTLSModel:   return 2;
@@ -543,36 +545,35 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   std::map<std::string, unsigned> GCMap;
   unsigned MaxAlignment = 0;
   unsigned MaxGlobalType = 0;
-  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
-       GV != E; ++GV) {
-    MaxAlignment = std::max(MaxAlignment, GV->getAlignment());
-    MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV->getType()));
-    if (GV->hasSection()) {
+  for (const GlobalValue &GV : M->globals()) {
+    MaxAlignment = std::max(MaxAlignment, GV.getAlignment());
+    MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getType()));
+    if (GV.hasSection()) {
       // Give section names unique ID's.
-      unsigned &Entry = SectionMap[GV->getSection()];
+      unsigned &Entry = SectionMap[GV.getSection()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV->getSection(),
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV.getSection(),
                           0/*TODO*/, Stream);
         Entry = SectionMap.size();
       }
     }
   }
-  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
-    MaxAlignment = std::max(MaxAlignment, F->getAlignment());
-    if (F->hasSection()) {
+  for (const Function &F : *M) {
+    MaxAlignment = std::max(MaxAlignment, F.getAlignment());
+    if (F.hasSection()) {
       // Give section names unique ID's.
-      unsigned &Entry = SectionMap[F->getSection()];
+      unsigned &Entry = SectionMap[F.getSection()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, F->getSection(),
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, F.getSection(),
                           0/*TODO*/, Stream);
         Entry = SectionMap.size();
       }
     }
-    if (F->hasGC()) {
+    if (F.hasGC()) {
       // Same for GC names.
-      unsigned &Entry = GCMap[F->getGC()];
+      unsigned &Entry = GCMap[F.getGC()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_GCNAME, F->getGC(),
+        WriteStringRecord(bitc::MODULE_CODE_GCNAME, F.getGC(),
                           0/*TODO*/, Stream);
         Entry = GCMap.size();
       }
@@ -608,28 +609,27 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
   // Emit the global variable information.
   SmallVector<unsigned, 64> Vals;
-  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
-       GV != E; ++GV) {
+  for (const GlobalVariable &GV : M->globals()) {
     unsigned AbbrevToUse = 0;
 
     // GLOBALVAR: [type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
     //             unnamed_addr, externally_initialized, dllstorageclass]
-    Vals.push_back(VE.getTypeID(GV->getType()));
-    Vals.push_back(GV->isConstant());
-    Vals.push_back(GV->isDeclaration() ? 0 :
-                   (VE.getValueID(GV->getInitializer()) + 1));
+    Vals.push_back(VE.getTypeID(GV.getType()));
+    Vals.push_back(GV.isConstant());
+    Vals.push_back(GV.isDeclaration() ? 0 :
+                   (VE.getValueID(GV.getInitializer()) + 1));
     Vals.push_back(getEncodedLinkage(GV));
-    Vals.push_back(Log2_32(GV->getAlignment())+1);
-    Vals.push_back(GV->hasSection() ? SectionMap[GV->getSection()] : 0);
-    if (GV->isThreadLocal() ||
-        GV->getVisibility() != GlobalValue::DefaultVisibility ||
-        GV->hasUnnamedAddr() || GV->isExternallyInitialized() ||
-        GV->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
+    Vals.push_back(Log2_32(GV.getAlignment())+1);
+    Vals.push_back(GV.hasSection() ? SectionMap[GV.getSection()] : 0);
+    if (GV.isThreadLocal() ||
+        GV.getVisibility() != GlobalValue::DefaultVisibility ||
+        GV.hasUnnamedAddr() || GV.isExternallyInitialized() ||
+        GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
-      Vals.push_back(GV->hasUnnamedAddr());
-      Vals.push_back(GV->isExternallyInitialized());
+      Vals.push_back(GV.hasUnnamedAddr());
+      Vals.push_back(GV.isExternallyInitialized());
       Vals.push_back(getEncodedDLLStorageClass(GV));
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
@@ -640,20 +640,20 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   }
 
   // Emit the function proto information.
-  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+  for (const Function &F : *M) {
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
     //             section, visibility, gc, unnamed_addr, prefix]
-    Vals.push_back(VE.getTypeID(F->getType()));
-    Vals.push_back(F->getCallingConv());
-    Vals.push_back(F->isDeclaration());
+    Vals.push_back(VE.getTypeID(F.getType()));
+    Vals.push_back(F.getCallingConv());
+    Vals.push_back(F.isDeclaration());
     Vals.push_back(getEncodedLinkage(F));
-    Vals.push_back(VE.getAttributeID(F->getAttributes()));
-    Vals.push_back(Log2_32(F->getAlignment())+1);
-    Vals.push_back(F->hasSection() ? SectionMap[F->getSection()] : 0);
+    Vals.push_back(VE.getAttributeID(F.getAttributes()));
+    Vals.push_back(Log2_32(F.getAlignment())+1);
+    Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0);
     Vals.push_back(getEncodedVisibility(F));
-    Vals.push_back(F->hasGC() ? GCMap[F->getGC()] : 0);
-    Vals.push_back(F->hasUnnamedAddr());
-    Vals.push_back(F->hasPrefixData() ? (VE.getValueID(F->getPrefixData()) + 1)
+    Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0);
+    Vals.push_back(F.hasUnnamedAddr());
+    Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1)
                                       : 0);
     Vals.push_back(getEncodedDLLStorageClass(F));
 
@@ -663,14 +663,15 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   }
 
   // Emit the alias information.
-  for (Module::const_alias_iterator AI = M->alias_begin(), E = M->alias_end();
-       AI != E; ++AI) {
+  for (const GlobalAlias &A : M->aliases()) {
     // ALIAS: [alias type, aliasee val#, linkage, visibility]
-    Vals.push_back(VE.getTypeID(AI->getType()));
-    Vals.push_back(VE.getValueID(AI->getAliasee()));
-    Vals.push_back(getEncodedLinkage(AI));
-    Vals.push_back(getEncodedVisibility(AI));
-    Vals.push_back(getEncodedDLLStorageClass(AI));
+    Vals.push_back(VE.getTypeID(A.getType()));
+    Vals.push_back(VE.getValueID(A.getAliasee()));
+    Vals.push_back(getEncodedLinkage(A));
+    Vals.push_back(getEncodedVisibility(A));
+    Vals.push_back(getEncodedDLLStorageClass(A));
+    Vals.push_back(getEncodedThreadLocalMode(A));
+    Vals.push_back(A.hasUnnamedAddr());
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse);
     Vals.clear();
@@ -1448,6 +1449,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
                      cast<AtomicCmpXchgInst>(I).getSynchScope()));
     Vals.push_back(GetEncodedOrdering(
                      cast<AtomicCmpXchgInst>(I).getFailureOrdering()));
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isWeak());
     break;
   case Instruction::AtomicRMW:
     Code = bitc::FUNC_CODE_INST_ATOMICRMW;
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 8531e76be1fe..befe15bb4587 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -73,37 +73,34 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
   SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
 
   // Enumerate types used by function bodies and argument lists.
-  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
-
-    for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-         I != E; ++I)
-      EnumerateType(I->getType());
-
-    for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;++I){
-        for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
-             OI != E; ++OI) {
-          if (MDNode *MD = dyn_cast<MDNode>(*OI))
+  for (const Function &F : *M) {
+    for (const Argument &A : F.args())
+      EnumerateType(A.getType());
+
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB) {
+        for (const Use &Op : I.operands()) {
+          if (MDNode *MD = dyn_cast<MDNode>(&Op))
             if (MD->isFunctionLocal() && MD->getFunction())
               // These will get enumerated during function-incorporation.
               continue;
-          EnumerateOperandType(*OI);
+          EnumerateOperandType(Op);
         }
-        EnumerateType(I->getType());
-        if (const CallInst *CI = dyn_cast<CallInst>(I))
+        EnumerateType(I.getType());
+        if (const CallInst *CI = dyn_cast<CallInst>(&I))
           EnumerateAttributes(CI->getAttributes());
-        else if (const InvokeInst *II = dyn_cast<InvokeInst>(I))
+        else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I))
           EnumerateAttributes(II->getAttributes());
 
         // Enumerate metadata attached with this instruction.
         MDs.clear();
-        I->getAllMetadataOtherThanDebugLoc(MDs);
+        I.getAllMetadataOtherThanDebugLoc(MDs);
         for (unsigned i = 0, e = MDs.size(); i != e; ++i)
           EnumerateMetadata(MDs[i].second);
 
-        if (!I->getDebugLoc().isUnknown()) {
+        if (!I.getDebugLoc().isUnknown()) {
           MDNode *Scope, *IA;
-          I->getDebugLoc().getScopeAndInlinedAt(Scope, IA, I->getContext());
+          I.getDebugLoc().getScopeAndInlinedAt(Scope, IA, I.getContext());
           if (Scope) EnumerateMetadata(Scope);
           if (IA) EnumerateMetadata(IA);
         }
diff --git a/lib/Bitcode/module.modulemap b/lib/Bitcode/module.modulemap
new file mode 100644
index 000000000000..7df1a0a3c721
--- /dev/null
+++ b/lib/Bitcode/module.modulemap
@@ -0,0 +1 @@
+module Bitcode { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 6fc83a26a0d2..a3e65284e803 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines several CodeGen-specific LLVM IR analysis utilties.
+// This file defines several CodeGen-specific LLVM IR analysis utilities.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -474,8 +475,7 @@ static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool llvm::isInTailCallPosition(ImmutableCallSite CS,
-                                const TargetLowering &TLI) {
+bool llvm::isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG) {
   const Instruction *I = CS.getInstruction();
   const BasicBlock *ExitBB = I->getParent();
   const TerminatorInst *Term = ExitBB->getTerminator();
@@ -490,7 +490,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
   // longjmp on x86), it can end up causing miscompilation that has not
   // been fully understood.
   if (!Ret &&
-      (!TLI.getTargetMachine().Options.GuaranteedTailCallOpt ||
+      (!DAG.getTarget().Options.GuaranteedTailCallOpt ||
        !isa<UnreachableInst>(Term)))
     return false;
 
@@ -509,7 +509,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
         return false;
     }
 
-  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret, TLI);
+  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret,
+                                         *DAG.getTarget().getTargetLowering());
 }
 
 bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 1cb0159d10f7..251f5effd6b4 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -37,8 +37,7 @@
 using namespace llvm;
 
 ARMException::ARMException(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitCFI(false) {}
+  : EHStreamer(A), shouldEmitCFI(false) {}
 
 ARMException::~ARMException() {}
 
@@ -100,7 +99,7 @@ void ARMException::endFunction(const MachineFunction *) {
       ATS.emitHandlerData();
 
       // Emit actual exception table
-      EmitExceptionTable();
+      emitExceptionTable();
     }
   }
 
@@ -108,7 +107,7 @@ void ARMException::endFunction(const MachineFunction *) {
     ATS.emitFnEnd();
 }
 
-void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
+void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
   const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7de9c6d616c8..996dc2122f49 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -232,23 +233,23 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
   }
 
-  DwarfException *DE = nullptr;
+  EHStreamer *ES = nullptr;
   switch (MAI->getExceptionHandlingType()) {
   case ExceptionHandling::None:
     break;
   case ExceptionHandling::SjLj:
   case ExceptionHandling::DwarfCFI:
-    DE = new DwarfCFIException(this);
+    ES = new DwarfCFIException(this);
     break;
   case ExceptionHandling::ARM:
-    DE = new ARMException(this);
+    ES = new ARMException(this);
     break;
   case ExceptionHandling::Win64:
-    DE = new Win64Exception(this);
+    ES = new Win64Exception(this);
     break;
   }
-  if (DE)
-    Handlers.push_back(HandlerInfo(DE, EHTimerName, DWARFGroupName));
+  if (ES)
+    Handlers.push_back(HandlerInfo(ES, EHTimerName, DWARFGroupName));
   return false;
 }
 
@@ -870,6 +871,8 @@ void AsmPrinter::EmitFunctionBody() {
   OutStreamer.AddBlankLine();
 }
 
+static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP);
+
 bool AsmPrinter::doFinalization(Module &M) {
   // Emit global variables.
   for (const auto &G : M.globals())
@@ -887,6 +890,54 @@ bool AsmPrinter::doFinalization(Module &M) {
     EmitVisibility(Name, V, false);
   }
 
+  // Get information about jump-instruction tables to print.
+  JumpInstrTableInfo *JITI = getAnalysisIfAvailable<JumpInstrTableInfo>();
+
+  if (JITI && !JITI->getTables().empty()) {
+    unsigned Arch = Triple(getTargetTriple()).getArch();
+    bool IsThumb = (Arch == Triple::thumb || Arch == Triple::thumbeb);
+    MCInst TrapInst;
+    TM.getInstrInfo()->getTrap(TrapInst);
+    for (const auto &KV : JITI->getTables()) {
+      uint64_t Count = 0;
+      for (const auto &FunPair : KV.second) {
+        // Emit the function labels to make this be a function entry point.
+        MCSymbol *FunSym =
+          OutContext.GetOrCreateSymbol(FunPair.second->getName());
+        OutStreamer.EmitSymbolAttribute(FunSym, MCSA_Global);
+        // FIXME: JumpTableInstrInfo should store information about the required
+        // alignment of table entries and the size of the padding instruction.
+        EmitAlignment(3);
+        if (IsThumb)
+          OutStreamer.EmitThumbFunc(FunSym);
+        if (MAI->hasDotTypeDotSizeDirective())
+          OutStreamer.EmitSymbolAttribute(FunSym, MCSA_ELF_TypeFunction);
+        OutStreamer.EmitLabel(FunSym);
+
+        // Emit the jump instruction to transfer control to the original
+        // function.
+        MCInst JumpToFun;
+        MCSymbol *TargetSymbol =
+          OutContext.GetOrCreateSymbol(FunPair.first->getName());
+        const MCSymbolRefExpr *TargetSymRef =
+          MCSymbolRefExpr::Create(TargetSymbol, MCSymbolRefExpr::VK_PLT,
+                                  OutContext);
+        TM.getInstrInfo()->getUnconditionalBranch(JumpToFun, TargetSymRef);
+        OutStreamer.EmitInstruction(JumpToFun, getSubtargetInfo());
+        ++Count;
+      }
+
+      // Emit enough padding instructions to fill up to the next power of two.
+      // This assumes that the trap instruction takes 8 bytes or fewer.
+      uint64_t Remaining = NextPowerOf2(Count) - Count;
+      for (uint64_t C = 0; C < Remaining; ++C) {
+        EmitAlignment(3);
+        OutStreamer.EmitInstruction(TrapInst, getSubtargetInfo());
+      }
+
+    }
+  }
+
   // Emit module flags.
   SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
   M.getModuleFlagsMetadata(ModuleFlags);
@@ -932,10 +983,6 @@ bool AsmPrinter::doFinalization(Module &M) {
     for (const auto &Alias : M.aliases()) {
       MCSymbol *Name = getSymbol(&Alias);
 
-      const GlobalValue *GV = Alias.getAliasee();
-      assert(!GV->isDeclaration());
-      MCSymbol *Target = getSymbol(GV);
-
       if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
       else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
@@ -947,7 +994,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
       // Emit the directives as assignments aka .set:
       OutStreamer.EmitAssignment(Name,
-                                 MCSymbolRefExpr::Create(Target, OutContext));
+                                 lowerConstant(Alias.getAliasee(), *this));
     }
   }
 
@@ -1248,7 +1295,7 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
   }
 
   // Ignore debug and non-emitted data.  This handles llvm.compiler.used.
-  if (GV->getSection() == "llvm.metadata" ||
+  if (StringRef(GV->getSection()) == "llvm.metadata" ||
       GV->hasAvailableExternallyLinkage())
     return true;
 
@@ -1350,14 +1397,17 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   for (Structor &S : Structors) {
     const TargetLoweringObjectFile &Obj = getObjFileLowering();
     const MCSymbol *KeySym = nullptr;
-    const MCSection *KeySec = nullptr;
-    if (S.ComdatKey) {
-      KeySym = getSymbol(S.ComdatKey);
-      KeySec = getObjFileLowering().SectionForGlobal(S.ComdatKey, *Mang, TM);
+    if (GlobalValue *GV = S.ComdatKey) {
+      if (GV->hasAvailableExternallyLinkage())
+        // If the associated variable is available_externally, some other TU
+        // will provide its dynamic initializer.
+        continue;
+
+      KeySym = getSymbol(GV);
     }
     const MCSection *OutputSection =
-        (isCtor ? Obj.getStaticCtorSection(S.Priority, KeySym, KeySec)
-                : Obj.getStaticDtorSection(S.Priority, KeySym, KeySec));
+        (isCtor ? Obj.getStaticCtorSection(S.Priority, KeySym)
+                : Obj.getStaticDtorSection(S.Priority, KeySym));
     OutStreamer.SwitchSection(OutputSection);
     if (OutStreamer.getCurrentSection() != OutStreamer.getPreviousSection())
       EmitAlignment(Align);
@@ -1817,6 +1867,7 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
     SmallString<8> StrVal;
     CFP->getValueAPF().toString(StrVal);
 
+    assert(CFP->getType() != nullptr && "Expecting non-null Type");
     CFP->getType()->print(AP.OutStreamer.GetCommentOS());
     AP.OutStreamer.GetCommentOS() << ' ' << StrVal << '\n';
   }
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index b4ef185b0a5d..f555f212a978 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -10,10 +10,10 @@ add_llvm_library(LLVMAsmPrinter
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
   DwarfDebug.cpp
-  DwarfException.cpp
   DwarfFile.cpp
   DwarfStringPool.cpp
   DwarfUnit.cpp
+  EHStreamer.cpp
   ErlangGCPrinter.cpp
   OcamlGCPrinter.cpp
   Win64Exception.cpp
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index 874b8615b94f..a66d08e501ac 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -8,140 +8,197 @@
 //===----------------------------------------------------------------------===//
 
 #include "DbgValueHistoryCalculator.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <map>
+#include <set>
 
 #define DEBUG_TYPE "dwarfdebug"
 
 namespace llvm {
 
-// Return true if debug value, encoded by DBG_VALUE instruction, is in a
-// defined reg.
-static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
-  assert(MI->isDebugValue() && "Invalid DBG_VALUE machine instruction!");
-  return MI->getNumOperands() == 3 && MI->getOperand(0).isReg() &&
-         MI->getOperand(0).getReg() &&
-         (MI->getOperand(1).isImm() ||
-          (MI->getOperand(1).isReg() && MI->getOperand(1).getReg() == 0U));
+// \brief If @MI is a DBG_VALUE with debug value described by a
+// defined register, returns the number of this register.
+// In the other case, returns 0.
+static unsigned isDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue());
+  assert(MI.getNumOperands() == 3);
+  // If location of variable is described using a register (directly or
+  // indirecltly), this register is always a first operand.
+  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
+}
+
+void DbgValueHistoryMap::startInstrRange(const MDNode *Var,
+                                         const MachineInstr &MI) {
+  // Instruction range should start with a DBG_VALUE instruction for the
+  // variable.
+  assert(MI.isDebugValue() && MI.getDebugVariable() == Var);
+  auto &Ranges = VarInstrRanges[Var];
+  if (!Ranges.empty() && Ranges.back().second == nullptr &&
+      Ranges.back().first->isIdenticalTo(&MI)) {
+    DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
+                 << "\t" << Ranges.back().first << "\t" << MI << "\n");
+    return;
+  }
+  Ranges.push_back(std::make_pair(&MI, nullptr));
+}
+
+void DbgValueHistoryMap::endInstrRange(const MDNode *Var,
+                                       const MachineInstr &MI) {
+  auto &Ranges = VarInstrRanges[Var];
+  // Verify that the current instruction range is not yet closed.
+  assert(!Ranges.empty() && Ranges.back().second == nullptr);
+  // For now, instruction ranges are not allowed to cross basic block
+  // boundaries.
+  assert(Ranges.back().first->getParent() == MI.getParent());
+  Ranges.back().second = &MI;
+}
+
+unsigned DbgValueHistoryMap::getRegisterForVar(const MDNode *Var) const {
+  const auto &I = VarInstrRanges.find(Var);
+  if (I == VarInstrRanges.end())
+    return 0;
+  const auto &Ranges = I->second;
+  if (Ranges.empty() || Ranges.back().second != nullptr)
+    return 0;
+  return isDescribedByReg(*Ranges.back().first);
+}
+
+namespace {
+// Maps physreg numbers to the variables they describe.
+typedef std::map<unsigned, SmallVector<const MDNode *, 1>> RegDescribedVarsMap;
+}
+
+// \brief Claim that @Var is not described by @RegNo anymore.
+static void dropRegDescribedVar(RegDescribedVarsMap &RegVars,
+                                unsigned RegNo, const MDNode *Var) {
+  const auto &I = RegVars.find(RegNo);
+  assert(RegNo != 0U && I != RegVars.end());
+  auto &VarSet = I->second;
+  const auto &VarPos = std::find(VarSet.begin(), VarSet.end(), Var);
+  assert(VarPos != VarSet.end());
+  VarSet.erase(VarPos);
+  // Don't keep empty sets in a map to keep it as small as possible.
+  if (VarSet.empty())
+    RegVars.erase(I);
+}
+
+// \brief Claim that @Var is now described by @RegNo.
+static void addRegDescribedVar(RegDescribedVarsMap &RegVars,
+                               unsigned RegNo, const MDNode *Var) {
+  assert(RegNo != 0U);
+  auto &VarSet = RegVars[RegNo];
+  assert(std::find(VarSet.begin(), VarSet.end(), Var) == VarSet.end());
+  VarSet.push_back(Var);
+}
+
+// \brief Terminate the location range for variables described by register
+// @RegNo by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                                DbgValueHistoryMap &HistMap,
+                                const MachineInstr &ClobberingInstr) {
+  const auto &I = RegVars.find(RegNo);
+  if (I == RegVars.end())
+    return;
+  // Iterate over all variables described by this register and add this
+  // instruction to their history, clobbering it.
+  for (const auto &Var : I->second)
+    HistMap.endInstrRange(Var, ClobberingInstr);
+  RegVars.erase(I);
+}
+
+// \brief Collect all registers clobbered by @MI and insert them to @Regs.
+static void collectClobberedRegisters(const MachineInstr &MI,
+                                      const TargetRegisterInfo *TRI,
+                                      std::set<unsigned> &Regs) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.getReg())
+      continue;
+    for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+      Regs.insert(*AI);
+  }
+}
+
+// \brief Returns the first instruction in @MBB which corresponds to
+// the function epilogue, or nullptr if @MBB doesn't contain an epilogue.
+static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
+  auto LastMI = MBB.getLastNonDebugInstr();
+  if (LastMI == MBB.end() || !LastMI->isReturn())
+    return nullptr;
+  // Assume that epilogue starts with instruction having the same debug location
+  // as the return instruction.
+  DebugLoc LastLoc = LastMI->getDebugLoc();
+  auto Res = LastMI;
+  for (MachineBasicBlock::const_reverse_iterator I(std::next(LastMI)); I != MBB.rend();
+       ++I) {
+    if (I->getDebugLoc() != LastLoc)
+      return Res;
+    Res = std::prev(I.base());
+  }
+  // If all instructions have the same debug location, assume whole MBB is
+  // an epilogue.
+  return MBB.begin();
+}
+
+// \brief Collect registers that are modified in the function body (their
+// contents is changed only in the prologue and epilogue).
+static void collectChangingRegs(const MachineFunction *MF,
+                                const TargetRegisterInfo *TRI,
+                                std::set<unsigned> &Regs) {
+  for (const auto &MBB : *MF) {
+    auto FirstEpilogueInst = getFirstEpilogueInst(MBB);
+    bool IsInEpilogue = false;
+    for (const auto &MI : MBB) {
+      IsInEpilogue |= &MI == FirstEpilogueInst;
+      if (!MI.getFlag(MachineInstr::FrameSetup) && !IsInEpilogue)
+        collectClobberedRegisters(MI, TRI, Regs);
+    }
+  }
 }
 
 void calculateDbgValueHistory(const MachineFunction *MF,
                               const TargetRegisterInfo *TRI,
                               DbgValueHistoryMap &Result) {
-  // LiveUserVar - Map physreg numbers to the MDNode they contain.
-  std::vector<const MDNode *> LiveUserVar(TRI->getNumRegs());
+  std::set<unsigned> ChangingRegs;
+  collectChangingRegs(MF, TRI, ChangingRegs);
 
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
-       ++I) {
-    bool AtBlockEntry = true;
-    for (const auto &MI : *I) {
-      if (MI.isDebugValue()) {
-        assert(MI.getNumOperands() > 1 && "Invalid machine instruction!");
-
-        // Keep track of user variables.
-        const MDNode *Var = MI.getDebugVariable();
-
-        // Variable is in a register, we need to check for clobbers.
-        if (isDbgValueInDefinedReg(&MI))
-          LiveUserVar[MI.getOperand(0).getReg()] = Var;
-
-        // Check the history of this variable.
-        SmallVectorImpl<const MachineInstr *> &History = Result[Var];
-        if (!History.empty()) {
-          // We have seen this variable before. Try to coalesce DBG_VALUEs.
-          const MachineInstr *Prev = History.back();
-          if (Prev->isDebugValue()) {
-            // Coalesce identical entries at the end of History.
-            if (History.size() >= 2 &&
-                Prev->isIdenticalTo(History[History.size() - 2])) {
-              DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
-                           << "\t" << *Prev << "\t"
-                           << *History[History.size() - 2] << "\n");
-              History.pop_back();
-            }
-
-            // Terminate old register assignments that don't reach MI;
-            MachineFunction::const_iterator PrevMBB = Prev->getParent();
-            if (PrevMBB != I && (!AtBlockEntry || std::next(PrevMBB) != I) &&
-                isDbgValueInDefinedReg(Prev)) {
-              // Previous register assignment needs to terminate at the end of
-              // its basic block.
-              MachineBasicBlock::const_iterator LastMI =
-                  PrevMBB->getLastNonDebugInstr();
-              if (LastMI == PrevMBB->end()) {
-                // Drop DBG_VALUE for empty range.
-                DEBUG(dbgs() << "Dropping DBG_VALUE for empty range:\n"
-                             << "\t" << *Prev << "\n");
-                History.pop_back();
-              } else if (std::next(PrevMBB) != PrevMBB->getParent()->end())
-                // Terminate after LastMI.
-                History.push_back(LastMI);
-            }
-          }
-        }
-        History.push_back(&MI);
-      } else {
-        // Not a DBG_VALUE instruction.
-        if (!MI.isPosition())
-          AtBlockEntry = false;
-
-        // Check if the instruction clobbers any registers with debug vars.
-        for (const MachineOperand &MO : MI.operands()) {
-          if (!MO.isReg() || !MO.isDef() || !MO.getReg())
-            continue;
-          for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
-               ++AI) {
-            unsigned Reg = *AI;
-            const MDNode *Var = LiveUserVar[Reg];
-            if (!Var)
-              continue;
-            // Reg is now clobbered.
-            LiveUserVar[Reg] = nullptr;
-
-            // Was MD last defined by a DBG_VALUE referring to Reg?
-            auto HistI = Result.find(Var);
-            if (HistI == Result.end())
-              continue;
-            SmallVectorImpl<const MachineInstr *> &History = HistI->second;
-            if (History.empty())
-              continue;
-            const MachineInstr *Prev = History.back();
-            // Sanity-check: Register assignments are terminated at the end of
-            // their block.
-            if (!Prev->isDebugValue() || Prev->getParent() != MI.getParent())
-              continue;
-            // Is the variable still in Reg?
-            if (!isDbgValueInDefinedReg(Prev) ||
-                Prev->getOperand(0).getReg() != Reg)
-              continue;
-            // Var is clobbered. Make sure the next instruction gets a label.
-            History.push_back(&MI);
-          }
+  RegDescribedVarsMap RegVars;
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isDebugValue()) {
+        // Not a DBG_VALUE instruction. It may clobber registers which describe
+        // some variables.
+        std::set<unsigned> MIClobberedRegs;
+        collectClobberedRegisters(MI, TRI, MIClobberedRegs);
+        for (unsigned RegNo : MIClobberedRegs) {
+          if (ChangingRegs.count(RegNo))
+            clobberRegisterUses(RegVars, RegNo, Result, MI);
         }
+        continue;
       }
-    }
-  }
 
-  // Make sure the final register assignments are terminated.
-  for (auto &I : Result) {
-    SmallVectorImpl<const MachineInstr *> &History = I.second;
-    if (History.empty())
-      continue;
+      assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
+      const MDNode *Var = MI.getDebugVariable();
 
-    const MachineInstr *Prev = History.back();
-    if (Prev->isDebugValue() && isDbgValueInDefinedReg(Prev)) {
-      const MachineBasicBlock *PrevMBB = Prev->getParent();
-      MachineBasicBlock::const_iterator LastMI =
-          PrevMBB->getLastNonDebugInstr();
-      if (LastMI == PrevMBB->end())
-        // Drop DBG_VALUE for empty range.
-        History.pop_back();
-      else if (PrevMBB != &PrevMBB->getParent()->back()) {
-        // Terminate after LastMI.
-        History.push_back(LastMI);
-      }
+      if (unsigned PrevReg = Result.getRegisterForVar(Var))
+        dropRegDescribedVar(RegVars, PrevReg, Var);
+
+      Result.startInstrRange(Var, MI);
+
+      if (unsigned NewReg = isDescribedByReg(MI))
+        addRegDescribedVar(RegVars, NewReg, Var);
+    }
+
+    // Make sure locations for register-described variables are valid only
+    // until the end of the basic block (unless it's the last basic block, in
+    // which case let their liveness run off to the end of the function).
+    if (!MBB.empty() &&  &MBB != &MF->back()) {
+      for (unsigned RegNo : ChangingRegs)
+        clobberRegisterUses(RegVars, RegNo, Result, MBB.back());
     }
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
index 2945c1522de3..b9177f05950e 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
@@ -20,11 +20,31 @@ class MachineInstr;
 class MDNode;
 class TargetRegisterInfo;
 
-// For each user variable, keep a list of DBG_VALUE instructions in order.
-// The list can also contain normal instructions that clobber the previous
-// DBG_VALUE. The variables are listed in order of appearance.
-typedef MapVector<const MDNode *, SmallVector<const MachineInstr *, 4>>
-DbgValueHistoryMap;
+// For each user variable, keep a list of instruction ranges where this variable
+// is accessible. The variables are listed in order of appearance.
+class DbgValueHistoryMap {
+  // Each instruction range starts with a DBG_VALUE instruction, specifying the
+  // location of a variable, which is assumed to be valid until the end of the
+  // range. If end is not specified, location is valid until the start
+  // instruction of the next instruction range, or until the end of the
+  // function.
+  typedef std::pair<const MachineInstr *, const MachineInstr *> InstrRange;
+  typedef SmallVector<InstrRange, 4> InstrRanges;
+  typedef MapVector<const MDNode *, InstrRanges> InstrRangesMap;
+  InstrRangesMap VarInstrRanges;
+
+public:
+  void startInstrRange(const MDNode *Var, const MachineInstr &MI);
+  void endInstrRange(const MDNode *Var, const MachineInstr &MI);
+  // Returns register currently describing @Var. If @Var is currently
+  // unaccessible or is not described by a register, returns 0.
+  unsigned getRegisterForVar(const MDNode *Var) const;
+
+  bool empty() const { return VarInstrRanges.empty(); }
+  void clear() { VarInstrRanges.clear(); }
+  InstrRangesMap::const_iterator begin() const { return VarInstrRanges.begin(); }
+  InstrRangesMap::const_iterator end() const { return VarInstrRanges.end(); }
+};
 
 void calculateDbgValueHistory(const MachineFunction *MF,
                               const TargetRegisterInfo *TRI,
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 30312ac2e281..e2d95272c2c5 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -40,9 +40,8 @@
 using namespace llvm;
 
 DwarfCFIException::DwarfCFIException(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false),
-    moveTypeModule(AsmPrinter::CFI_M_None) {}
+  : EHStreamer(A), shouldEmitPersonality(false), shouldEmitLSDA(false),
+    shouldEmitMoves(false), moveTypeModule(AsmPrinter::CFI_M_None) {}
 
 DwarfCFIException::~DwarfCFIException() {}
 
@@ -59,7 +58,7 @@ void DwarfCFIException::endModule() {
 
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
 
-  if ((PerEncoding & 0x70) != dwarf::DW_EH_PE_pcrel)
+  if ((PerEncoding & 0x80) != dwarf::DW_EH_PE_indirect)
     return;
 
   // Emit references to all used personality functions
@@ -153,5 +152,5 @@ void DwarfCFIException::endFunction(const MachineFunction *) {
   // Map all labels and get rid of any dead landing pads.
   MMI->TidyLandingPads();
 
-  EmitExceptionTable();
+  emitExceptionTable();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5688d55fafdc..a88aebd2d22f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -314,19 +314,7 @@ bool DwarfDebug::isSubprogramContext(const MDNode *Context) {
 // scope then create and insert DIEs for these variables.
 DIE &DwarfDebug::updateSubprogramScopeDIE(DwarfCompileUnit &SPCU,
                                           DISubprogram SP) {
-  DIE *SPDie = SPCU.getDIE(SP);
-
-  assert(SPDie && "Unable to find subprogram DIE!");
-
-  // If we're updating an abstract DIE, then we will be adding the children and
-  // object pointer later on. But what we don't want to do is process the
-  // concrete DIE twice.
-  if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
-    assert(SPDie == AbsSPDIE);
-    // Pick up abstract subprogram DIE.
-    SPDie = &SPCU.createAndAddDIE(dwarf::DW_TAG_subprogram, SPCU.getUnitDie());
-    SPCU.addDIEEntry(*SPDie, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
-  }
+  DIE *SPDie = SPCU.getOrCreateSubprogramDIE(SP);
 
   attachLowHighPC(SPCU, *SPDie, FunctionBeginSym, FunctionEndSym);
 
@@ -431,14 +419,10 @@ DwarfDebug::constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
   assert(Scope->getScopeNode());
   DIScope DS(Scope->getScopeNode());
   DISubprogram InlinedSP = getDISubprogram(DS);
-  DIE *OriginDIE = TheCU.getDIE(InlinedSP);
-  // FIXME: This should be an assert (or possibly a
-  // getOrCreateSubprogram(InlinedSP)) otherwise we're just failing to emit
-  // inlining information.
-  if (!OriginDIE) {
-    DEBUG(dbgs() << "Unable to find original DIE for an inlined subprogram.");
-    return nullptr;
-  }
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DIE *OriginDIE = AbstractSPDies[InlinedSP];
+  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
 
   auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_inlined_subroutine);
   TheCU.addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
@@ -464,12 +448,7 @@ static std::unique_ptr<DIE> constructVariableDIE(DwarfCompileUnit &TheCU,
                                                  DbgVariable &DV,
                                                  const LexicalScope &Scope,
                                                  DIE *&ObjectPointer) {
-  AbstractOrInlined AOI = AOI_None;
-  if (Scope.isAbstractScope())
-    AOI = AOI_Abstract;
-  else if (Scope.getInlinedAt())
-    AOI = AOI_Inlined;
-  auto Var = TheCU.constructVariableDIE(DV, AOI);
+  auto Var = TheCU.constructVariableDIE(DV, Scope.isAbstractScope());
   if (DV.isObjectPointer())
     ObjectPointer = Var.get();
   return Var;
@@ -525,16 +504,37 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
   assert(Scope->isAbstractScope());
   assert(!Scope->getInlinedAt());
 
-  DISubprogram Sub(Scope->getScopeNode());
+  DISubprogram SP(Scope->getScopeNode());
+
+  ProcessedSPNodes.insert(SP);
 
-  if (!ProcessedSPNodes.insert(Sub))
+  DIE *&AbsDef = AbstractSPDies[SP];
+  if (AbsDef)
     return;
 
-  DIE *ScopeDIE = TheCU.getDIE(Sub);
-  assert(ScopeDIE);
-  AbstractSPDies.insert(std::make_pair(Sub, ScopeDIE));
-  TheCU.addUInt(*ScopeDIE, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
-  createAndAddScopeChildren(TheCU, Scope, *ScopeDIE);
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DwarfCompileUnit &SPCU = *SPMap[SP];
+  DIE *ContextDIE;
+
+  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
+  // the important distinction that the DIDescriptor is not associated with the
+  // DIE (since the DIDescriptor will be associated with the concrete DIE, if
+  // any). It could be refactored to some common utility function.
+  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
+    ContextDIE = &SPCU.getUnitDie();
+    SPCU.getOrCreateSubprogramDIE(SPDecl);
+  } else
+    ContextDIE = SPCU.getOrCreateContextDIE(resolve(SP.getContext()));
+
+  // Passing null as the associated DIDescriptor because the abstract definition
+  // shouldn't be found by lookup.
+  AbsDef = &SPCU.createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE,
+                                 DIDescriptor());
+  SPCU.applySubprogramAttributesToDefinition(SP, *AbsDef);
+
+  SPCU.addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+  createAndAddScopeChildren(SPCU, Scope, *AbsDef);
 }
 
 DIE &DwarfDebug::constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
@@ -682,28 +682,6 @@ DwarfCompileUnit &DwarfDebug::constructDwarfCompileUnit(DICompileUnit DIUnit) {
   return NewCU;
 }
 
-// Construct subprogram DIE.
-void DwarfDebug::constructSubprogramDIE(DwarfCompileUnit &TheCU,
-                                        const MDNode *N) {
-  // FIXME: We should only call this routine once, however, during LTO if a
-  // program is defined in multiple CUs we could end up calling it out of
-  // beginModule as we walk the CUs.
-
-  DwarfCompileUnit *&CURef = SPMap[N];
-  if (CURef)
-    return;
-  CURef = &TheCU;
-
-  DISubprogram SP(N);
-  assert(SP.isSubprogram());
-  assert(SP.isDefinition());
-
-  DIE &SubprogramDie = *TheCU.getOrCreateSubprogramDIE(SP);
-
-  // Expose as a global name.
-  TheCU.addGlobalName(SP.getName(), SubprogramDie, resolve(SP.getContext()));
-}
-
 void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
                                             const MDNode *N) {
   DIImportedEntity Module(N);
@@ -780,7 +758,7 @@ void DwarfDebug::beginModule() {
       CU.createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
     DIArray SPs = CUNode.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
-      constructSubprogramDIE(CU, SPs.getElement(i));
+      SPMap.insert(std::make_pair(SPs.getElement(i), &CU));
     DIArray EnumTypes = CUNode.getEnumTypes();
     for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i)
       CU.getOrCreateTypeDIE(EnumTypes.getElement(i));
@@ -805,6 +783,60 @@ void DwarfDebug::beginModule() {
   SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
+void DwarfDebug::finishVariableDefinitions() {
+  for (const auto &Var : ConcreteVariables) {
+    DIE *VariableDie = Var->getDIE();
+    // FIXME: There shouldn't be any variables without DIEs.
+    if (!VariableDie)
+      continue;
+    // FIXME: Consider the time-space tradeoff of just storing the unit pointer
+    // in the ConcreteVariables list, rather than looking it up again here.
+    // DIE::getUnit isn't simple - it walks parent pointers, etc.
+    DwarfCompileUnit *Unit = lookupUnit(VariableDie->getUnit());
+    assert(Unit);
+    DbgVariable *AbsVar = getExistingAbstractVariable(Var->getVariable());
+    if (AbsVar && AbsVar->getDIE()) {
+      Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+                        *AbsVar->getDIE());
+    } else
+      Unit->applyVariableAttributes(*Var, *VariableDie);
+  }
+}
+
+void DwarfDebug::finishSubprogramDefinitions() {
+  const Module *M = MMI->getModule();
+
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  for (MDNode *N : CU_Nodes->operands()) {
+    DICompileUnit TheCU(N);
+    // Construct subprogram DIE and add variables DIEs.
+    DwarfCompileUnit *SPCU =
+        static_cast<DwarfCompileUnit *>(CUMap.lookup(TheCU));
+    DIArray Subprograms = TheCU.getSubprograms();
+    for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
+      DISubprogram SP(Subprograms.getElement(i));
+      // Perhaps the subprogram is in another CU (such as due to comdat
+      // folding, etc), in which case ignore it here.
+      if (SPMap[SP] != SPCU)
+        continue;
+      DIE *D = SPCU->getDIE(SP);
+      if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
+        if (D)
+          // If this subprogram has an abstract definition, reference that
+          SPCU->addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
+      } else {
+        if (!D)
+          // Lazily construct the subprogram if we didn't see either concrete or
+          // inlined versions during codegen.
+          D = SPCU->getOrCreateSubprogramDIE(SP);
+        // And attach the attributes
+        SPCU->applySubprogramAttributesToDefinition(SP, *D);
+      }
+    }
+  }
+}
+
+
 // Collect info for variables that were optimized out.
 void DwarfDebug::collectDeadVariables() {
   const Module *M = MMI->getModule();
@@ -829,15 +861,17 @@ void DwarfDebug::collectDeadVariables() {
         if (Variables.getNumElements() == 0)
           continue;
 
-        // FIXME: See the comment in constructSubprogramDIE about duplicate
-        // subprogram DIEs.
-        constructSubprogramDIE(*SPCU, SP);
-        DIE *SPDIE = SPCU->getDIE(SP);
+        DIE *SPDIE = AbstractSPDies.lookup(SP);
+        if (!SPDIE)
+          SPDIE = SPCU->getDIE(SP);
+        assert(SPDIE);
         for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
           DIVariable DV(Variables.getElement(vi));
           assert(DV.isVariable());
-          DbgVariable NewVar(DV, nullptr, this);
-          SPDIE->addChild(SPCU->constructVariableDIE(NewVar));
+          DbgVariable NewVar(DV, this);
+          auto VariableDie = SPCU->constructVariableDIE(NewVar);
+          SPCU->applyVariableAttributes(NewVar, *VariableDie);
+          SPDIE->addChild(std::move(VariableDie));
         }
       }
     }
@@ -845,6 +879,10 @@ void DwarfDebug::collectDeadVariables() {
 }
 
 void DwarfDebug::finalizeModuleInfo() {
+  finishSubprogramDefinitions();
+
+  finishVariableDefinitions();
+
   // Collect info for variables that were optimized out.
   collectDeadVariables();
 
@@ -1024,29 +1062,58 @@ void DwarfDebug::endModule() {
 
   // clean up.
   SPMap.clear();
+  AbstractVariables.clear();
 
   // Reset these for the next Module if we have one.
   FirstCU = nullptr;
 }
 
 // Find abstract variable, if any, associated with Var.
-DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
-                                              DebugLoc ScopeLoc) {
+DbgVariable *DwarfDebug::getExistingAbstractVariable(const DIVariable &DV,
+                                                     DIVariable &Cleansed) {
   LLVMContext &Ctx = DV->getContext();
   // More then one inlined variable corresponds to one abstract variable.
-  DIVariable Var = cleanseInlinedVariable(DV, Ctx);
-  DbgVariable *AbsDbgVariable = AbstractVariables.lookup(Var);
-  if (AbsDbgVariable)
-    return AbsDbgVariable;
+  // FIXME: This duplication of variables when inlining should probably be
+  // removed. It's done to allow each DIVariable to describe its location
+  // because the DebugLoc on the dbg.value/declare isn't accurate. We should
+  // make it accurate then remove this duplication/cleansing stuff.
+  Cleansed = cleanseInlinedVariable(DV, Ctx);
+  auto I = AbstractVariables.find(Cleansed);
+  if (I != AbstractVariables.end())
+    return I->second.get();
+  return nullptr;
+}
 
-  LexicalScope *Scope = LScopes.findAbstractScope(ScopeLoc.getScope(Ctx));
-  if (!Scope)
-    return nullptr;
+DbgVariable *DwarfDebug::getExistingAbstractVariable(const DIVariable &DV) {
+  DIVariable Cleansed;
+  return getExistingAbstractVariable(DV, Cleansed);
+}
+
+void DwarfDebug::createAbstractVariable(const DIVariable &Var,
+                                        LexicalScope *Scope) {
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, this);
+  addScopeVariable(Scope, AbsDbgVariable.get());
+  AbstractVariables[Var] = std::move(AbsDbgVariable);
+}
+
+void DwarfDebug::ensureAbstractVariableIsCreated(const DIVariable &DV,
+                                                 const MDNode *ScopeNode) {
+  DIVariable Cleansed = DV;
+  if (getExistingAbstractVariable(DV, Cleansed))
+    return;
+
+  createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(ScopeNode));
+}
+
+void
+DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(const DIVariable &DV,
+                                                    const MDNode *ScopeNode) {
+  DIVariable Cleansed = DV;
+  if (getExistingAbstractVariable(DV, Cleansed))
+    return;
 
-  AbsDbgVariable = new DbgVariable(Var, nullptr, this);
-  addScopeVariable(Scope, AbsDbgVariable);
-  AbstractVariables[Var] = AbsDbgVariable;
-  return AbsDbgVariable;
+  if (LexicalScope *Scope = LScopes.findAbstractScope(ScopeNode))
+    createAbstractVariable(Cleansed, Scope);
 }
 
 // If Var is a current function argument then add it to CurrentFnArguments list.
@@ -1085,11 +1152,11 @@ void DwarfDebug::collectVariableInfoFromMMITable(
     if (!Scope)
       continue;
 
-    DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VI.Loc);
-    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable, this);
+    ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
+    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
+    DbgVariable *RegVar = ConcreteVariables.back().get();
     RegVar->setFrameIndex(VI.Slot);
-    if (!addCurrentFnArgument(RegVar, Scope))
-      addScopeVariable(Scope, RegVar);
+    addScopeVariable(Scope, RegVar);
   }
 }
 
@@ -1132,12 +1199,10 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     if (Processed.count(DV))
       continue;
 
-    // History contains relevant DBG_VALUE instructions for DV and instructions
-    // clobbering it.
-    const SmallVectorImpl<const MachineInstr *> &History = I.second;
-    if (History.empty())
+    // Instruction ranges, specifying where DV is accessible.
+    const auto &Ranges = I.second;
+    if (Ranges.empty())
       continue;
-    const MachineInstr *MInsn = History.front();
 
     LexicalScope *Scope = nullptr;
     if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
@@ -1154,20 +1219,16 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
       continue;
 
     Processed.insert(DV);
+    const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
-    DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
-    DbgVariable *RegVar = new DbgVariable(DV, AbsVar, this);
-    if (!addCurrentFnArgument(RegVar, Scope))
-      addScopeVariable(Scope, RegVar);
-    if (AbsVar)
-      AbsVar->setMInsn(MInsn);
-
-    // Simplify ranges that are fully coalesced.
-    if (History.size() <= 1 ||
-        (History.size() == 2 && MInsn->isIdenticalTo(History.back()))) {
-      RegVar->setMInsn(MInsn);
+    ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
+    ConcreteVariables.push_back(make_unique<DbgVariable>(MInsn, this));
+    DbgVariable *RegVar = ConcreteVariables.back().get();
+    addScopeVariable(Scope, RegVar);
+
+    // Check if the first DBG_VALUE is valid for the rest of the function.
+    if (Ranges.size() == 1 && Ranges.front().second == nullptr)
       continue;
-    }
 
     // Handle multiple DBG_VALUE instructions describing one variable.
     RegVar->setDotDebugLocOffset(DotDebugLocEntries.size());
@@ -1177,42 +1238,34 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     LocList.Label =
         Asm->GetTempSymbol("debug_loc", DotDebugLocEntries.size() - 1);
     SmallVector<DebugLocEntry, 4> &DebugLoc = LocList.List;
-    for (SmallVectorImpl<const MachineInstr *>::const_iterator
-             HI = History.begin(),
-             HE = History.end();
-         HI != HE; ++HI) {
-      const MachineInstr *Begin = *HI;
+    for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
+      const MachineInstr *Begin = I->first;
+      const MachineInstr *End = I->second;
       assert(Begin->isDebugValue() && "Invalid History entry");
 
-      // Check if DBG_VALUE is truncating a range.
+      // Check if a variable is unaccessible in this range.
       if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg() &&
           !Begin->getOperand(0).getReg())
         continue;
+      DEBUG(dbgs() << "DotDebugLoc Pair:\n" << "\t" << *Begin);
+      if (End != nullptr)
+        DEBUG(dbgs() << "\t" << *End);
+      else
+        DEBUG(dbgs() << "\tNULL\n");
 
-      // Compute the range for a register location.
-      const MCSymbol *FLabel = getLabelBeforeInsn(Begin);
-      const MCSymbol *SLabel = nullptr;
-
-      if (HI + 1 == HE)
-        // If Begin is the last instruction in History then its value is valid
-        // until the end of the function.
-        SLabel = FunctionEndSym;
-      else {
-        const MachineInstr *End = HI[1];
-        DEBUG(dbgs() << "DotDebugLoc Pair:\n"
-                     << "\t" << *Begin << "\t" << *End << "\n");
-        if (End->isDebugValue())
-          SLabel = getLabelBeforeInsn(End);
-        else {
-          // End is a normal instruction clobbering the range.
-          SLabel = getLabelAfterInsn(End);
-          assert(SLabel && "Forgot label after clobber instruction");
-          ++HI;
-        }
-      }
+      const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
+      assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
 
-      // The value is valid until the next DBG_VALUE or clobber.
-      DebugLocEntry Loc(FLabel, SLabel, getDebugLocValue(Begin), TheCU);
+      const MCSymbol *EndLabel;
+      if (End != nullptr)
+        EndLabel = getLabelAfterInsn(End);
+      else if (std::next(I) == Ranges.end())
+        EndLabel = FunctionEndSym;
+      else
+        EndLabel = getLabelBeforeInsn(std::next(I)->first);
+      assert(EndLabel && "Forgot label after instruction ending a range!");
+
+      DebugLocEntry Loc(StartLabel, EndLabel, getDebugLocValue(Begin), TheCU);
       if (DebugLoc.empty() || !DebugLoc.back().Merge(Loc))
         DebugLoc.push_back(std::move(Loc));
     }
@@ -1225,8 +1278,11 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     assert(DV.isVariable());
     if (!Processed.insert(DV))
       continue;
-    if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
-      addScopeVariable(Scope, new DbgVariable(DV, nullptr, this));
+    if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext())) {
+      ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
+      ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
+      addScopeVariable(Scope, ConcreteVariables.back().get());
+    }
   }
 }
 
@@ -1340,6 +1396,17 @@ void DwarfDebug::identifyScopeMarkers() {
   }
 }
 
+static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
+  // First known non-DBG_VALUE and non-frame setup location marks
+  // the beginning of the function body.
+  for (const auto &MBB : *MF)
+    for (const auto &MI : MBB)
+      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+          !MI.getDebugLoc().isUnknown())
+        return MI.getDebugLoc();
+  return DebugLoc();
+}
+
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
 void DwarfDebug::beginFunction(const MachineFunction *MF) {
@@ -1377,34 +1444,13 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
-  // Collect user variables, find the end of the prologue.
-  for (const auto &MBB : *MF) {
-    for (const auto &MI : MBB) {
-      if (MI.isDebugValue()) {
-        assert(MI.getNumOperands() > 1 && "Invalid machine instruction!");
-        // Keep track of user variables in order of appearance. Create the
-        // empty history for each variable so that the order of keys in
-        // DbgValues is correct. Actual history will be populated in
-        // calculateDbgValueHistory() function.
-        const MDNode *Var = MI.getDebugVariable();
-        DbgValues.insert(
-            std::make_pair(Var, SmallVector<const MachineInstr *, 4>()));
-      } else if (!MI.getFlag(MachineInstr::FrameSetup) &&
-                 PrologEndLoc.isUnknown() && !MI.getDebugLoc().isUnknown()) {
-        // First known non-DBG_VALUE and non-frame setup location marks
-        // the beginning of the function body.
-        PrologEndLoc = MI.getDebugLoc();
-      }
-    }
-  }
-
   // Calculate history for local variables.
   calculateDbgValueHistory(MF, Asm->TM.getRegisterInfo(), DbgValues);
 
   // Request labels for the full history.
-  for (auto &I : DbgValues) {
-    const SmallVectorImpl<const MachineInstr *> &History = I.second;
-    if (History.empty())
+  for (const auto &I : DbgValues) {
+    const auto &Ranges = I.second;
+    if (Ranges.empty())
       continue;
 
     // The first mention of a function argument gets the FunctionBeginSym
@@ -1412,13 +1458,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     DIVariable DV(I.first);
     if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
         getDISubprogram(DV.getContext()).describes(MF->getFunction()))
-      LabelsBeforeInsn[History.front()] = FunctionBeginSym;
+      LabelsBeforeInsn[Ranges.front().first] = FunctionBeginSym;
 
-    for (const MachineInstr *MI : History) {
-      if (MI->isDebugValue())
-        requestLabelBeforeInsn(MI);
-      else
-        requestLabelAfterInsn(MI);
+    for (const auto &Range : Ranges) {
+      requestLabelBeforeInsn(Range.first);
+      if (Range.second)
+        requestLabelAfterInsn(Range.second);
     }
   }
 
@@ -1426,6 +1471,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   PrevLabel = FunctionBeginSym;
 
   // Record beginning of function.
+  PrologEndLoc = findPrologueEndLoc(MF);
   if (!PrologEndLoc.isUnknown()) {
     DebugLoc FnStartDL =
         PrologEndLoc.getFnDebugLoc(MF->getFunction()->getContext());
@@ -1439,6 +1485,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 }
 
 void DwarfDebug::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+  if (addCurrentFnArgument(Var, LS))
+    return;
   SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
   DIVariable DV = Var->getVariable();
   // Variables with positive arg numbers are parameters.
@@ -1516,14 +1564,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
       assert(DV && DV.isVariable());
       if (!ProcessedVars.insert(DV))
         continue;
-      // Check that DbgVariable for DV wasn't created earlier, when
-      // findAbstractVariable() was called for inlined instance of DV.
-      LLVMContext &Ctx = DV->getContext();
-      DIVariable CleanDV = cleanseInlinedVariable(DV, Ctx);
-      if (AbstractVariables.lookup(CleanDV))
-        continue;
-      if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
-        addScopeVariable(Scope, new DbgVariable(DV, nullptr, this));
+      ensureAbstractVariableIsCreated(DV, DV.getContext());
     }
     constructAbstractSubprogramScopeDIE(TheCU, AScope);
   }
@@ -1539,12 +1580,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   PrevCU = &TheCU;
 
   // Clear debug info
-  for (auto &I : ScopeVariables)
-    DeleteContainerPointers(I.second);
+  // Ownership of DbgVariables is a bit subtle - ScopeVariables owns all the
+  // DbgVariables except those that are also in AbstractVariables (since they
+  // can be used cross-function)
   ScopeVariables.clear();
-  DeleteContainerPointers(CurrentFnArguments);
+  CurrentFnArguments.clear();
   DbgValues.clear();
-  AbstractVariables.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
   PrevLabel = nullptr;
@@ -1585,12 +1626,9 @@ void DwarfDebug::emitSectionLabels() {
   // Dwarf sections base addresses.
   DwarfInfoSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info");
-  if (useSplitDwarf()) {
+  if (useSplitDwarf())
     DwarfInfoDWOSectionSym =
         emitSectionSym(Asm, TLOF.getDwarfInfoDWOSection(), "section_info_dwo");
-    DwarfTypesDWOSectionSym =
-        emitSectionSym(Asm, TLOF.getDwarfTypesDWOSection(), "section_types_dwo");
-  }
   DwarfAbbrevSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfAbbrevSection(), "section_abbrev");
   if (useSplitDwarf())
@@ -2354,9 +2392,9 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
 
-  auto OwnedUnit = make_unique<DwarfTypeUnit>(
-      InfoHolder.getUnits().size() + TypeUnitsUnderConstruction.size(), CU, Asm,
-      this, &InfoHolder, getDwoLineTable(CU));
+  auto OwnedUnit =
+      make_unique<DwarfTypeUnit>(InfoHolder.getUnits().size(), CU, Asm, this,
+                                 &InfoHolder, getDwoLineTable(CU));
   DwarfTypeUnit &NewTU = *OwnedUnit;
   DIE &UnitDie = NewTU.getUnitDie();
   TU = &NewTU;
@@ -2369,14 +2407,15 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   uint64_t Signature = makeTypeSignature(Identifier);
   NewTU.setTypeSignature(Signature);
 
-  if (useSplitDwarf())
-    NewTU.initSection(Asm->getObjFileLowering().getDwarfTypesDWOSection(),
-                      DwarfTypesDWOSectionSym);
-  else {
+  if (!useSplitDwarf())
     CU.applyStmtList(UnitDie);
-    NewTU.initSection(
-        Asm->getObjFileLowering().getDwarfTypesSection(Signature));
-  }
+
+  // FIXME: Skip using COMDAT groups for type units in the .dwo file once tools
+  // such as DWP ( http://gcc.gnu.org/wiki/DebugFissionDWP ) can cope with it.
+  NewTU.initSection(
+      useSplitDwarf()
+          ? Asm->getObjFileLowering().getDwarfTypesDWOSection(Signature)
+          : Asm->getObjFileLowering().getDwarfTypesSection(Signature));
 
   NewTU.setType(NewTU.createTypeDIE(CTy));
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index b2e16074f51d..f2aa80845a05 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -27,6 +27,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MachineLocation.h"
@@ -71,16 +72,21 @@ class DbgVariable {
   DIVariable Var;             // Variable Descriptor.
   DIE *TheDIE;                // Variable DIE.
   unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries.
-  DbgVariable *AbsVar;        // Corresponding Abstract variable, if any.
   const MachineInstr *MInsn;  // DBG_VALUE instruction of the variable.
   int FrameIndex;
   DwarfDebug *DD;
 
 public:
-  // AbsVar may be NULL.
-  DbgVariable(DIVariable V, DbgVariable *AV, DwarfDebug *DD)
-      : Var(V), TheDIE(nullptr), DotDebugLocOffset(~0U), AbsVar(AV),
-        MInsn(nullptr), FrameIndex(~0), DD(DD) {}
+  /// Construct a DbgVariable from a DIVariable.
+  DbgVariable(DIVariable V, DwarfDebug *DD)
+      : Var(V), TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(nullptr),
+        FrameIndex(~0), DD(DD) {}
+
+  /// Construct a DbgVariable from a DEBUG_VALUE.
+  /// AbstractVar may be NULL.
+  DbgVariable(const MachineInstr *DbgValue, DwarfDebug *DD)
+      : Var(DbgValue->getDebugVariable()), TheDIE(nullptr),
+        DotDebugLocOffset(~0U), MInsn(DbgValue), FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable() const { return Var; }
@@ -89,9 +95,7 @@ class DbgVariable {
   void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; }
   unsigned getDotDebugLocOffset() const { return DotDebugLocOffset; }
   StringRef getName() const { return Var.getName(); }
-  DbgVariable *getAbstractVariable() const { return AbsVar; }
   const MachineInstr *getMInsn() const { return MInsn; }
-  void setMInsn(const MachineInstr *M) { MInsn = M; }
   int getFrameIndex() const { return FrameIndex; }
   void setFrameIndex(int FI) { FrameIndex = FI; }
   // Translate tag to proper Dwarf tag.
@@ -199,7 +203,8 @@ class DwarfDebug : public AsmPrinterHandler {
   ScopeVariablesMap ScopeVariables;
 
   // Collection of abstract variables.
-  DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+  SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
 
   // Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
   // can refer to them in spite of insertions into this list.
@@ -253,7 +258,6 @@ class DwarfDebug : public AsmPrinterHandler {
   MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym, *DwarfAddrSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
   MCSymbol *DwarfInfoDWOSectionSym, *DwarfAbbrevDWOSectionSym;
-  MCSymbol *DwarfTypesDWOSectionSym;
   MCSymbol *DwarfStrDWOSectionSym;
   MCSymbol *DwarfGnuPubNamesSectionSym, *DwarfGnuPubTypesSectionSym;
 
@@ -335,7 +339,14 @@ class DwarfDebug : public AsmPrinterHandler {
   }
 
   /// \brief Find abstract variable associated with Var.
-  DbgVariable *findAbstractVariable(DIVariable &Var, DebugLoc Loc);
+  DbgVariable *getExistingAbstractVariable(const DIVariable &DV,
+                                           DIVariable &Cleansed);
+  DbgVariable *getExistingAbstractVariable(const DIVariable &DV);
+  void createAbstractVariable(const DIVariable &DV, LexicalScope *Scope);
+  void ensureAbstractVariableIsCreated(const DIVariable &Var,
+                                       const MDNode *Scope);
+  void ensureAbstractVariableIsCreatedIfScoped(const DIVariable &Var,
+                                               const MDNode *Scope);
 
   /// \brief Find DIE for the given subprogram and attach appropriate
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
@@ -389,6 +400,10 @@ class DwarfDebug : public AsmPrinterHandler {
   /// \brief Collect info for variables that were optimized out.
   void collectDeadVariables();
 
+  void finishVariableDefinitions();
+
+  void finishSubprogramDefinitions();
+
   /// \brief Finish off debug information after all functions have been
   /// processed.
   void finalizeModuleInfo();
@@ -489,9 +504,6 @@ class DwarfDebug : public AsmPrinterHandler {
   /// DW_TAG_compile_unit.
   DwarfCompileUnit &constructDwarfCompileUnit(DICompileUnit DIUnit);
 
-  /// \brief Construct subprogram DIE.
-  void constructSubprogramDIE(DwarfCompileUnit &TheCU, const MDNode *N);
-
   /// \brief Construct imported_module or imported_declaration DIE.
   void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index f7924827518e..0440fce506d0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -14,138 +14,14 @@
 #ifndef LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 #define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 
-#include "AsmPrinterHandler.h"
-#include "llvm/ADT/DenseMap.h"
+#include "EHStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include <vector>
 
 namespace llvm {
-
-template <typename T> class SmallVectorImpl;
-struct LandingPadInfo;
-class MachineModuleInfo;
-class MachineInstr;
 class MachineFunction;
-class MCAsmInfo;
-class MCExpr;
-class MCSymbol;
-class Function;
 class ARMTargetStreamer;
-class AsmPrinter;
-
-//===----------------------------------------------------------------------===//
-/// DwarfException - Emits Dwarf exception handling directives.
-///
-class DwarfException : public AsmPrinterHandler {
-protected:
-  /// Asm - Target of Dwarf emission.
-  AsmPrinter *Asm;
-
-  /// MMI - Collected machine module information.
-  MachineModuleInfo *MMI;
-
-  /// SharedTypeIds - How many leading type ids two landing pads have in common.
-  static unsigned SharedTypeIds(const LandingPadInfo *L,
-                                const LandingPadInfo *R);
-
-  /// PadRange - Structure holding a try-range and the associated landing pad.
-  struct PadRange {
-    // The index of the landing pad.
-    unsigned PadIndex;
-    // The index of the begin and end labels in the landing pad's label lists.
-    unsigned RangeIndex;
-  };
-
-  typedef DenseMap<MCSymbol *, PadRange> RangeMapType;
-
-  /// ActionEntry - Structure describing an entry in the actions table.
-  struct ActionEntry {
-    int ValueForTypeID; // The value to write - may not be equal to the type id.
-    int NextAction;
-    unsigned Previous;
-  };
-
-  /// CallSiteEntry - Structure describing an entry in the call-site table.
-  struct CallSiteEntry {
-    // The 'try-range' is BeginLabel .. EndLabel.
-    MCSymbol *BeginLabel; // zero indicates the start of the function.
-    MCSymbol *EndLabel;   // zero indicates the end of the function.
-
-    // The landing pad starts at PadLabel.
-    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
-    unsigned Action;
-  };
-
-  /// ComputeActionsTable - Compute the actions table and gather the first
-  /// action index for each landing pad site.
-  unsigned ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
-                               SmallVectorImpl<ActionEntry> &Actions,
-                               SmallVectorImpl<unsigned> &FirstActions);
-
-  /// CallToNoUnwindFunction - Return `true' if this is a call to a function
-  /// marked `nounwind'. Return `false' otherwise.
-  bool CallToNoUnwindFunction(const MachineInstr *MI);
-
-  /// ComputeCallSiteTable - Compute the call-site table.  The entry for an
-  /// invoke has a try-range containing the call, a non-zero landing pad and an
-  /// appropriate action.  The entry for an ordinary call has a try-range
-  /// containing the call and zero for the landing pad and the action.  Calls
-  /// marked 'nounwind' have no entry and must not be contained in the try-range
-  /// of any entry - they form gaps in the table.  Entries must be ordered by
-  /// try-range address.
-  void ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const RangeMapType &PadMap,
-                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
-                            const SmallVectorImpl<unsigned> &FirstActions);
-
-  /// EmitExceptionTable - Emit landing pads and actions.
-  ///
-  /// The general organization of the table is complex, but the basic concepts
-  /// are easy.  First there is a header which describes the location and
-  /// organization of the three components that follow.
-  ///  1. The landing pad site information describes the range of code covered
-  ///     by the try.  In our case it's an accumulation of the ranges covered
-  ///     by the invokes in the try.  There is also a reference to the landing
-  ///     pad that handles the exception once processed.  Finally an index into
-  ///     the actions table.
-  ///  2. The action table, in our case, is composed of pairs of type ids
-  ///     and next action offset.  Starting with the action index from the
-  ///     landing pad site, each type Id is checked for a match to the current
-  ///     exception.  If it matches then the exception and type id are passed
-  ///     on to the landing pad.  Otherwise the next action is looked up.  This
-  ///     chain is terminated with a next action of zero.  If no type id is
-  ///     found the frame is unwound and handling continues.
-  ///  3. Type id table contains references to all the C++ typeinfo for all
-  ///     catches in the function.  This tables is reversed indexed base 1.
-  void EmitExceptionTable();
-
-  virtual void EmitTypeInfos(unsigned TTypeEncoding);
-
-public:
-  //===--------------------------------------------------------------------===//
-  // Main entry points.
-  //
-  DwarfException(AsmPrinter *A);
-  virtual ~DwarfException();
-
-  /// endModule - Emit all exception information that should come after the
-  /// content.
-  void endModule() override;
-
-  /// beginFunction - Gather pre-function exception information.  Assumes being
-  /// emitted immediately after the function entry point.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// endFunction - Gather and emit post-function exception information.
-  void endFunction(const MachineFunction *) override;
-
-  // We don't need these.
-  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
-  void beginInstruction(const MachineInstr *MI) override {}
-  void endInstruction() override {}
-};
 
-class DwarfCFIException : public DwarfException {
+class DwarfCFIException : public EHStreamer {
   /// shouldEmitPersonality - Per-function flag to indicate if .cfi_personality
   /// should be emitted.
   bool shouldEmitPersonality;
@@ -179,8 +55,8 @@ class DwarfCFIException : public DwarfException {
   void endFunction(const MachineFunction *) override;
 };
 
-class ARMException : public DwarfException {
-  void EmitTypeInfos(unsigned TTypeEncoding) override;
+class ARMException : public EHStreamer {
+  void emitTypeInfos(unsigned TTypeEncoding) override;
   ARMTargetStreamer &getTargetStreamer();
 
   /// shouldEmitCFI - Per-function flag to indicate if frame CFI info
@@ -206,7 +82,7 @@ class ARMException : public DwarfException {
   void endFunction(const MachineFunction *) override;
 };
 
-class Win64Exception : public DwarfException {
+class Win64Exception : public EHStreamer {
   /// shouldEmitPersonality - Per-function flag to indicate if personality
   /// info should be emitted.
   bool shouldEmitPersonality;
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 8adf78ca8546..9538bee7bca8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -749,8 +749,23 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
 static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
   DIDerivedType DTy(Ty);
   if (DTy.isDerivedType()) {
-    if (DIType Deriv = DD->resolve(DTy.getTypeDerivedFrom()))
-      return isUnsignedDIType(DD, Deriv);
+    dwarf::Tag T = (dwarf::Tag)Ty.getTag();
+    // Encode pointer constants as unsigned bytes. This is used at least for
+    // null pointer constant emission.
+    // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
+    // here, but accept them for now due to a bug in SROA producing bogus
+    // dbg.values.
+    if (T == dwarf::DW_TAG_pointer_type ||
+        T == dwarf::DW_TAG_ptr_to_member_type ||
+        T == dwarf::DW_TAG_reference_type ||
+        T == dwarf::DW_TAG_rvalue_reference_type)
+      return true;
+    assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
+           T == dwarf::DW_TAG_volatile_type ||
+           T == dwarf::DW_TAG_restrict_type ||
+           T == dwarf::DW_TAG_enumeration_type);
+    if (DITypeRef Deriv = DTy.getTypeDerivedFrom())
+      return isUnsignedDIType(DD, DD->resolve(Deriv));
     // FIXME: Enums without a fixed underlying type have unknown signedness
     // here, leading to incorrectly emitted constants.
     assert(DTy.getTag() == dwarf::DW_TAG_enumeration_type);
@@ -1056,6 +1071,8 @@ std::string DwarfUnit::getParentContextString(DIScope Context) const {
        I != E; ++I) {
     DIScope Ctx = *I;
     StringRef Name = Ctx.getName();
+    if (Name.empty() && Ctx.isNameSpace())
+      Name = "(anonymous namespace)";
     if (!Name.empty()) {
       CS += Name;
       CS += "::";
@@ -1344,12 +1361,13 @@ DIE *DwarfUnit::getOrCreateNameSpace(DINameSpace NS) {
     return NDie;
   DIE &NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
 
-  if (!NS.getName().empty()) {
+  StringRef Name = NS.getName();
+  if (!Name.empty())
     addString(NDie, dwarf::DW_AT_name, NS.getName());
-    DD->addAccelNamespace(NS.getName(), NDie);
-    addGlobalName(NS.getName(), NDie, NS.getContext());
-  } else
-    DD->addAccelNamespace("(anonymous namespace)", NDie);
+  else
+    Name = "(anonymous namespace)";
+  DD->addAccelNamespace(Name, NDie);
+  addGlobalName(Name, NDie, NS.getContext());
   addSourceLine(NDie, NS);
   return &NDie;
 }
@@ -1359,51 +1377,67 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
-  DIScope Context = resolve(SP.getContext());
-  DIE *ContextDIE = getOrCreateContextDIE(Context);
-
-  // Unique declarations based on the ODR, where applicable.
-  SP = DISubprogram(DD->resolve(SP.getRef()));
-  assert(SP.Verify());
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
 
   if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
-  if (SPDecl.isSubprogram())
+  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
     // Add subprogram definitions to the CU die directly.
     ContextDIE = &getUnitDie();
+    // Build the decl now to ensure it precedes the definition.
+    getOrCreateSubprogramDIE(SPDecl);
+  }
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
   DIE &SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
 
+  // Stop here and fill this in later, depending on whether or not this
+  // subprogram turns out to have inlined instances or not.
+  if (SP.isDefinition())
+    return &SPDie;
+
+  applySubprogramAttributes(SP, SPDie);
+  return &SPDie;
+}
+
+void DwarfUnit::applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie) {
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  DIScope Context = resolve(SPDecl ? SPDecl.getContext() : SP.getContext());
+  applySubprogramAttributes(SP, SPDie);
+  addGlobalName(SP.getName(), SPDie, Context);
+}
+
+void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
   DIE *DeclDie = nullptr;
-  if (SPDecl.isSubprogram())
-    DeclDie = getOrCreateSubprogramDIE(SPDecl);
+  StringRef DeclLinkageName;
+  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
+    DeclDie = getDIE(SPDecl);
+    assert(DeclDie && "This DIE should've already been constructed when the "
+                      "definition DIE was created in "
+                      "getOrCreateSubprogramDIE");
+    DeclLinkageName = SPDecl.getLinkageName();
+  }
 
   // Add function template parameters.
   addTemplateParams(SPDie, SP.getTemplateParams());
 
-  if (DeclDie)
-    // Refer function declaration directly.
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
-
   // Add the linkage name if we have one and it isn't in the Decl.
   StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty()) {
-    if (SPDecl.isSubprogram() && !SPDecl.getLinkageName().empty())
-      assert(SPDecl.getLinkageName() == SP.getLinkageName() &&
-             "decl has a linkage name and it is different");
-    else
-      addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
-                GlobalValue::getRealLinkageName(LinkageName));
+  assert(((LinkageName.empty() || DeclLinkageName.empty()) ||
+          LinkageName == DeclLinkageName) &&
+         "decl has a linkage name and it is different");
+  if (!LinkageName.empty() && DeclLinkageName.empty())
+    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
+              GlobalValue::getRealLinkageName(LinkageName));
+
+  if (DeclDie) {
+    // Refer to the function declaration where all the other attributes will be
+    // found.
+    addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
+    return;
   }
 
-  // If this DIE is going to refer declaration info using AT_specification
-  // then there is no need to add other attributes.
-  if (DeclDie)
-    return &SPDie;
-
   // Constructors and operators for anonymous aggregates do not have names.
   if (!SP.getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP.getName());
@@ -1478,8 +1512,17 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
   if (SP.isExplicit())
     addFlag(SPDie, dwarf::DW_AT_explicit);
+}
 
-  return &SPDie;
+void DwarfUnit::applyVariableAttributes(const DbgVariable &Var,
+                                        DIE &VariableDie) {
+  StringRef Name = Var.getName();
+  if (!Name.empty())
+    addString(VariableDie, dwarf::DW_AT_name, Name);
+  addSourceLine(VariableDie, Var.getVariable());
+  addType(VariableDie, Var.getType());
+  if (Var.isArtificial())
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
 }
 
 // Return const expression if value is a GEP to access merged global
@@ -1645,10 +1688,8 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
       DD->addAccelName(GV.getLinkageName(), AddrDIE);
   }
 
-  if (!GV.isLocalToUnit())
-    addGlobalName(GV.getName(),
-                  VariableSpecDIE ? *VariableSpecDIE : *VariableDIE,
-                  GV.getContext());
+  addGlobalName(GV.getName(), VariableSpecDIE ? *VariableSpecDIE : *VariableDIE,
+                GV.getContext());
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
@@ -1749,35 +1790,21 @@ void DwarfUnit::constructContainingTypeDIEs() {
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
 std::unique_ptr<DIE> DwarfUnit::constructVariableDIE(DbgVariable &DV,
-                                                     AbstractOrInlined AbsIn) {
-  auto D = constructVariableDIEImpl(DV, AbsIn);
+                                                     bool Abstract) {
+  auto D = constructVariableDIEImpl(DV, Abstract);
   DV.setDIE(*D);
   return D;
 }
 
-std::unique_ptr<DIE>
-DwarfUnit::constructVariableDIEImpl(const DbgVariable &DV,
-                                    AbstractOrInlined AbsIn) {
-  StringRef Name = DV.getName();
-
+std::unique_ptr<DIE> DwarfUnit::constructVariableDIEImpl(const DbgVariable &DV,
+                                                         bool Abstract) {
   // Define variable debug information entry.
   auto VariableDie = make_unique<DIE>(DV.getTag());
-  DbgVariable *AbsVar = DV.getAbstractVariable();
-  DIE *AbsDIE = AbsVar ? AbsVar->getDIE() : nullptr;
-  if (AbsDIE)
-    addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, *AbsDIE);
-  else {
-    if (!Name.empty())
-      addString(*VariableDie, dwarf::DW_AT_name, Name);
-    addSourceLine(*VariableDie, DV.getVariable());
-    addType(*VariableDie, DV.getType());
-  }
 
-  if (AbsIn != AOI_Inlined && DV.isArtificial())
-    addFlag(*VariableDie, dwarf::DW_AT_artificial);
-
-  if (AbsIn == AOI_Abstract)
+  if (Abstract) {
+    applyVariableAttributes(DV, *VariableDie);
     return VariableDie;
+  }
 
   // Add variable address.
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 78931487986f..b7b83b282d82 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -61,8 +61,6 @@ class RangeSpanList {
   void addRange(RangeSpan Range) { Ranges.push_back(Range); }
 };
 
-enum AbstractOrInlined { AOI_None, AOI_Inlined, AOI_Abstract };
-
 //===----------------------------------------------------------------------===//
 /// Unit - This dwarf writer support class manages information associated
 /// with a source file.
@@ -401,6 +399,10 @@ class DwarfUnit {
   /// getOrCreateSubprogramDIE - Create new DIE using SP.
   DIE *getOrCreateSubprogramDIE(DISubprogram SP);
 
+  void applySubprogramAttributes(DISubprogram SP, DIE &SPDie);
+  void applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie);
+  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
+
   /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
   /// given DIType.
   DIE *getOrCreateTypeDIE(const MDNode *N);
@@ -417,7 +419,7 @@ class DwarfUnit {
 
   /// constructVariableDIE - Construct a DIE for the given DbgVariable.
   std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
-                                            AbstractOrInlined AbsIn = AOI_None);
+                                            bool Abstract = false);
 
   /// constructSubprogramArguments - Construct function argument DIEs.
   void constructSubprogramArguments(DIE &Buffer, DIArray Args);
@@ -455,7 +457,7 @@ class DwarfUnit {
   /// \brief Construct a DIE for the given DbgVariable without initializing the
   /// DbgVariable's DIE reference.
   std::unique_ptr<DIE> constructVariableDIEImpl(const DbgVariable &DV,
-                                                AbstractOrInlined AbsIn);
+                                                bool Abstract);
 
   /// constructTypeDIE - Construct basic type die from DIBasicType.
   void constructTypeDIE(DIE &Buffer, DIBasicType BTy);
@@ -577,10 +579,6 @@ class DwarfTypeUnit : public DwarfUnit {
            sizeof(uint32_t);                               // Type DIE Offset
   }
   void initSection(const MCSection *Section);
-  // Bring in the base function (taking two args, including the section symbol)
-  // for use when building DWO type units (they don't go in unique comdat
-  // sections)
-  using DwarfUnit::initSection;
   DwarfCompileUnit &getCU() override { return CU; }
 
 protected:
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
similarity index 89%
rename from lib/CodeGen/AsmPrinter/DwarfException.cpp
rename to lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 3a12c7326f3b..73f62bfd804e 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- CodeGen/AsmPrinter/DwarfException.cpp - Dwarf Exception Impl ------===//
+//===-- CodeGen/AsmPrinter/EHStreamer.cpp - Exception Directive Streamer --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,45 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains support for writing DWARF exception info into asm files.
+// This file contains support for writing exception info into assembly files.
 //
 //===----------------------------------------------------------------------===//
 
-#include "DwarfException.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Twine.h"
+#include "EHStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+
 using namespace llvm;
 
-DwarfException::DwarfException(AsmPrinter *A)
-  : Asm(A), MMI(Asm->MMI) {}
+EHStreamer::EHStreamer(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
 
-DwarfException::~DwarfException() {}
+EHStreamer::~EHStreamer() {}
 
-/// SharedTypeIds - How many leading type ids two landing pads have in common.
-unsigned DwarfException::SharedTypeIds(const LandingPadInfo *L,
-                                       const LandingPadInfo *R) {
+/// How many leading type ids two landing pads have in common.
+unsigned EHStreamer::sharedTypeIDs(const LandingPadInfo *L,
+                                   const LandingPadInfo *R) {
   const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
   unsigned LSize = LIds.size(), RSize = RIds.size();
   unsigned MinSize = LSize < RSize ? LSize : RSize;
@@ -58,10 +44,10 @@ unsigned DwarfException::SharedTypeIds(const LandingPadInfo *L,
   return Count;
 }
 
-/// ComputeActionsTable - Compute the actions table and gather the first action
-/// index for each landing pad site.
-unsigned DwarfException::
-ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
+/// Compute the actions table and gather the first action index for each landing
+/// pad site.
+unsigned EHStreamer::
+computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
                     SmallVectorImpl<ActionEntry> &Actions,
                     SmallVectorImpl<unsigned> &FirstActions) {
 
@@ -109,7 +95,7 @@ ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
          I = LandingPads.begin(), E = LandingPads.end(); I != E; ++I) {
     const LandingPadInfo *LPI = *I;
     const std::vector<int> &TypeIds = LPI->TypeIds;
-    unsigned NumShared = PrevLPI ? SharedTypeIds(LPI, PrevLPI) : 0;
+    unsigned NumShared = PrevLPI ? sharedTypeIDs(LPI, PrevLPI) : 0;
     unsigned SizeSiteActions = 0;
 
     if (NumShared < TypeIds.size()) {
@@ -167,9 +153,9 @@ ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
   return SizeActions;
 }
 
-/// CallToNoUnwindFunction - Return `true' if this is a call to a function
-/// marked `nounwind'. Return `false' otherwise.
-bool DwarfException::CallToNoUnwindFunction(const MachineInstr *MI) {
+/// Return `true' if this is a call to a function marked `nounwind'. Return
+/// `false' otherwise.
+bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) {
   assert(MI->isCall() && "This should be a call instruction!");
 
   bool MarkedNoUnwind = false;
@@ -201,15 +187,14 @@ bool DwarfException::CallToNoUnwindFunction(const MachineInstr *MI) {
   return MarkedNoUnwind;
 }
 
-/// ComputeCallSiteTable - Compute the call-site table.  The entry for an invoke
-/// has a try-range containing the call, a non-zero landing pad, and an
-/// appropriate action.  The entry for an ordinary call has a try-range
-/// containing the call and zero for the landing pad and the action.  Calls
-/// marked 'nounwind' have no entry and must not be contained in the try-range
-/// of any entry - they form gaps in the table.  Entries must be ordered by
-/// try-range address.
-void DwarfException::
-ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+/// Compute the call-site table.  The entry for an invoke has a try-range
+/// containing the call, a non-zero landing pad, and an appropriate action.  The
+/// entry for an ordinary call has a try-range containing the call and zero for
+/// the landing pad and the action.  Calls marked 'nounwind' have no entry and
+/// must not be contained in the try-range of any entry - they form gaps in the
+/// table.  Entries must be ordered by try-range address.
+void EHStreamer::
+computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
                      const RangeMapType &PadMap,
                      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      const SmallVectorImpl<unsigned> &FirstActions) {
@@ -228,7 +213,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
     for (const auto &MI : MBB) {
       if (!MI.isEHLabel()) {
         if (MI.isCall())
-          SawPotentiallyThrowing |= !CallToNoUnwindFunction(&MI);
+          SawPotentiallyThrowing |= !callToNoUnwindFunction(&MI);
         continue;
       }
 
@@ -308,7 +293,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   }
 }
 
-/// EmitExceptionTable - Emit landing pads and actions.
+/// Emit landing pads and actions.
 ///
 /// The general organization of the table is complex, but the basic concepts are
 /// easy.  First there is a header which describes the location and organization
@@ -328,7 +313,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 ///     unwound and handling continues.
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
-void DwarfException::EmitExceptionTable() {
+void EHStreamer::emitExceptionTable() {
   const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
   const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
@@ -350,7 +335,8 @@ void DwarfException::EmitExceptionTable() {
   // landing pad site.
   SmallVector<ActionEntry, 32> Actions;
   SmallVector<unsigned, 64> FirstActions;
-  unsigned SizeActions=ComputeActionsTable(LandingPads, Actions, FirstActions);
+  unsigned SizeActions =
+    computeActionsTable(LandingPads, Actions, FirstActions);
 
   // Invokes and nounwind calls have entries in PadMap (due to being bracketed
   // by try-range labels when lowered).  Ordinary calls do not, so appropriate
@@ -368,7 +354,7 @@ void DwarfException::EmitExceptionTable() {
 
   // Compute the call-site table.
   SmallVector<CallSiteEntry, 64> CallSites;
-  ComputeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
+  computeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
 
   // Final tallies.
 
@@ -657,12 +643,12 @@ void DwarfException::EmitExceptionTable() {
     Asm->EmitSLEB128(Action.NextAction);
   }
 
-  EmitTypeInfos(TTypeEncoding);
+  emitTypeInfos(TTypeEncoding);
 
   Asm->EmitAlignment(2);
 }
 
-void DwarfException::EmitTypeInfos(unsigned TTypeEncoding) {
+void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
   const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
@@ -703,19 +689,18 @@ void DwarfException::EmitTypeInfos(unsigned TTypeEncoding) {
   }
 }
 
-/// endModule - Emit all exception information that should come after the
-/// content.
-void DwarfException::endModule() {
+/// Emit all exception information that should come after the content.
+void EHStreamer::endModule() {
   llvm_unreachable("Should be implemented");
 }
 
-/// beginFunction - Gather pre-function exception information. Assumes it's
-/// being emitted immediately after the function entry point.
-void DwarfException::beginFunction(const MachineFunction *MF) {
+/// Gather pre-function exception information. Assumes it's being emitted
+/// immediately after the function entry point.
+void EHStreamer::beginFunction(const MachineFunction *MF) {
   llvm_unreachable("Should be implemented");
 }
 
-/// endFunction - Gather and emit post-function exception information.
-void DwarfException::endFunction(const MachineFunction *) {
+/// Gather and emit post-function exception information.
+void EHStreamer::endFunction(const MachineFunction *) {
   llvm_unreachable("Should be implemented");
 }
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
new file mode 100644
index 000000000000..2b6ba788e6fa
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -0,0 +1,138 @@
+//===-- EHStreamer.h - Exception Handling Directive Streamer ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing exception info into assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#define LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
+
+#include "AsmPrinterHandler.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+struct LandingPadInfo;
+class MachineModuleInfo;
+class MachineInstr;
+class MachineFunction;
+class AsmPrinter;
+
+template <typename T>
+class SmallVectorImpl;
+
+/// Emits exception handling directives.
+class EHStreamer : public AsmPrinterHandler {
+protected:
+  /// Target of directive emission.
+  AsmPrinter *Asm;
+
+  /// Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  /// How many leading type ids two landing pads have in common.
+  static unsigned sharedTypeIDs(const LandingPadInfo *L,
+                                const LandingPadInfo *R);
+
+  /// Structure holding a try-range and the associated landing pad.
+  struct PadRange {
+    // The index of the landing pad.
+    unsigned PadIndex;
+    // The index of the begin and end labels in the landing pad's label lists.
+    unsigned RangeIndex;
+  };
+
+  typedef DenseMap<MCSymbol *, PadRange> RangeMapType;
+
+  /// Structure describing an entry in the actions table.
+  struct ActionEntry {
+    int ValueForTypeID; // The value to write - may not be equal to the type id.
+    int NextAction;
+    unsigned Previous;
+  };
+
+  /// Structure describing an entry in the call-site table.
+  struct CallSiteEntry {
+    // The 'try-range' is BeginLabel .. EndLabel.
+    MCSymbol *BeginLabel; // zero indicates the start of the function.
+    MCSymbol *EndLabel;   // zero indicates the end of the function.
+
+    // The landing pad starts at PadLabel.
+    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+    unsigned Action;
+  };
+
+  /// Compute the actions table and gather the first action index for each
+  /// landing pad site.
+  unsigned computeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
+                               SmallVectorImpl<ActionEntry> &Actions,
+                               SmallVectorImpl<unsigned> &FirstActions);
+
+  /// Return `true' if this is a call to a function marked `nounwind'. Return
+  /// `false' otherwise.
+  bool callToNoUnwindFunction(const MachineInstr *MI);
+
+  /// Compute the call-site table.  The entry for an invoke has a try-range
+  /// containing the call, a non-zero landing pad and an appropriate action.
+  /// The entry for an ordinary call has a try-range containing the call and
+  /// zero for the landing pad and the action.  Calls marked 'nounwind' have
+  /// no entry and must not be contained in the try-range of any entry - they
+  /// form gaps in the table.  Entries must be ordered by try-range address.
+
+  void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                            const RangeMapType &PadMap,
+                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
+                            const SmallVectorImpl<unsigned> &FirstActions);
+
+  /// Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
+  void emitExceptionTable();
+
+  virtual void emitTypeInfos(unsigned TTypeEncoding);
+
+public:
+  EHStreamer(AsmPrinter *A);
+  virtual ~EHStreamer();
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+
+  // Unused.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+  void beginInstruction(const MachineInstr *MI) override {}
+  void endInstruction() override {}
+};
+}
+
+#endif
+
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index 17d8bff6092d..4768a43e9a6f 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -38,9 +38,8 @@
 using namespace llvm;
 
 Win64Exception::Win64Exception(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false)
-    {}
+  : EHStreamer(A), shouldEmitPersonality(false), shouldEmitLSDA(false),
+    shouldEmitMoves(false) {}
 
 Win64Exception::~Win64Exception() {}
 
@@ -108,7 +107,7 @@ void Win64Exception::endFunction(const MachineFunction *) {
     Asm->OutStreamer.EmitWin64EHHandlerData();
     Asm->OutStreamer.EmitValue(MCSymbolRefExpr::Create(Sym, Asm->OutContext),
                                4);
-    EmitExceptionTable();
+    emitExceptionTable();
     Asm->OutStreamer.PopSection();
   }
   Asm->OutStreamer.EmitWin64EHEndProc();
diff --git a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
index d995333971b0..4c4150bfec90 100644
--- a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
+++ b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
@@ -50,22 +50,9 @@ namespace {
 
 char AtomicExpandLoadLinked::ID = 0;
 char &llvm::AtomicExpandLoadLinkedID = AtomicExpandLoadLinked::ID;
-
-static void *initializeAtomicExpandLoadLinkedPassOnce(PassRegistry &Registry) {
-  PassInfo *PI = new PassInfo(
-      "Expand Atomic calls in terms of load-linked & store-conditional",
-      "atomic-ll-sc", &AtomicExpandLoadLinked::ID,
-      PassInfo::NormalCtor_t(callDefaultCtor<AtomicExpandLoadLinked>), false,
-      false, PassInfo::TargetMachineCtor_t(
-                 callTargetMachineCtor<AtomicExpandLoadLinked>));
-  Registry.registerPass(*PI, true);
-  return PI;
-}
-
-void llvm::initializeAtomicExpandLoadLinkedPass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializeAtomicExpandLoadLinkedPassOnce)
-}
-
+INITIALIZE_TM_PASS(AtomicExpandLoadLinked, "atomic-ll-sc",
+    "Expand Atomic calls in terms of load-linked & store-conditional",
+    false, false)
 
 FunctionPass *llvm::createAtomicExpandLoadLinkedPass(const TargetMachine *TM) {
   return new AtomicExpandLoadLinked(TM);
@@ -256,19 +243,26 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //     %loaded = @load.linked(%addr)
   //     %should_store = icmp eq %loaded, %desired
   //     br i1 %should_store, label %cmpxchg.trystore,
-  //                          label %cmpxchg.end/%cmpxchg.barrier
+  //                          label %cmpxchg.failure
   // cmpxchg.trystore:
   //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %cmpxchg.end
-  // cmpxchg.barrier:
+  //     %success = icmp eq i32 %stored, 0
+  //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
+  // cmpxchg.success:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.failure:
   //     fence?
   //     br label %cmpxchg.end
   // cmpxchg.end:
+  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
+  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
   //     [...]
   BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
-  auto BarrierBB = BasicBlock::Create(Ctx, "cmpxchg.barrier", F, ExitBB);
-  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, BarrierBB);
+  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
+  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
+  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
   auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
 
   // This grabs the DebugLoc from CI
@@ -290,25 +284,69 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   // If the the cmpxchg doesn't actually need any ordering when it fails, we can
   // jump straight past that fence instruction (if it exists).
-  BasicBlock *FailureBB = FailureOrder == Monotonic ? ExitBB : BarrierBB;
   Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
 
   Builder.SetInsertPoint(TryStoreBB);
   Value *StoreSuccess = TLI->emitStoreConditional(
       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
+  StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
-  Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);
+  Builder.CreateCondBr(StoreSuccess, SuccessBB,
+                       CI->isWeak() ? FailureBB : LoopBB);
 
-  // Finally, make sure later instructions don't get reordered with a fence if
-  // necessary.
-  Builder.SetInsertPoint(BarrierBB);
+  // Make sure later instructions don't get reordered with a fence if necessary.
+  Builder.SetInsertPoint(SuccessBB);
   insertTrailingFence(Builder, SuccessOrder);
   Builder.CreateBr(ExitBB);
 
-  CI->replaceAllUsesWith(Loaded);
-  CI->eraseFromParent();
+  Builder.SetInsertPoint(FailureBB);
+  insertTrailingFence(Builder, FailureOrder);
+  Builder.CreateBr(ExitBB);
+
+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
 
+  // Setup the builder so we can create any PHIs we need.
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  SmallVector<ExtractValueInst *, 2> PrunedInsts;
+  for (auto User : CI->users()) {
+    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
+    if (!EV)
+      continue;
+
+    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
+           "weird extraction from { iN, i1 }");
+
+    if (EV->getIndices()[0] == 0)
+      EV->replaceAllUsesWith(Loaded);
+    else
+      EV->replaceAllUsesWith(Success);
+
+    PrunedInsts.push_back(EV);
+  }
+
+  // We can remove the instructions now we're no longer iterating through them.
+  for (auto EV : PrunedInsts)
+    EV->eraseFromParent();
+
+  if (!CI->use_empty()) {
+    // Some use of the full struct return that we don't understand has happened,
+    // so we've got to reconstruct it properly.
+    Value *Res;
+    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
+    Res = Builder.CreateInsertValue(Res, Success, 1);
+
+    CI->replaceAllUsesWith(Res);
+  }
+
+  CI->eraseFromParent();
   return true;
 }
 
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index f623a48cf0d2..7503e5751994 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1505,10 +1505,17 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     if (MO.isUse()) {
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         Uses.insert(*AI);
-    } else if (!MO.isDead())
-      // Don't try to hoist code in the rare case the terminator defines a
-      // register that is later used.
-      return MBB->end();
+    } else {
+      if (!MO.isDead())
+        // Don't try to hoist code in the rare case the terminator defines a
+        // register that is later used.
+        return MBB->end();
+
+      // If the terminator defines a register, make sure we don't hoist
+      // the instruction whose def might be clobbered by the terminator.
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Defs.insert(*AI);
+    }
   }
 
   if (Uses.empty())
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 0b492a965216..57c24e823c1d 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -22,11 +22,13 @@ add_llvm_library(LLVMCodeGen
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCStrategy.cpp
+  GlobalMerge.cpp
   IfConversion.cpp
   InlineSpiller.cpp
   InterferenceCache.cpp
   IntrinsicLowering.cpp
   JITCodeEmitter.cpp
+  JumpInstrTables.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
   LexicalScopes.cpp
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index dc5f67b8c0b9..ccac40c66961 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -151,19 +151,8 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
 }
 
 char CodeGenPrepare::ID = 0;
-static void *initializeCodeGenPreparePassOnce(PassRegistry &Registry) {
-  initializeTargetLibraryInfoPass(Registry);
-  PassInfo *PI = new PassInfo(
-      "Optimize for code generation", "codegenprepare", &CodeGenPrepare::ID,
-      PassInfo::NormalCtor_t(callDefaultCtor<CodeGenPrepare>), false, false,
-      PassInfo::TargetMachineCtor_t(callTargetMachineCtor<CodeGenPrepare>));
-  Registry.registerPass(*PI, true);
-  return PI;
-}
-
-void llvm::initializeCodeGenPreparePass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializeCodeGenPreparePassOnce)
-}
+INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare",
+                   "Optimize for code generation", false, false)
 
 FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
   return new CodeGenPrepare(TM);
@@ -1078,8 +1067,11 @@ void ExtAddrMode::print(raw_ostream &OS) const {
     NeedPlus = true;
   }
 
-  if (BaseOffs)
-    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
+  if (BaseOffs) {
+    OS << (NeedPlus ? " + " : "")
+       << BaseOffs;
+    NeedPlus = true;
+  }
 
   if (BaseReg) {
     OS << (NeedPlus ? " + " : "")
@@ -1640,6 +1632,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
 static bool MightBeFoldableInst(Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
     // Don't touch identity bitcasts.
     if (I->getType() == I->getOperand(0)->getType())
       return false;
@@ -1994,6 +1987,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
       return MatchAddr(AddrInst->getOperand(0), Depth);
     return false;
   case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
     // BitCast is always a noop, and we can handle it as long as it is
     // int->int or pointer->pointer (we don't want int<->fp or something).
     if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index e3190241cd6e..c4706328ea52 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -776,6 +776,12 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
 bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
                << "********** Function: " << MF.getName() << '\n');
+  // Only run if conversion if the target wants it.
+  if (!MF.getTarget()
+           .getSubtarget<TargetSubtargetInfo>()
+           .enableEarlyIfConversion())
+    return false;
+
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
   SchedModel =
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
similarity index 77%
rename from lib/Transforms/Scalar/GlobalMerge.cpp
rename to lib/CodeGen/GlobalMerge.cpp
index dd9c3784cc2d..d52fcbfa41ed 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -64,6 +64,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -72,7 +73,7 @@ using namespace llvm;
 #define DEBUG_TYPE "global-merge"
 
 static cl::opt<bool>
-EnableGlobalMerge("global-merge", cl::Hidden,
+EnableGlobalMerge("enable-global-merge", cl::Hidden,
                   cl::desc("Enable global merge pass"),
                   cl::init(true));
 
@@ -81,6 +82,13 @@ EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
                          cl::desc("Enable global merge pass on constants"),
                          cl::init(false));
 
+// FIXME: this could be a transitional option, and we probably need to remove
+// it if only we are sure this optimization could always benefit all targets.
+static cl::opt<bool>
+EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
+     cl::desc("Enable global merge pass on external linkage"),
+     cl::init(false));
+
 STATISTIC(NumMerged      , "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
@@ -129,9 +137,8 @@ namespace {
 } // end anonymous namespace
 
 char GlobalMerge::ID = 0;
-INITIALIZE_PASS(GlobalMerge, "global-merge",
-                "Global Merge", false, false)
-
+INITIALIZE_TM_PASS(GlobalMerge, "global-merge", "Merge global variables",
+                   false, false)
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
@@ -154,11 +161,23 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
 
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
 
+  assert(Globals.size() > 1);
+
+  // FIXME: This simple solution merges globals all together as maximum as
+  // possible. However, with this solution it would be hard to remove dead
+  // global symbols at link-time. An alternative solution could be checking
+  // global symbols references function by function, and make the symbols
+  // being referred in the same function merged and we would probably need
+  // to introduce heuristic algorithm to solve the merge conflict from
+  // different functions.
   for (size_t i = 0, e = Globals.size(); i != e; ) {
     size_t j = 0;
     uint64_t MergedSize = 0;
     std::vector<Type*> Tys;
     std::vector<Constant*> Inits;
+
+    bool HasExternal = false;
+    GlobalVariable *TheFirstExternal = 0;
     for (j = i; j != e; ++j) {
       Type *Ty = Globals[j]->getType()->getElementType();
       MergedSize += DL->getTypeAllocSize(Ty);
@@ -167,17 +186,37 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
       }
       Tys.push_back(Ty);
       Inits.push_back(Globals[j]->getInitializer());
+
+      if (Globals[j]->hasExternalLinkage() && !HasExternal) {
+        HasExternal = true;
+        TheFirstExternal = Globals[j];
+      }
     }
 
+    // If merged variables doesn't have external linkage, we needn't to expose
+    // the symbol after merging.
+    GlobalValue::LinkageTypes Linkage = HasExternal
+                                            ? GlobalValue::ExternalLinkage
+                                            : GlobalValue::InternalLinkage;
+
+    // If merged variables have external linkage, we use symbol name of the
+    // first variable merged as the suffix of global symbol name. This would
+    // be able to avoid the link-time naming conflict for globalm symbols.
+    Twine MergedGVName = HasExternal
+                             ? "_MergedGlobals_" + TheFirstExternal->getName()
+                             : "_MergedGlobals";
+
     StructType *MergedTy = StructType::get(M.getContext(), Tys);
     Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
-    GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
-                                                  GlobalValue::InternalLinkage,
-                                                  MergedInit, "_MergedGlobals",
-                                                  nullptr,
-                                                  GlobalVariable::NotThreadLocal,
-                                                  AddrSpace);
+
+    GlobalVariable *MergedGV = new GlobalVariable(
+        M, MergedTy, isConst, Linkage, MergedInit, MergedGVName, nullptr,
+        GlobalVariable::NotThreadLocal, AddrSpace);
+
     for (size_t k = i; k < j; ++k) {
+      GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
+      std::string Name = Globals[k]->getName();
+
       Constant *Idx[2] = {
         ConstantInt::get(Int32Ty, 0),
         ConstantInt::get(Int32Ty, k-i)
@@ -185,6 +224,14 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
       Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
+
+      if (Linkage != GlobalValue::InternalLinkage) {
+        // Generate a new alias...
+        auto *PTy = cast<PointerType>(GEP->getType());
+        GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                            Linkage, Name, GEP, &M);
+      }
+
       NumMerged++;
     }
     i = j;
@@ -245,8 +292,12 @@ bool GlobalMerge::doInitialization(Module &M) {
   // Grab all non-const globals.
   for (Module::global_iterator I = M.global_begin(),
          E = M.global_end(); I != E; ++I) {
-    // Merge is safe for "normal" internal globals only
-    if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
+    // Merge is safe for "normal" internal or external globals only
+    if (I->isDeclaration() || I->isThreadLocal() || I->hasSection())
+      continue;
+
+    if (!(EnableGlobalMergeOnExternal && I->hasExternalLinkage()) &&
+        !I->hasInternalLinkage())
       continue;
 
     PointerType *PT = dyn_cast<PointerType>(I->getType());
@@ -270,8 +321,7 @@ bool GlobalMerge::doInitialization(Module &M) {
       continue;
 
     if (DL->getTypeAllocSize(Ty) < MaxOffset) {
-      if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine())
-          .isBSSLocal())
+      if (TargetLoweringObjectFile::getKindForGlobal(I, *TM).isBSSLocal())
         BSSGlobals[AddressSpace].push_back(I);
       else if (I->isConstant())
         ConstGlobals[AddressSpace].push_back(I);
diff --git a/lib/CodeGen/JumpInstrTables.cpp b/lib/CodeGen/JumpInstrTables.cpp
new file mode 100644
index 000000000000..61ef722dce52
--- /dev/null
+++ b/lib/CodeGen/JumpInstrTables.cpp
@@ -0,0 +1,301 @@
+//===-- JumpInstrTables.cpp: Jump-Instruction Tables ----------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief An implementation of jump-instruction tables.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jt"
+
+#include "llvm/CodeGen/JumpInstrTables.h"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <vector>
+
+using namespace llvm;
+
+char JumpInstrTables::ID = 0;
+
+INITIALIZE_PASS_BEGIN(JumpInstrTables, "jump-instr-tables",
+                      "Jump-Instruction Tables", true, true)
+INITIALIZE_PASS_DEPENDENCY(JumpInstrTableInfo);
+INITIALIZE_PASS_END(JumpInstrTables, "jump-instr-tables",
+                    "Jump-Instruction Tables", true, true)
+
+STATISTIC(NumJumpTables, "Number of indirect call tables generated");
+STATISTIC(NumFuncsInJumpTables, "Number of functions in the jump tables");
+
+ModulePass *llvm::createJumpInstrTablesPass() {
+  // The default implementation uses a single table for all functions.
+  return new JumpInstrTables(JumpTable::Single);
+}
+
+ModulePass *llvm::createJumpInstrTablesPass(JumpTable::JumpTableType JTT) {
+  return new JumpInstrTables(JTT);
+}
+
+namespace {
+static const char jump_func_prefix[] = "__llvm_jump_instr_table_";
+static const char jump_section_prefix[] = ".jump.instr.table.text.";
+
+// Checks to see if a given CallSite is making an indirect call, including
+// cases where the indirect call is made through a bitcast.
+bool isIndirectCall(CallSite &CS) {
+  if (CS.getCalledFunction())
+    return false;
+
+  // Check the value to see if it is merely a bitcast of a function. In
+  // this case, it will translate to a direct function call in the resulting
+  // assembly, so we won't treat it as an indirect call here.
+  const Value *V = CS.getCalledValue();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    return !(CE->isCast() && isa<Function>(CE->getOperand(0)));
+  }
+
+  // Otherwise, since we know it's a call, it must be an indirect call
+  return true;
+}
+
+// Replaces Functions and GlobalAliases with a different Value.
+bool replaceGlobalValueIndirectUse(GlobalValue *GV, Value *V, Use *U) {
+  User *Us = U->getUser();
+  if (!Us)
+    return false;
+  if (Instruction *I = dyn_cast<Instruction>(Us)) {
+    CallSite CS(I);
+
+    // Don't do the replacement if this use is a direct call to this function.
+    // If the use is not the called value, then replace it.
+    if (CS && (isIndirectCall(CS) || CS.isCallee(U))) {
+      return false;
+    }
+
+    U->set(V);
+  } else if (Constant *C = dyn_cast<Constant>(Us)) {
+    // Don't replace calls to bitcasts of function symbols, since they get
+    // translated to direct calls.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Us)) {
+      if (CE->getOpcode() == Instruction::BitCast) {
+        // This bitcast must have exactly one user.
+        if (CE->user_begin() != CE->user_end()) {
+          User *ParentUs = *CE->user_begin();
+          if (CallInst *CI = dyn_cast<CallInst>(ParentUs)) {
+            CallSite CS(CI);
+            Use &CEU = *CE->use_begin();
+            if (CS.isCallee(&CEU)) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    // GlobalAlias doesn't support replaceUsesOfWithOnConstant. And the verifier
+    // requires alias to point to a defined function. So, GlobalAlias is handled
+    // as a separate case in runOnModule.
+    if (!isa<GlobalAlias>(C))
+      C->replaceUsesOfWithOnConstant(GV, V, U);
+  } else {
+    assert(false && "The Use of a Function symbol is neither an instruction nor"
+                    " a constant");
+  }
+
+  return true;
+}
+
+// Replaces all replaceable address-taken uses of GV with a pointer to a
+// jump-instruction table entry.
+void replaceValueWithFunction(GlobalValue *GV, Function *F) {
+  // Go through all uses of this function and replace the uses of GV with the
+  // jump-table version of the function. Get the uses as a vector before
+  // replacing them, since replacing them changes the use list and invalidates
+  // the iterator otherwise.
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E;) {
+    Use &U = *I++;
+
+    // Replacement of constants replaces all instances in the constant. So, some
+    // uses might have already been handled by the time we reach them here.
+    if (U.get() == GV)
+      replaceGlobalValueIndirectUse(GV, F, &U);
+  }
+
+  return;
+}
+} // end anonymous namespace
+
+JumpInstrTables::JumpInstrTables()
+    : ModulePass(ID), Metadata(), JITI(nullptr), TableCount(0),
+      JTType(JumpTable::Single) {
+  initializeJumpInstrTablesPass(*PassRegistry::getPassRegistry());
+}
+
+JumpInstrTables::JumpInstrTables(JumpTable::JumpTableType JTT)
+    : ModulePass(ID), Metadata(), JITI(nullptr), TableCount(0), JTType(JTT) {
+  initializeJumpInstrTablesPass(*PassRegistry::getPassRegistry());
+}
+
+JumpInstrTables::~JumpInstrTables() {}
+
+void JumpInstrTables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<JumpInstrTableInfo>();
+}
+
+Function *JumpInstrTables::insertEntry(Module &M, Function *Target) {
+  FunctionType *OrigFunTy = Target->getFunctionType();
+  FunctionType *FunTy = transformType(OrigFunTy);
+
+  JumpMap::iterator it = Metadata.find(FunTy);
+  if (Metadata.end() == it) {
+    struct TableMeta Meta;
+    Meta.TableNum = TableCount;
+    Meta.Count = 0;
+    Metadata[FunTy] = Meta;
+    it = Metadata.find(FunTy);
+    ++NumJumpTables;
+    ++TableCount;
+  }
+
+  it->second.Count++;
+
+  std::string NewName(jump_func_prefix);
+  NewName += (Twine(it->second.TableNum) + "_" + Twine(it->second.Count)).str();
+  Function *JumpFun =
+      Function::Create(OrigFunTy, GlobalValue::ExternalLinkage, NewName, &M);
+  // The section for this table
+  JumpFun->setSection((jump_section_prefix + Twine(it->second.TableNum)).str());
+  JITI->insertEntry(FunTy, Target, JumpFun);
+
+  ++NumFuncsInJumpTables;
+  return JumpFun;
+}
+
+bool JumpInstrTables::hasTable(FunctionType *FunTy) {
+  FunctionType *TransTy = transformType(FunTy);
+  return Metadata.end() != Metadata.find(TransTy);
+}
+
+FunctionType *JumpInstrTables::transformType(FunctionType *FunTy) {
+  // Returning nullptr forces all types into the same table, since all types map
+  // to the same type
+  Type *VoidPtrTy = Type::getInt8PtrTy(FunTy->getContext());
+
+  // Ignore the return type.
+  Type *RetTy = VoidPtrTy;
+  bool IsVarArg = FunTy->isVarArg();
+  std::vector<Type *> ParamTys(FunTy->getNumParams());
+  FunctionType::param_iterator PI, PE;
+  int i = 0;
+
+  std::vector<Type *> EmptyParams;
+  Type *Int32Ty = Type::getInt32Ty(FunTy->getContext());
+  FunctionType *VoidFnTy = FunctionType::get(
+      Type::getVoidTy(FunTy->getContext()), EmptyParams, false);
+  switch (JTType) {
+  case JumpTable::Single:
+
+    return FunctionType::get(RetTy, EmptyParams, false);
+  case JumpTable::Arity:
+    // Transform all types to void* so that all functions with the same arity
+    // end up in the same table.
+    for (PI = FunTy->param_begin(), PE = FunTy->param_end(); PI != PE;
+         PI++, i++) {
+      ParamTys[i] = VoidPtrTy;
+    }
+
+    return FunctionType::get(RetTy, ParamTys, IsVarArg);
+  case JumpTable::Simplified:
+    // Project all parameters types to one of 3 types: composite, integer, and
+    // function, matching the three subclasses of Type.
+    for (PI = FunTy->param_begin(), PE = FunTy->param_end(); PI != PE;
+         ++PI, ++i) {
+      assert((isa<IntegerType>(*PI) || isa<FunctionType>(*PI) ||
+              isa<CompositeType>(*PI)) &&
+             "This type is not an Integer or a Composite or a Function");
+      if (isa<CompositeType>(*PI)) {
+        ParamTys[i] = VoidPtrTy;
+      } else if (isa<FunctionType>(*PI)) {
+        ParamTys[i] = VoidFnTy;
+      } else if (isa<IntegerType>(*PI)) {
+        ParamTys[i] = Int32Ty;
+      }
+    }
+
+    return FunctionType::get(RetTy, ParamTys, IsVarArg);
+  case JumpTable::Full:
+    // Don't transform this type at all.
+    return FunTy;
+  }
+
+  return nullptr;
+}
+
+bool JumpInstrTables::runOnModule(Module &M) {
+  // Make sure the module is well-formed, especially with respect to jumptable.
+  if (verifyModule(M))
+    return false;
+
+  JITI = &getAnalysis<JumpInstrTableInfo>();
+
+  // Get the set of jumptable-annotated functions.
+  DenseMap<Function *, Function *> Functions;
+  for (Function &F : M) {
+    if (F.hasFnAttribute(Attribute::JumpTable)) {
+      assert(F.hasUnnamedAddr() &&
+             "Attribute 'jumptable' requires 'unnamed_addr'");
+      Functions[&F] = nullptr;
+    }
+  }
+
+  // Create the jump-table functions.
+  for (auto &KV : Functions) {
+    Function *F = KV.first;
+    KV.second = insertEntry(M, F);
+  }
+
+  // GlobalAlias is a special case, because the target of an alias statement
+  // must be a defined function. So, instead of replacing a given function in
+  // the alias, we replace all uses of aliases that target jumptable functions.
+  // Note that there's no need to create these functions, since only aliases
+  // that target known jumptable functions are replaced, and there's no way to
+  // put the jumptable annotation on a global alias.
+  DenseMap<GlobalAlias *, Function *> Aliases;
+  for (GlobalAlias &GA : M.aliases()) {
+    Constant *Aliasee = GA.getAliasee();
+    if (Function *F = dyn_cast<Function>(Aliasee)) {
+      auto it = Functions.find(F);
+      if (it != Functions.end()) {
+        Aliases[&GA] = it->second;
+      }
+    }
+  }
+
+  // Replace each address taken function with its jump-instruction table entry.
+  for (auto &KV : Functions)
+    replaceValueWithFunction(KV.first, KV.second);
+
+  for (auto &KV : Aliases)
+    replaceValueWithFunction(KV.first, KV.second);
+
+  return !Functions.empty();
+}
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index c8211b76be4e..29062434f00e 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -12,11 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetMachine.h"
+
+#include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/JumpInstrTables.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -43,19 +47,6 @@ static cl::opt<cl::boolOrDefault>
 EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
-static cl::opt<cl::boolOrDefault>
-AsmVerbose("asm-verbose", cl::desc("Add comments to directives."),
-           cl::init(cl::BOU_UNSET));
-
-static bool getVerboseAsm() {
-  switch (AsmVerbose) {
-  case cl::BOU_UNSET: return TargetMachine::getAsmVerbosityDefault();
-  case cl::BOU_TRUE:  return true;
-  case cl::BOU_FALSE: return false;
-  }
-  llvm_unreachable("Invalid verbose asm state");
-}
-
 void LLVMTargetMachine::initAsmInfo() {
   MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(*getRegisterInfo(),
                                                     TargetTriple);
@@ -95,6 +86,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
                                           bool DisableVerify,
                                           AnalysisID StartAfter,
                                           AnalysisID StopAfter) {
+
   // Add internal analysis passes from the target machine.
   TM->addAnalysisPasses(PM);
 
@@ -149,6 +141,11 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             bool DisableVerify,
                                             AnalysisID StartAfter,
                                             AnalysisID StopAfter) {
+  // Passes to handle jumptable function annotations. These can't be handled at
+  // JIT time, so we don't add them directly to addPassesToGenerateCode.
+  PM.add(createJumpInstrTableInfoPass());
+  PM.add(createJumpInstrTablesPass(Options.JTType));
+
   // Add common CodeGen passes.
   MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify,
                                                StartAfter, StopAfter);
@@ -188,8 +185,9 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                        TargetCPU);
     MCStreamer *S = getTarget().createAsmStreamer(
-        *Context, Out, getVerboseAsm(), Options.MCOptions.MCUseDwarfDirectory,
-        InstPrinter, MCE, MAB, Options.MCOptions.ShowMCInst);
+        *Context, Out, Options.MCOptions.AsmVerbose,
+        Options.MCOptions.MCUseDwarfDirectory, InstPrinter, MCE, MAB,
+        Options.MCOptions.ShowMCInst);
     AsmStreamer.reset(S);
     break;
   }
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index d965968fb498..d12c234bf3b2 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -210,21 +210,21 @@ LexicalScope *LexicalScopes::getOrCreateAbstractScope(const MDNode *N) {
   DIDescriptor Scope(N);
   if (Scope.isLexicalBlockFile())
     Scope = DILexicalBlockFile(Scope).getScope();
-  auto I = AbstractScopeMap.find(N);
+  auto I = AbstractScopeMap.find(Scope);
   if (I != AbstractScopeMap.end())
     return &I->second;
 
   LexicalScope *Parent = nullptr;
   if (Scope.isLexicalBlock()) {
-    DILexicalBlock DB(N);
+    DILexicalBlock DB(Scope);
     DIDescriptor ParentDesc = DB.getContext();
     Parent = getOrCreateAbstractScope(ParentDesc);
   }
   I = AbstractScopeMap.emplace(std::piecewise_construct,
-                               std::forward_as_tuple(N),
-                               std::forward_as_tuple(Parent, DIDescriptor(N),
+                               std::forward_as_tuple(Scope),
+                               std::forward_as_tuple(Parent, Scope,
                                                      nullptr, true)).first;
-  if (DIDescriptor(N).isSubprogram())
+  if (Scope.isSubprogram())
     AbstractScopesList.push_back(&I->second);
   return &I->second;
 }
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 3563f8eab023..15595609e904 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -186,6 +186,7 @@ void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   LRCalc->createDeadDefs(LI);
   LRCalc->extendToUses(LI);
+  computeDeadValues(&LI, LI, nullptr, nullptr);
 }
 
 void LiveIntervals::computeVirtRegs() {
@@ -412,21 +413,34 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
   // Handle dead values.
   bool CanSeparate = false;
+  computeDeadValues(li, NewLR, &CanSeparate, dead);
+
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
+  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
+  return CanSeparate;
+}
+
+void LiveIntervals::computeDeadValues(LiveInterval *li,
+                                      LiveRange &LR,
+                                      bool *CanSeparate,
+                                      SmallVectorImpl<MachineInstr*> *dead) {
   for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
        I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    LiveRange::iterator LRI = NewLR.FindSegmentContaining(VNI->def);
-    assert(LRI != NewLR.end() && "Missing segment for PHI");
+    LiveRange::iterator LRI = LR.FindSegmentContaining(VNI->def);
+    assert(LRI != LR.end() && "Missing segment for PHI");
     if (LRI->end != VNI->def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      NewLR.removeSegment(LRI->start, LRI->end);
+      LR.removeSegment(LRI->start, LRI->end);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
-      CanSeparate = true;
+      if (CanSeparate)
+        *CanSeparate = true;
     } else {
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(VNI->def);
@@ -438,11 +452,6 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       }
     }
   }
-
-  // Move the trimmed segments back.
-  li->segments.swap(NewLR.segments);
-  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
-  return CanSeparate;
 }
 
 void LiveIntervals::extendToIndices(LiveRange &LR,
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 23847d6a75f5..0baf2a6c1c21 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -333,6 +333,12 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   if (skipOptnoneFunction(*mf.getFunction()))
     return false;
 
+  const TargetSubtargetInfo &ST =
+    mf.getTarget().getSubtarget<TargetSubtargetInfo>();
+  if (!ST.enablePostMachineScheduler()) {
+    DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
+    return false;
+  }
   DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
 
   // Initialize the context of the pass.
@@ -529,6 +535,11 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
     llvm_unreachable(nullptr);
   }
 #endif
+  // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
+    SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
+
   --SuccSU->NumPredsLeft;
   if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
     SchedImpl->releaseTopNode(SuccSU);
@@ -563,6 +574,11 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
     llvm_unreachable(nullptr);
   }
 #endif
+  // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
+    PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
+
   --PredSU->NumSuccsLeft;
   if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
     SchedImpl->releaseBottomNode(PredSU);
@@ -674,10 +690,13 @@ void ScheduleDAGMI::schedule() {
         CurrentBottom = MI;
       }
     }
-    updateQueues(SU, IsTopNode);
-
-    // Notify the scheduling strategy after updating the DAG.
+    // Notify the scheduling strategy before updating the DAG.
+    // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
+    // runs, it can then use the accurate ReadyCycle time to determine whether
+    // newly released nodes can move to the readyQ.
     SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
   }
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
@@ -1568,7 +1587,7 @@ void SchedBoundary::reset() {
   // Track the maximum number of stall cycles that could arise either from the
   // latency of a DAG edge or the number of cycles that a processor resource is
   // reserved (SchedBoundary::ReservedCycles).
-  MaxObservedLatency = 0;
+  MaxObservedStall = 0;
 #endif
   // Reserve a zero-count for invalid CritResIdx.
   ExecutedResCounts.resize(1);
@@ -1725,6 +1744,16 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
 }
 
 void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+#ifndef NDEBUG
+  // ReadyCycle was been bumped up to the CurrCycle when this node was
+  // scheduled, but CurrCycle may have been eagerly advanced immediately after
+  // scheduling, so may now be greater than ReadyCycle.
+  if (ReadyCycle > CurrCycle)
+    MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
+#endif
+
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
 
@@ -1744,18 +1773,6 @@ void SchedBoundary::releaseTopNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isWeak())
-      continue;
-    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
-    unsigned Latency = I->getLatency();
-#ifndef NDEBUG
-    MaxObservedLatency = std::max(Latency, MaxObservedLatency);
-#endif
-    if (SU->TopReadyCycle < PredReadyCycle + Latency)
-      SU->TopReadyCycle = PredReadyCycle + Latency;
-  }
   releaseNode(SU, SU->TopReadyCycle);
 }
 
@@ -1763,20 +1780,6 @@ void SchedBoundary::releaseBottomNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
-  assert(SU->getInstr() && "Scheduled SUnit must have instr");
-
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isWeak())
-      continue;
-    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
-    unsigned Latency = I->getLatency();
-#ifndef NDEBUG
-    MaxObservedLatency = std::max(Latency, MaxObservedLatency);
-#endif
-    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
-      SU->BotReadyCycle = SuccReadyCycle + Latency;
-  }
   releaseNode(SU, SU->BotReadyCycle);
 }
 
@@ -1945,7 +1948,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {
         if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
           ReservedCycles[PIdx] = isTop() ? NextCycle + PI->Cycles : NextCycle;
 #ifndef NDEBUG
-          MaxObservedLatency = std::max(PI->Cycles, MaxObservedLatency);
+          MaxObservedStall = std::max(PI->Cycles, MaxObservedStall);
 #endif
         }
       }
@@ -2049,7 +2052,7 @@ SUnit *SchedBoundary::pickOnlyChoice() {
     }
   }
   for (unsigned i = 0; Available.empty(); ++i) {
-    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) &&
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
            "permanent hazard"); (void)i;
     bumpCycle(CurrCycle + 1);
     releasePending();
@@ -2090,111 +2093,6 @@ void SchedBoundary::dumpScheduledState() {
 // GenericScheduler - Generic implementation of MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
-namespace {
-/// Base class for GenericScheduler. This class maintains information about
-/// scheduling candidates based on TargetSchedModel making it easy to implement
-/// heuristics for either preRA or postRA scheduling.
-class GenericSchedulerBase : public MachineSchedStrategy {
-public:
-  /// Represent the type of SchedCandidate found within a single queue.
-  /// pickNodeBidirectional depends on these listed by decreasing priority.
-  enum CandReason {
-    NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
-    ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
-    TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
-
-#ifndef NDEBUG
-  static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
-#endif
-
-  /// Policy for scheduling the next instruction in the candidate's zone.
-  struct CandPolicy {
-    bool ReduceLatency;
-    unsigned ReduceResIdx;
-    unsigned DemandResIdx;
-
-    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
-  };
-
-  /// Status of an instruction's critical resource consumption.
-  struct SchedResourceDelta {
-    // Count critical resources in the scheduled region required by SU.
-    unsigned CritResources;
-
-    // Count critical resources from another region consumed by SU.
-    unsigned DemandedResources;
-
-    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
-
-    bool operator==(const SchedResourceDelta &RHS) const {
-      return CritResources == RHS.CritResources
-        && DemandedResources == RHS.DemandedResources;
-    }
-    bool operator!=(const SchedResourceDelta &RHS) const {
-      return !operator==(RHS);
-    }
-  };
-
-  /// Store the state used by GenericScheduler heuristics, required for the
-  /// lifetime of one invocation of pickNode().
-  struct SchedCandidate {
-    CandPolicy Policy;
-
-    // The best SUnit candidate.
-    SUnit *SU;
-
-    // The reason for this candidate.
-    CandReason Reason;
-
-    // Set of reasons that apply to multiple candidates.
-    uint32_t RepeatReasonSet;
-
-    // Register pressure values for the best candidate.
-    RegPressureDelta RPDelta;
-
-    // Critical resource consumption of the best candidate.
-    SchedResourceDelta ResDelta;
-
-    SchedCandidate(const CandPolicy &policy)
-      : Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {}
-
-    bool isValid() const { return SU; }
-
-    // Copy the status of another candidate without changing policy.
-    void setBest(SchedCandidate &Best) {
-      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
-      SU = Best.SU;
-      Reason = Best.Reason;
-      RPDelta = Best.RPDelta;
-      ResDelta = Best.ResDelta;
-    }
-
-    bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
-    void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
-
-    void initResourceDelta(const ScheduleDAGMI *DAG,
-                           const TargetSchedModel *SchedModel);
-  };
-
-protected:
-  const MachineSchedContext *Context;
-  const TargetSchedModel *SchedModel;
-  const TargetRegisterInfo *TRI;
-
-  SchedRemainder Rem;
-protected:
-  GenericSchedulerBase(const MachineSchedContext *C):
-    Context(C), SchedModel(nullptr), TRI(nullptr) {}
-
-  void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
-                 SchedBoundary *OtherZone);
-
-#ifndef NDEBUG
-  void traceCandidate(const SchedCandidate &Cand);
-#endif
-};
-} // namespace
-
 void GenericSchedulerBase::SchedCandidate::
 initResourceDelta(const ScheduleDAGMI *DAG,
                   const TargetSchedModel *SchedModel) {
@@ -2430,65 +2328,6 @@ static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,
         << GenericSchedulerBase::getReasonStr(Cand.Reason) << '\n');
 }
 
-namespace {
-/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
-/// the schedule.
-class GenericScheduler : public GenericSchedulerBase {
-  ScheduleDAGMILive *DAG;
-
-  // State of the top and bottom scheduled instruction boundaries.
-  SchedBoundary Top;
-  SchedBoundary Bot;
-
-  MachineSchedPolicy RegionPolicy;
-public:
-  GenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), DAG(nullptr), Top(SchedBoundary::TopQID, "TopQ"),
-    Bot(SchedBoundary::BotQID, "BotQ") {}
-
-  void initPolicy(MachineBasicBlock::iterator Begin,
-                  MachineBasicBlock::iterator End,
-                  unsigned NumRegionInstrs) override;
-
-  bool shouldTrackPressure() const override {
-    return RegionPolicy.ShouldTrackPressure;
-  }
-
-  void initialize(ScheduleDAGMI *dag) override;
-
-  SUnit *pickNode(bool &IsTopNode) override;
-
-  void schedNode(SUnit *SU, bool IsTopNode) override;
-
-  void releaseTopNode(SUnit *SU) override {
-    Top.releaseTopNode(SU);
-  }
-
-  void releaseBottomNode(SUnit *SU) override {
-    Bot.releaseBottomNode(SU);
-  }
-
-  void registerRoots() override;
-
-protected:
-  void checkAcyclicLatency();
-
-  void tryCandidate(SchedCandidate &Cand,
-                    SchedCandidate &TryCand,
-                    SchedBoundary &Zone,
-                    const RegPressureTracker &RPTracker,
-                    RegPressureTracker &TempTracker);
-
-  SUnit *pickNodeBidirectional(bool &IsTopNode);
-
-  void pickNodeFromQueue(SchedBoundary &Zone,
-                         const RegPressureTracker &RPTracker,
-                         SchedCandidate &Candidate);
-
-  void reschedulePhysRegCopies(SUnit *SU, bool isTop);
-};
-} // namespace
-
 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() &&
          "(PreRA)GenericScheduler needs vreg liveness");
@@ -3023,75 +2862,25 @@ GenericSchedRegistry("converge", "Standard converging scheduler.",
 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
-namespace {
-/// PostGenericScheduler - Interface to the scheduling algorithm used by
-/// ScheduleDAGMI.
-///
-/// Callbacks from ScheduleDAGMI:
-///   initPolicy -> initialize(DAG) -> registerRoots -> pickNode ...
-class PostGenericScheduler : public GenericSchedulerBase {
-  ScheduleDAGMI *DAG;
-  SchedBoundary Top;
-  SmallVector<SUnit*, 8> BotRoots;
-public:
-  PostGenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
-
-  virtual ~PostGenericScheduler() {}
-
-  void initPolicy(MachineBasicBlock::iterator Begin,
-                  MachineBasicBlock::iterator End,
-                  unsigned NumRegionInstrs) override {
-    /* no configurable policy */
-  };
-
-  /// PostRA scheduling does not track pressure.
-  bool shouldTrackPressure() const override { return false; }
-
-  void initialize(ScheduleDAGMI *Dag) override {
-    DAG = Dag;
-    SchedModel = DAG->getSchedModel();
-    TRI = DAG->TRI;
-
-    Rem.init(DAG, SchedModel);
-    Top.init(DAG, SchedModel, &Rem);
-    BotRoots.clear();
-
-    // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
-    // or are disabled, then these HazardRecs will be disabled.
-    const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
-    const TargetMachine &TM = DAG->MF.getTarget();
-    if (!Top.HazardRec) {
-      Top.HazardRec =
-        TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-    }
-  }
-
-  void registerRoots() override;
-
-  SUnit *pickNode(bool &IsTopNode) override;
-
-  void scheduleTree(unsigned SubtreeID) override {
-    llvm_unreachable("PostRA scheduler does not support subtree analysis.");
-  }
-
-  void schedNode(SUnit *SU, bool IsTopNode) override;
+void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
+  DAG = Dag;
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
 
-  void releaseTopNode(SUnit *SU) override {
-    Top.releaseTopNode(SU);
-  }
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  BotRoots.clear();
 
-  // Only called for roots.
-  void releaseBottomNode(SUnit *SU) override {
-    BotRoots.push_back(SU);
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
+  // or are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+  const TargetMachine &TM = DAG->MF.getTarget();
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
   }
+}
 
-protected:
-  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
-
-  void pickNodeFromQueue(SchedCandidate &Cand);
-};
-} // namespace
 
 void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 96cf719184be..8515b0f456d8 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -578,8 +577,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB exits via unconditional fall-through but its successor "
                "differs from its CFG successor!", MBB);
       }
-      if (!MBB->empty() && getBundleStart(&MBB->back())->isBarrier() &&
-          !TII->isPredicated(getBundleStart(&MBB->back()))) {
+      if (!MBB->empty() && MBB->back().isBarrier() &&
+          !TII->isPredicated(&MBB->back())) {
         report("MBB exits via unconditional fall-through but ends with a "
                "barrier instruction!", MBB);
       }
@@ -599,10 +598,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via unconditional branch but doesn't contain "
                "any instructions!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
+      } else if (!MBB->back().isBarrier()) {
         report("MBB exits via unconditional branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+      } else if (!MBB->back().isTerminator()) {
         report("MBB exits via unconditional branch but the branch isn't a "
                "terminator instruction!", MBB);
       }
@@ -630,10 +629,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/fall-through but doesn't "
                "contain any instructions!", MBB);
-      } else if (getBundleStart(&MBB->back())->isBarrier()) {
+      } else if (MBB->back().isBarrier()) {
         report("MBB exits via conditional branch/fall-through but ends with a "
                "barrier instruction!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+      } else if (!MBB->back().isTerminator()) {
         report("MBB exits via conditional branch/fall-through but the branch "
                "isn't a terminator instruction!", MBB);
       }
@@ -658,10 +657,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/branch but doesn't "
                "contain any instructions!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
+      } else if (!MBB->back().isBarrier()) {
         report("MBB exits via conditional branch/branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+      } else if (!MBB->back().isTerminator()) {
         report("MBB exits via conditional branch/branch but the branch "
                "isn't a terminator instruction!", MBB);
       }
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index b3f719841dba..9568e238a2ca 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -92,9 +92,9 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
-// substitutePass(&PostRASchedulerID, &MachineSchedulerID); Ideally it wouldn't
-// be part of the standard pass pipeline, and the target would just add a PostRA
-// scheduling pass wherever it wants.
+// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); Ideally it
+// wouldn't be part of the standard pass pipeline, and the target would just add
+// a PostRA scheduling pass wherever it wants.
 static cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
   cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
 
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index c74a42fd5d54..1ba2c7418f21 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -160,7 +160,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   replaceFrameIndices(Fn);
 
   // If register scavenging is needed, as we've enabled doing it as a
-  // post-pass, scavenge the virtual registers that frame index elimiation
+  // post-pass, scavenge the virtual registers that frame index elimination
   // inserted.
   if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging)
     scavengeFrameVirtualRegs(Fn);
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2d2fd53447ee..0f5018484d93 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -167,9 +167,18 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
-    SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
+    /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
+    ///   load.
+    ///
+    /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
+    /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
+    /// \param EltNo index of the vector element to load.
+    /// \param OriginalLoad load that EVE came from to be replaced.
+    /// \returns EVE on success SDValue() on failure.
+    SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+        SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad);
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
     SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
@@ -762,14 +771,10 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
 
     // If the operands of this node are only used by the node, they will now
     // be dead.  Make sure to visit them first to delete dead nodes early.
-    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i) {
-      SDNode *Op = TLO.Old.getNode()->getOperand(i).getNode();
-      // For an operand generating multiple values, one of the values may
-      // become dead allowing further simplification (e.g. split index
-      // arithmetic from an indexed load).
-      if (Op->hasOneUse() || Op->getNumValues() > 1)
-        AddToWorkList(Op);
-    }
+    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
+      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
+        AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode());
+
     DAG.DeleteNode(TLO.Old.getNode());
   }
 }
@@ -1320,9 +1325,16 @@ SDValue DAGCombiner::combine(SDNode *N) {
 
     // Constant operands are canonicalized to RHS.
     if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
-      SDValue Ops[] = { N1, N0 };
-      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(),
-                                            Ops);
+      SDValue Ops[] = {N1, N0};
+      SDNode *CSENode;
+      if (const BinaryWithFlagsSDNode *BinNode =
+              dyn_cast<BinaryWithFlagsSDNode>(N)) {
+        CSENode = DAG.getNodeIfExists(
+            N->getOpcode(), N->getVTList(), Ops, BinNode->hasNoUnsignedWrap(),
+            BinNode->hasNoSignedWrap(), BinNode->isExact());
+      } else {
+        CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops);
+      }
       if (CSENode)
         return SDValue(CSENode, 0);
     }
@@ -4555,12 +4567,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 
   // fold selects based on a setcc into other things, such as min/max/abs
   if (N0.getOpcode() == ISD::SETCC) {
-    // FIXME:
-    // Check against MVT::Other for SELECT_CC, which is a workaround for targets
-    // having to say they don't support SELECT_CC on every type the DAG knows
-    // about, since there is no way to mark an opcode illegal at all value types
-    if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other) &&
-        TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))
+    if ((!LegalOperations &&
+         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
+	TLI.isOperationLegal(ISD::SELECT_CC, VT))
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1),
                          N1, N2, N0.getOperand(2));
@@ -4587,6 +4596,56 @@ std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
   return std::make_pair(Lo, Hi);
 }
 
+// This function assumes all the vselect's arguments are CONCAT_VECTOR
+// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
+static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  MVT VT = N->getSimpleValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         RHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         Cond.getOpcode() == ISD::BUILD_VECTOR);
+
+  // We're sure we have an even number of elements due to the
+  // concat_vectors we have as arguments to vselect.
+  // Skip BV elements until we find one that's not an UNDEF
+  // After we find an UNDEF element, keep looping until we get to half the
+  // length of the BV and see if all the non-undef nodes are the same.
+  ConstantSDNode *BottomHalf = nullptr;
+  for (int i = 0; i < NumElems / 2; ++i) {
+    if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF)
+      continue;
+
+    if (BottomHalf == nullptr)
+      BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != BottomHalf)
+      return SDValue();
+  }
+
+  // Do the same for the second half of the BuildVector
+  ConstantSDNode *TopHalf = nullptr;
+  for (int i = NumElems / 2; i < NumElems; ++i) {
+    if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF)
+      continue;
+
+    if (TopHalf == nullptr)
+      TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != TopHalf)
+      return SDValue();
+  }
+
+  assert(TopHalf && BottomHalf &&
+         "One half of the selector was all UNDEFs and the other was all the "
+         "same value. This should have been addressed before this function.");
+  return DAG.getNode(
+      ISD::CONCAT_VECTORS, dl, VT,
+      BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
+      TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4659,6 +4718,17 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return N2;
 
+  // The ConvertSelectToConcatVector function is assuming both the above
+  // checks for (vselect (build_vector all{ones,zeros) ...) have been made
+  // and addressed.
+  if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
+      N2.getOpcode() == ISD::CONCAT_VECTORS &&
+      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+    SDValue CV = ConvertSelectToConcatVector(N, DAG);
+    if (CV.getNode())
+      return CV;
+  }
+
   return SDValue();
 }
 
@@ -6955,11 +7025,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
-  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
-  // having to say they don't support SELECT_CC on every type the DAG knows
-  // about, since there is no way to mark an opcode illegal at all value types
-  // (See also visitSELECT)
-  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
     // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
     if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
         !VT.isVector() &&
@@ -7012,11 +7078,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
-  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
-  // having to say they don't support SELECT_CC on every type the DAG knows
-  // about, since there is no way to mark an opcode illegal at all value types
-  // (See also visitSELECT)
-  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
     // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
 
     if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
@@ -7849,17 +7911,6 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   return false;
 }
 
-/// \brief Return the base-pointer arithmetic from an indexed \p LD.
-SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-  assert(AM != ISD::UNINDEXED);
-  SDValue BP = LD->getOperand(1);
-  SDValue Inc = LD->getOperand(2);
-  unsigned Opc =
-      (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
-  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
-}
-
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -7896,16 +7947,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     } else {
       // Indexed loads.
       assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
-      if (!N->hasAnyUseOfValue(0)) {
+      if (!N->hasAnyUseOfValue(0) && !N->hasAnyUseOfValue(1)) {
         SDValue Undef = DAG.getUNDEF(N->getValueType(0));
-        SDValue Index;
-        if (N->hasAnyUseOfValue(1)) {
-          Index = SplitIndexingFromLoad(LD);
-          // Try to fold the base pointer arithmetic into subsequent loads and
-          // stores.
-          AddUsersToWorkList(N);
-        } else
-          Index = DAG.getUNDEF(N->getValueType(1));
         DEBUG(dbgs() << "\nReplacing.7 ";
               N->dump(&DAG);
               dbgs() << "\nWith: ";
@@ -7913,7 +7956,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               dbgs() << " and 2 other values\n");
         WorkListRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
+                                      DAG.getUNDEF(N->getValueType(1)));
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
         removeFromWorkList(N);
         DAG.DeleteNode(N);
@@ -9666,6 +9710,27 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     return SDValue();
   unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
 
+  // Canonicalize insert_vector_elt dag nodes.
+  // Example:
+  // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
+  // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
+  //
+  // Do this only if the child insert_vector node has one use; also
+  // do this only if indices are both constants and Idx1 < Idx0.
+  if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
+      && isa<ConstantSDNode>(InVec.getOperand(2))) {
+    unsigned OtherElt =
+      cast<ConstantSDNode>(InVec.getOperand(2))->getZExtValue();
+    if (Elt < OtherElt) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VT,
+                                  InVec.getOperand(0), InVal, EltNo);
+      AddToWorkList(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
+                         VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
+    }
+  }
+
   // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
@@ -9698,6 +9763,86 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
+SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+    SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
+  EVT ResultVT = EVE->getValueType(0);
+  EVT VecEltVT = InVecVT.getVectorElementType();
+  unsigned Align = OriginalLoad->getAlignment();
+  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+      VecEltVT.getTypeForEVT(*DAG.getContext()));
+
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
+    return SDValue();
+
+  Align = NewAlign;
+
+  SDValue NewPtr = OriginalLoad->getBasePtr();
+  SDValue Offset;
+  EVT PtrType = NewPtr.getValueType();
+  MachinePointerInfo MPI;
+  if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
+    int Elt = ConstEltNo->getZExtValue();
+    unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+    if (TLI.isBigEndian())
+      PtrOff = InVecVT.getSizeInBits() / 8 - PtrOff;
+    Offset = DAG.getConstant(PtrOff, PtrType);
+    MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+  } else {
+    Offset = DAG.getNode(
+        ISD::MUL, SDLoc(EVE), EltNo.getValueType(), EltNo,
+        DAG.getConstant(VecEltVT.getStoreSize(), EltNo.getValueType()));
+    if (TLI.isBigEndian())
+      Offset = DAG.getNode(
+          ISD::SUB, SDLoc(EVE), EltNo.getValueType(),
+          DAG.getConstant(InVecVT.getStoreSize(), EltNo.getValueType()), Offset);
+    MPI = OriginalLoad->getPointerInfo();
+  }
+  NewPtr = DAG.getNode(ISD::ADD, SDLoc(EVE), PtrType, NewPtr, Offset);
+
+  // The replacement we need to do here is a little tricky: we need to
+  // replace an extractelement of a load with a load.
+  // Use ReplaceAllUsesOfValuesWith to do the replacement.
+  // Note that this replacement assumes that the extractvalue is the only
+  // use of the load; that's okay because we don't want to perform this
+  // transformation in other cases anyway.
+  SDValue Load;
+  SDValue Chain;
+  if (ResultVT.bitsGT(VecEltVT)) {
+    // If the result type of vextract is wider than the load, then issue an
+    // extending load instead.
+    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, VecEltVT)
+                                   ? ISD::ZEXTLOAD
+                                   : ISD::EXTLOAD;
+    Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(),
+                          NewPtr, MPI, VecEltVT, OriginalLoad->isVolatile(),
+                          OriginalLoad->isNonTemporal(), Align,
+                          OriginalLoad->getTBAAInfo());
+    Chain = Load.getValue(1);
+  } else {
+    Load = DAG.getLoad(
+        VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI,
+        OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
+        OriginalLoad->isInvariant(), Align, OriginalLoad->getTBAAInfo());
+    Chain = Load.getValue(1);
+    if (ResultVT.bitsLT(VecEltVT))
+      Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
+    else
+      Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load);
+  }
+  WorkListRemover DeadNodes(*this);
+  SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
+  SDValue To[] = { Load, Chain };
+  DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  // Since we're explicitly calling ReplaceAllUses, add the new node to the
+  // worklist explicitly as well.
+  AddToWorkList(Load.getNode());
+  AddUsersToWorkList(Load.getNode()); // Add users too
+  // Make sure to revisit this node to clean it up; it will usually be dead.
+  AddToWorkList(EVE);
+  ++OpsNarrowed;
+  return SDValue(EVE, 0);
+}
+
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
@@ -9766,6 +9911,38 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
+  bool BCNumEltsChanged = false;
+  EVT ExtVT = VT.getVectorElementType();
+  EVT LVT = ExtVT;
+
+  // If the result of load has to be truncated, then it's not necessarily
+  // profitable.
+  if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
+    return SDValue();
+
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
+      return SDValue();
+    if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+      BCNumEltsChanged = true;
+    InVec = InVec.getOperand(0);
+    ExtVT = BCVT.getVectorElementType();
+  }
+
+  // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
+  if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
+      ISD::isNormalLoad(InVec.getNode())) {
+    SDValue Index = N->getOperand(1);
+    if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec))
+      return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
+                                                           OrigLoad);
+  }
+
   // Perform only after legalization to ensure build_vector / vector_shuffle
   // optimizations have already been done.
   if (!LegalOperations) return SDValue();
@@ -9776,30 +9953,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   if (ConstEltNo) {
     int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-    bool NewLoad = false;
-    bool BCNumEltsChanged = false;
-    EVT ExtVT = VT.getVectorElementType();
-    EVT LVT = ExtVT;
-
-    // If the result of load has to be truncated, then it's not necessarily
-    // profitable.
-    if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
-      return SDValue();
-
-    if (InVec.getOpcode() == ISD::BITCAST) {
-      // Don't duplicate a load with other uses.
-      if (!InVec.hasOneUse())
-        return SDValue();
-
-      EVT BCVT = InVec.getOperand(0).getValueType();
-      if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
-        return SDValue();
-      if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
-        BCNumEltsChanged = true;
-      InVec = InVec.getOperand(0);
-      ExtVT = BCVT.getVectorElementType();
-      NewLoad = true;
-    }
 
     LoadSDNode *LN0 = nullptr;
     const ShuffleVectorSDNode *SVN = nullptr;
@@ -9842,6 +9995,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       if (ISD::isNormalLoad(InVec.getNode())) {
         LN0 = cast<LoadSDNode>(InVec);
         Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
+        EltNo = DAG.getConstant(Elt, EltNo.getValueType());
       }
     }
 
@@ -9854,72 +10008,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     if (Elt == -1)
       return DAG.getUNDEF(LVT);
 
-    unsigned Align = LN0->getAlignment();
-    if (NewLoad) {
-      // Check the resultant load doesn't need a higher alignment than the
-      // original load.
-      unsigned NewAlign =
-        TLI.getDataLayout()
-            ->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext()));
-
-      if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT))
-        return SDValue();
-
-      Align = NewAlign;
-    }
-
-    SDValue NewPtr = LN0->getBasePtr();
-    unsigned PtrOff = 0;
-
-    if (Elt) {
-      PtrOff = LVT.getSizeInBits() * Elt / 8;
-      EVT PtrType = NewPtr.getValueType();
-      if (TLI.isBigEndian())
-        PtrOff = VT.getSizeInBits() / 8 - PtrOff;
-      NewPtr = DAG.getNode(ISD::ADD, SDLoc(N), PtrType, NewPtr,
-                           DAG.getConstant(PtrOff, PtrType));
-    }
-
-    // The replacement we need to do here is a little tricky: we need to
-    // replace an extractelement of a load with a load.
-    // Use ReplaceAllUsesOfValuesWith to do the replacement.
-    // Note that this replacement assumes that the extractvalue is the only
-    // use of the load; that's okay because we don't want to perform this
-    // transformation in other cases anyway.
-    SDValue Load;
-    SDValue Chain;
-    if (NVT.bitsGT(LVT)) {
-      // If the result type of vextract is wider than the load, then issue an
-      // extending load instead.
-      ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT)
-        ? ISD::ZEXTLOAD : ISD::EXTLOAD;
-      Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(),
-                            NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
-                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                            Align, LN0->getTBAAInfo());
-      Chain = Load.getValue(1);
-    } else {
-      Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr,
-                         LN0->getPointerInfo().getWithOffset(PtrOff),
-                         LN0->isVolatile(), LN0->isNonTemporal(),
-                         LN0->isInvariant(), Align, LN0->getTBAAInfo());
-      Chain = Load.getValue(1);
-      if (NVT.bitsLT(LVT))
-        Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load);
-      else
-        Load = DAG.getNode(ISD::BITCAST, SDLoc(N), NVT, Load);
-    }
-    WorkListRemover DeadNodes(*this);
-    SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
-    SDValue To[] = { Load, Chain };
-    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
-    // Since we're explcitly calling ReplaceAllUses, add the new node to the
-    // worklist explicitly as well.
-    AddToWorkList(Load.getNode());
-    AddUsersToWorkList(Load.getNode()); // Add users too
-    // Make sure to revisit this node to clean it up; it will usually be dead.
-    AddToWorkList(N);
-    return SDValue(N, 0);
+    return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
   }
 
   return SDValue();
@@ -10729,6 +10818,27 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
       return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
   }
 
+  // Type legalization might introduce new shuffles in the DAG.
+  // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
+  //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
+  if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
+      isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
+      LHS.getOperand(1).getOpcode() == ISD::UNDEF &&
+      RHS.getOperand(1).getOpcode() == ISD::UNDEF) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
+
+    if (SVN0->getMask().equals(SVN1->getMask())) {
+      EVT VT = N->getValueType(0);
+      SDValue UndefVector = LHS.getOperand(1);
+      SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                     LHS.getOperand(0), RHS.getOperand(0));
+      AddUsersToWorkList(N);
+      return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
+                                  &SVN0->getMask()[0]);
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 99931c14ebcb..f7da4d546d87 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -42,12 +42,15 @@
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -558,6 +561,36 @@ bool FastISel::SelectGetElementPtr(const User *I) {
   return true;
 }
 
+/// \brief Add a stack map intrinsic call's live variable operands to a stackmap
+/// or patchpoint machine instruction.
+///
+bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
+                                   const CallInst *CI, unsigned StartIdx) {
+  for (unsigned i = StartIdx, e = CI->getNumArgOperands(); i != e; ++i) {
+    Value *Val = CI->getArgOperand(i);
+    if (auto *C = dyn_cast<ConstantInt>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(C->getSExtValue()));
+    } else if (isa<ConstantPointerNull>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(0));
+    } else if (auto *AI = dyn_cast<AllocaInst>(Val)) {
+      auto SI = FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end())
+        Ops.push_back(MachineOperand::CreateFI(SI->second));
+      else
+        return false;
+    } else {
+      unsigned Reg = getRegForValue(Val);
+      if (Reg == 0)
+        return false;
+      Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
+    }
+  }
+
+  return true;
+}
+
 bool FastISel::SelectCall(const User *I) {
   const CallInst *Call = cast<CallInst>(I);
 
@@ -713,6 +746,76 @@ bool FastISel::SelectCall(const User *I) {
     UpdateValueMap(Call, ResultReg);
     return true;
   }
+  case Intrinsic::experimental_stackmap: {
+    // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>,
+    //                                  [live variables...])
+
+    assert(Call->getCalledFunction()->getReturnType()->isVoidTy() &&
+           "Stackmap cannot return a value.");
+
+    // The stackmap intrinsic only records the live variables (the arguments
+    // passed to it) and emits NOPS (if requested). Unlike the patchpoint
+    // intrinsic, this won't be lowered to a function call. This means we don't
+    // have to worry about calling conventions and target-specific lowering
+    // code. Instead we perform the call lowering right here.
+    //
+    // CALLSEQ_START(0)
+    // STACKMAP(id, nbytes, ...)
+    // CALLSEQ_END(0, 0)
+    //
+
+    SmallVector<MachineOperand, 32> Ops;
+
+    // Add the <id> and <numBytes> constants.
+    assert(isa<ConstantInt>(Call->getOperand(PatchPointOpers::IDPos)) &&
+           "Expected a constant integer.");
+    auto IDVal = cast<ConstantInt>(Call->getOperand(PatchPointOpers::IDPos));
+    Ops.push_back(MachineOperand::CreateImm(IDVal->getZExtValue()));
+
+    assert(isa<ConstantInt>(Call->getOperand(PatchPointOpers::NBytesPos)) &&
+           "Expected a constant integer.");
+    auto NBytesVal =
+      cast<ConstantInt>(Call->getOperand(PatchPointOpers::NBytesPos));
+    Ops.push_back(MachineOperand::CreateImm(NBytesVal->getZExtValue()));
+
+    // Push live variables for the stack map.
+    if (!addStackMapLiveVars(Ops, Call, 2))
+      return false;
+
+    // We are not adding any register mask info here, because the stackmap
+    // doesn't clobber anything.
+
+    // Add scratch registers as implicit def and early clobber.
+    CallingConv::ID CC = Call->getCallingConv();
+    const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
+    for (unsigned i = 0; ScratchRegs[i]; ++i)
+      Ops.push_back(MachineOperand::CreateReg(
+        ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+        /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+
+    // Issue CALLSEQ_START
+    unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+      .addImm(0);
+
+    // Issue STACKMAP.
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(TargetOpcode::STACKMAP));
+
+    for (auto const &MO : Ops)
+      MIB.addOperand(MO);
+
+    // Issue CALLSEQ_END
+    unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+      .addImm(0).addImm(0);
+
+    // Inform the Frame Information that we have a stackmap in this function.
+    FuncInfo.MF->getFrameInfo()->setHasStackMap();
+
+    return true;
+  }
   }
 
   // Usually, it does not make sense to initialize a value,
@@ -879,7 +982,6 @@ FastISel::SelectInstruction(const Instruction *I) {
 /// the CFG.
 void
 FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
-
   if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
       FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only instruction
@@ -890,7 +992,11 @@ FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
     TII.InsertBranch(*FuncInfo.MBB, MSucc, nullptr,
                      SmallVector<MachineOperand, 0>(), DbgLoc);
   }
-  FuncInfo.MBB->addSuccessor(MSucc);
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(FuncInfo.MBB->getBasicBlock(),
+                                               MSucc->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(MSucc, BranchWeight);
 }
 
 /// SelectFNeg - Emit an FNeg operation.
@@ -1635,3 +1741,47 @@ bool FastISel::canFoldAddIntoGEP(const User *GEP, const Value *Add) {
   return isa<ConstantInt>(cast<AddOperator>(Add)->getOperand(1));
 }
 
+MachineMemOperand *
+FastISel::createMachineMemOperandFor(const Instruction *I) const {
+  const Value *Ptr;
+  Type *ValTy;
+  unsigned Alignment;
+  unsigned Flags;
+  bool IsVolatile;
+
+  if (const auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlignment();
+    IsVolatile = LI->isVolatile();
+    Flags = MachineMemOperand::MOLoad;
+    Ptr = LI->getPointerOperand();
+    ValTy = LI->getType();
+  } else if (const auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlignment();
+    IsVolatile = SI->isVolatile();
+    Flags = MachineMemOperand::MOStore;
+    Ptr = SI->getPointerOperand();
+    ValTy = SI->getValueOperand()->getType();
+  } else {
+    return nullptr;
+  }
+
+  bool IsNonTemporal = I->getMetadata("nontemporal") != nullptr;
+  bool IsInvariant = I->getMetadata("invariant.load") != nullptr;
+  const MDNode *TBAAInfo = I->getMetadata(LLVMContext::MD_tbaa);
+  const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0.
+    Alignment = DL.getABITypeAlignment(ValTy);
+
+  unsigned Size = TM.getDataLayout()->getTypeStoreSize(ValTy);
+
+  if (IsVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (IsNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  if (IsInvariant)
+    Flags |= MachineMemOperand::MOInvariant;
+
+  return FuncInfo.MF->getMachineMemOperand(MachinePointerInfo(Ptr), Flags, Size,
+                                           Alignment, TBAAInfo, Ranges);
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a59e8954eb7c..a4245a6016c2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2650,12 +2650,15 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
     NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
     assert(NewOutTy.isInteger() && "Ran out of possibilities!");
 
+    // A larger signed type can hold all unsigned values of the requested type,
+    // so using FP_TO_SINT is valid
     if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
       OpToUse = ISD::FP_TO_SINT;
       break;
     }
 
-    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
+    // However, if the value may be < 0.0, we *must* use some FP_TO_SINT.
+    if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
       OpToUse = ISD::FP_TO_UINT;
       break;
     }
@@ -3007,14 +3010,14 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::ATOMIC_LOAD: {
     // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
     SDValue Zero = DAG.getConstant(0, Node->getValueType(0));
-    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
-                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
-                                 Node->getOperand(0),
-                                 Node->getOperand(1), Zero, Zero,
-                                 cast<AtomicSDNode>(Node)->getMemOperand(),
-                                 cast<AtomicSDNode>(Node)->getOrdering(),
-                                 cast<AtomicSDNode>(Node)->getOrdering(),
-                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Swap = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+        cast<AtomicSDNode>(Node)->getMemOperand(),
+        cast<AtomicSDNode>(Node)->getOrdering(),
+        cast<AtomicSDNode>(Node)->getOrdering(),
+        cast<AtomicSDNode>(Node)->getSynchScope());
     Results.push_back(Swap.getValue(0));
     Results.push_back(Swap.getValue(1));
     break;
@@ -3051,6 +3054,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp.second);
     break;
   }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
+    // splits out the success value as a comparison. Expanding the resulting
+    // ATOMIC_CMP_SWAP will produce a libcall.
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Node->getOperand(2),
+        Node->getOperand(3), cast<MemSDNode>(Node)->getMemOperand(),
+        cast<AtomicSDNode>(Node)->getSuccessOrdering(),
+        cast<AtomicSDNode>(Node)->getFailureOrdering(),
+        cast<AtomicSDNode>(Node)->getSynchScope());
+
+    SDValue Success = DAG.getSetCC(SDLoc(Node), Node->getValueType(1),
+                                   Res, Node->getOperand(2), ISD::SETEQ);
+
+    Results.push_back(Res.getValue(0));
+    Results.push_back(Success);
+    Results.push_back(Res.getValue(1));
+    break;
+  }
   case ISD::DYNAMIC_STACKALLOC:
     ExpandDYNAMIC_STACKALLOC(Node, Results);
     break;
@@ -3128,6 +3152,65 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                 Node->getOperand(0), Node->getValueType(0), dl);
     Results.push_back(Tmp1);
     break;
+  case ISD::FP_TO_SINT: {
+    EVT VT = Node->getOperand(0).getValueType();
+    EVT NVT = Node->getValueType(0);
+
+    // FIXME: Only f32 to i64 conversions are supported.
+    if (VT != MVT::f32 || NVT != MVT::i64)
+      break;
+
+    // Expand f32 -> i64 conversion
+    // This algorithm comes from compiler-rt's implementation of fixsfdi:
+    // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  VT.getSizeInBits());
+    SDValue ExponentMask = DAG.getConstant(0x7F800000, IntVT);
+    SDValue ExponentLoBit = DAG.getConstant(23, IntVT);
+    SDValue Bias = DAG.getConstant(127, IntVT);
+    SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()),
+                                       IntVT);
+    SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, IntVT);
+    SDValue MantissaMask = DAG.getConstant(0x007FFFFF, IntVT);
+
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
+
+    SDValue ExponentBits = DAG.getNode(ISD::SRL, dl, IntVT,
+        DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
+        DAG.getZExtOrTrunc(ExponentLoBit, dl, TLI.getShiftAmountTy(IntVT)));
+    SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
+
+    SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
+        DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+        DAG.getZExtOrTrunc(SignLowBit, dl, TLI.getShiftAmountTy(IntVT)));
+    Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
+
+    SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
+        DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
+        DAG.getConstant(0x00800000, IntVT));
+
+    R = DAG.getZExtOrTrunc(R, dl, NVT);
+
+
+    R = DAG.getSelectCC(dl, Exponent, ExponentLoBit,
+       DAG.getNode(ISD::SHL, dl, NVT, R,
+                   DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
+                      dl, TLI.getShiftAmountTy(IntVT))),
+       DAG.getNode(ISD::SRL, dl, NVT, R,
+                   DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
+                      dl, TLI.getShiftAmountTy(IntVT))),
+       ISD::SETGT);
+
+    SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
+        DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
+        Sign);
+
+    Results.push_back(DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, IntVT),
+        DAG.getConstant(0, NVT), Ret, ISD::SETLT));
+    break;
+  }
   case ISD::FP_TO_UINT: {
     SDValue True, False;
     EVT VT =  Node->getOperand(0).getValueType();
@@ -3653,7 +3736,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                               ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
                               LHS, RHS);
     Results.push_back(Sum);
-    EVT OType = Node->getValueType(1);
+    EVT ResultType = Node->getValueType(1);
+    EVT OType = getSetCCResultType(Node->getValueType(0));
 
     SDValue Zero = DAG.getConstant(0, LHS.getValueType());
 
@@ -3676,7 +3760,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
 
     SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
-    Results.push_back(Cmp);
+    Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType));
     break;
   }
   case ISD::UADDO:
@@ -3687,9 +3771,14 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                               ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
                               LHS, RHS);
     Results.push_back(Sum);
-    Results.push_back(DAG.getSetCC(dl, Node->getValueType(1), Sum, LHS,
-                                   Node->getOpcode () == ISD::UADDO ?
-                                   ISD::SETULT : ISD::SETUGT));
+
+    EVT ResultType = Node->getValueType(1);
+    EVT SetCCType = getSetCCResultType(Node->getValueType(0));
+    ISD::CondCode CC
+      = Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
+    SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
+
+    Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType));
     break;
   }
   case ISD::UMULO:
@@ -3899,13 +3988,29 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp2 = Node->getOperand(1);   // RHS
     Tmp3 = Node->getOperand(2);   // True
     Tmp4 = Node->getOperand(3);   // False
+    EVT VT = Node->getValueType(0);
     SDValue CC = Node->getOperand(4);
+    ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();
+
+    if (TLI.isCondCodeLegal(CCOp, Tmp1.getSimpleValueType())) {
+      // If the condition code is legal, then we need to expand this
+      // node using SETCC and SELECT.
+      EVT CmpVT = Tmp1.getValueType();
+      assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
+             "Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
+             "expanded.");
+      EVT CCVT = TLI.getSetCCResultType(*DAG.getContext(), CmpVT);
+      SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC);
+      Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
+      break;
+    }
 
+    // SELECT_CC is legal, so the condition code must not be.
     bool Legalized = false;
     // Try to legalize by inverting the condition.  This is for targets that
     // might support an ordered version of a condition, but not the unordered
     // version (or vice versa).
-    ISD::CondCode InvCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+    ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp,
                                                Tmp1.getValueType().isInteger());
     if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
       // Use the new condition code and swap true and false
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 2483184deedc..a8603423e32a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -138,7 +138,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
 
   case ISD::ATOMIC_CMP_SWAP:
-    Res = PromoteIntRes_Atomic2(cast<AtomicSDNode>(N)); break;
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    Res = PromoteIntRes_AtomicCmpSwap(cast<AtomicSDNode>(N), ResNo);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -192,16 +194,41 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_Atomic2(AtomicSDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
+                                                      unsigned ResNo) {
+  if (ResNo == 1) {
+    assert(N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+    EVT SVT = getSetCCResultType(N->getOperand(2).getValueType());
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+
+    // Only use the result of getSetCCResultType if it is legal,
+    // otherwise just use the promoted result type (NVT).
+    if (!TLI.isTypeLegal(SVT))
+      SVT = NVT;
+
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), SVT, MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, SDLoc(N), N->getMemoryVT(), VTs,
+        N->getChain(), N->getBasePtr(), N->getOperand(2), N->getOperand(3),
+        N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(),
+        N->getSynchScope());
+    ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
+    ReplaceValueWith(SDValue(N, 2), Res.getValue(2));
+    return Res.getValue(1);
+  }
+
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
   SDValue Op3 = GetPromotedInteger(N->getOperand(3));
-  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
-                              N->getChain(), N->getBasePtr(), Op2, Op3,
-                              N->getMemOperand(), N->getSuccessOrdering(),
-                              N->getFailureOrdering(), N->getSynchScope());
+  SDVTList VTs =
+      DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other);
+  SDValue Res = DAG.getAtomicCmpSwap(
+      N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(),
+      N->getBasePtr(), Op2, Op3, N->getMemOperand(), N->getSuccessOrdering(),
+      N->getFailureOrdering(), N->getSynchScope());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
-  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  unsigned ChainOp = N->getNumValues() - 1;
+  ReplaceValueWith(SDValue(N, ChainOp), Res.getValue(ChainOp));
   return Res;
 }
 
@@ -1143,6 +1170,26 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
     ReplaceValueWith(SDValue(N, 1), Tmp.second);
     break;
   }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    AtomicSDNode *AN = cast<AtomicSDNode>(N);
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::Other);
+    SDValue Tmp = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, SDLoc(N), AN->getMemoryVT(), VTs,
+        N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3),
+        AN->getMemOperand(), AN->getSuccessOrdering(), AN->getFailureOrdering(),
+        AN->getSynchScope());
+
+    // Expanding to the strong ATOMIC_CMP_SWAP node means we can determine
+    // success simply by comparing the loaded value against the ingoing
+    // comparison.
+    SDValue Success = DAG.getSetCC(SDLoc(N), N->getValueType(1), Tmp,
+                                   N->getOperand(2), ISD::SETEQ);
+
+    SplitInteger(Tmp, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Success);
+    ReplaceValueWith(SDValue(N, 2), Tmp.getValue(1));
+    break;
+  }
 
   case ISD::AND:
   case ISD::OR:
@@ -2388,16 +2435,18 @@ void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
   SDValue Zero = DAG.getConstant(0, VT);
-  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
-                               N->getOperand(0),
-                               N->getOperand(1), Zero, Zero,
-                               cast<AtomicSDNode>(N)->getMemOperand(),
-                               cast<AtomicSDNode>(N)->getOrdering(),
-                               cast<AtomicSDNode>(N)->getOrdering(),
-                               cast<AtomicSDNode>(N)->getSynchScope());
+  SDValue Swap = DAG.getAtomicCmpSwap(
+      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
+      cast<AtomicSDNode>(N)->getMemoryVT(), VTs, N->getOperand(0),
+      N->getOperand(1), Zero, Zero, cast<AtomicSDNode>(N)->getMemOperand(),
+      cast<AtomicSDNode>(N)->getOrdering(),
+      cast<AtomicSDNode>(N)->getOrdering(),
+      cast<AtomicSDNode>(N)->getSynchScope());
+
   ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
-  ReplaceValueWith(SDValue(N, 1), Swap.getValue(1));
+  ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index e4bbc78fa71a..04c200c9528c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -220,7 +220,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_AssertZext(SDNode *N);
   SDValue PromoteIntRes_Atomic0(AtomicSDNode *N);
   SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
-  SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
+  SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
   SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 78ec4df95f54..13cfae7515b8 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -170,7 +170,8 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes {
     if (DisableSchedCycles || !NeedLatency)
       HazardRec = new ScheduleHazardRecognizer();
     else
-      HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(&tm, this);
+      HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
+          tm.getSubtargetImpl(), this);
   }
 
   ~ScheduleDAGRRList() {
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 51c51d6925be..4589b0c35dc3 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -73,7 +73,8 @@ class ScheduleDAGVLIW : public ScheduleDAGSDNodes {
     : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
 
     const TargetMachine &tm = mf.getTarget();
-    HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(&tm, this);
+    HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
+        tm.getSubtargetImpl(), this);
   }
 
   ~ScheduleDAGVLIW() {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b1b8035a7d9a..639eb462e7ff 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 #include <algorithm>
 #include <cmath>
+
 using namespace llvm;
 
 /// makeVTList - Return an instance of the SDVTList struct initialized with the
@@ -381,6 +382,20 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
   }
 }
 
+static void AddBinaryNodeIDCustom(FoldingSetNodeID &ID, bool nuw, bool nsw,
+                                  bool exact) {
+  ID.AddBoolean(nuw);
+  ID.AddBoolean(nsw);
+  ID.AddBoolean(exact);
+}
+
+/// AddBinaryNodeIDCustom - Add BinarySDNodes special infos
+static void AddBinaryNodeIDCustom(FoldingSetNodeID &ID, unsigned Opcode,
+                                  bool nuw, bool nsw, bool exact) {
+  if (isBinOpWithFlags(Opcode))
+    AddBinaryNodeIDCustom(ID, nuw, nsw, exact);
+}
+
 static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
                           SDVTList VTList, ArrayRef<SDValue> OpList) {
   AddNodeIDOpcode(ID, OpC);
@@ -473,7 +488,21 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(ST->getPointerInfo().getAddrSpace());
     break;
   }
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::MUL:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL: {
+    const BinaryWithFlagsSDNode *BinNode = cast<BinaryWithFlagsSDNode>(N);
+    AddBinaryNodeIDCustom(ID, N->getOpcode(), BinNode->hasNoUnsignedWrap(),
+                          BinNode->hasNoSignedWrap(), BinNode->isExact());
+    break;
+  }
   case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
@@ -926,6 +955,25 @@ void SelectionDAG::allnodes_clear() {
     DeallocateNode(AllNodes.begin());
 }
 
+BinarySDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, SDLoc DL,
+                                            SDVTList VTs, SDValue N1,
+                                            SDValue N2, bool nuw, bool nsw,
+                                            bool exact) {
+  if (isBinOpWithFlags(Opcode)) {
+    BinaryWithFlagsSDNode *FN = new (NodeAllocator) BinaryWithFlagsSDNode(
+        Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+    FN->setHasNoUnsignedWrap(nuw);
+    FN->setHasNoSignedWrap(nsw);
+    FN->setIsExact(exact);
+
+    return FN;
+  }
+
+  BinarySDNode *N = new (NodeAllocator)
+      BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+  return N;
+}
+
 void SelectionDAG::clear() {
   allnodes_clear();
   OperandAllocator.Reset();
@@ -1190,15 +1238,8 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
   if (BitWidth < 64)
     Offset = SignExtend64(Offset, BitWidth);
 
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (!GVar) {
-    // If GV is an alias then use the aliasee for determining thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee());
-  }
-
   unsigned Opc;
-  if (GVar && GVar->isThreadLocal())
+  if (GV->isThreadLocal())
     Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
   else
     Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
@@ -2192,8 +2233,11 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       const APInt &RA = Rem->getAPIntValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        KnownZero |= ~LowBits;
-        computeKnownBits(Op.getOperand(0), KnownZero, KnownOne,Depth+1);
+        computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth + 1);
+
+        // The upper bits are all zero, the lower ones are unchanged.
+        KnownZero = KnownZero2 | ~LowBits;
+        KnownOne = KnownOne2 & LowBits;
         break;
       }
     }
@@ -2940,7 +2984,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
-                              SDValue N2) {
+                              SDValue N2, bool nuw, bool nsw, bool exact) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
   switch (Opcode) {
@@ -3380,22 +3424,25 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Memoize this node if possible.
-  SDNode *N;
+  BinarySDNode *N;
   SDVTList VTs = getVTList(VT);
+  const bool BinOpHasFlags = isBinOpWithFlags(Opcode);
   if (VT != MVT::Glue) {
-    SDValue Ops[] = { N1, N2 };
+    SDValue Ops[] = {N1, N2};
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
+    if (BinOpHasFlags)
+      AddBinaryNodeIDCustom(ID, Opcode, nuw, nsw, exact);
     void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
-                                         DL.getDebugLoc(), VTs, N1, N2);
+    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact);
+
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
-                                         DL.getDebugLoc(), VTs, N1, N2);
+
+    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact);
   }
 
   AllNodes.push_back(N);
@@ -4281,51 +4328,47 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                    Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDValue Chain, SDValue Ptr, SDValue Cmp,
-                                SDValue Swp, MachinePointerInfo PtrInfo,
-                                unsigned Alignment,
-                                AtomicOrdering SuccessOrdering,
-                                AtomicOrdering FailureOrdering,
-                                SynchronizationScope SynchScope) {
+SDValue SelectionDAG::getAtomicCmpSwap(
+    unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain,
+    SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
+    unsigned Alignment, AtomicOrdering SuccessOrdering,
+    AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+         Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
 
-  // All atomics are load and store, except for ATMOIC_LOAD and ATOMIC_STORE.
-  // For now, atomics are considered to be volatile always.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
   unsigned Flags = MachineMemOperand::MOVolatile;
-  if (Opcode != ISD::ATOMIC_STORE)
-    Flags |= MachineMemOperand::MOLoad;
-  if (Opcode != ISD::ATOMIC_LOAD)
-    Flags |= MachineMemOperand::MOStore;
+  Flags |= MachineMemOperand::MOLoad;
+  Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
 
-  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Cmp, Swp, MMO,
-                   SuccessOrdering, FailureOrdering, SynchScope);
+  return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO,
+                          SuccessOrdering, FailureOrdering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDValue Chain,
-                                SDValue Ptr, SDValue Cmp,
-                                SDValue Swp, MachineMemOperand *MMO,
-                                AtomicOrdering SuccessOrdering,
-                                AtomicOrdering FailureOrdering,
-                                SynchronizationScope SynchScope) {
-  assert(Opcode == ISD::ATOMIC_CMP_SWAP && "Invalid Atomic Op");
+SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT,
+                                       SDVTList VTs, SDValue Chain, SDValue Ptr,
+                                       SDValue Cmp, SDValue Swp,
+                                       MachineMemOperand *MMO,
+                                       AtomicOrdering SuccessOrdering,
+                                       AtomicOrdering FailureOrdering,
+                                       SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+         Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
 
-  EVT VT = Cmp.getValueType();
-
-  SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
-  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, SuccessOrdering,
-                   FailureOrdering, SynchScope);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO,
+                   SuccessOrdering, FailureOrdering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
@@ -5610,10 +5653,13 @@ SelectionDAG::getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT,
 /// getNodeIfExists - Get the specified node if it's already available, or
 /// else return NULL.
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
-                                      ArrayRef<SDValue> Ops) {
-  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+                                      ArrayRef<SDValue> Ops, bool nuw, bool nsw,
+                                      bool exact) {
+  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
+    if (isBinOpWithFlags(Opcode))
+      AddBinaryNodeIDCustom(ID, nuw, nsw, exact);
     void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return E;
@@ -5960,7 +6006,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   // count of outstanding operands.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
     SDNode *N = I++;
-    checkForCycles(N);
+    checkForCycles(N, this);
     unsigned Degree = N->getNumOperands();
     if (Degree == 0) {
       // A node with no uses, add it to the result array immediately.
@@ -5980,7 +6026,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   // such that by the time the end is reached all nodes will be sorted.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
     SDNode *N = I;
-    checkForCycles(N);
+    checkForCycles(N, this);
     // N is in sorted position, so all its uses have one less operand
     // that needs to be sorted.
     for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
@@ -6005,7 +6051,9 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
 #ifndef NDEBUG
       SDNode *S = ++I;
       dbgs() << "Overran sorted position:\n";
-      S->dumprFull();
+      S->dumprFull(this); dbgs() << "\n";
+      dbgs() << "Checking if this is due to cycles\n";
+      checkForCycles(this, true);
 #endif
       llvm_unreachable(nullptr);
     }
@@ -6591,10 +6639,11 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
   return true;
 }
 
-#ifdef XDEBUG
+#ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
                                  SmallPtrSet<const SDNode*, 32> &Visited,
-                                 SmallPtrSet<const SDNode*, 32> &Checked) {
+                                 SmallPtrSet<const SDNode*, 32> &Checked,
+                                 const llvm::SelectionDAG *DAG) {
   // If this node has already been checked, don't check it again.
   if (Checked.count(N))
     return;
@@ -6602,29 +6651,37 @@ static void checkForCyclesHelper(const SDNode *N,
   // If a node has already been visited on this depth-first walk, reject it as
   // a cycle.
   if (!Visited.insert(N)) {
-    dbgs() << "Offending node:\n";
-    N->dumprFull();
     errs() << "Detected cycle in SelectionDAG\n";
+    dbgs() << "Offending node:\n";
+    N->dumprFull(DAG); dbgs() << "\n";
     abort();
   }
 
   for(unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    checkForCyclesHelper(N->getOperand(i).getNode(), Visited, Checked);
+    checkForCyclesHelper(N->getOperand(i).getNode(), Visited, Checked, DAG);
 
   Checked.insert(N);
   Visited.erase(N);
 }
 #endif
 
-void llvm::checkForCycles(const llvm::SDNode *N) {
+void llvm::checkForCycles(const llvm::SDNode *N,
+                          const llvm::SelectionDAG *DAG,
+                          bool force) {
+#ifndef NDEBUG
+  bool check = force;
 #ifdef XDEBUG
-  assert(N && "Checking nonexistent SDNode");
-  SmallPtrSet<const SDNode*, 32> visited;
-  SmallPtrSet<const SDNode*, 32> checked;
-  checkForCyclesHelper(N, visited, checked);
-#endif
+  check = true;
+#endif  // XDEBUG
+  if (check) {
+    assert(N && "Checking nonexistent SDNode");
+    SmallPtrSet<const SDNode*, 32> visited;
+    SmallPtrSet<const SDNode*, 32> checked;
+    checkForCyclesHelper(N, visited, checked, DAG);
+  }
+#endif  // !NDEBUG
 }
 
-void llvm::checkForCycles(const llvm::SelectionDAG *DAG) {
-  checkForCycles(DAG->getRoot().getNode());
+void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
+  checkForCycles(DAG->getRoot().getNode(), DAG, force);
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c181046ba235..e6dc27219787 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2784,8 +2784,22 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
-  setValue(&I, DAG.getNode(OpCode, getCurSDLoc(),
-                           Op1.getValueType(), Op1, Op2));
+
+  bool nuw = false;
+  bool nsw = false;
+  bool exact = false;
+  if (const OverflowingBinaryOperator *OFBinOp =
+          dyn_cast<const OverflowingBinaryOperator>(&I)) {
+    nuw = OFBinOp->hasNoUnsignedWrap();
+    nsw = OFBinOp->hasNoSignedWrap();
+  }
+  if (const PossiblyExactOperator *ExactOp =
+          dyn_cast<const PossiblyExactOperator>(&I))
+    exact = ExactOp->isExact();
+
+  SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
+                                     Op1, Op2, nuw, nsw, exact);
+  setValue(&I, BinNodeValue);
 }
 
 void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
@@ -2816,8 +2830,25 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
       Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
   }
 
-  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(),
-                           Op1.getValueType(), Op1, Op2));
+  bool nuw = false;
+  bool nsw = false;
+  bool exact = false;
+
+  if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) {
+
+    if (const OverflowingBinaryOperator *OFBinOp =
+            dyn_cast<const OverflowingBinaryOperator>(&I)) {
+      nuw = OFBinOp->hasNoUnsignedWrap();
+      nsw = OFBinOp->hasNoSignedWrap();
+    }
+    if (const PossiblyExactOperator *ExactOp =
+            dyn_cast<const PossiblyExactOperator>(&I))
+      exact = ExactOp->isExact();
+  }
+
+  SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
+                            nuw, nsw, exact);
+  setValue(&I, Res);
 }
 
 void SelectionDAGBuilder::visitSDiv(const User &I) {
@@ -3570,12 +3601,12 @@ static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
   if (Before) {
     if (Order == AcquireRelease || Order == SequentiallyConsistent)
       Order = Release;
-    else if (Order == Acquire || Order == Monotonic)
+    else if (Order == Acquire || Order == Monotonic || Order == Unordered)
       return Chain;
   } else {
     if (Order == AcquireRelease)
       Order = Acquire;
-    else if (Order == Release || Order == Monotonic)
+    else if (Order == Release || Order == Monotonic || Order == Unordered)
       return Chain;
   }
   SDValue Ops[3];
@@ -3598,19 +3629,17 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
     InChain = InsertFenceForAtomic(InChain, SuccessOrder, Scope, true, dl,
                                    DAG, *TLI);
 
-  SDValue L =
-    DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
-                  getValue(I.getCompareOperand()).getSimpleValueType(),
-                  InChain,
-                  getValue(I.getPointerOperand()),
-                  getValue(I.getCompareOperand()),
-                  getValue(I.getNewValOperand()),
-                  MachinePointerInfo(I.getPointerOperand()), 0 /* Alignment */,
-                  TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder,
-                  TLI->getInsertFencesForAtomic() ? Monotonic : FailureOrder,
-                  Scope);
+  MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
+  SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
+  SDValue L = DAG.getAtomicCmpSwap(
+      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
+      getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
+      getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
+      0 /* Alignment */,
+      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder,
+      TLI->getInsertFencesForAtomic() ? Monotonic : FailureOrder, Scope);
 
-  SDValue OutChain = L.getValue(1);
+  SDValue OutChain = L.getValue(2);
 
   if (TLI->getInsertFencesForAtomic())
     OutChain = InsertFenceForAtomic(OutChain, SuccessOrder, Scope, false, dl,
@@ -5410,6 +5439,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                       bool isTailCall,
                                       MachineBasicBlock *LandingPad) {
+  const TargetLowering *TLI = TM.getTargetLowering();
   PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
   FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   Type *RetTy = FTy->getReturnType();
@@ -5420,45 +5450,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   TargetLowering::ArgListEntry Entry;
   Args.reserve(CS.arg_size());
 
-  // Check whether the function can return without sret-demotion.
-  SmallVector<ISD::OutputArg, 4> Outs;
-  const TargetLowering *TLI = TM.getTargetLowering();
-  GetReturnInfo(RetTy, CS.getAttributes(), Outs, *TLI);
-
-  bool CanLowerReturn = TLI->CanLowerReturn(CS.getCallingConv(),
-                                            DAG.getMachineFunction(),
-                                            FTy->isVarArg(), Outs,
-                                            FTy->getContext());
-
-  SDValue DemoteStackSlot;
-  int DemoteStackIdx = -100;
-
-  if (!CanLowerReturn) {
-    assert(!CS.hasInAllocaArgument() &&
-           "sret demotion is incompatible with inalloca");
-    uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(
-                      FTy->getReturnType());
-    unsigned Align  = TLI->getDataLayout()->getPrefTypeAlignment(
-                      FTy->getReturnType());
-    MachineFunction &MF = DAG.getMachineFunction();
-    DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
-    Type *StackSlotPtrType = PointerType::getUnqual(FTy->getReturnType());
-
-    DemoteStackSlot = DAG.getFrameIndex(DemoteStackIdx, TLI->getPointerTy());
-    Entry.Node = DemoteStackSlot;
-    Entry.Ty = StackSlotPtrType;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isInReg = false;
-    Entry.isSRet = true;
-    Entry.isNest = false;
-    Entry.isByVal = false;
-    Entry.isReturned = false;
-    Entry.Alignment = Align;
-    Args.push_back(Entry);
-    RetTy = Type::getVoidTy(FTy->getContext());
-  }
-
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
     const Value *V = *i;
@@ -5499,7 +5490,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // Check if target-independent constraints permit a tail call here.
   // Target-dependent constraints are checked within TLI->LowerCallTo.
-  if (isTailCall && !isInTailCallPosition(CS, *TLI))
+  if (isTailCall && !isInTailCallPosition(CS, DAG))
     isTailCall = false;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -5511,46 +5502,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
          "Null value expected with tail call!");
-  if (Result.first.getNode()) {
+  if (Result.first.getNode())
     setValue(CS.getInstruction(), Result.first);
-  } else if (!CanLowerReturn && Result.second.getNode()) {
-    // The instruction result is the result of loading from the
-    // hidden sret parameter.
-    SmallVector<EVT, 1> PVTs;
-    Type *PtrRetTy = PointerType::getUnqual(FTy->getReturnType());
-
-    ComputeValueVTs(*TLI, PtrRetTy, PVTs);
-    assert(PVTs.size() == 1 && "Pointers should fit in one register");
-    EVT PtrVT = PVTs[0];
-
-    SmallVector<EVT, 4> RetTys;
-    SmallVector<uint64_t, 4> Offsets;
-    RetTy = FTy->getReturnType();
-    ComputeValueVTs(*TLI, RetTy, RetTys, &Offsets);
-
-    unsigned NumValues = RetTys.size();
-    SmallVector<SDValue, 4> Values(NumValues);
-    SmallVector<SDValue, 4> Chains(NumValues);
-
-    for (unsigned i = 0; i < NumValues; ++i) {
-      SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(), PtrVT,
-                                DemoteStackSlot,
-                                DAG.getConstant(Offsets[i], PtrVT));
-      SDValue L = DAG.getLoad(RetTys[i], getCurSDLoc(), Result.second, Add,
-                  MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]),
-                              false, false, false, 1);
-      Values[i] = L;
-      Chains[i] = L.getValue(1);
-    }
-
-    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                                MVT::Other, Chains);
-    PendingLoads.push_back(Chain);
-
-    setValue(CS.getInstruction(),
-             DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                         DAG.getVTList(RetTys), Values));
-  }
 
   if (!Result.second.getNode()) {
     // As a special case, a null chain means that a tail call has been emitted
@@ -7092,6 +7045,21 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
   FuncInfo.MF->getFrameInfo()->setHasPatchPoint();
 }
 
+/// Returns an AttributeSet representing the attributes applied to the return
+/// value of the given call.
+static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
+  SmallVector<Attribute::AttrKind, 2> Attrs;
+  if (CLI.RetSExt)
+    Attrs.push_back(Attribute::SExt);
+  if (CLI.RetZExt)
+    Attrs.push_back(Attribute::ZExt);
+  if (CLI.IsInReg)
+    Attrs.push_back(Attribute::InReg);
+
+  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
+                           Attrs);
+}
+
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
 /// implementation, which just calls LowerCall.
 /// FIXME: When all targets are
@@ -7100,24 +7068,62 @@ std::pair<SDValue, SDValue>
 TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // Handle the incoming return values from the call.
   CLI.Ins.clear();
+  Type *OrigRetTy = CLI.RetTy;
   SmallVector<EVT, 4> RetTys;
-  ComputeValueVTs(*this, CLI.RetTy, RetTys);
-  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-    EVT VT = RetTys[I];
-    MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
-    for (unsigned i = 0; i != NumRegs; ++i) {
-      ISD::InputArg MyFlags;
-      MyFlags.VT = RegisterVT;
-      MyFlags.ArgVT = VT;
-      MyFlags.Used = CLI.IsReturnValueUsed;
-      if (CLI.RetSExt)
-        MyFlags.Flags.setSExt();
-      if (CLI.RetZExt)
-        MyFlags.Flags.setZExt();
-      if (CLI.IsInReg)
-        MyFlags.Flags.setInReg();
-      CLI.Ins.push_back(MyFlags);
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(*this, CLI.RetTy, RetTys, &Offsets);
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this);
+
+  bool CanLowerReturn =
+      this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
+                           CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+
+  SDValue DemoteStackSlot;
+  int DemoteStackIdx = -100;
+  if (!CanLowerReturn) {
+    // FIXME: equivalent assert?
+    // assert(!CS.hasInAllocaArgument() &&
+    //        "sret demotion is incompatible with inalloca");
+    uint64_t TySize = getDataLayout()->getTypeAllocSize(CLI.RetTy);
+    unsigned Align  = getDataLayout()->getPrefTypeAlignment(CLI.RetTy);
+    MachineFunction &MF = CLI.DAG.getMachineFunction();
+    DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
+    Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy);
+
+    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy());
+    ArgListEntry Entry;
+    Entry.Node = DemoteStackSlot;
+    Entry.Ty = StackSlotPtrType;
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Entry.isInReg = false;
+    Entry.isSRet = true;
+    Entry.isNest = false;
+    Entry.isByVal = false;
+    Entry.isReturned = false;
+    Entry.Alignment = Align;
+    CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
+    CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
+  } else {
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        ISD::InputArg MyFlags;
+        MyFlags.VT = RegisterVT;
+        MyFlags.ArgVT = VT;
+        MyFlags.Used = CLI.IsReturnValueUsed;
+        if (CLI.RetSExt)
+          MyFlags.Flags.setSExt();
+        if (CLI.RetZExt)
+          MyFlags.Flags.setZExt();
+        if (CLI.IsInReg)
+          MyFlags.Flags.setInReg();
+        CLI.Ins.push_back(MyFlags);
+      }
     }
   }
 
@@ -7176,11 +7182,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       }
       if (Args[i].isNest)
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
@@ -7226,6 +7229,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
+        // Only mark the end at the last register of the last value.
+        if (NeedsRegBlock && Value == NumValues - 1 && j == NumParts - 1)
+          MyFlags.Flags.setInConsecutiveRegsLast();
+
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
@@ -7259,31 +7266,59 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
                  "LowerCall emitted a value with the wrong type!");
         });
 
-  // Collect the legal value parts into potentially illegal values
-  // that correspond to the original function's return values.
-  ISD::NodeType AssertOp = ISD::DELETED_NODE;
-  if (CLI.RetSExt)
-    AssertOp = ISD::AssertSext;
-  else if (CLI.RetZExt)
-    AssertOp = ISD::AssertZext;
   SmallVector<SDValue, 4> ReturnValues;
-  unsigned CurReg = 0;
-  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-    EVT VT = RetTys[I];
-    MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+  if (!CanLowerReturn) {
+    // The instruction result is the result of loading from the
+    // hidden sret parameter.
+    SmallVector<EVT, 1> PVTs;
+    Type *PtrRetTy = PointerType::getUnqual(OrigRetTy);
 
-    ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
-                                            NumRegs, RegisterVT, VT, nullptr,
-                                            AssertOp));
-    CurReg += NumRegs;
-  }
+    ComputeValueVTs(*this, PtrRetTy, PVTs);
+    assert(PVTs.size() == 1 && "Pointers should fit in one register");
+    EVT PtrVT = PVTs[0];
 
-  // For a function returning void, there is no return value. We can't create
-  // such a node, so we just return a null return value in that case. In
-  // that case, nothing will actually look at the value.
-  if (ReturnValues.empty())
-    return std::make_pair(SDValue(), CLI.Chain);
+    unsigned NumValues = RetTys.size();
+    ReturnValues.resize(NumValues);
+    SmallVector<SDValue, 4> Chains(NumValues);
+
+    for (unsigned i = 0; i < NumValues; ++i) {
+      SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
+                                    CLI.DAG.getConstant(Offsets[i], PtrVT));
+      SDValue L = CLI.DAG.getLoad(
+          RetTys[i], CLI.DL, CLI.Chain, Add,
+          MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]), false,
+          false, false, 1);
+      ReturnValues[i] = L;
+      Chains[i] = L.getValue(1);
+    }
+
+    CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
+  } else {
+    // Collect the legal value parts into potentially illegal values
+    // that correspond to the original function's return values.
+    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+    if (CLI.RetSExt)
+      AssertOp = ISD::AssertSext;
+    else if (CLI.RetZExt)
+      AssertOp = ISD::AssertZext;
+    unsigned CurReg = 0;
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+
+      ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
+                                              NumRegs, RegisterVT, VT, nullptr,
+                                              AssertOp));
+      CurReg += NumRegs;
+    }
+
+    // For a function returning void, there is no return value. We can't create
+    // such a node, so we just return a null return value in that case. In
+    // that case, nothing will actually look at the value.
+    if (ReturnValues.empty())
+      return std::make_pair(SDValue(), CLI.Chain);
+  }
 
   SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
                                 CLI.DAG.getVTList(RetTys), ReturnValues);
@@ -7412,11 +7447,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
@@ -7429,6 +7461,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // if it isn't first piece, alignment must be 1
         else if (i > 0)
           MyFlags.Flags.setOrigAlign(1);
+
+        // Only mark the end at the last register of the last value.
+        if (NeedsRegBlock && Value == NumValues - 1 && i == NumRegs - 1)
+          MyFlags.Flags.setInConsecutiveRegsLast();
+
         Ins.push_back(MyFlags);
       }
       PartBase += VT.getStoreSize();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index d6b525500a8a..c92fb2453c2a 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -55,6 +55,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::PREFETCH:                   return "Prefetch";
   case ISD::ATOMIC_FENCE:               return "AtomicFence";
   case ISD::ATOMIC_CMP_SWAP:            return "AtomicCmpSwap";
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return "AtomicCmpSwapWithSuccess";
   case ISD::ATOMIC_SWAP:                return "AtomicSwap";
   case ISD::ATOMIC_LOAD_ADD:            return "AtomicLoadAdd";
   case ISD::ATOMIC_LOAD_SUB:            return "AtomicLoadSub";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 472fc9c808f1..57e22e21c371 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -141,6 +141,25 @@ STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector");
 STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue");
 STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue");
 STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad");
+
+// Intrinsic instructions...
+STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call");
+STATISTIC(NumFastIselFailSAddWithOverflow,
+          "Fast isel fails on sadd.with.overflow");
+STATISTIC(NumFastIselFailUAddWithOverflow,
+          "Fast isel fails on uadd.with.overflow");
+STATISTIC(NumFastIselFailSSubWithOverflow,
+          "Fast isel fails on ssub.with.overflow");
+STATISTIC(NumFastIselFailUSubWithOverflow,
+          "Fast isel fails on usub.with.overflow");
+STATISTIC(NumFastIselFailSMulWithOverflow,
+          "Fast isel fails on smul.with.overflow");
+STATISTIC(NumFastIselFailUMulWithOverflow,
+          "Fast isel fails on umul.with.overflow");
+STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress");
+STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call");
+STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call");
+STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call");
 #endif
 
 static cl::opt<bool>
@@ -974,7 +993,37 @@ static void collectFailStats(const Instruction *I) {
   case Instruction::FCmp:           NumFastIselFailFCmp++; return;
   case Instruction::PHI:            NumFastIselFailPHI++; return;
   case Instruction::Select:         NumFastIselFailSelect++; return;
-  case Instruction::Call:           NumFastIselFailCall++; return;
+  case Instruction::Call: {
+    if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+      switch (Intrinsic->getIntrinsicID()) {
+      default:
+        NumFastIselFailIntrinsicCall++; return;
+      case Intrinsic::sadd_with_overflow:
+        NumFastIselFailSAddWithOverflow++; return;
+      case Intrinsic::uadd_with_overflow:
+        NumFastIselFailUAddWithOverflow++; return;
+      case Intrinsic::ssub_with_overflow:
+        NumFastIselFailSSubWithOverflow++; return;
+      case Intrinsic::usub_with_overflow:
+        NumFastIselFailUSubWithOverflow++; return;
+      case Intrinsic::smul_with_overflow:
+        NumFastIselFailSMulWithOverflow++; return;
+      case Intrinsic::umul_with_overflow:
+        NumFastIselFailUMulWithOverflow++; return;
+      case Intrinsic::frameaddress:
+        NumFastIselFailFrameaddress++; return;
+      case Intrinsic::sqrt:
+          NumFastIselFailSqrt++; return;
+      case Intrinsic::experimental_stackmap:
+        NumFastIselFailStackMap++; return;
+      case Intrinsic::experimental_patchpoint_void: // fall-through
+      case Intrinsic::experimental_patchpoint_i64:
+        NumFastIselFailPatchPoint++; return;
+      }
+    }
+    NumFastIselFailCall++;
+    return;
+  }
   case Instruction::Shl:            NumFastIselFailShl++; return;
   case Instruction::LShr:           NumFastIselFailLShr++; return;
   case Instruction::AShr:           NumFastIselFailAShr++; return;
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b75d80541ea2..75bbbe749e58 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -327,6 +327,10 @@ TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
   assert(Op.getNode()->getNumValues() == 1 &&
          "ShrinkDemandedOp only supports nodes with one result!");
 
+  // Early return, as this function cannot handle vector types.
+  if (Op.getValueType().isVector())
+    return false;
+
   // Don't do this if the node has another user, which may require the
   // full value.
   if (!Op.getNode()->hasOneUse())
@@ -2613,7 +2617,8 @@ SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, SDLoc dl,
   if (ShAmt) {
     // TODO: For UDIV use SRL instead of SRA.
     SDValue Amt = DAG.getConstant(ShAmt, getShiftAmountTy(Op1.getValueType()));
-    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt);
+    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, false, false,
+                      true);
     d = d.ashr(ShAmt);
   }
 
diff --git a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
index 1120be8ed2ab..0e89bad5f26f 100644
--- a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
@@ -15,8 +15,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-TargetSelectionDAGInfo::TargetSelectionDAGInfo(const TargetMachine &TM)
-  : DL(TM.getDataLayout()) {
+TargetSelectionDAGInfo::TargetSelectionDAGInfo(const DataLayout *DL)
+  : DL(DL) {
 }
 
 TargetSelectionDAGInfo::~TargetSelectionDAGInfo() {
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index d2f395594860..10e67b38c0c2 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -248,19 +248,27 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
   for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
        ++AI) {
     Type *Ty = AI->getType();
+    StructType *ST = dyn_cast<StructType>(Ty);
+    ArrayType *AT = dyn_cast<ArrayType>(Ty);
 
     // Aggregate types can't be cast, but are legal argument types, so we have
     // to handle them differently. We use an extract/insert pair as a
     // lightweight method to achieve the same goal.
-    if (isa<StructType>(Ty) || isa<ArrayType>(Ty)) {
-      Instruction *EI = ExtractValueInst::Create(AI, 0, "", AfterAllocaInsPt);
-      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
-      NI->insertAfter(EI);
-      AI->replaceAllUsesWith(NI);
-
-      // Set the operand of the instructions back to the AllocaInst.
-      EI->setOperand(0, AI);
-      NI->setOperand(0, AI);
+    // There is one more special case though: aggregate type with no data   
+    // for example, a struct with no fields which is used extensively in Rust
+    // In this case there is no need to transfer anything
+    if (ST || AT) {      
+      if ((ST && ST->getNumElements() != 0) 
+        || (AT && AT->getNumElements() != 0)) {
+        Instruction *EI = ExtractValueInst::Create(AI, 0, "", AfterAllocaInsPt);
+        Instruction *NI = InsertValueInst::Create(AI, EI, 0);
+        NI->insertAfter(EI);
+        AI->replaceAllUsesWith(NI);
+
+        // Set the operand of the instructions back to the AllocaInst.
+        EI->setOperand(0, AI);
+        NI->setOperand(0, AI);
+      }
     } else {
       // This is always a no-op cast because we're casting AI to AI->getType()
       // so src and destination types are identical. BitCast is the only
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index c3f84c64d7fb..83966bd0c208 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -671,7 +671,7 @@ bool TargetInstrInfo::usePreRAHazardRecognizer() const {
 
 // Default implementation of CreateTargetRAHazardRecognizer.
 ScheduleHazardRecognizer *TargetInstrInfo::
-CreateTargetHazardRecognizer(const TargetMachine *TM,
+CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                              const ScheduleDAG *DAG) const {
   // Dummy hazard recognizer allows all instructions to issue.
   return new ScheduleHazardRecognizer();
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 978432411ef8..f53048554322 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -39,7 +39,7 @@ using namespace llvm;
 
 /// InitLibcallNames - Set default libcall names.
 ///
-static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
+static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::SHL_I16] = "__ashlhi3";
   Names[RTLIB::SHL_I32] = "__ashlsi3";
   Names[RTLIB::SHL_I64] = "__ashldi3";
@@ -384,7 +384,7 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
   Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
   
-  if (Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU) {
+  if (TT.getEnvironment() == Triple::GNU) {
     Names[RTLIB::SINCOS_F32] = "sincosf";
     Names[RTLIB::SINCOS_F64] = "sincos";
     Names[RTLIB::SINCOS_F80] = "sincosl";
@@ -399,7 +399,7 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
     Names[RTLIB::SINCOS_PPCF128] = nullptr;
   }
 
-  if (Triple(TM.getTargetTriple()).getOS() != Triple::OpenBSD) {
+  if (TT.getOS() != Triple::OpenBSD) {
     Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
   } else {
     // These are generally not available.
@@ -702,7 +702,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
   SupportJumpTables = true;
   MinimumJumpTableEntries = 4;
 
-  InitLibcallNames(LibcallRoutineNames, TM);
+  InitLibcallNames(LibcallRoutineNames, Triple(TM.getTargetTriple()));
   InitCmpLibcallCCs(CmpLibcallCCs);
   InitLibcallCallingConvs(LibcallCallingConvs);
 }
@@ -730,6 +730,10 @@ void TargetLoweringBase::initActions() {
       setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
     }
 
+    // Most backends expect to see the node which just returns the value loaded.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
+                       (MVT::SimpleValueType)VT, Expand);
+
     // These operations default to expand.
     setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
@@ -915,7 +919,6 @@ bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const {
 MachineBasicBlock*
 TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
                                    MachineBasicBlock *MBB) const {
-  const TargetMachine &TM = getTargetMachine();
   MachineFunction &MF = *MI->getParent()->getParent();
 
   // MI changes inside this loop as we grow operands.
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index dda22599d252..02abc282e6d6 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -48,16 +48,12 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
     const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
     MachineModuleInfo *MMI) const {
   unsigned Encoding = getPersonalityEncoding();
-  switch (Encoding & 0x70) {
-  default:
-    report_fatal_error("We do not support this DWARF encoding yet!");
-  case dwarf::DW_EH_PE_absptr:
-    return TM.getSymbol(GV, Mang);
-  case dwarf::DW_EH_PE_pcrel: {
+  if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect)
     return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
                                           TM.getSymbol(GV, Mang)->getName());
-  }
-  }
+  if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr)
+    return TM.getSymbol(GV, Mang);
+  report_fatal_error("We do not support this DWARF encoding yet!");
 }
 
 void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
@@ -340,7 +336,7 @@ getSectionForConstant(SectionKind Kind) const {
 }
 
 const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
+    unsigned Priority, const MCSymbol *KeySym) const {
   // The default scheme is .ctor / .dtor, so we have to invert the priority
   // numbering.
   if (Priority == 65535)
@@ -360,7 +356,7 @@ const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
 }
 
 const MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
+    unsigned Priority, const MCSymbol *KeySym) const {
   // The default scheme is .ctor / .dtor, so we have to invert the priority
   // numbering.
   if (Priority == 65535)
@@ -868,8 +864,7 @@ emitModuleFlags(MCStreamer &Streamer,
 
 static const MCSection *getAssociativeCOFFSection(MCContext &Ctx,
                                                   const MCSection *Sec,
-                                                  const MCSymbol *KeySym,
-                                                  const MCSection *KeySec) {
+                                                  const MCSymbol *KeySym) {
   // Return the normal section if we don't have to be associative.
   if (!KeySym)
     return Sec;
@@ -877,20 +872,19 @@ static const MCSection *getAssociativeCOFFSection(MCContext &Ctx,
   // Make an associative section with the same name and kind as the normal
   // section.
   const MCSectionCOFF *SecCOFF = cast<MCSectionCOFF>(Sec);
-  const MCSectionCOFF *KeySecCOFF = cast<MCSectionCOFF>(KeySec);
   unsigned Characteristics =
       SecCOFF->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
   return Ctx.getCOFFSection(SecCOFF->getSectionName(), Characteristics,
                             SecCOFF->getKind(), KeySym->getName(),
-                            COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, KeySecCOFF);
+                            COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
-  return getAssociativeCOFFSection(getContext(), StaticCtorSection, KeySym, KeySec);
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getAssociativeCOFFSection(getContext(), StaticCtorSection, KeySym);
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
-  return getAssociativeCOFFSection(getContext(), StaticDtorSection, KeySym, KeySec);
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getAssociativeCOFFSection(getContext(), StaticDtorSection, KeySym);
 }
diff --git a/lib/CodeGen/module.modulemap b/lib/CodeGen/module.modulemap
new file mode 100644
index 000000000000..d4f68bcc6eed
--- /dev/null
+++ b/lib/CodeGen/module.modulemap
@@ -0,0 +1 @@
+module CodeGen { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index e52e8afab84d..3961905f6582 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -734,7 +734,7 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj)
         object::RelocToApply R(V.visit(Type, Reloc, 0, SymAddr));
         if (V.error()) {
           SmallString<32> Name;
-          error_code ec(Reloc.getTypeName(Name));
+          std::error_code ec(Reloc.getTypeName(Name));
           if (ec) {
             errs() << "Aaaaaa! Nameless relocation! Aaaaaa!\n";
           }
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index 2524adc25c15..fe7e46d63ba1 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
+#include <set>
 using namespace llvm;
 
 void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
@@ -30,6 +31,7 @@ void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
       uint64_t HighPC = Desc.getEndAddress();
       appendRange(CUOffset, LowPC, HighPC);
     }
+    ParsedCUOffsets.insert(CUOffset);
   }
 }
 
@@ -56,69 +58,55 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     }
   }
 
-  sortAndMinimize();
+  construct();
 }
 
 void DWARFDebugAranges::clear() {
+  Endpoints.clear();
   Aranges.clear();
   ParsedCUOffsets.clear();
 }
 
 void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
                                     uint64_t HighPC) {
-  if (!Aranges.empty()) {
-    if (Aranges.back().CUOffset == CUOffset &&
-        Aranges.back().HighPC() == LowPC) {
-      Aranges.back().setHighPC(HighPC);
-      return;
-    }
-  }
-  Aranges.push_back(Range(LowPC, HighPC, CUOffset));
-}
-
-void DWARFDebugAranges::sortAndMinimize() {
-  const size_t orig_arange_size = Aranges.size();
-  // Size of one? If so, no sorting is needed
-  if (orig_arange_size <= 1)
+  if (LowPC >= HighPC)
     return;
-  // Sort our address range entries
-  std::stable_sort(Aranges.begin(), Aranges.end());
-
-  // Most address ranges are contiguous from function to function
-  // so our new ranges will likely be smaller. We calculate the size
-  // of the new ranges since although std::vector objects can be resized,
-  // the will never reduce their allocated block size and free any excesss
-  // memory, so we might as well start a brand new collection so it is as
-  // small as possible.
-
-  // First calculate the size of the new minimal arange vector
-  // so we don't have to do a bunch of re-allocations as we
-  // copy the new minimal stuff over to the new collection.
-  size_t minimal_size = 1;
-  for (size_t i = 1; i < orig_arange_size; ++i) {
-    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i]))
-      ++minimal_size;
-  }
+  Endpoints.emplace_back(LowPC, CUOffset, true);
+  Endpoints.emplace_back(HighPC, CUOffset, false);
+}
 
-  // Else, make a new RangeColl that _only_ contains what we need.
-  RangeColl minimal_aranges;
-  minimal_aranges.resize(minimal_size);
-  uint32_t j = 0;
-  minimal_aranges[j] = Aranges[0];
-  for (size_t i = 1; i < orig_arange_size; ++i) {
-    if (Range::SortedOverlapCheck(minimal_aranges[j], Aranges[i])) {
-      minimal_aranges[j].setHighPC(Aranges[i].HighPC());
+void DWARFDebugAranges::construct() {
+  std::multiset<uint32_t> ValidCUs;  // Maintain the set of CUs describing
+                                     // a current address range.
+  std::sort(Endpoints.begin(), Endpoints.end());
+  uint64_t PrevAddress = -1ULL;
+  for (const auto &E : Endpoints) {
+    if (PrevAddress < E.Address && ValidCUs.size() > 0) {
+      // If the address range between two endpoints is described by some
+      // CU, first try to extend the last range in Aranges. If we can't
+      // do it, start a new range.
+      if (!Aranges.empty() && Aranges.back().HighPC() == PrevAddress &&
+          ValidCUs.find(Aranges.back().CUOffset) != ValidCUs.end()) {
+        Aranges.back().setHighPC(E.Address);
+      } else {
+        Aranges.emplace_back(PrevAddress, E.Address, *ValidCUs.begin());
+      }
+    }
+    // Update the set of valid CUs.
+    if (E.IsRangeStart) {
+      ValidCUs.insert(E.CUOffset);
     } else {
-      // Only increment j if we aren't merging
-      minimal_aranges[++j] = Aranges[i];
+      auto CUPos = ValidCUs.find(E.CUOffset);
+      assert(CUPos != ValidCUs.end());
+      ValidCUs.erase(CUPos);
     }
+    PrevAddress = E.Address;
   }
-  assert(j+1 == minimal_size);
+  assert(ValidCUs.empty());
 
-  // Now swap our new minimal aranges into place. The local
-  // minimal_aranges will then contian the old big collection
-  // which will get freed.
-  minimal_aranges.swap(Aranges);
+  // Endpoints are not needed now.
+  std::vector<RangeEndpoint> EmptyEndpoints;
+  EmptyEndpoints.swap(Endpoints);
 }
 
 uint32_t DWARFDebugAranges::findAddress(uint64_t Address) const {
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
index de96d7fb8198..a9f37fe772c7 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/lib/DebugInfo/DWARFDebugAranges.h
@@ -27,9 +27,9 @@ class DWARFDebugAranges {
   void clear();
   void extract(DataExtractor DebugArangesData);
 
-  // Use appendRange multiple times and then call sortAndMinimize.
+  // Call appendRange multiple times and then call construct.
   void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
-  void sortAndMinimize();
+  void construct();
 
   struct Range {
     explicit Range(uint64_t LowPC = -1ULL, uint64_t HighPC = -1ULL,
@@ -47,31 +47,39 @@ class DWARFDebugAranges {
         return LowPC + Length;
       return -1ULL;
     }
+
     bool containsAddress(uint64_t Address) const {
       return LowPC <= Address && Address < HighPC();
     }
-
-    bool operator <(const Range &other) const {
+    bool operator<(const Range &other) const {
       return LowPC < other.LowPC;
     }
 
-    static bool SortedOverlapCheck(const Range &Left, const Range &Right) {
-      if (Left.CUOffset != Right.CUOffset)
-        return false;
-      return Left.HighPC() >= Right.LowPC;
-    }
-
     uint64_t LowPC; // Start of address range.
     uint32_t Length; // End of address range (not including this address).
     uint32_t CUOffset; // Offset of the compile unit or die.
   };
 
+  struct RangeEndpoint {
+    uint64_t Address;
+    uint32_t CUOffset;
+    bool IsRangeStart;
+
+    RangeEndpoint(uint64_t Address, uint32_t CUOffset, bool IsRangeStart)
+        : Address(Address), CUOffset(CUOffset), IsRangeStart(IsRangeStart) {}
+
+    bool operator<(const RangeEndpoint &Other) const {
+      return Address < Other.Address;
+    }
+  };
+
+
   typedef std::vector<Range>              RangeColl;
   typedef RangeColl::const_iterator       RangeCollIterator;
-  typedef DenseSet<uint32_t>              ParsedCUOffsetColl;
 
+  std::vector<RangeEndpoint> Endpoints;
   RangeColl Aranges;
-  ParsedCUOffsetColl ParsedCUOffsets;
+  DenseSet<uint32_t> ParsedCUOffsets;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index b811ed70644f..2e7a54aeb858 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -210,6 +210,16 @@ uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSectionOffset(
   return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
+uint64_t
+DWARFDebugInfoEntryMinimal::getRangesBaseAttribute(const DWARFUnit *U,
+                                                   uint64_t FailValue) const {
+  uint64_t Result =
+      getAttributeValueAsSectionOffset(U, DW_AT_ranges_base, -1ULL);
+  if (Result != -1ULL)
+    return Result;
+  return getAttributeValueAsSectionOffset(U, DW_AT_GNU_ranges_base, FailValue);
+}
+
 bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
                                                  uint64_t &LowPC,
                                                  uint64_t &HighPC) const {
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index 916e1ed340eb..cc58eb652adc 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -106,6 +106,8 @@ class DWARFDebugInfoEntryMinimal {
                                             const uint16_t Attr,
                                             uint64_t FailValue) const;
 
+  uint64_t getRangesBaseAttribute(const DWARFUnit *U, uint64_t FailValue) const;
+
   /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
   /// Returns true if both attributes are present.
   bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
index f5f5072b9d3d..39d0a0ff5a4c 100644
--- a/lib/DebugInfo/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARFUnit.cpp
@@ -226,7 +226,9 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     AddrOffsetSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
         this, DW_AT_GNU_addr_base, 0);
     RangeSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
-        this, DW_AT_GNU_ranges_base, 0);
+        this, DW_AT_ranges_base, 0);
+    // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
+    // skeleton CU DIE, so that DWARF users not aware of it are not broken.
   }
 
   setDIERelations();
@@ -272,7 +274,8 @@ bool DWARFUnit::parseDWO() {
   }
   // Share .debug_addr and .debug_ranges section with compile unit in .dwo
   DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
-  DWOCU->setRangesSection(RangeSection, RangeSectionBase);
+  uint32_t DWORangesBase = DieArray[0].getRangesBaseAttribute(this, 0);
+  DWOCU->setRangesSection(RangeSection, DWORangesBase);
   return true;
 }
 
diff --git a/lib/DebugInfo/module.modulemap b/lib/DebugInfo/module.modulemap
new file mode 100644
index 000000000000..1fe5ab130fd7
--- /dev/null
+++ b/lib/DebugInfo/module.modulemap
@@ -0,0 +1 @@
+module DebugInfo { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 6766ef1512a0..9154fe2f5ff4 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -148,8 +148,7 @@ Function *ExecutionEngine::FindFunctionNamed(const char *FnName) {
 }
 
 
-void *ExecutionEngineState::RemoveMapping(const MutexGuard &,
-                                          const GlobalValue *ToUnmap) {
+void *ExecutionEngineState::RemoveMapping(const GlobalValue *ToUnmap) {
   GlobalAddressMapTy::iterator I = GlobalAddressMap.find(ToUnmap);
   void *OldVal;
 
@@ -171,14 +170,14 @@ void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
 
   DEBUG(dbgs() << "JIT: Map \'" << GV->getName()
         << "\' to [" << Addr << "]\n";);
-  void *&CurVal = EEState.getGlobalAddressMap(locked)[GV];
+  void *&CurVal = EEState.getGlobalAddressMap()[GV];
   assert((!CurVal || !Addr) && "GlobalMapping already established!");
   CurVal = Addr;
 
   // If we are using the reverse mapping, add it too.
-  if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
+  if (!EEState.getGlobalAddressReverseMap().empty()) {
     AssertingVH<const GlobalValue> &V =
-      EEState.getGlobalAddressReverseMap(locked)[Addr];
+      EEState.getGlobalAddressReverseMap()[Addr];
     assert((!V || !GV) && "GlobalMapping already established!");
     V = GV;
   }
@@ -187,41 +186,41 @@ void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
 void ExecutionEngine::clearAllGlobalMappings() {
   MutexGuard locked(lock);
 
-  EEState.getGlobalAddressMap(locked).clear();
-  EEState.getGlobalAddressReverseMap(locked).clear();
+  EEState.getGlobalAddressMap().clear();
+  EEState.getGlobalAddressReverseMap().clear();
 }
 
 void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
   MutexGuard locked(lock);
 
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI)
-    EEState.RemoveMapping(locked, FI);
+    EEState.RemoveMapping(FI);
   for (Module::global_iterator GI = M->global_begin(), GE = M->global_end();
        GI != GE; ++GI)
-    EEState.RemoveMapping(locked, GI);
+    EEState.RemoveMapping(GI);
 }
 
 void *ExecutionEngine::updateGlobalMapping(const GlobalValue *GV, void *Addr) {
   MutexGuard locked(lock);
 
   ExecutionEngineState::GlobalAddressMapTy &Map =
-    EEState.getGlobalAddressMap(locked);
+    EEState.getGlobalAddressMap();
 
   // Deleting from the mapping?
   if (!Addr)
-    return EEState.RemoveMapping(locked, GV);
+    return EEState.RemoveMapping(GV);
 
   void *&CurVal = Map[GV];
   void *OldVal = CurVal;
 
-  if (CurVal && !EEState.getGlobalAddressReverseMap(locked).empty())
-    EEState.getGlobalAddressReverseMap(locked).erase(CurVal);
+  if (CurVal && !EEState.getGlobalAddressReverseMap().empty())
+    EEState.getGlobalAddressReverseMap().erase(CurVal);
   CurVal = Addr;
 
   // If we are using the reverse mapping, add it too.
-  if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
+  if (!EEState.getGlobalAddressReverseMap().empty()) {
     AssertingVH<const GlobalValue> &V =
-      EEState.getGlobalAddressReverseMap(locked)[Addr];
+      EEState.getGlobalAddressReverseMap()[Addr];
     assert((!V || !GV) && "GlobalMapping already established!");
     V = GV;
   }
@@ -232,25 +231,25 @@ void *ExecutionEngine::getPointerToGlobalIfAvailable(const GlobalValue *GV) {
   MutexGuard locked(lock);
 
   ExecutionEngineState::GlobalAddressMapTy::iterator I =
-    EEState.getGlobalAddressMap(locked).find(GV);
-  return I != EEState.getGlobalAddressMap(locked).end() ? I->second : nullptr;
+    EEState.getGlobalAddressMap().find(GV);
+  return I != EEState.getGlobalAddressMap().end() ? I->second : nullptr;
 }
 
 const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
   MutexGuard locked(lock);
 
   // If we haven't computed the reverse mapping yet, do so first.
-  if (EEState.getGlobalAddressReverseMap(locked).empty()) {
+  if (EEState.getGlobalAddressReverseMap().empty()) {
     for (ExecutionEngineState::GlobalAddressMapTy::iterator
-         I = EEState.getGlobalAddressMap(locked).begin(),
-         E = EEState.getGlobalAddressMap(locked).end(); I != E; ++I)
-      EEState.getGlobalAddressReverseMap(locked).insert(std::make_pair(
+         I = EEState.getGlobalAddressMap().begin(),
+         E = EEState.getGlobalAddressMap().end(); I != E; ++I)
+      EEState.getGlobalAddressReverseMap().insert(std::make_pair(
                                                           I->second, I->first));
   }
 
   std::map<void *, AssertingVH<const GlobalValue> >::iterator I =
-    EEState.getGlobalAddressReverseMap(locked).find(Addr);
-  return I != EEState.getGlobalAddressReverseMap(locked).end() ? I->second : nullptr;
+    EEState.getGlobalAddressReverseMap().find(Addr);
+  return I != EEState.getGlobalAddressReverseMap().end() ? I->second : nullptr;
 }
 
 namespace {
@@ -457,6 +456,27 @@ ExecutionEngine *ExecutionEngine::createJIT(Module *M,
   return ExecutionEngine::JITCtor(M, ErrorStr, JMM, GVsWithCode, TM);
 }
 
+void EngineBuilder::InitEngine() {
+  WhichEngine = EngineKind::Either;
+  ErrorStr = nullptr;
+  OptLevel = CodeGenOpt::Default;
+  MCJMM = nullptr;
+  JMM = nullptr;
+  Options = TargetOptions();
+  AllocateGVsWithCode = false;
+  RelocModel = Reloc::Default;
+  CMModel = CodeModel::JITDefault;
+  UseMCJIT = false;
+
+// IR module verification is enabled by default in debug builds, and disabled
+// by default in release builds.
+#ifndef NDEBUG
+  VerifyModules = true;
+#else
+  VerifyModules = false;
+#endif
+}
+
 ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
   std::unique_ptr<TargetMachine> TheTM(TM); // Take ownership.
 
@@ -536,7 +556,7 @@ void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
     return getPointerToFunction(F);
 
   MutexGuard locked(lock);
-  if (void *P = EEState.getGlobalAddressMap(locked)[GV])
+  if (void *P = EEState.getGlobalAddressMap()[GV])
     return P;
 
   // Global variable might have been added since interpreter started.
@@ -546,7 +566,7 @@ void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
   else
     llvm_unreachable("Global hasn't had an address allocated yet!");
 
-  return EEState.getGlobalAddressMap(locked)[GV];
+  return EEState.getGlobalAddressMap()[GV];
 }
 
 /// \brief Converts a Constant* into a GenericValue, including handling of
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 9a65fa09b231..4e22a8b3ea0d 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -86,7 +86,7 @@ static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
   LineNumberInfo Result;
 
   Result.Offset = Address - StartAddress;
-  Result.LineNumber = Line.getLine();
+  Result.LineNumber = Line.Line;
 
   return Result;
 }
@@ -233,7 +233,7 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
           FunctionMessage.line_number_size = 0;
           FunctionMessage.line_number_table = 0;
         } else {
-          SourceFileName = Lines.front().second.getFileName();
+          SourceFileName = Lines.front().second.FileName;
           FunctionMessage.source_file_name = const_cast<char *>(SourceFileName.c_str());
           FunctionMessage.line_number_size = LineInfo.size();
           FunctionMessage.line_number_table = &*LineInfo.begin();
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
index c589457b670d..814efcc27fcb 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.cpp
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
@@ -34,7 +34,7 @@ extern "C" void LLVMLinkInInterpreter() { }
 ///
 ExecutionEngine *Interpreter::create(Module *M, std::string* ErrStr) {
   // Tell this Module to materialize everything and release the GVMaterializer.
-  if (error_code EC = M->materializeAllPermanently()) {
+  if (std::error_code EC = M->materializeAllPermanently()) {
     if (ErrStr)
       *ErrStr = EC.message();
     // We got an error, just return 0
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index f8b28279456a..83ec9784b98e 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -151,7 +151,7 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
 
   // Add target data
   MutexGuard locked(lock);
-  FunctionPassManager &PM = jitstate->getPM(locked);
+  FunctionPassManager &PM = jitstate->getPM();
   M->setDataLayout(TM.getDataLayout());
   PM.add(new DataLayoutPass(M));
 
@@ -184,7 +184,7 @@ void JIT::addModule(Module *M) {
 
     jitstate = new JITState(M);
 
-    FunctionPassManager &PM = jitstate->getPM(locked);
+    FunctionPassManager &PM = jitstate->getPM();
     M->setDataLayout(TM.getDataLayout());
     PM.add(new DataLayoutPass(M));
 
@@ -216,7 +216,7 @@ bool JIT::removeModule(Module *M) {
   if (!jitstate && !Modules.empty()) {
     jitstate = new JITState(Modules[0]);
 
-    FunctionPassManager &PM = jitstate->getPM(locked);
+    FunctionPassManager &PM = jitstate->getPM();
     M->setDataLayout(TM.getDataLayout());
     PM.add(new DataLayoutPass(M));
 
@@ -460,41 +460,41 @@ void JIT::runJITOnFunction(Function *F, MachineCodeInfo *MCI) {
   if (MCI)
     RegisterJITEventListener(&MCIL);
 
-  runJITOnFunctionUnlocked(F, locked);
+  runJITOnFunctionUnlocked(F);
 
   if (MCI)
     UnregisterJITEventListener(&MCIL);
 }
 
-void JIT::runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked) {
+void JIT::runJITOnFunctionUnlocked(Function *F) {
   assert(!isAlreadyCodeGenerating && "Error: Recursive compilation detected!");
 
-  jitTheFunction(F, locked);
+  jitTheFunctionUnlocked(F);
 
   // If the function referred to another function that had not yet been
   // read from bitcode, and we are jitting non-lazily, emit it now.
-  while (!jitstate->getPendingFunctions(locked).empty()) {
-    Function *PF = jitstate->getPendingFunctions(locked).back();
-    jitstate->getPendingFunctions(locked).pop_back();
+  while (!jitstate->getPendingFunctions().empty()) {
+    Function *PF = jitstate->getPendingFunctions().back();
+    jitstate->getPendingFunctions().pop_back();
 
     assert(!PF->hasAvailableExternallyLinkage() &&
            "Externally-defined function should not be in pending list.");
 
-    jitTheFunction(PF, locked);
+    jitTheFunctionUnlocked(PF);
 
     // Now that the function has been jitted, ask the JITEmitter to rewrite
     // the stub with real address of the function.
-    updateFunctionStub(PF);
+    updateFunctionStubUnlocked(PF);
   }
 }
 
-void JIT::jitTheFunction(Function *F, const MutexGuard &locked) {
+void JIT::jitTheFunctionUnlocked(Function *F) {
   isAlreadyCodeGenerating = true;
-  jitstate->getPM(locked).run(*F);
+  jitstate->getPM().run(*F);
   isAlreadyCodeGenerating = false;
 
   // clear basic block addresses after this function is done
-  getBasicBlockAddressMap(locked).clear();
+  getBasicBlockAddressMap().clear();
 }
 
 /// getPointerToFunction - This method is used to get the address of the
@@ -526,7 +526,7 @@ void *JIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
-  runJITOnFunctionUnlocked(F, locked);
+  runJITOnFunctionUnlocked(F);
 
   void *Addr = getPointerToGlobalIfAvailable(F);
   assert(Addr && "Code generation didn't add function to GlobalAddress table!");
@@ -537,9 +537,9 @@ void JIT::addPointerToBasicBlock(const BasicBlock *BB, void *Addr) {
   MutexGuard locked(lock);
 
   BasicBlockAddressMapTy::iterator I =
-    getBasicBlockAddressMap(locked).find(BB);
-  if (I == getBasicBlockAddressMap(locked).end()) {
-    getBasicBlockAddressMap(locked)[BB] = Addr;
+    getBasicBlockAddressMap().find(BB);
+  if (I == getBasicBlockAddressMap().end()) {
+    getBasicBlockAddressMap()[BB] = Addr;
   } else {
     // ignore repeats: some BBs can be split into few MBBs?
   }
@@ -547,7 +547,7 @@ void JIT::addPointerToBasicBlock(const BasicBlock *BB, void *Addr) {
 
 void JIT::clearPointerToBasicBlock(const BasicBlock *BB) {
   MutexGuard locked(lock);
-  getBasicBlockAddressMap(locked).erase(BB);
+  getBasicBlockAddressMap().erase(BB);
 }
 
 void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
@@ -558,8 +558,8 @@ void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
   MutexGuard locked(lock);
 
   BasicBlockAddressMapTy::iterator I =
-    getBasicBlockAddressMap(locked).find(BB);
-  if (I != getBasicBlockAddressMap(locked).end()) {
+    getBasicBlockAddressMap().find(BB);
+  if (I != getBasicBlockAddressMap().end()) {
     return I->second;
   } else {
     llvm_unreachable("JIT does not have BB address for address-of-label, was"
@@ -688,7 +688,7 @@ char* JIT::getMemoryForGV(const GlobalVariable* GV) {
 
 void JIT::addPendingFunction(Function *F) {
   MutexGuard locked(lock);
-  jitstate->getPendingFunctions(locked).push_back(F);
+  jitstate->getPendingFunctions().push_back(F);
 }
 
 
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index d2bd5089cb29..69a7c3670a87 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -39,12 +39,12 @@ class JITState {
 public:
   explicit JITState(Module *M) : PM(M), M(M) {}
 
-  FunctionPassManager &getPM(const MutexGuard &L) {
+  FunctionPassManager &getPM() {
     return PM;
   }
 
   Module *getModule() const { return M; }
-  std::vector<AssertingVH<Function> > &getPendingFunctions(const MutexGuard &L){
+  std::vector<AssertingVH<Function> > &getPendingFunctions() {
     return PendingFunctions;
   }
 };
@@ -205,7 +205,7 @@ class JIT : public ExecutionEngine {
   void NotifyFreeingMachineCode(void *OldPtr);
 
   BasicBlockAddressMapTy &
-  getBasicBlockAddressMap(const MutexGuard &) {
+  getBasicBlockAddressMap() {
     return BasicBlockAddressMap;
   }
 
@@ -213,9 +213,9 @@ class JIT : public ExecutionEngine {
 private:
   static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM,
                                        TargetMachine &tm);
-  void runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked);
-  void updateFunctionStub(Function *F);
-  void jitTheFunction(Function *F, const MutexGuard &locked);
+  void runJITOnFunctionUnlocked(Function *F);
+  void updateFunctionStubUnlocked(Function *F);
+  void jitTheFunctionUnlocked(Function *F);
 
 protected:
 
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index cd7a500f5115..50b8c10b638b 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Support/Debug.h"
@@ -120,21 +121,16 @@ namespace {
 #endif
     }
 
-    FunctionToLazyStubMapTy& getFunctionToLazyStubMap(
-      const MutexGuard& locked) {
-      assert(locked.holds(TheJIT->lock));
+    FunctionToLazyStubMapTy& getFunctionToLazyStubMap() {
       return FunctionToLazyStubMap;
     }
 
-    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap(const MutexGuard& lck) {
-      assert(lck.holds(TheJIT->lock));
+    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap() {
       return GlobalToIndirectSymMap;
     }
 
     std::pair<void *, Function *> LookupFunctionFromCallSite(
-        const MutexGuard &locked, void *CallSite) const {
-      assert(locked.holds(TheJIT->lock));
-
+        void *CallSite) const {
       // The address given to us for the stub may not be exactly right, it
       // might be a little bit after the stub.  As such, use upper_bound to
       // find it.
@@ -146,9 +142,7 @@ namespace {
       return *I;
     }
 
-    void AddCallSite(const MutexGuard &locked, void *CallSite, Function *F) {
-      assert(locked.holds(TheJIT->lock));
-
+    void AddCallSite(void *CallSite, Function *F) {
       bool Inserted = CallSiteToFunctionMap.insert(
           std::make_pair(CallSite, F)).second;
       (void)Inserted;
@@ -503,7 +497,7 @@ void *JITResolver::getLazyFunctionStubIfAvailable(Function *F) {
   MutexGuard locked(TheJIT->lock);
 
   // If we already have a stub for this function, recycle it.
-  return state.getFunctionToLazyStubMap(locked).lookup(F);
+  return state.getFunctionToLazyStubMap().lookup(F);
 }
 
 /// getFunctionStub - This returns a pointer to a function stub, creating
@@ -512,7 +506,7 @@ void *JITResolver::getLazyFunctionStub(Function *F) {
   MutexGuard locked(TheJIT->lock);
 
   // If we already have a lazy stub for this function, recycle it.
-  void *&Stub = state.getFunctionToLazyStubMap(locked)[F];
+  void *&Stub = state.getFunctionToLazyStubMap()[F];
   if (Stub) return Stub;
 
   // Call the lazy resolver function if we are JIT'ing lazily.  Otherwise we
@@ -554,7 +548,7 @@ void *JITResolver::getLazyFunctionStub(Function *F) {
 
     // Finally, keep track of the stub-to-Function mapping so that the
     // JITCompilerFn knows which function to compile!
-    state.AddCallSite(locked, Stub, F);
+    state.AddCallSite(Stub, F);
   } else if (!Actual) {
     // If we are JIT'ing non-lazily but need to call a function that does not
     // exist yet, add it to the JIT's work list so that we can fill in the
@@ -573,7 +567,7 @@ void *JITResolver::getGlobalValueIndirectSym(GlobalValue *GV, void *GVAddress) {
   MutexGuard locked(TheJIT->lock);
 
   // If we already have a stub for this global variable, recycle it.
-  void *&IndirectSym = state.getGlobalToIndirectSymMap(locked)[GV];
+  void *&IndirectSym = state.getGlobalToIndirectSymMap()[GV];
   if (IndirectSym) return IndirectSym;
 
   // Otherwise, codegen a new indirect symbol.
@@ -633,7 +627,7 @@ void *JITResolver::JITCompilerFn(void *Stub) {
     // The address given to us for the stub may not be exactly right, it might
     // be a little bit after the stub.  As such, use upper_bound to find it.
     std::pair<void*, Function*> I =
-      JR->state.LookupFunctionFromCallSite(locked, Stub);
+      JR->state.LookupFunctionFromCallSite(Stub);
     F = I.second;
     ActualPtr = I.first;
   }
@@ -684,13 +678,23 @@ void *JITResolver::JITCompilerFn(void *Stub) {
 //===----------------------------------------------------------------------===//
 // JITEmitter code.
 //
+
+static GlobalObject *getSimpleAliasee(Constant *C) {
+  C = C->stripPointerCasts();
+  return dyn_cast<GlobalObject>(C);
+}
+
 void *JITEmitter::getPointerToGlobal(GlobalValue *V, void *Reference,
                                      bool MayNeedFarStub) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     return TheJIT->getOrEmitGlobalVariable(GV);
 
-  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
-    return TheJIT->getPointerToGlobal(GA->getAliasee());
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    // We can only handle simple cases.
+    if (GlobalValue *GV = getSimpleAliasee(GA->getAliasee()))
+      return TheJIT->getPointerToGlobal(GV);
+    return nullptr;
+  }
 
   // If we have already compiled the function, return a pointer to its body.
   Function *F = cast<Function>(V);
@@ -1225,7 +1229,7 @@ void *JIT::getPointerToFunctionOrStub(Function *F) {
   return JE->getJITResolver().getLazyFunctionStub(F);
 }
 
-void JIT::updateFunctionStub(Function *F) {
+void JIT::updateFunctionStubUnlocked(Function *F) {
   // Get the empty stub we generated earlier.
   JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
   void *Stub = JE->getJITResolver().getLazyFunctionStub(F);
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 42cb4ea6d841..e9ba96a6496f 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -305,9 +305,13 @@ uint64_t MCJIT::getSymbolAddress(const std::string &Name,
     // Look for our symbols in each Archive
     object::Archive::child_iterator ChildIt = A->findSym(Name);
     if (ChildIt != A->child_end()) {
-      std::unique_ptr<object::Binary> ChildBin;
       // FIXME: Support nested archives?
-      if (!ChildIt->getAsBinary(ChildBin) && ChildBin->isObject()) {
+      ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr =
+          ChildIt->getAsBinary();
+      if (ChildBinOrErr.getError())
+        continue;
+      std::unique_ptr<object::Binary> ChildBin = std::move(ChildBinOrErr.get());
+      if (ChildBin->isObject()) {
         std::unique_ptr<object::ObjectFile> OF(
             static_cast<object::ObjectFile *>(ChildBin.release()));
         // This causes the object file to be loaded.
diff --git a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
index 9ceaa90b0dc3..59860844e939 100644
--- a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
+++ b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -71,7 +71,7 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
   //
   // FIXME: Initialize the Near member for each memory group to avoid
   // interleaving.
-  error_code ec;
+  std::error_code ec;
   sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(RequiredSize,
                                                           &MemGroup.Near,
                                                           sys::Memory::MF_READ |
@@ -105,7 +105,7 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
 bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
 {
   // FIXME: Should in-progress permissions be reverted if an error occurs?
-  error_code ec;
+  std::error_code ec;
 
   // Don't allow free memory blocks to be used after setting protection flags.
   CodeMem.FreeMem.clear();
@@ -143,19 +143,20 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
   return false;
 }
 
-error_code SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
-                                                             unsigned Permissions) {
+std::error_code
+SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
+                                                  unsigned Permissions) {
 
   for (int i = 0, e = MemGroup.AllocatedMem.size(); i != e; ++i) {
-      error_code ec;
-      ec = sys::Memory::protectMappedMemory(MemGroup.AllocatedMem[i],
-                                            Permissions);
-      if (ec) {
-        return ec;
-      }
+    std::error_code ec;
+    ec =
+        sys::Memory::protectMappedMemory(MemGroup.AllocatedMem[i], Permissions);
+    if (ec) {
+      return ec;
+    }
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
 void SectionMemoryManager::invalidateInstructionCache() {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index c1eb0fd31f36..9dfd1678de8b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -73,9 +73,9 @@ void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
-static error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
+static std::error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
   uint64_t Address;
-  if (error_code EC = Sym.getAddress(Address))
+  if (std::error_code EC = Sym.getAddress(Address))
     return EC;
 
   if (Address == UnknownAddressOrSize) {
@@ -85,7 +85,7 @@ static error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
 
   const ObjectFile *Obj = Sym.getObject();
   section_iterator SecI(Obj->section_begin());
-  if (error_code EC = Sym.getSection(SecI))
+  if (std::error_code EC = Sym.getSection(SecI))
     return EC;
 
  if (SecI == Obj->section_end()) {
@@ -94,7 +94,7 @@ static error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
  }
 
   uint64_t SectionAddress;
-  if (error_code EC = SecI->getAddress(SectionAddress))
+  if (std::error_code EC = SecI->getAddress(SectionAddress))
     return EC;
 
   Result = Address - SectionAddress;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 6ba24b9b5971..56471f43b2e2 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -32,7 +32,7 @@ using namespace llvm::object;
 
 namespace {
 
-static inline error_code check(error_code Err) {
+static inline std::error_code check(std::error_code Err) {
   if (Err) {
     report_fatal_error(Err.message());
   }
@@ -55,9 +55,9 @@ template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
 
 public:
   DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                MemoryBuffer *Wrapper, error_code &ec);
+                MemoryBuffer *Wrapper, std::error_code &ec);
 
-  DyldELFObject(MemoryBuffer *Wrapper, error_code &ec);
+  DyldELFObject(MemoryBuffer *Wrapper, std::error_code &ec);
 
   void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
   void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr);
@@ -109,14 +109,14 @@ template <class ELFT> class ELFObjectImage : public ObjectImageCommon {
 // actual memory.  Ultimately, the Binary parent class will take ownership of
 // this MemoryBuffer object but not the underlying memory.
 template <class ELFT>
-DyldELFObject<ELFT>::DyldELFObject(MemoryBuffer *Wrapper, error_code &ec)
+DyldELFObject<ELFT>::DyldELFObject(MemoryBuffer *Wrapper, std::error_code &ec)
     : ELFObjectFile<ELFT>(Wrapper, ec) {
   this->isDyldELFObject = true;
 }
 
 template <class ELFT>
 DyldELFObject<ELFT>::DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                                   MemoryBuffer *Wrapper, error_code &ec)
+                                   MemoryBuffer *Wrapper, std::error_code &ec)
     : ELFObjectFile<ELFT>(Wrapper, ec),
       UnderlyingFile(std::move(UnderlyingFile)) {
   this->isDyldELFObject = true;
@@ -182,7 +182,7 @@ RuntimeDyldELF::createObjectImageFromFile(std::unique_ptr<object::ObjectFile> Ob
   if (!ObjFile)
     return nullptr;
 
-  error_code ec;
+  std::error_code ec;
   MemoryBuffer *Buffer =
       MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false);
 
@@ -218,7 +218,7 @@ ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
   std::pair<unsigned char, unsigned char> Ident =
       std::make_pair((uint8_t)Buffer->getBufferStart()[ELF::EI_CLASS],
                      (uint8_t)Buffer->getBufferStart()[ELF::EI_DATA]);
-  error_code ec;
+  std::error_code ec;
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
     auto Obj =
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index a526073bc046..b84883310b4f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -20,10 +20,9 @@
 using namespace llvm;
 
 namespace llvm {
-
 namespace {
 // Helper for extensive error checking in debug builds.
-error_code Check(error_code Err) {
+std::error_code Check(std::error_code Err) {
   if (Err) {
     report_fatal_error(Err.message());
   }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 412cf20a5c7b..11cc3b246aca 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -28,8 +28,8 @@
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <map>
+#include <system_error>
 
 using namespace llvm;
 using namespace llvm::object;
@@ -245,14 +245,14 @@ class RuntimeDyldImpl {
 
   void writeInt16BE(uint8_t *Addr, uint16_t Value) {
     if (IsTargetLittleEndian)
-      Value = sys::SwapByteOrder(Value);
+      sys::swapByteOrder(Value);
     *Addr       = (Value >> 8) & 0xFF;
     *(Addr + 1) = Value & 0xFF;
   }
 
   void writeInt32BE(uint8_t *Addr, uint32_t Value) {
     if (IsTargetLittleEndian)
-      Value = sys::SwapByteOrder(Value);
+      sys::swapByteOrder(Value);
     *Addr       = (Value >> 24) & 0xFF;
     *(Addr + 1) = (Value >> 16) & 0xFF;
     *(Addr + 2) = (Value >> 8) & 0xFF;
@@ -261,7 +261,7 @@ class RuntimeDyldImpl {
 
   void writeInt64BE(uint8_t *Addr, uint64_t Value) {
     if (IsTargetLittleEndian)
-      Value = sys::SwapByteOrder(Value);
+      sys::swapByteOrder(Value);
     *Addr       = (Value >> 56) & 0xFF;
     *(Addr + 1) = (Value >> 48) & 0xFF;
     *(Addr + 2) = (Value >> 40) & 0xFF;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 30529808d0d5..2b425fbdd339 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -168,8 +168,9 @@ void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE,
   case Triple::thumb:
     resolveARMRelocation(RE, Value);
     break;
+  case Triple::aarch64:
   case Triple::arm64:
-    resolveARM64Relocation(RE, Value);
+    resolveAArch64Relocation(RE, Value);
     break;
   }
 }
@@ -289,8 +290,8 @@ bool RuntimeDyldMachO::resolveARMRelocation(const RelocationEntry &RE,
   return false;
 }
 
-bool RuntimeDyldMachO::resolveARM64Relocation(const RelocationEntry &RE,
-                                              uint64_t Value) {
+bool RuntimeDyldMachO::resolveAArch64Relocation(const RelocationEntry &RE,
+                                                uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
   uint8_t* LocalAddress = Section.Address + RE.Offset;
 
@@ -460,7 +461,7 @@ relocation_iterator RuntimeDyldMachO::processSECTDIFFRelocation(
 
   uint32_t AddrB = MachO->getScatteredRelocationValue(RE2);
   section_iterator SBI = getSectionByAddress(*MachO, AddrB);
-  assert(SBI != MachO->section_end() && "Can't find seciton for address B");
+  assert(SBI != MachO->section_end() && "Can't find section for address B");
   uint64_t SectionBBase;
   SBI->getAddress(SectionBBase);
   uint64_t SectionBOffset = AddrB - SectionBBase;
@@ -483,7 +484,48 @@ relocation_iterator RuntimeDyldMachO::processSECTDIFFRelocation(
   addRelocationForSection(R, SectionAID);
   addRelocationForSection(R, SectionBID);
 
-  return RelI;
+  return ++RelI;
+}
+
+relocation_iterator RuntimeDyldMachO::processI386ScatteredVANILLA(
+                                            unsigned SectionID,
+                                            relocation_iterator RelI,
+                                            ObjectImage &Obj,
+                                            ObjSectionToIDMap &ObjSectionToID) {
+  const MachOObjectFile *MachO =
+    static_cast<const MachOObjectFile*>(Obj.getObjectFile());
+  MachO::any_relocation_info RE =
+    MachO->getRelocation(RelI->getRawDataRefImpl());
+
+  SectionEntry &Section = Sections[SectionID];
+  uint32_t RelocType = MachO->getAnyRelocationType(RE);
+  bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
+  unsigned Size = MachO->getAnyRelocationLength(RE);
+  uint64_t Offset;
+  RelI->getOffset(Offset);
+  uint8_t *LocalAddress = Section.Address + Offset;
+  unsigned NumBytes = 1 << Size;
+  int64_t Addend = 0;
+  memcpy(&Addend, LocalAddress, NumBytes);
+
+  unsigned SymbolBaseAddr = MachO->getScatteredRelocationValue(RE);
+  section_iterator TargetSI = getSectionByAddress(*MachO, SymbolBaseAddr);
+  assert(TargetSI != MachO->section_end() && "Can't find section for symbol");
+  uint64_t SectionBaseAddr;
+  TargetSI->getAddress(SectionBaseAddr);
+  SectionRef TargetSection = *TargetSI;
+  bool IsCode;
+  TargetSection.isText(IsCode);
+  uint32_t TargetSectionID = findOrEmitSection(Obj, TargetSection, IsCode,
+                                               ObjSectionToID);
+
+  Addend -= SectionBaseAddr;
+  RelocationEntry R(SectionID, Offset, RelocType, Addend,
+                    IsPCRel, Size);
+
+  addRelocationForSection(R, TargetSectionID);
+
+  return ++RelI;
 }
 
 relocation_iterator RuntimeDyldMachO::processRelocationRef(
@@ -498,17 +540,22 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
   uint32_t RelType = MachO->getAnyRelocationType(RE);
 
   // FIXME: Properly handle scattered relocations.
-  //        For now, optimistically skip these: they can often be ignored, as
-  //        the static linker will already have applied the relocation, and it
-  //        only needs to be reapplied if symbols move relative to one another.
-  //        Note: This will fail horribly where the relocations *do* need to be
-  //        applied, but that was already the case.
+  //        Special case the couple of scattered relocations that we know how
+  //        to handle: SECTDIFF relocations, and scattered VANILLA relocations
+  //        on I386.
+  //        For all other scattered relocations, just bail out and hope for the
+  //        best, since the offsets computed by scattered relocations have often
+  //        been optimisticaly filled in by the compiler. This will fail
+  //        horribly where the relocations *do* need to be applied, but that was
+  //        already the case.
   if (MachO->isRelocationScattered(RE)) {
     if (RelType == MachO::GENERIC_RELOC_SECTDIFF ||
         RelType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF)
       return processSECTDIFFRelocation(SectionID, RelI, Obj, ObjSectionToID);
-
-    return ++RelI;
+    else if (Arch == Triple::x86 && RelType == MachO::GENERIC_RELOC_VANILLA)
+      return processI386ScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID);
+    else
+      return ++RelI;
   }
 
   RelocationValueRef Value;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 138c59b95cea..060eb8c29a2b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -28,23 +28,20 @@ class RuntimeDyldMachO : public RuntimeDyldImpl {
 private:
 
   /// Write the least significant 'Size' bytes in 'Value' out at the address
-  /// pointed to by Addr. Check for overflow.
+  /// pointed to by Addr.
   bool applyRelocationValue(uint8_t *Addr, uint64_t Value, unsigned Size) {
     for (unsigned i = 0; i < Size; ++i) {
       *Addr++ = (uint8_t)Value;
       Value >>= 8;
     }
 
-    if (Value) // Catch overflow
-      return Error("Relocation out of range.");
-
     return false;
   }
 
   bool resolveI386Relocation(const RelocationEntry &RE, uint64_t Value);
   bool resolveX86_64Relocation(const RelocationEntry &RE, uint64_t Value);
   bool resolveARMRelocation(const RelocationEntry &RE, uint64_t Value);
-  bool resolveARM64Relocation(const RelocationEntry &RE, uint64_t Value);
+  bool resolveAArch64Relocation(const RelocationEntry &RE, uint64_t Value);
 
   // Populate stubs in __jump_table section.
   void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection,
@@ -71,6 +68,12 @@ class RuntimeDyldMachO : public RuntimeDyldImpl {
                                              ObjectImage &ObjImg,
                                              ObjSectionToIDMap &ObjSectionToID);
 
+  relocation_iterator processI386ScatteredVANILLA(
+					     unsigned SectionID,
+					     relocation_iterator RelI,
+					     ObjectImage &ObjImg,
+					     ObjSectionToIDMap &ObjSectionToID);
+
   struct EHFrameRelatedSections {
     EHFrameRelatedSections()
         : EHFrameSID(RTDYLD_INVALID_SECTION_ID),
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 0fef0d0a188f..398e3d5f2a1b 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -1451,10 +1451,11 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   PrintVisibility(GV->getVisibility(), Out);
   PrintDLLStorageClass(GV->getDLLStorageClass(), Out);
   PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
+  if (GV->hasUnnamedAddr())
+    Out << "unnamed_addr ";
 
   if (unsigned AddressSpace = GV->getType()->getAddressSpace())
     Out << "addrspace(" << AddressSpace << ") ";
-  if (GV->hasUnnamedAddr()) Out << "unnamed_addr ";
   if (GV->isExternallyInitialized()) Out << "externally_initialized ";
   Out << (GV->isConstant() ? "constant " : "global ");
   TypePrinter.print(GV->getType()->getElementType(), Out);
@@ -1488,21 +1489,18 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
   }
   PrintVisibility(GA->getVisibility(), Out);
   PrintDLLStorageClass(GA->getDLLStorageClass(), Out);
+  PrintThreadLocalModel(GA->getThreadLocalMode(), Out);
+  if (GA->hasUnnamedAddr())
+    Out << "unnamed_addr ";
 
   Out << "alias ";
 
   PrintLinkage(GA->getLinkage(), Out);
 
-  PointerType *Ty = GA->getType();
   const Constant *Aliasee = GA->getAliasee();
-  if (!Aliasee || Ty != Aliasee->getType()) {
-    if (unsigned AddressSpace = Ty->getAddressSpace())
-      Out << "addrspace(" << AddressSpace << ") ";
-    TypePrinter.print(Ty->getElementType(), Out);
-    Out << ", ";
-  }
 
   if (!Aliasee) {
+    TypePrinter.print(GA->getType(), Out);
     Out << " <<NULL ALIASEE>>";
   } else {
     writeOperand(Aliasee, !isa<ConstantExpr>(Aliasee));
@@ -1788,6 +1786,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       (isa<StoreInst>(I) && cast<StoreInst>(I).isAtomic()))
     Out << " atomic";
 
+  if (isa<AtomicCmpXchgInst>(I) && cast<AtomicCmpXchgInst>(I).isWeak())
+    Out << " weak";
+
   // If this is a volatile operation, print out the volatile marker.
   if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isVolatile()) ||
       (isa<StoreInst>(I) && cast<StoreInst>(I).isVolatile()) ||
@@ -2158,10 +2159,6 @@ void NamedMDNode::print(raw_ostream &ROS) const {
 }
 
 void Type::print(raw_ostream &OS) const {
-  if (!this) {
-    OS << "<null Type>";
-    return;
-  }
   TypePrinting TP;
   TP.print(const_cast<Type*>(this), OS);
 
@@ -2174,10 +2171,6 @@ void Type::print(raw_ostream &OS) const {
 }
 
 void Value::print(raw_ostream &ROS) const {
-  if (!this) {
-    ROS << "printing a <null> value\n";
-    return;
-  }
   formatted_raw_ostream OS(ROS);
   if (const Instruction *I = dyn_cast<Instruction>(this)) {
     const Function *F = I->getParent() ? I->getParent()->getParent() : nullptr;
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index a9074bb294d9..58475e2d3cba 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -173,6 +173,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "inlinehint";
   if (hasAttribute(Attribute::InReg))
     return "inreg";
+  if (hasAttribute(Attribute::JumpTable))
+    return "jumptable";
   if (hasAttribute(Attribute::MinSize))
     return "minsize";
   if (hasAttribute(Attribute::Naked))
@@ -395,6 +397,7 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::OptimizeNone:    return 1ULL << 42;
   case Attribute::InAlloca:        return 1ULL << 43;
   case Attribute::NonNull:         return 1ULL << 44;
+  case Attribute::JumpTable:       return 1ULL << 45;
   }
   llvm_unreachable("Unsupported attribute type");
 }
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index e255113aab7e..05b3745ab0f9 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -114,6 +114,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx.movnt.pd.256" ||
         Name == "x86.avx.movnt.ps.256" ||
         Name == "x86.sse42.crc32.64.8" ||
+        Name == "x86.avx.vbroadcast.ss" ||
+        Name == "x86.avx.vbroadcast.ss.256" ||
+        Name == "x86.avx.vbroadcast.sd.256" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = nullptr;
       return true;
@@ -335,6 +338,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
       Rep = Builder.CreateCall2(CRC32, Trunc0, CI->getArgOperand(1));
       Rep = Builder.CreateZExt(Rep, CI->getType(), "");
+    } else if (Name.startswith("llvm.x86.avx.vbroadcast")) {
+      // Replace broadcasts with a series of insertelements.
+      Type *VecTy = CI->getType();
+      Type *EltTy = VecTy->getVectorElementType();
+      unsigned EltNum = VecTy->getVectorNumElements();
+      Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0),
+                                          EltTy->getPointerTo());
+      Value *Load = Builder.CreateLoad(Cast);
+      Type *I32Ty = Type::getInt32Ty(C);
+      Rep = UndefValue::get(VecTy);
+      for (unsigned I = 0; I < EltNum; ++I)
+        Rep = Builder.CreateInsertElement(Rep, Load,
+                                          ConstantInt::get(I32Ty, I));
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 706e66fb422e..c23ab71eaf3a 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -529,7 +529,10 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       // Try hard to fold cast of cast because they are often eliminable.
       if (unsigned newOpc = foldConstantCastPair(opc, CE, DestTy))
         return ConstantExpr::getCast(newOpc, CE->getOperand(0), DestTy);
-    } else if (CE->getOpcode() == Instruction::GetElementPtr) {
+    } else if (CE->getOpcode() == Instruction::GetElementPtr &&
+               // Do not fold addrspacecast (gep 0, .., 0). It might make the
+               // addrspacecast uncanonicalized.
+               opc != Instruction::AddrSpaceCast) {
       // If all of the indexes in the GEP are null values, there is no pointer
       // adjustment going on.  We might as well cast the source pointer.
       bool isAllNull = true;
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index bb8d60b234f3..aa26cff6a7b6 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -1698,6 +1698,19 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy) {
   assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
          "Invalid constantexpr addrspacecast!");
 
+  // Canonicalize addrspacecasts between different pointer types by first
+  // bitcasting the pointer type and then converting the address space.
+  PointerType *SrcScalarTy = cast<PointerType>(C->getType()->getScalarType());
+  PointerType *DstScalarTy = cast<PointerType>(DstTy->getScalarType());
+  Type *DstElemTy = DstScalarTy->getElementType();
+  if (SrcScalarTy->getElementType() != DstElemTy) {
+    Type *MidTy = PointerType::get(DstElemTy, SrcScalarTy->getAddressSpace());
+    if (VectorType *VT = dyn_cast<VectorType>(DstTy)) {
+      // Handle vectors of pointers.
+      MidTy = VectorType::get(MidTy, VT->getNumElements());
+    }
+    C = getBitCast(C, MidTy);
+  }
   return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy);
 }
 
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 27ce503e1c34..197b6cb9054e 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -17,9 +17,9 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
@@ -35,10 +35,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
+#include <system_error>
 
 using namespace llvm;
 
@@ -281,6 +281,7 @@ char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
   std::string buf;
   raw_string_ostream os(buf);
 
+  assert(unwrap(Ty) != nullptr && "Expecting non-null Type");
   unwrap(Ty)->print(os);
   os.flush();
 
@@ -531,6 +532,7 @@ char* LLVMPrintValueToString(LLVMValueRef Val) {
   std::string buf;
   raw_string_ostream os(buf);
 
+  assert(unwrap(Val) != nullptr && "Expecting non-null Value");
   unwrap(Val)->print(os);
   os.flush();
 
@@ -1286,7 +1288,7 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
 }
 
 const char *LLVMGetSection(LLVMValueRef Global) {
-  return unwrap<GlobalValue>(Global)->getSection().c_str();
+  return unwrap<GlobalValue>(Global)->getSection();
 }
 
 void LLVMSetSection(LLVMValueRef Global, const char *Section) {
@@ -2599,7 +2601,7 @@ LLVMBool LLVMCreateMemoryBufferWithContentsOfFile(
     char **OutMessage) {
 
   std::unique_ptr<MemoryBuffer> MB;
-  error_code ec;
+  std::error_code ec;
   if (!(ec = MemoryBuffer::getFile(Path, MB))) {
     *OutMemBuf = wrap(MB.release());
     return 0;
@@ -2612,7 +2614,7 @@ LLVMBool LLVMCreateMemoryBufferWithContentsOfFile(
 LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
                                          char **OutMessage) {
   std::unique_ptr<MemoryBuffer> MB;
-  error_code ec;
+  std::error_code ec;
   if (!(ec = MemoryBuffer::getSTDIN(MB))) {
     *OutMemBuf = wrap(MB.release());
     return 0;
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index 43360d38662c..e8bdccebae96 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -76,7 +76,7 @@ MDNode *DebugLoc::getScopeNode(const LLVMContext &Ctx) const {
   return getScope(Ctx);
 }
 
-DebugLoc DebugLoc::getFnDebugLoc(const LLVMContext &Ctx) {
+DebugLoc DebugLoc::getFnDebugLoc(const LLVMContext &Ctx) const {
   const MDNode *Scope = getScopeNode(Ctx);
   DISubprogram SP = getDISubprogram(Scope);
   if (SP.isSubprogram()) {
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 3f8100f985d7..27270636004f 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -12,6 +12,7 @@
 // Diagnostics reporting is still done as part of the LLVMContext.
 //===----------------------------------------------------------------------===//
 
+#include "LLVMContextImpl.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -22,10 +23,69 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Atomic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Regex.h"
 #include <string>
 
 using namespace llvm;
 
+namespace {
+
+/// \brief Regular expression corresponding to the value given in one of the
+/// -pass-remarks* command line flags. Passes whose name matches this regexp
+/// will emit a diagnostic when calling the associated diagnostic function
+/// (emitOptimizationRemark, emitOptimizationRemarkMissed or
+/// emitOptimizationRemarkAnalysis).
+struct PassRemarksOpt {
+  std::shared_ptr<Regex> Pattern;
+
+  void operator=(const std::string &Val) {
+    // Create a regexp object to match pass names for emitOptimizationRemark.
+    if (!Val.empty()) {
+      Pattern = std::make_shared<Regex>(Val);
+      std::string RegexError;
+      if (!Pattern->isValid(RegexError))
+        report_fatal_error("Invalid regular expression '" + Val +
+                               "' in -pass-remarks: " + RegexError,
+                           false);
+    }
+  };
+};
+
+static PassRemarksOpt PassRemarksOptLoc;
+static PassRemarksOpt PassRemarksMissedOptLoc;
+static PassRemarksOpt PassRemarksAnalysisOptLoc;
+
+// -pass-remarks
+//    Command line flag to enable emitOptimizationRemark()
+static cl::opt<PassRemarksOpt, true, cl::parser<std::string>>
+PassRemarks("pass-remarks", cl::value_desc("pattern"),
+            cl::desc("Enable optimization remarks from passes whose name match "
+                     "the given regular expression"),
+            cl::Hidden, cl::location(PassRemarksOptLoc), cl::ValueRequired,
+            cl::ZeroOrMore);
+
+// -pass-remarks-missed
+//    Command line flag to enable emitOptimizationRemarkMissed()
+static cl::opt<PassRemarksOpt, true, cl::parser<std::string>> PassRemarksMissed(
+    "pass-remarks-missed", cl::value_desc("pattern"),
+    cl::desc("Enable missed optimization remarks from passes whose name match "
+             "the given regular expression"),
+    cl::Hidden, cl::location(PassRemarksMissedOptLoc), cl::ValueRequired,
+    cl::ZeroOrMore);
+
+// -pass-remarks-analysis
+//    Command line flag to enable emitOptimizationRemarkAnalysis()
+static cl::opt<PassRemarksOpt, true, cl::parser<std::string>>
+PassRemarksAnalysis(
+    "pass-remarks-analysis", cl::value_desc("pattern"),
+    cl::desc(
+        "Enable optimization analysis remarks from passes whose name match "
+        "the given regular expression"),
+    cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
+    cl::ZeroOrMore);
+}
+
 int llvm::getNextAvailablePluginDiagnosticKind() {
   static sys::cas_flag PluginKindID = DK_FirstPluginKind;
   return (int)sys::AtomicIncrement(&PluginKindID);
@@ -67,20 +127,20 @@ void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const {
   DP << getMsg();
 }
 
-bool DiagnosticInfoOptimizationRemark::isLocationAvailable() const {
-  return getFunction().getParent()->getNamedMetadata("llvm.dbg.cu") != nullptr;
+bool DiagnosticInfoOptimizationRemarkBase::isLocationAvailable() const {
+  return getDebugLoc().isUnknown() == false;
 }
 
-void DiagnosticInfoOptimizationRemark::getLocation(StringRef *Filename,
-                                                   unsigned *Line,
-                                                   unsigned *Column) const {
+void DiagnosticInfoOptimizationRemarkBase::getLocation(StringRef *Filename,
+                                                       unsigned *Line,
+                                                       unsigned *Column) const {
   DILocation DIL(getDebugLoc().getAsMDNode(getFunction().getContext()));
   *Filename = DIL.getFilename();
   *Line = DIL.getLineNumber();
   *Column = DIL.getColumnNumber();
 }
 
-const std::string DiagnosticInfoOptimizationRemark::getLocationStr() const {
+const std::string DiagnosticInfoOptimizationRemarkBase::getLocationStr() const {
   StringRef Filename("<unknown>");
   unsigned Line = 0;
   unsigned Column = 0;
@@ -89,6 +149,43 @@ const std::string DiagnosticInfoOptimizationRemark::getLocationStr() const {
   return Twine(Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
 }
 
-void DiagnosticInfoOptimizationRemark::print(DiagnosticPrinter &DP) const {
+void DiagnosticInfoOptimizationRemarkBase::print(DiagnosticPrinter &DP) const {
   DP << getLocationStr() << ": " << getMsg();
 }
+
+bool DiagnosticInfoOptimizationRemark::isEnabled() const {
+  return PassRemarksOptLoc.Pattern &&
+         PassRemarksOptLoc.Pattern->match(getPassName());
+}
+
+bool DiagnosticInfoOptimizationRemarkMissed::isEnabled() const {
+  return PassRemarksMissedOptLoc.Pattern &&
+         PassRemarksMissedOptLoc.Pattern->match(getPassName());
+}
+
+bool DiagnosticInfoOptimizationRemarkAnalysis::isEnabled() const {
+  return PassRemarksAnalysisOptLoc.Pattern &&
+         PassRemarksAnalysisOptLoc.Pattern->match(getPassName());
+}
+
+void llvm::emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
+                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationRemark(PassName, Fn, DLoc, Msg));
+}
+
+void llvm::emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
+                                        const Function &Fn,
+                                        const DebugLoc &DLoc,
+                                        const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationRemarkMissed(PassName, Fn, DLoc, Msg));
+}
+
+void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx,
+                                          const char *PassName,
+                                          const Function &Fn,
+                                          const DebugLoc &DLoc,
+                                          const Twine &Msg) {
+  Ctx.diagnose(
+      DiagnosticInfoOptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg));
+}
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index f2099d6baf13..b4c7977d52e5 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -19,8 +19,8 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
+#include <system_error>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -438,7 +438,7 @@ class LineConsumer {
   StringRef Remaining;
 public:
   LineConsumer(StringRef Filename) {
-    if (error_code EC = MemoryBuffer::getFileOrSTDIN(Filename, Buffer)) {
+    if (std::error_code EC = MemoryBuffer::getFileOrSTDIN(Filename, Buffer)) {
       errs() << Filename << ": " << EC.message() << "\n";
       Remaining = "";
     } else
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index c905cfe31e2d..5410cc031dad 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
@@ -58,10 +59,22 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setDLLStorageClass(Src->getDLLStorageClass());
 }
 
-unsigned GlobalValue::getAlignment() const {
-  if (auto *GA = dyn_cast<GlobalAlias>(this))
-    return GA->getAliasee()->getAlignment();
+static const GlobalObject *getBaseObject(const Constant &C) {
+  // FIXME: We should probably return a base + offset pair for non-zero GEPs.
+  return dyn_cast<GlobalObject>(C.stripPointerCasts());
+}
 
+unsigned GlobalValue::getAlignment() const {
+  if (auto *GA = dyn_cast<GlobalAlias>(this)) {
+    // In general we cannot compute this at the IR level, but we try.
+    if (const GlobalObject *GO = getBaseObject(*GA->getAliasee()))
+      return GO->getAlignment();
+
+    // FIXME: we should also be able to handle:
+    // Alias = Global + Offset
+    // Alias = Absolute
+    return 0;
+  }
   return cast<GlobalObject>(this)->getAlignment();
 }
 
@@ -80,9 +93,13 @@ void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
   setSection(GV->getSection());
 }
 
-const std::string &GlobalValue::getSection() const {
-  if (auto *GA = dyn_cast<GlobalAlias>(this))
-    return GA->getAliasee()->getSection();
+const char *GlobalValue::getSection() const {
+  if (auto *GA = dyn_cast<GlobalAlias>(this)) {
+    // In general we cannot compute this at the IR level, but we try.
+    if (const GlobalObject *GO = getBaseObject(*GA->getAliasee()))
+      return GO->getSection();
+    return "";
+  }
   return cast<GlobalObject>(this)->getSection();
 }
 
@@ -113,8 +130,9 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
     : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
                    OperandTraits<GlobalVariable>::op_begin(this),
                    InitVal != nullptr, Link, Name),
-      isConstantGlobal(constant), threadLocalMode(TLMode),
+      isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
+  setThreadLocalMode(TLMode);
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -132,8 +150,9 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
     : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
                    OperandTraits<GlobalVariable>::op_begin(this),
                    InitVal != nullptr, Link, Name),
-      isConstantGlobal(constant), threadLocalMode(TLMode),
+      isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
+  setThreadLocalMode(TLMode);
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -214,7 +233,7 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
 //===----------------------------------------------------------------------===//
 
 GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
-                         const Twine &Name, GlobalObject *Aliasee,
+                         const Twine &Name, Constant *Aliasee,
                          Module *ParentModule)
     : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalAliasVal,
                   &Op<0>(), 1, Link, Name) {
@@ -227,7 +246,7 @@ GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
 
 GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
                                  LinkageTypes Link, const Twine &Name,
-                                 GlobalObject *Aliasee, Module *ParentModule) {
+                                 Constant *Aliasee, Module *ParentModule) {
   return new GlobalAlias(Ty, AddressSpace, Link, Name, Aliasee, ParentModule);
 }
 
@@ -239,18 +258,18 @@ GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
 
 GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
                                  LinkageTypes Linkage, const Twine &Name,
-                                 GlobalObject *Aliasee) {
+                                 GlobalValue *Aliasee) {
   return create(Ty, AddressSpace, Linkage, Name, Aliasee, Aliasee->getParent());
 }
 
 GlobalAlias *GlobalAlias::create(LinkageTypes Link, const Twine &Name,
-                                 GlobalObject *Aliasee) {
+                                 GlobalValue *Aliasee) {
   PointerType *PTy = Aliasee->getType();
   return create(PTy->getElementType(), PTy->getAddressSpace(), Link, Name,
                 Aliasee);
 }
 
-GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalObject *Aliasee) {
+GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalValue *Aliasee) {
   return create(Aliasee->getLinkage(), Name, Aliasee);
 }
 
@@ -270,4 +289,8 @@ void GlobalAlias::eraseFromParent() {
   getParent()->getAliasList().erase(this);
 }
 
-void GlobalAlias::setAliasee(GlobalObject *Aliasee) { setOperand(0, Aliasee); }
+void GlobalAlias::setAliasee(Constant *Aliasee) {
+  assert((!Aliasee || Aliasee->getType() == getType()) &&
+         "Alias and aliasee types should match!");
+  setOperand(0, Aliasee);
+}
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index 099c27cfa5c0..c8a17479d8a0 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -94,7 +94,7 @@ class PrintBasicBlockPass : public BasicBlockPass {
     return false;
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override{
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
 };
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index e638540b2c90..86421c4ae9ff 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -145,31 +145,31 @@ void Instruction::setFastMathFlags(FastMathFlags FMF) {
 
 /// Determine whether the unsafe-algebra flag is set.
 bool Instruction::hasUnsafeAlgebra() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
 }
 
 /// Determine whether the no-NaNs flag is set.
 bool Instruction::hasNoNaNs() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasNoNaNs();
 }
 
 /// Determine whether the no-infs flag is set.
 bool Instruction::hasNoInfs() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasNoInfs();
 }
 
 /// Determine whether the no-signed-zeros flag is set.
 bool Instruction::hasNoSignedZeros() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasNoSignedZeros();
 }
 
 /// Determine whether the allow-reciprocal flag is set.
 bool Instruction::hasAllowReciprocal() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasAllowReciprocal();
 }
 
@@ -177,7 +177,7 @@ bool Instruction::hasAllowReciprocal() const {
 /// operator which supports these flags. See LangRef.html for the meaning of
 /// these flats.
 FastMathFlags Instruction::getFastMathFlags() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->getFastMathFlags();
 }
 
@@ -262,6 +262,59 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   }
 }
 
+/// Return true if both instructions have the same special state
+/// This must be kept in sync with lib/Transforms/IPO/MergeFunctions.cpp.
+static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
+                                 bool IgnoreAlignment = false) {
+  assert(I1->getOpcode() == I2->getOpcode() &&
+         "Can not compare special state of different instructions");
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
+    return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
+           (LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() ||
+            IgnoreAlignment) &&
+           LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
+    return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
+           (SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() ||
+            IgnoreAlignment) &&
+           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
+    return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(I1))
+    return CI->isTailCall() == cast<CallInst>(I2)->isTailCall() &&
+           CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<CallInst>(I2)->getAttributes();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
+    return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
+           CI->getAttributes() ==
+             cast<InvokeInst>(I2)->getAttributes();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
+    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
+    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
+    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
+           CXI->isWeak() == cast<AtomicCmpXchgInst>(I2)->isWeak() &&
+           CXI->getSuccessOrdering() ==
+               cast<AtomicCmpXchgInst>(I2)->getSuccessOrdering() &&
+           CXI->getFailureOrdering() ==
+               cast<AtomicCmpXchgInst>(I2)->getFailureOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope();
+
+  return true;
+}
+
 /// isIdenticalTo - Return true if the specified instruction is exactly
 /// identical to the current one.  This means that all operands match and any
 /// extra information (e.g. load is volatile) agree.
@@ -279,56 +332,22 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
       getType() != I->getType())
     return false;
 
+  // If both instructions have no operands, they are identical.
+  if (getNumOperands() == 0 && I->getNumOperands() == 0)
+    return haveSameSpecialState(this, I);
+
   // We have two instructions of identical opcode and #operands.  Check to see
   // if all operands are the same.
   if (!std::equal(op_begin(), op_end(), I->op_begin()))
     return false;
 
-  // Check special state that is a part of some instructions.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
-    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I)->getAlignment() &&
-           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
-           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
-    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I)->getAlignment() &&
-           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
-           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
-  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
-    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
-  if (const CallInst *CI = dyn_cast<CallInst>(this))
-    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
-           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
-           CI->getAttributes() == cast<CallInst>(I)->getAttributes();
-  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
-    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
-           CI->getAttributes() == cast<InvokeInst>(I)->getAttributes();
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this))
-    return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
-    return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
-  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
-    return FI->getOrdering() == cast<FenceInst>(FI)->getOrdering() &&
-           FI->getSynchScope() == cast<FenceInst>(FI)->getSynchScope();
-  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
-    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
-           CXI->getSuccessOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() &&
-           CXI->getFailureOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getFailureOrdering() &&
-           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
-    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
-           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
-           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
-           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
   if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) {
     const PHINode *otherPHI = cast<PHINode>(I);
     return std::equal(thisPHI->block_begin(), thisPHI->block_end(),
                       otherPHI->block_begin());
   }
-  return true;
+
+  return haveSameSpecialState(this, I);
 }
 
 // isSameOperationAs
@@ -355,50 +374,7 @@ bool Instruction::isSameOperationAs(const Instruction *I,
         getOperand(i)->getType() != I->getOperand(i)->getType())
       return false;
 
-  // Check special state that is a part of some instructions.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
-    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           (LI->getAlignment() == cast<LoadInst>(I)->getAlignment() ||
-            IgnoreAlignment) &&
-           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
-           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
-    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           (SI->getAlignment() == cast<StoreInst>(I)->getAlignment() ||
-            IgnoreAlignment) &&
-           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
-           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
-  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
-    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
-  if (const CallInst *CI = dyn_cast<CallInst>(this))
-    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
-           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
-           CI->getAttributes() == cast<CallInst>(I)->getAttributes();
-  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
-    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
-           CI->getAttributes() ==
-             cast<InvokeInst>(I)->getAttributes();
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this))
-    return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
-    return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
-  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
-    return FI->getOrdering() == cast<FenceInst>(I)->getOrdering() &&
-           FI->getSynchScope() == cast<FenceInst>(I)->getSynchScope();
-  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
-    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
-           CXI->getSuccessOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() &&
-           CXI->getFailureOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getFailureOrdering() &&
-           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
-    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
-           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
-           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
-           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
-
-  return true;
+  return haveSameSpecialState(this, I, IgnoreAlignment);
 }
 
 /// isUsedOutsideOfBlock - Return true if there are any uses of I outside of the
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 13c51b817c0b..a5ceacb5637c 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -248,7 +248,7 @@ void LandingPadInst::growOperands(unsigned Size) {
   Use::zap(OldOps, OldOps + e, true);
 }
 
-void LandingPadInst::addClause(Value *Val) {
+void LandingPadInst::addClause(Constant *Val) {
   unsigned OpNo = getNumOperands();
   growOperands(1);
   assert(OpNo < ReservedSpace && "Growing didn't work!");
@@ -1251,10 +1251,11 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      AtomicOrdering FailureOrdering,
                                      SynchronizationScope SynchScope,
                                      Instruction *InsertBefore)
-  : Instruction(Cmp->getType(), AtomicCmpXchg,
-                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
-                OperandTraits<AtomicCmpXchgInst>::operands(this),
-                InsertBefore) {
+    : Instruction(
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
+                          nullptr),
+          AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+          OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
 }
 
@@ -1263,13 +1264,14 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      AtomicOrdering FailureOrdering,
                                      SynchronizationScope SynchScope,
                                      BasicBlock *InsertAtEnd)
-  : Instruction(Cmp->getType(), AtomicCmpXchg,
-                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
-                OperandTraits<AtomicCmpXchgInst>::operands(this),
-                InsertAtEnd) {
+    : Instruction(
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
+                          nullptr),
+          AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+          OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
 }
- 
+
 //===----------------------------------------------------------------------===//
 //                       AtomicRMWInst Implementation
 //===----------------------------------------------------------------------===//
@@ -2331,18 +2333,12 @@ unsigned CastInst::isEliminableCastPair(
       // Allowed, use first cast's opcode
       return firstOp;
     case 14:
-      // FIXME: this state can be merged with (2), but the following assert
-      // is useful to check the correcteness of the sequence due to semantic
-      // change of bitcast.
-      assert(
-        SrcTy->isPtrOrPtrVectorTy() &&
-        MidTy->isPtrOrPtrVectorTy() &&
-        DstTy->isPtrOrPtrVectorTy() &&
-        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
-        MidTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
-        "Illegal bitcast, addrspacecast sequence!");
-      // Allowed, use second cast's opcode
-      return secondOp;
+      // bitcast, addrspacecast -> addrspacecast if the element type of
+      // bitcast's source is the same as that of addrspacecast's destination.
+      if (SrcTy->getPointerElementType() == DstTy->getPointerElementType())
+        return Instruction::AddrSpaceCast;
+      return 0;
+
     case 15:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -3610,6 +3606,7 @@ AtomicCmpXchgInst *AtomicCmpXchgInst::clone_impl() const {
                           getSuccessOrdering(), getFailureOrdering(),
                           getSynchScope());
   Result->setVolatile(isVolatile());
+  Result->setWeak(isWeak());
   return Result;
 }
 
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index 5f94dca1eb96..de825f00b207 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -142,14 +142,26 @@ void LLVMContext::diagnose(const DiagnosticInfo &DI) {
     return;
   }
 
-  // Optimization remarks are selective. They need to check whether
-  // the regexp pattern, passed via -pass-remarks, matches the name
-  // of the pass that is emitting the diagnostic. If there is no match,
-  // ignore the diagnostic and return.
-  if (DI.getKind() == llvm::DK_OptimizationRemark &&
-      !pImpl->optimizationRemarksEnabledFor(
-          cast<DiagnosticInfoOptimizationRemark>(DI).getPassName()))
-    return;
+  // Optimization remarks are selective. They need to check whether the regexp
+  // pattern, passed via one of the -pass-remarks* flags, matches the name of
+  // the pass that is emitting the diagnostic. If there is no match, ignore the
+  // diagnostic and return.
+  switch (DI.getKind()) {
+  case llvm::DK_OptimizationRemark:
+    if (!cast<DiagnosticInfoOptimizationRemark>(DI).isEnabled())
+      return;
+    break;
+  case llvm::DK_OptimizationRemarkMissed:
+    if (!cast<DiagnosticInfoOptimizationRemarkMissed>(DI).isEnabled())
+      return;
+    break;
+  case llvm::DK_OptimizationRemarkAnalysis:
+    if (!cast<DiagnosticInfoOptimizationRemarkAnalysis>(DI).isEnabled())
+      return;
+    break;
+  default:
+    break;
+  }
 
   // Otherwise, print the message with a prefix based on the severity.
   std::string MsgStorage;
@@ -177,13 +189,6 @@ void LLVMContext::emitError(unsigned LocCookie, const Twine &ErrorStr) {
   diagnose(DiagnosticInfoInlineAsm(LocCookie, ErrorStr));
 }
 
-void LLVMContext::emitOptimizationRemark(const char *PassName,
-                                         const Function &Fn,
-                                         const DebugLoc &DLoc,
-                                         const Twine &Msg) {
-  diagnose(DiagnosticInfoOptimizationRemark(PassName, Fn, DLoc, Msg));
-}
-
 //===----------------------------------------------------------------------===//
 // Metadata Kind Uniquing
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index 2042374647dd..4c2791f0a8d5 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -14,9 +14,8 @@
 #include "LLVMContextImpl.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Regex.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -46,55 +45,11 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
   NamedStructTypesUniqueID = 0;
 }
 
-namespace {
-
-/// \brief Regular expression corresponding to the value given in the
-/// command line flag -pass-remarks. Passes whose name matches this
-/// regexp will emit a diagnostic when calling
-/// LLVMContext::emitOptimizationRemark.
-static Regex *OptimizationRemarkPattern = nullptr;
-
-struct PassRemarksOpt {
-  void operator=(const std::string &Val) const {
-    // Create a regexp object to match pass names for emitOptimizationRemark.
-    if (!Val.empty()) {
-      delete OptimizationRemarkPattern;
-      OptimizationRemarkPattern = new Regex(Val);
-      std::string RegexError;
-      if (!OptimizationRemarkPattern->isValid(RegexError))
-        report_fatal_error("Invalid regular expression '" + Val +
-                               "' in -pass-remarks: " + RegexError,
-                           false);
-    }
-  };
-};
-
-static PassRemarksOpt PassRemarksOptLoc;
-
-// -pass-remarks
-//    Command line flag to enable LLVMContext::emitOptimizationRemark()
-//    and LLVMContext::emitOptimizationNote() calls.
-static cl::opt<PassRemarksOpt, true, cl::parser<std::string>>
-PassRemarks("pass-remarks", cl::value_desc("pattern"),
-            cl::desc("Enable optimization remarks from passes whose name match "
-                     "the given regular expression"),
-            cl::Hidden, cl::location(PassRemarksOptLoc), cl::ValueRequired,
-            cl::ZeroOrMore);
-}
-
-bool
-LLVMContextImpl::optimizationRemarksEnabledFor(const char *PassName) const {
-  return OptimizationRemarkPattern &&
-         OptimizationRemarkPattern->match(PassName);
-}
-
-
 namespace {
 struct DropReferences {
   // Takes the value_type of a ConstantUniqueMap's internal map, whose 'second'
   // is a Constant*.
-  template<typename PairT>
-  void operator()(const PairT &P) {
+  template <typename PairT> void operator()(const PairT &P) {
     P.second->dropAllReferences();
   }
 };
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index b1ad9ff4a5a8..808c239bff5e 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -37,6 +37,9 @@ namespace llvm {
 
 class ConstantInt;
 class ConstantFP;
+class DiagnosticInfoOptimizationRemark;
+class DiagnosticInfoOptimizationRemarkMissed;
+class DiagnosticInfoOptimizationRemarkAnalysis;
 class LLVMContext;
 class Type;
 class Value;
@@ -371,10 +374,6 @@ class LLVMContextImpl {
   typedef DenseMap<const Function *, ReturnInst *> PrefixDataMapTy;
   PrefixDataMapTy PrefixDataMap;
 
-  /// \brief Return true if the given pass name should emit optimization
-  /// remarks.
-  bool optimizationRemarksEnabledFor(const char *PassName) const;
-
   int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
   int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
   
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index d3f3482dc024..d14f139db185 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -607,8 +607,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
   // If P is an analysis pass and it is available then do not
   // generate the analysis again. Stale analysis info should not be
   // available at this point.
-  const PassInfo *PI =
-    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  const PassInfo *PI = P->getPassInfo();
   if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
     delete P;
     return;
@@ -723,8 +722,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
       return *I;
 
     // If Pass not found then check the interfaces implemented by Immutable Pass
-    const PassInfo *PassInf =
-      PassRegistry::getPassRegistry()->getPassInfo(PI);
+    const PassInfo *PassInf = (*I)->getPassInfo();
     assert(PassInf && "Expected all immutable passes to be initialized");
     const std::vector<const PassInfo*> &ImmPI =
       PassInf->getInterfacesImplemented();
@@ -766,8 +764,7 @@ void PMTopLevelManager::dumpArguments() const {
   dbgs() << "Pass Arguments: ";
   for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
        ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    if (const PassInfo *PI =
-        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+    if (const PassInfo *PI = (*I)->getPassInfo()) {
       assert(PI && "Expected all immutable passes to be initialized");
       if (!PI->isAnalysisGroup())
         dbgs() << " -" << PI->getPassArgument();
@@ -831,8 +828,8 @@ void PMDataManager::recordAvailableAnalysis(Pass *P) {
 
   // This pass is the current implementation of all of the interfaces it
   // implements as well.
-  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
-  if (!PInf) return;
+  const PassInfo *PInf = P->getPassInfo();
+  if (PInf == 0) return;
   const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
   for (unsigned i = 0, e = II.size(); i != e; ++i)
     AvailableAnalysis[II[i]->getTypeInfo()] = P;
@@ -963,10 +960,9 @@ void PMDataManager::freePass(Pass *P, StringRef Msg,
     P->releaseMemory();
   }
 
-  AnalysisID PI = P->getPassID();
-  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
+  if (const PassInfo *PInf = P->getPassInfo()) {
     // Remove the pass itself (if it is not already removed).
-    AvailableAnalysis.erase(PI);
+    AvailableAnalysis.erase(P->getPassID());
 
     // Remove all interfaces this pass implements, for which it is also
     // listed as the available implementation.
@@ -1148,8 +1144,7 @@ void PMDataManager::dumpPassArguments() const {
     if (PMDataManager *PMD = (*I)->getAsPMDataManager())
       PMD->dumpPassArguments();
     else
-      if (const PassInfo *PI =
-            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+      if (const PassInfo *PI = (*I)->getPassInfo())
         if (!PI->isAnalysisGroup())
           dbgs() << " -" << PI->getPassArgument();
   }
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 5dbed69ca6cf..6a5b386c199f 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -381,7 +381,7 @@ bool Module::Materialize(GlobalValue *GV, std::string *ErrInfo) {
   if (!Materializer)
     return false;
 
-  error_code EC = Materializer->Materialize(GV);
+  std::error_code EC = Materializer->Materialize(GV);
   if (!EC)
     return false;
   if (ErrInfo)
@@ -394,18 +394,18 @@ void Module::Dematerialize(GlobalValue *GV) {
     return Materializer->Dematerialize(GV);
 }
 
-error_code Module::materializeAll() {
+std::error_code Module::materializeAll() {
   if (!Materializer)
-    return error_code::success();
+    return std::error_code();
   return Materializer->MaterializeModule(this);
 }
 
-error_code Module::materializeAllPermanently() {
-  if (error_code EC = materializeAll())
+std::error_code Module::materializeAllPermanently() {
+  if (std::error_code EC = materializeAll())
     return EC;
 
   Materializer.reset();
-  return error_code::success();
+  return std::error_code();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/Pass.cpp b/lib/IR/Pass.cpp
index bb55d2af7cf8..91d86ae40d18 100644
--- a/lib/IR/Pass.cpp
+++ b/lib/IR/Pass.cpp
@@ -199,14 +199,6 @@ Pass *Pass::createPass(AnalysisID ID) {
   return PI->createPass();
 }
 
-Pass *PassInfo::createPass() const {
-  assert((!isAnalysisGroup() || NormalCtor) &&
-         "No default implementation found for analysis group!");
-  assert(NormalCtor &&
-         "Cannot call createPass on PassInfo without default ctor!");
-  return NormalCtor();
-}
-
 //===----------------------------------------------------------------------===//
 //                  Analysis Group Implementation Code
 //===----------------------------------------------------------------------===//
@@ -224,17 +216,6 @@ RegisterAGBase::RegisterAGBase(const char *Name, const void *InterfaceID,
 // PassRegistrationListener implementation
 //
 
-// PassRegistrationListener ctor - Add the current object to the list of
-// PassRegistrationListeners...
-PassRegistrationListener::PassRegistrationListener() {
-  PassRegistry::getPassRegistry()->addRegistrationListener(this);
-}
-
-// dtor - Remove object from list of listeners...
-PassRegistrationListener::~PassRegistrationListener() {
-  PassRegistry::getPassRegistry()->removeRegistrationListener(this);
-}
-
 // enumeratePasses - Iterate over the registered passes, calling the
 // passEnumerate callback on each PassInfo object.
 //
@@ -242,7 +223,16 @@ void PassRegistrationListener::enumeratePasses() {
   PassRegistry::getPassRegistry()->enumerateWith(this);
 }
 
-PassNameParser::~PassNameParser() {}
+PassNameParser::PassNameParser()
+    : Opt(nullptr) {
+  PassRegistry::getPassRegistry()->addRegistrationListener(this);
+}
+
+PassNameParser::~PassNameParser() {
+  // This only gets called during static destruction, in which case the
+  // PassRegistry will have already been destroyed by llvm_shutdown().  So
+  // attempting to remove the registration listener is an error.
+}
 
 //===----------------------------------------------------------------------===//
 //   AnalysisUsage Class Implementation
diff --git a/lib/IR/PassRegistry.cpp b/lib/IR/PassRegistry.cpp
index 6a5bee2f57b1..91940a9c7f07 100644
--- a/lib/IR/PassRegistry.cpp
+++ b/lib/IR/PassRegistry.cpp
@@ -13,14 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/PassRegistry.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Function.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
 #include "llvm/Support/RWMutex.h"
 #include <vector>
 
@@ -36,62 +32,23 @@ PassRegistry *PassRegistry::getPassRegistry() {
   return &*PassRegistryObj;
 }
 
-static ManagedStatic<sys::SmartRWMutex<true> > Lock;
-
-//===----------------------------------------------------------------------===//
-// PassRegistryImpl
-//
-
-namespace {
-struct PassRegistryImpl {
-  /// PassInfoMap - Keep track of the PassInfo object for each registered pass.
-  typedef DenseMap<const void*, const PassInfo*> MapType;
-  MapType PassInfoMap;
-  
-  typedef StringMap<const PassInfo*> StringMapType;
-  StringMapType PassInfoStringMap;
-  
-  /// AnalysisGroupInfo - Keep track of information for each analysis group.
-  struct AnalysisGroupInfo {
-    SmallPtrSet<const PassInfo *, 8> Implementations;
-  };
-  DenseMap<const PassInfo*, AnalysisGroupInfo> AnalysisGroupInfoMap;
-  
-  std::vector<std::unique_ptr<const PassInfo>> ToFree;
-  std::vector<PassRegistrationListener*> Listeners;
-};
-} // end anonymous namespace
-
-void *PassRegistry::getImpl() const {
-  if (!pImpl)
-    pImpl = new PassRegistryImpl();
-  return pImpl;
-}
-
 //===----------------------------------------------------------------------===//
 // Accessors
 //
 
 PassRegistry::~PassRegistry() {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(pImpl);
-  delete Impl;
-  pImpl = nullptr;
 }
 
 const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
-  sys::SmartScopedReader<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.find(TI);
-  return I != Impl->PassInfoMap.end() ? I->second : nullptr;
+  sys::SmartScopedReader<true> Guard(Lock);
+  MapType::const_iterator I = PassInfoMap.find(TI);
+  return I != PassInfoMap.end() ? I->second : nullptr;
 }
 
 const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
-  sys::SmartScopedReader<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  PassRegistryImpl::StringMapType::const_iterator
-    I = Impl->PassInfoStringMap.find(Arg);
-  return I != Impl->PassInfoStringMap.end() ? I->second : nullptr;
+  sys::SmartScopedReader<true> Guard(Lock);
+  StringMapType::const_iterator I = PassInfoStringMap.find(Arg);
+  return I != PassInfoStringMap.end() ? I->second : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -99,39 +56,34 @@ const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
 //
 
 void PassRegistry::registerPass(const PassInfo &PI, bool ShouldFree) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  sys::SmartScopedWriter<true> Guard(Lock);
   bool Inserted =
-    Impl->PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
+    PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
   assert(Inserted && "Pass registered multiple times!");
   (void)Inserted;
-  Impl->PassInfoStringMap[PI.getPassArgument()] = &PI;
+  PassInfoStringMap[PI.getPassArgument()] = &PI;
   
   // Notify any listeners.
   for (std::vector<PassRegistrationListener*>::iterator
-       I = Impl->Listeners.begin(), E = Impl->Listeners.end(); I != E; ++I)
+       I = Listeners.begin(), E = Listeners.end(); I != E; ++I)
     (*I)->passRegistered(&PI);
   
-  if (ShouldFree) Impl->ToFree.push_back(std::unique_ptr<const PassInfo>(&PI));
+  if (ShouldFree) ToFree.push_back(std::unique_ptr<const PassInfo>(&PI));
 }
 
 void PassRegistry::unregisterPass(const PassInfo &PI) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  PassRegistryImpl::MapType::iterator I = 
-    Impl->PassInfoMap.find(PI.getTypeInfo());
-  assert(I != Impl->PassInfoMap.end() && "Pass registered but not in map!");
+  sys::SmartScopedWriter<true> Guard(Lock);
+  MapType::iterator I = PassInfoMap.find(PI.getTypeInfo());
+  assert(I != PassInfoMap.end() && "Pass registered but not in map!");
   
   // Remove pass from the map.
-  Impl->PassInfoMap.erase(I);
-  Impl->PassInfoStringMap.erase(PI.getPassArgument());
+  PassInfoMap.erase(I);
+  PassInfoStringMap.erase(PI.getPassArgument());
 }
 
 void PassRegistry::enumerateWith(PassRegistrationListener *L) {
-  sys::SmartScopedReader<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  for (PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.begin(),
-       E = Impl->PassInfoMap.end(); I != E; ++I)
+  sys::SmartScopedReader<true> Guard(Lock);
+  for (auto I = PassInfoMap.begin(), E = PassInfoMap.end(); I != E; ++I)
     L->passEnumerate(I->second);
 }
 
@@ -156,15 +108,13 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
     assert(ImplementationInfo &&
            "Must register pass before adding to AnalysisGroup!");
 
-    sys::SmartScopedWriter<true> Guard(*Lock);
+    sys::SmartScopedWriter<true> Guard(Lock);
     
     // Make sure we keep track of the fact that the implementation implements
     // the interface.
     ImplementationInfo->addInterfaceImplemented(InterfaceInfo);
 
-    PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-    PassRegistryImpl::AnalysisGroupInfo &AGI =
-      Impl->AnalysisGroupInfoMap[InterfaceInfo];
+    AnalysisGroupInfo &AGI = AnalysisGroupInfoMap[InterfaceInfo];
     assert(AGI.Implementations.count(ImplementationInfo) == 0 &&
            "Cannot add a pass to the same analysis group more than once!");
     AGI.Implementations.insert(ImplementationInfo);
@@ -179,30 +129,18 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
     }
   }
   
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   if (ShouldFree)
-    Impl->ToFree.push_back(std::unique_ptr<const PassInfo>(&Registeree));
+    ToFree.push_back(std::unique_ptr<const PassInfo>(&Registeree));
 }
 
 void PassRegistry::addRegistrationListener(PassRegistrationListener *L) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  Impl->Listeners.push_back(L);
+  sys::SmartScopedWriter<true> Guard(Lock);
+  Listeners.push_back(L);
 }
 
 void PassRegistry::removeRegistrationListener(PassRegistrationListener *L) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  
-  // NOTE: This is necessary, because removeRegistrationListener() can be called
-  // as part of the llvm_shutdown sequence.  Since we have no control over the
-  // order of that sequence, we need to gracefully handle the case where the
-  // PassRegistry is destructed before the object that triggers this call.
-  if (!pImpl) return;
+  sys::SmartScopedWriter<true> Guard(Lock);
   
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  std::vector<PassRegistrationListener*>::iterator I =
-    std::find(Impl->Listeners.begin(), Impl->Listeners.end(), L);
-  assert(I != Impl->Listeners.end() &&
-         "PassRegistrationListener not registered!");
-  Impl->Listeners.erase(I);
+  auto I = std::find(Listeners.begin(), Listeners.end(), L);
+  Listeners.erase(I);
 }
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index d734e4ea8183..463024a28c76 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -38,13 +38,12 @@ using namespace llvm;
 
 static inline Type *checkType(Type *Ty) {
   assert(Ty && "Value defined with a null type: Error!");
-  return const_cast<Type*>(Ty);
+  return Ty;
 }
 
 Value::Value(Type *ty, unsigned scid)
-  : SubclassID(scid), HasValueHandle(0),
-    SubclassOptionalData(0), SubclassData(0), VTy((Type*)checkType(ty)),
-    UseList(nullptr), Name(nullptr) {
+    : VTy(checkType(ty)), UseList(nullptr), Name(nullptr), SubclassID(scid),
+      HasValueHandle(0), SubclassOptionalData(0), SubclassData(0) {
   // FIXME: Why isn't this in the subclass gunk??
   // Note, we cannot call isa<CallInst> before the CallInst has been
   // constructed.
@@ -214,7 +213,7 @@ void Value::setName(const Twine &NewName) {
     // then reallocated.
 
     // Create the new name.
-    Name = ValueName::Create(NameRef.begin(), NameRef.end());
+    Name = ValueName::Create(NameRef);
     Name->setValue(this);
     return;
   }
@@ -301,27 +300,6 @@ void Value::takeName(Value *V) {
     ST->reinsertValue(this);
 }
 
-static GlobalObject &findReplacementForAliasUse(Value &C) {
-  if (auto *GO = dyn_cast<GlobalObject>(&C))
-    return *GO;
-  if (auto *GA = dyn_cast<GlobalAlias>(&C))
-    return *GA->getAliasee();
-  auto *CE = cast<ConstantExpr>(&C);
-  assert(CE->getOpcode() == Instruction::BitCast ||
-         CE->getOpcode() == Instruction::GetElementPtr ||
-         CE->getOpcode() == Instruction::AddrSpaceCast);
-  if (CE->getOpcode() == Instruction::GetElementPtr)
-    assert(cast<GEPOperator>(CE)->hasAllZeroIndices());
-  return findReplacementForAliasUse(*CE->getOperand(0));
-}
-
-static void replaceAliasUseWith(Use &U, Value *New) {
-  GlobalObject &Replacement = findReplacementForAliasUse(*New);
-  assert(&cast<GlobalObject>(*U) != &Replacement &&
-         "replaceAliasUseWith cannot form an alias cycle");
-  U.set(&Replacement);
-}
-
 #ifndef NDEBUG
 static bool contains(SmallPtrSet<ConstantExpr *, 4> &Cache, ConstantExpr *Expr,
                      Constant *C) {
@@ -373,10 +351,6 @@ void Value::replaceAllUsesWith(Value *New) {
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
     // constant because they are uniqued.
     if (auto *C = dyn_cast<Constant>(U.getUser())) {
-      if (isa<GlobalAlias>(C)) {
-        replaceAliasUseWith(U, New);
-        continue;
-      }
       if (!isa<GlobalValue>(C)) {
         C->replaceUsesOfWithOnConstant(this, New, &U);
         continue;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 80e731ab152f..d1c7f7d25c39 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -241,6 +241,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void visitGlobalValue(const GlobalValue &GV);
   void visitGlobalVariable(const GlobalVariable &GV);
   void visitGlobalAlias(const GlobalAlias &GA);
+  void visitAliaseeSubExpr(const GlobalAlias &A, const Constant &C);
+  void visitAliaseeSubExpr(SmallPtrSet<const GlobalAlias *, 4> &Visited,
+                           const GlobalAlias &A, const Constant &C);
   void visitNamedMDNode(const NamedMDNode &NMD);
   void visitMDNode(MDNode &MD, Function *F);
   void visitModuleIdents(const Module &M);
@@ -474,43 +477,55 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   visitGlobalValue(GV);
 }
 
+void Verifier::visitAliaseeSubExpr(const GlobalAlias &GA, const Constant &C) {
+  SmallPtrSet<const GlobalAlias*, 4> Visited;
+  Visited.insert(&GA);
+  visitAliaseeSubExpr(Visited, GA, C);
+}
+
+void Verifier::visitAliaseeSubExpr(SmallPtrSet<const GlobalAlias *, 4> &Visited,
+                                   const GlobalAlias &GA, const Constant &C) {
+  if (const auto *GV = dyn_cast<GlobalValue>(&C)) {
+    Assert1(!GV->isDeclaration(), "Alias must point to a definition", &GA);
+
+    if (const auto *GA2 = dyn_cast<GlobalAlias>(GV)) {
+      Assert1(Visited.insert(GA2), "Aliases cannot form a cycle", &GA);
+
+      Assert1(!GA2->mayBeOverridden(), "Alias cannot point to a weak alias",
+              &GA);
+    } else {
+      // Only continue verifying subexpressions of GlobalAliases.
+      // Do not recurse into global initializers.
+      return;
+    }
+  }
+
+  if (const auto *CE = dyn_cast<ConstantExpr>(&C))
+    VerifyConstantExprBitcastType(CE);
+
+  for (const Use &U : C.operands()) {
+    Value *V = &*U;
+    if (const auto *GA2 = dyn_cast<GlobalAlias>(V))
+      visitAliaseeSubExpr(Visited, GA, *GA2->getAliasee());
+    else if (const auto *C2 = dyn_cast<Constant>(V))
+      visitAliaseeSubExpr(Visited, GA, *C2);
+  }
+}
+
 void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
   Assert1(!GA.getName().empty(),
           "Alias name cannot be empty!", &GA);
   Assert1(GlobalAlias::isValidLinkage(GA.getLinkage()),
           "Alias should have external or external weak linkage!", &GA);
-  Assert1(GA.getAliasee(),
-          "Aliasee cannot be NULL!", &GA);
-  Assert1(!GA.hasUnnamedAddr(), "Alias cannot have unnamed_addr!", &GA);
-
   const Constant *Aliasee = GA.getAliasee();
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Aliasee);
+  Assert1(Aliasee, "Aliasee cannot be NULL!", &GA);
+  Assert1(GA.getType() == Aliasee->getType(),
+          "Alias and aliasee types should match!", &GA);
 
-  if (!GV) {
-    const ConstantExpr *CE = dyn_cast<ConstantExpr>(Aliasee);
-    if (CE && (CE->getOpcode() == Instruction::BitCast ||
-               CE->getOpcode() == Instruction::AddrSpaceCast ||
-               CE->getOpcode() == Instruction::GetElementPtr))
-      GV = dyn_cast<GlobalValue>(CE->getOperand(0));
+  Assert1(isa<GlobalValue>(Aliasee) || isa<ConstantExpr>(Aliasee),
+          "Aliasee should be either GlobalValue or ConstantExpr", &GA);
 
-    Assert1(GV, "Aliasee should be either GlobalValue, bitcast or "
-                "addrspacecast of GlobalValue",
-            &GA);
-
-    if (CE->getOpcode() == Instruction::BitCast) {
-      unsigned SrcAS = GV->getType()->getPointerAddressSpace();
-      unsigned DstAS = CE->getType()->getPointerAddressSpace();
-
-      Assert1(SrcAS == DstAS,
-              "Alias bitcasts cannot be between different address spaces",
-              &GA);
-    }
-  }
-  Assert1(!GV->isDeclaration(), "Alias must point to a definition", &GA);
-  if (const GlobalAlias *GAAliasee = dyn_cast<GlobalAlias>(GV)) {
-    Assert1(!GAAliasee->mayBeOverridden(), "Alias cannot point to a weak alias",
-            &GA);
-  }
+  visitAliaseeSubExpr(GA, *Aliasee);
 
   visitGlobalValue(GA);
 }
@@ -723,7 +738,8 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::Builtin ||
         I->getKindAsEnum() == Attribute::NoBuiltin ||
         I->getKindAsEnum() == Attribute::Cold ||
-        I->getKindAsEnum() == Attribute::OptimizeNone) {
+        I->getKindAsEnum() == Attribute::OptimizeNone ||
+        I->getKindAsEnum() == Attribute::JumpTable) {
       if (!isFunction) {
         CheckFailed("Attribute '" + I->getAsString() +
                     "' only applies to functions!", V);
@@ -897,6 +913,14 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
                                 Attribute::MinSize),
             "Attributes 'minsize and optnone' are incompatible!", V);
   }
+
+  if (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                         Attribute::JumpTable)) {
+    const GlobalValue *GV = cast<GlobalValue>(V);
+    Assert1(GV->hasUnnamedAddr(),
+            "Attribute 'jumptable' requires 'unnamed_addr'", V);
+
+  }
 }
 
 void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
@@ -2065,8 +2089,7 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
   Assert1(isa<Constant>(PersonalityFn), "Personality function is not constant!",
           &LPI);
   for (unsigned i = 0, e = LPI.getNumClauses(); i < e; ++i) {
-    Value *Clause = LPI.getClause(i);
-    Assert1(isa<Constant>(Clause), "Clause is not constant!", &LPI);
+    Constant *Clause = LPI.getClause(i);
     if (LPI.isCatch(i)) {
       Assert1(isa<PointerType>(Clause->getType()),
               "Catch operand does not have pointer type!", &LPI);
diff --git a/lib/IR/module.modulemap b/lib/IR/module.modulemap
new file mode 100644
index 000000000000..9698e9178349
--- /dev/null
+++ b/lib/IR/module.modulemap
@@ -0,0 +1 @@
+module IR { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp
index f4ed43769388..01aa074abad6 100644
--- a/lib/IRReader/IRReader.cpp
+++ b/lib/IRReader/IRReader.cpp
@@ -18,7 +18,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using namespace llvm;
 
@@ -36,7 +36,7 @@ Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
                 (const unsigned char *)Buffer->getBufferEnd())) {
     std::string ErrMsg;
     ErrorOr<Module *> ModuleOrErr = getLazyBitcodeModule(Buffer, Context);
-    if (error_code EC = ModuleOrErr.getError()) {
+    if (std::error_code EC = ModuleOrErr.getError()) {
       Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
       // ParseBitcodeFile does not take ownership of the Buffer in the
@@ -53,7 +53,7 @@ Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
 Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
                                   LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
     return nullptr;
@@ -70,7 +70,7 @@ Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
                 (const unsigned char *)Buffer->getBufferEnd())) {
     ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(Buffer, Context);
     Module *M = nullptr;
-    if (error_code EC = ModuleOrErr.getError())
+    if (std::error_code EC = ModuleOrErr.getError())
       Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
     else
@@ -86,7 +86,7 @@ Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
 Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
                           LLVMContext &Context) {
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
     return nullptr;
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 028c19127170..9009958613ed 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -44,7 +44,6 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
@@ -52,6 +51,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/ObjCARC.h"
+#include <system_error>
 using namespace llvm;
 
 const char* LTOCodeGenerator::getVersionString() {
@@ -208,7 +208,8 @@ bool LTOCodeGenerator::compile_to_file(const char** name,
   // make unique temp .o file to put generated object file
   SmallString<128> Filename;
   int FD;
-  error_code EC = sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
+  std::error_code EC =
+      sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
   if (EC) {
     errMsg = EC.message();
     return false;
@@ -252,7 +253,7 @@ const void* LTOCodeGenerator::compile(size_t* length,
 
   // read .o file into memory buffer
   std::unique_ptr<MemoryBuffer> BuffPtr;
-  if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
+  if (std::error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
     errMsg = ec.message();
     sys::fs::remove(NativeObjectPath);
     return nullptr;
@@ -312,7 +313,8 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
       MCpu = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       MCpu = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64)
+    else if (Triple.getArch() == llvm::Triple::arm64 ||
+             Triple.getArch() == llvm::Triple::aarch64)
       MCpu = "cyclone";
   }
 
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index d73a7e342ae7..eac48e16e8a0 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -37,11 +37,11 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+#include <system_error>
 using namespace llvm;
 
 LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
@@ -98,7 +98,7 @@ bool LTOModule::isTargetMatch(MemoryBuffer *buffer, const char *triplePrefix) {
 LTOModule *LTOModule::makeLTOModule(const char *path, TargetOptions options,
                                     std::string &errMsg) {
   std::unique_ptr<MemoryBuffer> buffer;
-  if (error_code ec = MemoryBuffer::getFile(path, buffer)) {
+  if (std::error_code ec = MemoryBuffer::getFile(path, buffer)) {
     errMsg = ec.message();
     return nullptr;
   }
@@ -117,7 +117,7 @@ LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
                                     TargetOptions options,
                                     std::string &errMsg) {
   std::unique_ptr<MemoryBuffer> buffer;
-  if (error_code ec =
+  if (std::error_code ec =
           MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) {
     errMsg = ec.message();
     return nullptr;
@@ -140,7 +140,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
   // parse bitcode buffer
   ErrorOr<Module *> ModuleOrErr =
       getLazyBitcodeModule(buffer, getGlobalContext());
-  if (error_code EC = ModuleOrErr.getError()) {
+  if (std::error_code EC = ModuleOrErr.getError()) {
     errMsg = EC.message();
     delete buffer;
     return nullptr;
@@ -168,7 +168,8 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
       CPU = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       CPU = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64)
+    else if (Triple.getArch() == llvm::Triple::arm64 ||
+             Triple.getArch() == llvm::Triple::aarch64)
       CPU = "cyclone";
   }
 
@@ -333,21 +334,22 @@ void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
   // from the ObjC data structures generated by the front end.
 
   // special case if this data blob is an ObjC class definition
-  if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
+  std::string Section = v->getSection();
+  if (Section.compare(0, 15, "__OBJC,__class,") == 0) {
     if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCClass(gv);
     }
   }
 
   // special case if this data blob is an ObjC category definition
-  else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
+  else if (Section.compare(0, 18, "__OBJC,__category,") == 0) {
     if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCCategory(gv);
     }
   }
 
   // special case if this data blob is the list of referenced classes
-  else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
+  else if (Section.compare(0, 18, "__OBJC,__cls_refs,") == 0) {
     if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCClassRef(gv);
     }
@@ -800,14 +802,8 @@ bool LTOModule::parseSymbols(std::string &errMsg) {
     return true;
 
   // add aliases
-  for (Module::alias_iterator a = _module->alias_begin(),
-         e = _module->alias_end(); a != e; ++a) {
-    if (isDeclaration(*a->getAliasee()))
-      // Is an alias to a declaration.
-      addPotentialUndefinedSymbol(a, false);
-    else
-      addDefinedDataSymbol(a);
-  }
+  for (const auto &Alias : _module->aliases())
+    addDefinedDataSymbol(&Alias);
 
   // make symbols for all undefines
   for (StringMap<NameAndAttributes>::iterator u =_undefines.begin(),
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 45f2d4e03a19..688b13f228d0 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -389,8 +389,6 @@ namespace {
     /// actually need, but this allows us to reuse the ValueMapper code.
     ValueToValueMapTy ValueMap;
 
-    std::vector<std::pair<GlobalValue *, GlobalAlias *>> ReplaceWithAlias;
-
     struct AppendingVarInfo {
       GlobalVariable *NewGV;  // New aggregate global in dest module.
       Constant *DstInit;      // Old initializer from dest module.
@@ -723,7 +721,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
     return emitError(
         "Appending variables with different unnamed_addr need to be linked!");
 
-  if (DstGV->getSection() != SrcGV->getSection())
+  if (StringRef(DstGV->getSection()) != SrcGV->getSection())
     return emitError(
           "Appending variables with different section name need to be linked!");
 
@@ -895,6 +893,7 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
 bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
   GlobalValue *DGV = getLinkedToGlobal(SGA);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SGA->hasUnnamedAddr();
 
   if (DGV) {
     GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
@@ -903,11 +902,13 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
     if (getLinkageResult(DGV, SGA, NewLinkage, NV, LinkFromSrc))
       return true;
     NewVisibility = NV;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
     if (!LinkFromSrc) {
       // Set calculated linkage.
       DGV->setLinkage(NewLinkage);
       DGV->setVisibility(*NewVisibility);
+      DGV->setUnnamedAddr(HasUnnamedAddr);
 
       // Make sure to remember this mapping.
       ValueMap[SGA] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGA->getType()));
@@ -928,9 +929,13 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
   copyGVAttributes(NewDA, SGA);
   if (NewVisibility)
     NewDA->setVisibility(*NewVisibility);
+  NewDA->setUnnamedAddr(HasUnnamedAddr);
 
-  if (DGV)
-    ReplaceWithAlias.push_back(std::make_pair(DGV, NewDA));
+  if (DGV) {
+    // Any uses of DGV need to change to NewDA, with cast.
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
+    DGV->eraseFromParent();
+  }
 
   ValueMap[SGA] = NewDA;
   return false;
@@ -1016,19 +1021,6 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
 
 }
 
-static GlobalObject &getGlobalObjectInExpr(Constant &C) {
-  auto *GO = dyn_cast<GlobalObject>(&C);
-  if (GO)
-    return *GO;
-  auto *GA = dyn_cast<GlobalAlias>(&C);
-  if (GA)
-    return *GA->getAliasee();
-  auto &CE = cast<ConstantExpr>(C);
-  assert(CE.getOpcode() == Instruction::BitCast ||
-         CE.getOpcode() == Instruction::AddrSpaceCast);
-  return getGlobalObjectInExpr(*CE.getOperand(0));
-}
-
 /// linkAliasBodies - Insert all of the aliases in Src into the Dest module.
 void ModuleLinker::linkAliasBodies() {
   for (Module::alias_iterator I = SrcM->alias_begin(), E = SrcM->alias_end();
@@ -1039,25 +1031,9 @@ void ModuleLinker::linkAliasBodies() {
       GlobalAlias *DA = cast<GlobalAlias>(ValueMap[I]);
       Constant *Val =
           MapValue(Aliasee, ValueMap, RF_None, &TypeMap, &ValMaterializer);
-      DA->setAliasee(&getGlobalObjectInExpr(*Val));
+      DA->setAliasee(Val);
     }
   }
-
-  // Any uses of DGV need to change to NewDA, with cast.
-  for (auto &Pair : ReplaceWithAlias) {
-    GlobalValue *DGV = Pair.first;
-    GlobalAlias *NewDA = Pair.second;
-
-    for (auto *User : DGV->users()) {
-      if (auto *GA = dyn_cast<GlobalAlias>(User)) {
-        if (GA == NewDA)
-          report_fatal_error("Linking these modules creates an alias cycle.");
-      }
-    }
-
-    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
-    DGV->eraseFromParent();
-  }
 }
 
 /// linkNamedMDNodes - Insert all of the named MDNodes in Src into the Dest
@@ -1165,7 +1141,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
     // Perform the merge for standard behavior types.
     switch (SrcBehaviorValue) {
     case Module::Require:
-    case Module::Override: assert(0 && "not possible"); break;
+    case Module::Override: llvm_unreachable("not possible");
     case Module::Error: {
       // Emit an error if the values differ.
       if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 6a384c1a8e1c..78bd8c4ba144 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMMC
+  ConstantPools.cpp
   ELFObjectWriter.cpp
   MCAsmBackend.cpp
   MCAsmInfo.cpp
diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp
new file mode 100644
index 000000000000..f979dad47da5
--- /dev/null
+++ b/lib/MC/ConstantPools.cpp
@@ -0,0 +1,95 @@
+//===- ConstantPools.cpp - ConstantPool class --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ConstantPool and  AssemblerConstantPools classes.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/ConstantPools.h"
+
+using namespace llvm;
+//
+// ConstantPool implementation
+//
+// Emit the contents of the constant pool using the provided streamer.
+void ConstantPool::emitEntries(MCStreamer &Streamer) {
+  if (Entries.empty())
+    return;
+  Streamer.EmitCodeAlignment(4); // align to 4-byte address
+  Streamer.EmitDataRegion(MCDR_DataRegion);
+  for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end();
+       I != E; ++I) {
+    Streamer.EmitLabel(I->first);
+    Streamer.EmitValue(I->second, 4);
+  }
+  Streamer.EmitDataRegion(MCDR_DataRegionEnd);
+  Entries.clear();
+}
+
+const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) {
+  MCSymbol *CPEntryLabel = Context.CreateTempSymbol();
+
+  Entries.push_back(std::make_pair(CPEntryLabel, Value));
+  return MCSymbolRefExpr::Create(CPEntryLabel, Context);
+}
+
+bool ConstantPool::empty() { return Entries.empty(); }
+
+//
+// AssemblerConstantPools implementation
+//
+ConstantPool *
+AssemblerConstantPools::getConstantPool(const MCSection *Section) {
+  ConstantPoolMapTy::iterator CP = ConstantPools.find(Section);
+  if (CP == ConstantPools.end())
+    return nullptr;
+
+  return &CP->second;
+}
+
+ConstantPool &
+AssemblerConstantPools::getOrCreateConstantPool(const MCSection *Section) {
+  return ConstantPools[Section];
+}
+
+static void emitConstantPool(MCStreamer &Streamer, const MCSection *Section,
+                             ConstantPool &CP) {
+  if (!CP.empty()) {
+    Streamer.SwitchSection(Section);
+    CP.emitEntries(Streamer);
+  }
+}
+
+void AssemblerConstantPools::emitAll(MCStreamer &Streamer) {
+  // Dump contents of assembler constant pools.
+  for (ConstantPoolMapTy::iterator CPI = ConstantPools.begin(),
+                                   CPE = ConstantPools.end();
+       CPI != CPE; ++CPI) {
+    const MCSection *Section = CPI->first;
+    ConstantPool &CP = CPI->second;
+
+    emitConstantPool(Streamer, Section, CP);
+  }
+}
+
+void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) {
+  const MCSection *Section = Streamer.getCurrentSection().first;
+  if (ConstantPool *CP = getConstantPool(Section)) {
+    emitConstantPool(Streamer, Section, *CP);
+  }
+}
+
+const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer,
+                                               const MCExpr *Expr) {
+  const MCSection *Section = Streamer.getCurrentSection().first;
+  return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext());
+}
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 0a54627f8cd0..a98a13e0cd42 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -1179,7 +1179,7 @@ prependCompressionHeader(uint64_t Size,
   if (Size <= Magic.size() + sizeof(Size) + CompressedContents.size())
     return false;
   if (sys::IsLittleEndianHost)
-    Size = sys::SwapByteOrder(Size);
+    sys::swapByteOrder(Size);
   CompressedContents.insert(CompressedContents.begin(),
                             Magic.size() + sizeof(Size), 0);
   std::copy(Magic.begin(), Magic.end(), CompressedContents.begin());
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index c0777a6e98cc..29105834499d 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -82,6 +82,7 @@ MCAsmInfo::MCAsmInfo() {
   HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
+  WinEHEncodingType = WinEH::EncodingType::ET_Invalid;
   DwarfUsesRelocationsAcrossSections = true;
   DwarfFDESymbolsUseAbsDiff = false;
   DwarfRegNumForCFI = false;
@@ -99,7 +100,7 @@ MCAsmInfo::MCAsmInfo() {
   //   - MCAsmInfoDarwin is handling this case
   // - Generic_GCC toolchains enable the integrated assembler on a per
   //   architecture basis.
-  //   - The target subclasses for AArch64, ARM, and X86  handle these cases
+  //   - The target subclasses for AArch64, ARM, and X86 handle these cases
   UseIntegratedAssembler = false;
 
   CompressDebugSections = false;
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 7f8ae54949e0..bccf5b9cd287 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -1134,17 +1134,14 @@ void MCAsmStreamer::EmitWin64EHHandlerData() {
 void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) {
   MCStreamer::EmitWin64EHPushReg(Register);
 
-  OS << "\t.seh_pushreg ";
-  EmitRegisterName(Register);
+  OS << "\t.seh_pushreg " << Register;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
   MCStreamer::EmitWin64EHSetFrame(Register, Offset);
 
-  OS << "\t.seh_setframe ";
-  EmitRegisterName(Register);
-  OS << ", " << Offset;
+  OS << "\t.seh_setframe " << Register << ", " << Offset;
   EmitEOL();
 }
 
@@ -1158,18 +1155,14 @@ void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) {
 void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
   MCStreamer::EmitWin64EHSaveReg(Register, Offset);
 
-  OS << "\t.seh_savereg ";
-  EmitRegisterName(Register);
-  OS << ", " << Offset;
+  OS << "\t.seh_savereg " << Register << ", " << Offset;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
   MCStreamer::EmitWin64EHSaveXMM(Register, Offset);
 
-  OS << "\t.seh_savexmm ";
-  EmitRegisterName(Register);
-  OS << ", " << Offset;
+  OS << "\t.seh_savexmm " << Register << ", " << Offset;
   EmitEOL();
 }
 
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index c16326829d8a..cc98be6ea63f 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -39,7 +39,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
       AllowTemporaryLabels(true), DwarfCompileUnitID(0),
       AutoReset(DoAutoReset) {
 
-  error_code EC = llvm::sys::fs::current_path(CompilationDir);
+  std::error_code EC = llvm::sys::fs::current_path(CompilationDir);
   if (EC)
     CompilationDir.clear();
 
@@ -277,10 +277,11 @@ const MCSectionELF *MCContext::CreateELFGroupSection() {
   return Result;
 }
 
-const MCSectionCOFF *
-MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
-                          SectionKind Kind, StringRef COMDATSymName,
-                          int Selection, const MCSectionCOFF *Assoc) {
+const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
+                                               unsigned Characteristics,
+                                               SectionKind Kind,
+                                               StringRef COMDATSymName,
+                                               int Selection) {
   // Do the lookup, if we have a hit, return it.
 
   SectionGroupPair P(Section, COMDATSymName);
@@ -294,8 +295,8 @@ MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
     COMDATSymbol = GetOrCreateSymbol(COMDATSymName);
 
   StringRef CachedName = Iter->first.first;
-  MCSectionCOFF *Result = new (*this) MCSectionCOFF(
-      CachedName, Characteristics, COMDATSymbol, Selection, Assoc, Kind);
+  MCSectionCOFF *Result = new (*this)
+      MCSectionCOFF(CachedName, Characteristics, COMDATSymbol, Selection, Kind);
 
   Iter->second = Result;
   return Result;
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index ff5009f0a203..f724716eacd4 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -271,6 +271,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Mips_GOT_LO16: return "GOT_LO16";
   case VK_Mips_CALL_HI16: return "CALL_HI16";
   case VK_Mips_CALL_LO16: return "CALL_LO16";
+  case VK_Mips_PCREL_HI16: return "PCREL_HI16";
+  case VK_Mips_PCREL_LO16: return "PCREL_LO16";
   case VK_COFF_IMGREL32: return "IMGREL32";
   }
   llvm_unreachable("Invalid variant kind");
diff --git a/lib/MC/MCModuleYAML.cpp b/lib/MC/MCModuleYAML.cpp
index f81cb149d83a..f6b7431eb3bf 100644
--- a/lib/MC/MCModuleYAML.cpp
+++ b/lib/MC/MCModuleYAML.cpp
@@ -453,7 +453,7 @@ StringRef yaml2mcmodule(std::unique_ptr<MCModule> &MCM, StringRef YamlContent,
   InstrRegInfoHolder IRI(MII, MRI);
   yaml::Input YIn(YamlContent, (void *)&IRI);
   YIn >> YAMLModule;
-  if (error_code ec = YIn.error())
+  if (std::error_code ec = YIn.error())
     return ec.message();
   StringRef err = Parser.parse(YAMLModule);
   if (!err.empty())
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index e4bcfa257f69..d12f60c09920 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -23,7 +23,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   IsFunctionEHFrameSymbolPrivate = false;
   SupportsWeakOmittedEHFrame = false;
 
-  if (T.isOSDarwin() && T.getArch() == Triple::arm64)
+  if (T.isOSDarwin() &&
+      (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))
     SupportsCompactUnwindWithoutEHFrame = true;
 
   PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
@@ -151,7 +152,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   COFFDebugSymbolsSection = nullptr;
 
   if ((T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) ||
-      (T.isOSDarwin() && T.getArch() == Triple::arm64)) {
+      (T.isOSDarwin() &&
+       (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))) {
     CompactUnwindSection =
       Ctx->getMachOSection("__LD", "__compact_unwind",
                            MachO::S_ATTR_DEBUG,
@@ -159,7 +161,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;
-    else if (T.getArch() == Triple::arm64)
+    else if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64)
       CompactUnwindDwarfEHFrameOnly = 0x03000000;
   }
 
@@ -319,6 +321,13 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
     break;
+  case Triple::mips:
+  case Triple::mipsel:
+    // MIPS uses indirect pointer to refer personality functions, so that the
+    // eh_frame section can be read-only.  DW.ref.personality will be generated
+    // for relocation.
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect;
+    break;
   case Triple::ppc64:
   case Triple::ppc64le:
     PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
@@ -537,9 +546,6 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   DwarfInfoDWOSection =
     Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
-  DwarfTypesDWOSection =
-    Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS, 0,
-                       SectionKind::getMetadata());
   DwarfAbbrevDWOSection =
     Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
@@ -563,6 +569,8 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
 
 void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
+  bool IsWoA = T.getArch() == Triple::arm || T.getArch() == Triple::thumb;
+
   // The object file format cannot represent common symbols with explicit
   // alignments.
   CommDirectiveSupportsAlignment = false;
@@ -576,6 +584,8 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         SectionKind::getBSS());
   TextSection =
     Ctx->getCOFFSection(".text",
+                        (IsWoA ? COFF::IMAGE_SCN_MEM_16BIT
+                               : (COFF::SectionCharacteristics)0) |
                         COFF::IMAGE_SCN_CNT_CODE |
                         COFF::IMAGE_SCN_MEM_EXECUTE |
                         COFF::IMAGE_SCN_MEM_READ,
@@ -591,12 +601,18 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getReadOnly());
-  if (T.isKnownWindowsMSVCEnvironment()) {
+
+  if (T.isKnownWindowsMSVCEnvironment() || T.isWindowsItaniumEnvironment()) {
     StaticCtorSection =
       Ctx->getCOFFSection(".CRT$XCU",
                           COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                           COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getReadOnly());
+    StaticDtorSection =
+      Ctx->getCOFFSection(".CRT$XTX",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getReadOnly());
   } else {
     StaticCtorSection =
       Ctx->getCOFFSection(".ctors",
@@ -604,16 +620,6 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                           COFF::IMAGE_SCN_MEM_READ |
                           COFF::IMAGE_SCN_MEM_WRITE,
                           SectionKind::getDataRel());
-  }
-
-
-  if (T.isKnownWindowsMSVCEnvironment()) {
-    StaticDtorSection =
-      Ctx->getCOFFSection(".CRT$XTX",
-                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getReadOnly());
-  } else {
     StaticDtorSection =
       Ctx->getCOFFSection(".dtors",
                           COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -706,40 +712,46 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfInfoDWOSection =
-      Ctx->getCOFFSection(".debug_info.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getMetadata());
-  DwarfTypesDWOSection =
-      Ctx->getCOFFSection(".debug_types.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                  COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_info.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfAbbrevDWOSection =
-      Ctx->getCOFFSection(".debug_abbrev.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                  COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_abbrev.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfStrDWOSection =
-      Ctx->getCOFFSection(".debug_str.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                               COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_str.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfLineDWOSection =
-      Ctx->getCOFFSection(".debug_line.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_line.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfLocDWOSection =
-      Ctx->getCOFFSection(".debug_loc.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                               COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_loc.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfStrOffDWOSection =
-      Ctx->getCOFFSection(".debug_str_offsets.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                  COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_str_offsets.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
-  DwarfAddrSection = Ctx->getCOFFSection(
-      ".debug_addr", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_MEM_READ,
-      SectionKind::getMetadata());
+
+  DwarfAddrSection =
+    Ctx->getCOFFSection(".debug_addr",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
 
   DrectveSection =
     Ctx->getCOFFSection(".drectve",
-                        COFF::IMAGE_SCN_LNK_INFO,
+                        COFF::IMAGE_SCN_LNK_INFO |
+                        COFF::IMAGE_SCN_LNK_REMOVE,
                         SectionKind::getMetadata());
 
   PDataSection =
@@ -753,6 +765,7 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getDataRel());
+
   TLSDataSection =
     Ctx->getCOFFSection(".tls$",
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -792,7 +805,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
   // cellspu-apple-darwin. Perhaps we should fix in Triple?
   if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
        Arch == Triple::arm || Arch == Triple::thumb ||
-       Arch == Triple::arm64 ||
+       Arch == Triple::arm64 || Arch == Triple::aarch64 ||
        Arch == Triple::ppc || Arch == Triple::ppc64 ||
        Arch == Triple::UnknownArch) &&
       (T.isOSDarwin() || T.isOSBinFormatMachO())) {
@@ -814,6 +827,13 @@ const MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const {
                             SectionKind::getMetadata(), 0, utostr(Hash));
 }
 
+const MCSection *
+MCObjectFileInfo::getDwarfTypesDWOSection(uint64_t Hash) const {
+  return Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS,
+                            ELF::SHF_GROUP, SectionKind::getMetadata(), 0,
+                            utostr(Hash));
+}
+
 void MCObjectFileInfo::InitEHFrameSection() {
   if (Env == IsMachO)
     EHFrameSection =
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 633d10124816..cbff7beccae0 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -102,7 +102,7 @@ struct MacroInstantiation {
 
 struct ParseStatementInfo {
   /// \brief The parsed operands from the last parsed statement.
-  SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
 
   /// \brief The opcode from the last parsed instruction.
   unsigned Opcode;
@@ -115,13 +115,6 @@ struct ParseStatementInfo {
   ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(nullptr) {}
   ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
     : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
-
-  ~ParseStatementInfo() {
-    // Free any parsed operands.
-    for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
-      delete ParsedOperands[i];
-    ParsedOperands.clear();
-  }
 };
 
 /// \brief The concrete assembly parser instance.
@@ -498,8 +491,8 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
                      const MCAsmInfo &_MAI)
     : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
       PlatformParser(nullptr), CurBuffer(0), MacrosEnabledFlag(true),
-      CppHashLineNumber(0), AssemblerDialect(~0U), IsDarwin(false),
-      ParsingInlineAsm(false) {
+      HadError(false), CppHashLineNumber(0), AssemblerDialect(~0U),
+      IsDarwin(false), ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
@@ -528,7 +521,8 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
 }
 
 AsmParser::~AsmParser() {
-  assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
+  assert((HadError || ActiveMacros.empty()) &&
+         "Unexpected active macro instantiation!");
 
   // Destroy any macros.
   for (StringMap<MCAsmMacro *>::iterator it = MacroMap.begin(),
@@ -4464,27 +4458,27 @@ bool AsmParser::parseMSInlineAsm(
 
     // Build the list of clobbers, outputs and inputs.
     for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
-      MCParsedAsmOperand *Operand = Info.ParsedOperands[i];
+      MCParsedAsmOperand &Operand = *Info.ParsedOperands[i];
 
       // Immediate.
-      if (Operand->isImm())
+      if (Operand.isImm())
         continue;
 
       // Register operand.
-      if (Operand->isReg() && !Operand->needAddressOf()) {
+      if (Operand.isReg() && !Operand.needAddressOf()) {
         unsigned NumDefs = Desc.getNumDefs();
         // Clobber.
-        if (NumDefs && Operand->getMCOperandNum() < NumDefs)
-          ClobberRegs.push_back(Operand->getReg());
+        if (NumDefs && Operand.getMCOperandNum() < NumDefs)
+          ClobberRegs.push_back(Operand.getReg());
         continue;
       }
 
       // Expr/Input or Output.
-      StringRef SymName = Operand->getSymName();
+      StringRef SymName = Operand.getSymName();
       if (SymName.empty())
         continue;
 
-      void *OpDecl = Operand->getOpDecl();
+      void *OpDecl = Operand.getOpDecl();
       if (!OpDecl)
         continue;
 
@@ -4493,13 +4487,13 @@ bool AsmParser::parseMSInlineAsm(
       if (isOutput) {
         ++InputIdx;
         OutputDecls.push_back(OpDecl);
-        OutputDeclsAddressOf.push_back(Operand->needAddressOf());
-        OutputConstraints.push_back('=' + Operand->getConstraint().str());
+        OutputDeclsAddressOf.push_back(Operand.needAddressOf());
+        OutputConstraints.push_back('=' + Operand.getConstraint().str());
         AsmStrRewrites.push_back(AsmRewrite(AOK_Output, Start, SymName.size()));
       } else {
         InputDecls.push_back(OpDecl);
-        InputDeclsAddressOf.push_back(Operand->needAddressOf());
-        InputConstraints.push_back(Operand->getConstraint().str());
+        InputDeclsAddressOf.push_back(Operand.needAddressOf());
+        InputConstraints.push_back(Operand.getConstraint().str());
         AsmStrRewrites.push_back(AsmRewrite(AOK_Input, Start, SymName.size()));
       }
     }
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index decf01c46c46..8e8447f7b89b 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -37,7 +37,7 @@ class COFFAsmParser : public MCAsmParserExtension {
 
   bool ParseSectionSwitch(StringRef Section, unsigned Characteristics,
                           SectionKind Kind, StringRef COMDATSymName,
-                          COFF::COMDATType Type, const MCSectionCOFF *Assoc);
+                          COFF::COMDATType Type);
 
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionFlags(StringRef FlagsString, unsigned* Flags);
@@ -117,8 +117,7 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseDirectiveEndef(StringRef, SMLoc);
   bool ParseDirectiveSecRel32(StringRef, SMLoc);
   bool ParseDirectiveSecIdx(StringRef, SMLoc);
-  bool parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
-                               const MCSectionCOFF *&Assoc);
+  bool parseCOMDATType(COFF::COMDATType &Type);
   bool ParseDirectiveLinkOnce(StringRef, SMLoc);
 
   // Win64 EH directives.
@@ -293,21 +292,20 @@ bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
   return ParseSectionSwitch(Section, Characteristics, Kind, "",
-                            COFF::IMAGE_COMDAT_SELECT_ANY, nullptr);
+                            COFF::IMAGE_COMDAT_SELECT_ANY);
 }
 
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind,
                                        StringRef COMDATSymName,
-                                       COFF::COMDATType Type,
-                                       const MCSectionCOFF *Assoc) {
+                                       COFF::COMDATType Type) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in section switching directive");
   Lex();
 
   getStreamer().SwitchSection(getContext().getCOFFSection(
-      Section, Characteristics, Kind, COMDATSymName, Type, Assoc));
+      Section, Characteristics, Kind, COMDATSymName, Type));
 
   return false;
 }
@@ -359,14 +357,13 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
   }
 
   COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-  const MCSectionCOFF *Assoc = nullptr;
   StringRef COMDATSymName;
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
 
     Flags |= COFF::IMAGE_SCN_LNK_COMDAT;
 
-    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+    if (parseCOMDATType(Type))
       return true;
 
     if (getLexer().isNot(AsmToken::Comma))
@@ -381,7 +378,7 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
     return TokError("unexpected token in directive");
 
   SectionKind Kind = computeSectionKind(Flags);
-  ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type, Assoc);
+  ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type);
   return false;
 }
 
@@ -461,9 +458,8 @@ bool COFFAsmParser::ParseDirectiveSecIdx(StringRef, SMLoc) {
   return false;
 }
 
-/// ::= [ identifier [ identifier ] ]
-bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
-                                            const MCSectionCOFF *&Assoc) {
+/// ::= [ identifier ]
+bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) {
   StringRef TypeId = getTok().getIdentifier();
 
   Type = StringSwitch<COFF::COMDATType>(TypeId)
@@ -481,48 +477,28 @@ bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
 
   Lex();
 
-  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    SMLoc Loc = getTok().getLoc();
-    StringRef AssocName;
-    if (ParseSectionName(AssocName))
-      return TokError("expected associated section name");
-
-    Assoc = static_cast<const MCSectionCOFF*>(
-                                        getContext().getCOFFSection(AssocName));
-    if (!Assoc)
-      return Error(Loc, "cannot associate unknown section '" + AssocName + "'");
-    if (!(Assoc->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT))
-      return Error(Loc, "associated section must be a COMDAT section");
-    if (Assoc->getSelection() == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
-      return Error(Loc, "associated section cannot be itself associative");
-  }
-
   return false;
 }
 
 /// ParseDirectiveLinkOnce
-///  ::= .linkonce [ identifier [ identifier ] ]
+///  ::= .linkonce [ identifier ]
 bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
   COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-  const MCSectionCOFF *Assoc = nullptr;
   if (getLexer().is(AsmToken::Identifier))
-    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+    if (parseCOMDATType(Type))
       return true;
 
   const MCSectionCOFF *Current = static_cast<const MCSectionCOFF*>(
                                        getStreamer().getCurrentSection().first);
 
-
-  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    if (Assoc == Current)
-      return Error(Loc, "cannot associate a section with itself");
-  }
+  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+    return Error(Loc, "cannot make section associative with .linkonce");
 
   if (Current->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
     return Error(Loc, Twine("section '") + Current->getSectionName() +
                                                        "' is already linkonce");
 
-  Current->setSelection(Type, Assoc);
+  Current->setSelection(Type);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 95c4971c52e5..78bb6c7aad0e 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -561,6 +561,19 @@ bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
   return false;
 }
 
+static MCSymbolAttr MCAttrForString(StringRef Type) {
+  return StringSwitch<MCSymbolAttr>(Type)
+          .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction)
+          .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject)
+          .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS)
+          .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon)
+          .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType)
+          .Cases("STT_GNU_IFUNC", "gnu_indirect_function",
+                 MCSA_ELF_TypeIndFunction)
+          .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+          .Default(MCSA_Invalid);
+}
+
 /// ParseDirectiveELFType
 ///  ::= .type identifier , STT_<TYPE_IN_UPPER_CASE>
 ///  ::= .type identifier , #attribute
@@ -575,53 +588,36 @@ bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   // Handle the identifier as the key symbol.
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.type' directive");
-  Lex();
-
-  StringRef Type;
-  SMLoc TypeLoc;
-  MCSymbolAttr Attr;
-  if (getLexer().is(AsmToken::Identifier)) {
-    TypeLoc = getLexer().getLoc();
-    if (getParser().parseIdentifier(Type))
-      return TokError("expected symbol type in directive");
-    Attr = StringSwitch<MCSymbolAttr>(Type)
-               .Case("STT_FUNC", MCSA_ELF_TypeFunction)
-               .Case("STT_OBJECT", MCSA_ELF_TypeObject)
-               .Case("STT_TLS", MCSA_ELF_TypeTLS)
-               .Case("STT_COMMON", MCSA_ELF_TypeCommon)
-               .Case("STT_NOTYPE", MCSA_ELF_TypeNoType)
-               .Case("STT_GNU_IFUNC", MCSA_ELF_TypeIndFunction)
-               .Default(MCSA_Invalid);
-  } else if (getLexer().is(AsmToken::Hash) || getLexer().is(AsmToken::At) ||
-             getLexer().is(AsmToken::Percent) ||
-             getLexer().is(AsmToken::String)) {
-    if (!getLexer().is(AsmToken::String))
-      Lex();
+  // NOTE the comma is optional in all cases.  It is only documented as being
+  // optional in the first case, however, GAS will silently treat the comma as
+  // optional in all cases.  Furthermore, although the documentation states that
+  // the first form only accepts STT_<TYPE_IN_UPPER_CASE>, in reality, GAS
+  // accepts both the upper case name as well as the lower case aliases.
+  if (getLexer().is(AsmToken::Comma))
+    Lex();
 
-    TypeLoc = getLexer().getLoc();
-    if (getParser().parseIdentifier(Type))
-      return TokError("expected symbol type in directive");
-    Attr = StringSwitch<MCSymbolAttr>(Type)
-               .Case("function", MCSA_ELF_TypeFunction)
-               .Case("object", MCSA_ELF_TypeObject)
-               .Case("tls_object", MCSA_ELF_TypeTLS)
-               .Case("common", MCSA_ELF_TypeCommon)
-               .Case("notype", MCSA_ELF_TypeNoType)
-               .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
-               .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
-               .Default(MCSA_Invalid);
-  } else
+  if (getLexer().isNot(AsmToken::Identifier) &&
+      getLexer().isNot(AsmToken::Hash) && getLexer().isNot(AsmToken::At) &&
+      getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::String))
     return TokError("expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', "
                     "'%<type>' or \"<type>\"");
 
+  if (getLexer().isNot(AsmToken::String) &&
+      getLexer().isNot(AsmToken::Identifier))
+    Lex();
+
+  SMLoc TypeLoc = getLexer().getLoc();
+
+  StringRef Type;
+  if (getParser().parseIdentifier(Type))
+    return TokError("expected symbol type in directive");
+
+  MCSymbolAttr Attr = MCAttrForString(Type);
   if (Attr == MCSA_Invalid)
     return Error(TypeLoc, "unsupported attribute in '.type' directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.type' directive");
-
   Lex();
 
   getStreamer().EmitSymbolAttribute(Sym, Attr);
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index 335b8cd6121e..fc2bd365e16d 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -30,14 +30,9 @@ bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
   return false;
 }
 
-void MCSectionCOFF::setSelection(int Selection,
-                                 const MCSectionCOFF *Assoc) const {
+void MCSectionCOFF::setSelection(int Selection) const {
   assert(Selection != 0 && "invalid COMDAT selection type");
-  assert((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
-         (Assoc != nullptr) &&
-    "associative COMDAT section must have an associated section");
   this->Selection = Selection;
-  this->Assoc = Assoc;
   Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 }
 
@@ -82,7 +77,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
         OS << "same_contents,";
         break;
       case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
-        OS << "associative " << Assoc->getSectionName() << ",";
+        OS << "associative,";
         break;
       case COFF::IMAGE_COMDAT_SELECT_LARGEST:
         OS << "largest,";
diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
index e1b6a5889d89..8e946d57f7fb 100644
--- a/lib/MC/MCTargetOptions.cpp
+++ b/lib/MC/MCTargetOptions.cpp
@@ -14,6 +14,6 @@ namespace llvm {
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
       MCSaveTempLabels(false), MCUseDwarfDirectory(false),
-      ShowMCEncoding(false), ShowMCInst(false) {}
+      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false) {}
 
 } // end namespace llvm
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index cbaf0b87d6e2..5214398bbcfe 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -303,20 +303,50 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   assert(OS.tell() - Start == sizeof(MachO::dysymtab_command));
 }
 
+MachObjectWriter::MachSymbolData *
+MachObjectWriter::findSymbolData(const MCSymbol &Sym) {
+  for (auto &Entry : LocalSymbolData)
+    if (&Entry.SymbolData->getSymbol() == &Sym)
+      return &Entry;
+
+  for (auto &Entry : ExternalSymbolData)
+    if (&Entry.SymbolData->getSymbol() == &Sym)
+      return &Entry;
+
+  for (auto &Entry : UndefinedSymbolData)
+    if (&Entry.SymbolData->getSymbol() == &Sym)
+      return &Entry;
+
+  return nullptr;
+}
+
 void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
                                   const MCAsmLayout &Layout) {
   MCSymbolData &Data = *MSD.SymbolData;
-  const MCSymbol &Symbol = Data.getSymbol();
+  const MCSymbol *Symbol = &Data.getSymbol();
+  const MCSymbol *AliasedSymbol = &Symbol->AliasedSymbol();
+  uint8_t SectionIndex = MSD.SectionIndex;
   uint8_t Type = 0;
   uint16_t Flags = Data.getFlags();
   uint64_t Address = 0;
+  bool IsAlias = Symbol != AliasedSymbol;
+
+  MachSymbolData *AliaseeInfo;
+  if (IsAlias) {
+    AliaseeInfo = findSymbolData(*AliasedSymbol);
+    if (AliaseeInfo)
+      SectionIndex = AliaseeInfo->SectionIndex;
+    Symbol = AliasedSymbol;
+  }
 
   // Set the N_TYPE bits. See <mach-o/nlist.h>.
   //
   // FIXME: Are the prebound or indirect fields possible here?
-  if (Symbol.isUndefined())
+  if (IsAlias && Symbol->isUndefined())
+    Type = MachO::N_INDR;
+  else if (Symbol->isUndefined())
     Type = MachO::N_UNDF;
-  else if (Symbol.isAbsolute())
+  else if (Symbol->isAbsolute())
     Type = MachO::N_ABS;
   else
     Type = MachO::N_SECT;
@@ -327,13 +357,15 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
     Type |= MachO::N_PEXT;
 
   // Set external bit.
-  if (Data.isExternal() || Symbol.isUndefined())
+  if (Data.isExternal() || (!IsAlias && Symbol->isUndefined()))
     Type |= MachO::N_EXT;
 
   // Compute the symbol address.
-  if (Symbol.isDefined()) {
+  if (IsAlias && Symbol->isUndefined())
+    Address = AliaseeInfo->StringIndex;
+  else if (Symbol->isDefined())
     Address = getSymbolAddress(&Data, Layout);
-  } else if (Data.isCommon()) {
+  else if (Data.isCommon()) {
     // Common symbols are encoded with the size in the address
     // field, and their alignment in the flags.
     Address = Data.getCommonSize();
@@ -344,21 +376,21 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
       assert((1U << Log2Size) == Align && "Invalid 'common' alignment!");
       if (Log2Size > 15)
         report_fatal_error("invalid 'common' alignment '" +
-                           Twine(Align) + "' for '" + Symbol.getName() + "'",
+                           Twine(Align) + "' for '" + Symbol->getName() + "'",
                            false);
       // FIXME: Keep this mask with the SymbolFlags enumeration.
       Flags = (Flags & 0xF0FF) | (Log2Size << 8);
     }
   }
 
-  if (Layout.getAssembler().isThumbFunc(&Symbol))
+  if (Layout.getAssembler().isThumbFunc(Symbol))
     Flags |= SF_ThumbFunc;
 
   // struct nlist (12 bytes)
 
   Write32(MSD.StringIndex);
   Write8(Type);
-  Write8(MSD.SectionIndex);
+  Write8(SectionIndex);
 
   // The Mach-O streamer uses the lowest 16-bits of the flags for the 'desc'
   // value.
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 2cc027bfa136..2d4b75858a5f 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -347,6 +347,14 @@ void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
 
   COFFSection *coff_section = createSection(Sec.getSectionName());
   COFFSymbol  *coff_symbol = createSymbol(Sec.getSectionName());
+  if (Sec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (const MCSymbol *S = Sec.getCOMDATSymbol()) {
+      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
+      if (COMDATSymbol->Section)
+        report_fatal_error("two sections have the same comdat");
+      COMDATSymbol->Section = coff_section;
+    }
+  }
 
   coff_section->Symbol = coff_symbol;
   coff_symbol->Section = coff_section;
@@ -458,9 +466,15 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
       coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
     } else {
       const MCSymbolData &BaseData = Assembler.getSymbolData(*Base);
-      if (BaseData.Fragment)
-        coff_symbol->Section =
+      if (BaseData.Fragment) {
+        COFFSection *Sec =
             SectionMap[&BaseData.Fragment->getParent()->getSection()];
+
+        if (coff_symbol->Section && coff_symbol->Section != Sec)
+          report_fatal_error("conflicting sections for symbol");
+
+        coff_symbol->Section = Sec;
+      }
     }
 
     coff_symbol->MCData = &ResSymData;
@@ -808,7 +822,8 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
     }
   }
 
-  coff_section->Relocations.push_back(Reloc);
+  if (TargetObjectWriter->recordRelocation(Fixup))
+    coff_section->Relocations.push_back(Reloc);
 }
 
 void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
@@ -818,13 +833,9 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
 
   DenseMap<COFFSection *, uint16_t> SectionIndices;
   for (auto & Section : Sections) {
-    if (Layout.getSectionAddressSize(Section->MCData) > 0) {
-      size_t Number = ++Header.NumberOfSections;
-      SectionIndices[Section.get()] = Number;
-      MakeSectionReal(*Section, Number);
-    } else {
-      Section->Number = -1;
-    }
+    size_t Number = ++Header.NumberOfSections;
+    SectionIndices[Section.get()] = Number;
+    MakeSectionReal(*Section, Number);
   }
 
   Header.NumberOfSymbols = 0;
@@ -864,11 +875,15 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     const MCSectionCOFF &MCSec =
       static_cast<const MCSectionCOFF &>(Section->MCData->getSection());
 
-    COFFSection *Assoc = SectionMap.lookup(MCSec.getAssocSection());
+    const MCSymbol *COMDAT = MCSec.getCOMDATSymbol();
+    assert(COMDAT);
+    COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(COMDAT);
+    assert(COMDATSymbol);
+    COFFSection *Assoc = COMDATSymbol->Section;
     if (!Assoc)
-      report_fatal_error(Twine("Missing associated COMDAT section ") +
-                         MCSec.getAssocSection()->getSectionName() +
-                         " for section " + MCSec.getSectionName());
+      report_fatal_error(
+          Twine("Missing associated COMDAT section for section ") +
+          MCSec.getSectionName());
 
     // Skip this section if the associated section is unused.
     if (Assoc->Number == -1)
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 40b8dd944bd9..e6df4651a536 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
@@ -125,30 +126,39 @@ void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
   assert((!Symbol->isInSection() ||
           Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
          "Got non-COFF section in the COFF backend!");
-  assert(!CurSymbol && "starting new symbol definition in a symbol definition");
+
+  if (CurSymbol)
+    FatalError("starting a new symbol definition without completing the "
+               "previous one");
   CurSymbol = Symbol;
 }
 
 void MCWinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
-  assert(CurSymbol && "StorageClass specified outside of symbol definition");
-  assert((StorageClass & ~0xFF) == 0 &&
-         "StorageClass must only have data in the first byte!");
+  if (!CurSymbol)
+    FatalError("storage class specified outside of symbol definition");
+
+  if (StorageClass & ~0xff)
+    FatalError(Twine("storage class value '") + itostr(StorageClass) +
+               "' out of range");
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*CurSymbol);
   SD.modifyFlags(StorageClass << COFF::SF_ClassShift, COFF::SF_ClassMask);
 }
 
 void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) {
-  assert(CurSymbol && "SymbolType specified outside of a symbol definition");
-  assert((Type & ~0xFFFF) == 0 &&
-         "Type must only have data in the first 2 bytes");
+  if (!CurSymbol)
+    FatalError("symbol type specified outside of a symbol definition");
+
+  if (Type & ~0xffff)
+    FatalError(Twine("type value '") + itostr(Type) + "' out of range");
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*CurSymbol);
   SD.modifyFlags(Type << COFF::SF_TypeShift, COFF::SF_TypeMask);
 }
 
 void MCWinCOFFStreamer::EndCOFFSymbolDef() {
-  assert(CurSymbol && "ending symbol definition without beginning one");
+  if (!CurSymbol)
+    FatalError("ending symbol definition without starting one");
   CurSymbol = nullptr;
 }
 
@@ -239,5 +249,10 @@ void MCWinCOFFStreamer::EmitWin64EHHandlerData() {
 void MCWinCOFFStreamer::FinishImpl() {
   MCObjectStreamer::FinishImpl();
 }
+
+LLVM_ATTRIBUTE_NORETURN
+void MCWinCOFFStreamer::FatalError(const Twine &Msg) const {
+  getContext().FatalError(SMLoc(), Msg);
+}
 }
 
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 304ca475e130..05e891384f12 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -115,18 +115,14 @@ Archive::Child Archive::Child::getNext() const {
   return Child(Parent, NextLoc);
 }
 
-error_code Archive::Child::getName(StringRef &Result) const {
+ErrorOr<StringRef> Archive::Child::getName() const {
   StringRef name = getRawName();
   // Check if it's a special name.
   if (name[0] == '/') {
-    if (name.size() == 1) { // Linker member.
-      Result = name;
-      return object_error::success;
-    }
-    if (name.size() == 2 && name[1] == '/') { // String table.
-      Result = name;
-      return object_error::success;
-    }
+    if (name.size() == 1) // Linker member.
+      return name;
+    if (name.size() == 2 && name[1] == '/') // String table.
+      return name;
     // It's a long name.
     // Get the offset.
     std::size_t offset;
@@ -147,64 +143,57 @@ error_code Archive::Child::getName(StringRef &Result) const {
     // GNU long file names end with a /.
     if (Parent->kind() == K_GNU) {
       StringRef::size_type End = StringRef(addr).find('/');
-      Result = StringRef(addr, End);
-    } else {
-      Result = addr;
+      return StringRef(addr, End);
     }
-    return object_error::success;
+    return StringRef(addr);
   } else if (name.startswith("#1/")) {
     uint64_t name_size;
     if (name.substr(3).rtrim(" ").getAsInteger(10, name_size))
       llvm_unreachable("Long name length is not an ingeter");
-    Result = Data.substr(sizeof(ArchiveMemberHeader), name_size)
+    return Data.substr(sizeof(ArchiveMemberHeader), name_size)
         .rtrim(StringRef("\0", 1));
-    return object_error::success;
   }
   // It's a simple name.
   if (name[name.size() - 1] == '/')
-    Result = name.substr(0, name.size() - 1);
-  else
-    Result = name;
-  return object_error::success;
+    return name.substr(0, name.size() - 1);
+  return name;
 }
 
-error_code Archive::Child::getMemoryBuffer(std::unique_ptr<MemoryBuffer> &Result,
-                                           bool FullPath) const {
-  StringRef Name;
-  if (error_code ec = getName(Name))
-    return ec;
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+Archive::Child::getMemoryBuffer(bool FullPath) const {
+  ErrorOr<StringRef> NameOrErr = getName();
+  if (std::error_code EC = NameOrErr.getError())
+    return EC;
+  StringRef Name = NameOrErr.get();
   SmallString<128> Path;
-  Result.reset(MemoryBuffer::getMemBuffer(
-      getBuffer(), FullPath ? (Twine(Parent->getFileName()) + "(" + Name + ")")
-                                  .toStringRef(Path)
-                            : Name,
+  std::unique_ptr<MemoryBuffer> Ret(MemoryBuffer::getMemBuffer(
+      getBuffer(),
+      FullPath
+          ? (Twine(Parent->getFileName()) + "(" + Name + ")").toStringRef(Path)
+          : Name,
       false));
-  return error_code::success();
+  return std::move(Ret);
 }
 
-error_code Archive::Child::getAsBinary(std::unique_ptr<Binary> &Result,
-                                       LLVMContext *Context) const {
+ErrorOr<std::unique_ptr<Binary>>
+Archive::Child::getAsBinary(LLVMContext *Context) const {
   std::unique_ptr<Binary> ret;
-  std::unique_ptr<MemoryBuffer> Buff;
-  if (error_code ec = getMemoryBuffer(Buff))
-    return ec;
-  ErrorOr<Binary *> BinaryOrErr = createBinary(Buff.release(), Context);
-  if (error_code EC = BinaryOrErr.getError())
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr = getMemoryBuffer();
+  if (std::error_code EC = BuffOrErr.getError())
     return EC;
-  Result.reset(BinaryOrErr.get());
-  return object_error::success;
+  return createBinary(BuffOrErr.get().release(), Context);
 }
 
 ErrorOr<Archive*> Archive::create(MemoryBuffer *Source) {
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<Archive> Ret(new Archive(Source, EC));
   if (EC)
     return EC;
   return Ret.release();
 }
 
-Archive::Archive(MemoryBuffer *source, error_code &ec)
-  : Binary(Binary::ID_Archive, source), SymbolTable(child_end()) {
+Archive::Archive(MemoryBuffer *source, std::error_code &ec)
+    : Binary(Binary::ID_Archive, source), SymbolTable(child_end()) {
   // Check for sufficient magic.
   assert(source);
   if (source->getBufferSize() < 8 ||
@@ -255,9 +244,11 @@ Archive::Archive(MemoryBuffer *source, error_code &ec)
   if (Name.startswith("#1/")) {
     Format = K_BSD;
     // We know this is BSD, so getName will work since there is no string table.
-    ec = i->getName(Name);
+    ErrorOr<StringRef> NameOrErr = i->getName();
+    ec = NameOrErr.getError();
     if (ec)
       return;
+    Name = NameOrErr.get();
     if (Name == "__.SYMDEF SORTED") {
       SymbolTable = i;
       ++i;
@@ -335,12 +326,11 @@ Archive::child_iterator Archive::child_end() const {
   return Child(this, nullptr);
 }
 
-error_code Archive::Symbol::getName(StringRef &Result) const {
-  Result = StringRef(Parent->SymbolTable->getBuffer().begin() + StringIndex);
-  return object_error::success;
+StringRef Archive::Symbol::getName() const {
+  return Parent->SymbolTable->getBuffer().begin() + StringIndex;
 }
 
-error_code Archive::Symbol::getMember(child_iterator &Result) const {
+ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const {
   const char *Buf = Parent->SymbolTable->getBuffer().begin();
   const char *Offsets = Buf + 4;
   uint32_t Offset = 0;
@@ -380,9 +370,8 @@ error_code Archive::Symbol::getMember(child_iterator &Result) const {
   }
 
   const char *Loc = Parent->getData().begin() + Offset;
-  Result = Child(Parent, Loc);
-
-  return object_error::success;
+  child_iterator Iter(Child(Parent, Loc));
+  return Iter;
 }
 
 Archive::Symbol Archive::Symbol::getNext() const {
@@ -440,16 +429,15 @@ Archive::symbol_iterator Archive::symbol_end() const {
 Archive::child_iterator Archive::findSym(StringRef name) const {
   Archive::symbol_iterator bs = symbol_begin();
   Archive::symbol_iterator es = symbol_end();
-  Archive::child_iterator result;
-  
-  StringRef symname;
+
   for (; bs != es; ++bs) {
-    if (bs->getName(symname))
-        return child_end();
-    if (symname == name) {
-      if (bs->getMember(result))
+    StringRef SymName = bs->getName();
+    if (SymName == name) {
+      ErrorOr<Archive::child_iterator> ResultOrErr = bs->getMember();
+      // FIXME: Should we really eat the error?
+      if (ResultOrErr.getError())
         return child_end();
-      return result;
+      return ResultOrErr.get();
     }
   }
   return child_end();
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index 63fd3ed01106..c55ed0c75988 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -81,7 +81,7 @@ ErrorOr<Binary *> object::createBinary(MemoryBuffer *Source,
 
 ErrorOr<Binary *> object::createBinary(StringRef Path) {
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFileOrSTDIN(Path, File))
+  if (std::error_code EC = MemoryBuffer::getFileOrSTDIN(Path, File))
     return EC;
   return createBinary(File.release());
 }
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 262c040c57a8..186d64bafd44 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -31,7 +31,8 @@ using support::ulittle32_t;
 using support::little16_t;
 
 // Returns false if size is greater than the buffer size. And sets ec.
-static bool checkSize(const MemoryBuffer *M, error_code &EC, uint64_t Size) {
+static bool checkSize(const MemoryBuffer *M, std::error_code &EC,
+                      uint64_t Size) {
   if (M->getBufferSize() < Size) {
     EC = object_error::unexpected_eof;
     return false;
@@ -41,9 +42,10 @@ static bool checkSize(const MemoryBuffer *M, error_code &EC, uint64_t Size) {
 
 // Sets Obj unless any bytes in [addr, addr + size) fall outsize of m.
 // Returns unexpected_eof if error.
-template<typename T>
-static error_code getObject(const T *&Obj, const MemoryBuffer *M,
-                            const uint8_t *Ptr, const size_t Size = sizeof(T)) {
+template <typename T>
+static std::error_code getObject(const T *&Obj, const MemoryBuffer *M,
+                                 const uint8_t *Ptr,
+                                 const size_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
   if (Addr + Size < Addr ||
       Addr + Size < Size ||
@@ -129,17 +131,17 @@ void COFFObjectFile::moveSymbolNext(DataRefImpl &Ref) const {
   Ref.p = reinterpret_cast<uintptr_t>(Symb);
 }
 
-error_code COFFObjectFile::getSymbolName(DataRefImpl Ref,
-                                         StringRef &Result) const {
+std::error_code COFFObjectFile::getSymbolName(DataRefImpl Ref,
+                                              StringRef &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   return getSymbolName(Symb, Result);
 }
 
-error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
-                                            uint64_t &Result) const {
+std::error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
+                                                 uint64_t &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   const coff_section *Section = nullptr;
-  if (error_code EC = getSection(Symb->SectionNumber, Section))
+  if (std::error_code EC = getSection(Symb->SectionNumber, Section))
     return EC;
 
   if (Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
@@ -151,8 +153,8 @@ error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
-                                         SymbolRef::Type &Result) const {
+std::error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
+                                              SymbolRef::Type &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   Result = SymbolRef::ST_Other;
   if (Symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
@@ -164,7 +166,7 @@ error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
     uint32_t Characteristics = 0;
     if (!COFF::isReservedSectionNumber(Symb->SectionNumber)) {
       const coff_section *Section = nullptr;
-      if (error_code EC = getSection(Symb->SectionNumber, Section))
+      if (std::error_code EC = getSection(Symb->SectionNumber, Section))
         return EC;
       Characteristics = Section->Characteristics;
     }
@@ -202,14 +204,14 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   return Result;
 }
 
-error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
-                                         uint64_t &Result) const {
+std::error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
+                                              uint64_t &Result) const {
   // FIXME: Return the correct size. This requires looking at all the symbols
   //        in the same section as this symbol, and looking for either the next
   //        symbol, or the end of the section.
   const coff_symbol *Symb = toSymb(Ref);
   const coff_section *Section = nullptr;
-  if (error_code EC = getSection(Symb->SectionNumber, Section))
+  if (std::error_code EC = getSection(Symb->SectionNumber, Section))
     return EC;
 
   if (Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
@@ -221,14 +223,16 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolSection(DataRefImpl Ref,
-                                            section_iterator &Result) const {
+std::error_code
+COFFObjectFile::getSymbolSection(DataRefImpl Ref,
+                                 section_iterator &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   if (COFF::isReservedSectionNumber(Symb->SectionNumber)) {
     Result = section_end();
   } else {
     const coff_section *Sec = nullptr;
-    if (error_code EC = getSection(Symb->SectionNumber, Sec)) return EC;
+    if (std::error_code EC = getSection(Symb->SectionNumber, Sec))
+      return EC;
     DataRefImpl Ref;
     Ref.p = reinterpret_cast<uintptr_t>(Sec);
     Result = section_iterator(SectionRef(Ref, this));
@@ -242,37 +246,37 @@ void COFFObjectFile::moveSectionNext(DataRefImpl &Ref) const {
   Ref.p = reinterpret_cast<uintptr_t>(Sec);
 }
 
-error_code COFFObjectFile::getSectionName(DataRefImpl Ref,
-                                          StringRef &Result) const {
+std::error_code COFFObjectFile::getSectionName(DataRefImpl Ref,
+                                               StringRef &Result) const {
   const coff_section *Sec = toSec(Ref);
   return getSectionName(Sec, Result);
 }
 
-error_code COFFObjectFile::getSectionAddress(DataRefImpl Ref,
-                                             uint64_t &Result) const {
+std::error_code COFFObjectFile::getSectionAddress(DataRefImpl Ref,
+                                                  uint64_t &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->VirtualAddress;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSectionSize(DataRefImpl Ref,
-                                          uint64_t &Result) const {
+std::error_code COFFObjectFile::getSectionSize(DataRefImpl Ref,
+                                               uint64_t &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->SizeOfRawData;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSectionContents(DataRefImpl Ref,
-                                              StringRef &Result) const {
+std::error_code COFFObjectFile::getSectionContents(DataRefImpl Ref,
+                                                   StringRef &Result) const {
   const coff_section *Sec = toSec(Ref);
   ArrayRef<uint8_t> Res;
-  error_code EC = getSectionContents(Sec, Res);
+  std::error_code EC = getSectionContents(Sec, Res);
   Result = StringRef(reinterpret_cast<const char*>(Res.data()), Res.size());
   return EC;
 }
 
-error_code COFFObjectFile::getSectionAlignment(DataRefImpl Ref,
-                                               uint64_t &Res) const {
+std::error_code COFFObjectFile::getSectionAlignment(DataRefImpl Ref,
+                                                    uint64_t &Res) const {
   const coff_section *Sec = toSec(Ref);
   if (!Sec)
     return object_error::parse_failed;
@@ -280,62 +284,64 @@ error_code COFFObjectFile::getSectionAlignment(DataRefImpl Ref,
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionText(DataRefImpl Ref,
-                                         bool &Result) const {
+std::error_code COFFObjectFile::isSectionText(DataRefImpl Ref,
+                                              bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionData(DataRefImpl Ref,
-                                         bool &Result) const {
+std::error_code COFFObjectFile::isSectionData(DataRefImpl Ref,
+                                              bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionBSS(DataRefImpl Ref,
-                                        bool &Result) const {
+std::error_code COFFObjectFile::isSectionBSS(DataRefImpl Ref,
+                                             bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Ref,
-                                                         bool &Result) const {
+std::error_code
+COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Ref,
+                                              bool &Result) const {
   // FIXME: Unimplemented
   Result = true;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionVirtual(DataRefImpl Ref,
-                                           bool &Result) const {
+std::error_code COFFObjectFile::isSectionVirtual(DataRefImpl Ref,
+                                                 bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Ref,
-                                             bool &Result) const {
+std::error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Ref,
+                                                  bool &Result) const {
   // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionReadOnlyData(DataRefImpl Ref,
-                                                bool &Result) const {
+std::error_code COFFObjectFile::isSectionReadOnlyData(DataRefImpl Ref,
+                                                      bool &Result) const {
   // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
 
-error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
-                                                 DataRefImpl SymbRef,
-                                                 bool &Result) const {
+std::error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
+                                                      DataRefImpl SymbRef,
+                                                      bool &Result) const {
   const coff_section *Sec = toSec(SecRef);
   const coff_symbol *Symb = toSymb(SymbRef);
   const coff_section *SymbSec = nullptr;
-  if (error_code EC = getSection(Symb->SectionNumber, SymbSec)) return EC;
+  if (std::error_code EC = getSection(Symb->SectionNumber, SymbSec))
+    return EC;
   if (SymbSec == Sec)
     Result = true;
   else
@@ -390,8 +396,8 @@ relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Ref) const {
 }
 
 // Initialize the pointer to the symbol table.
-error_code COFFObjectFile::initSymbolTablePtr() {
-  if (error_code EC = getObject(
+std::error_code COFFObjectFile::initSymbolTablePtr() {
+  if (std::error_code EC = getObject(
           SymbolTable, Data, base() + COFFHeader->PointerToSymbolTable,
           COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
     return EC;
@@ -403,11 +409,11 @@ error_code COFFObjectFile::initSymbolTablePtr() {
       base() + COFFHeader->PointerToSymbolTable +
       COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
   const ulittle32_t *StringTableSizePtr;
-  if (error_code EC = getObject(StringTableSizePtr, Data, StringTableAddr))
+  if (std::error_code EC = getObject(StringTableSizePtr, Data, StringTableAddr))
     return EC;
   StringTableSize = *StringTableSizePtr;
-  if (error_code EC =
-      getObject(StringTable, Data, StringTableAddr, StringTableSize))
+  if (std::error_code EC =
+          getObject(StringTable, Data, StringTableAddr, StringTableSize))
     return EC;
 
   // Treat table sizes < 4 as empty because contrary to the PECOFF spec, some
@@ -422,7 +428,7 @@ error_code COFFObjectFile::initSymbolTablePtr() {
 }
 
 // Returns the file offset for the given VA.
-error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
+std::error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
   uint64_t ImageBase = PE32Header ? (uint64_t)PE32Header->ImageBase
                                   : (uint64_t)PE32PlusHeader->ImageBase;
   uint64_t Rva = Addr - ImageBase;
@@ -431,7 +437,7 @@ error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
 }
 
 // Returns the file offset for the given RVA.
-error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
+std::error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
   for (const SectionRef &S : sections()) {
     const coff_section *Section = getCOFFSection(S);
     uint32_t SectionStart = Section->VirtualAddress;
@@ -447,10 +453,10 @@ error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
 
 // Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
 // table entry.
-error_code COFFObjectFile::
-getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const {
+std::error_code COFFObjectFile::getHintName(uint32_t Rva, uint16_t &Hint,
+                                            StringRef &Name) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = getRvaPtr(Rva, IntPtr))
+  if (std::error_code EC = getRvaPtr(Rva, IntPtr))
     return EC;
   const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(IntPtr);
   Hint = *reinterpret_cast<const ulittle16_t *>(Ptr);
@@ -459,7 +465,7 @@ getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const {
 }
 
 // Find the import table.
-error_code COFFObjectFile::initImportTablePtr() {
+std::error_code COFFObjectFile::initImportTablePtr() {
   // First, we get the RVA of the import table. If the file lacks a pointer to
   // the import table, do nothing.
   const data_directory *DataEntry;
@@ -477,7 +483,7 @@ error_code COFFObjectFile::initImportTablePtr() {
   // Find the section that contains the RVA. This is needed because the RVA is
   // the import table's memory address which is different from its file offset.
   uintptr_t IntPtr = 0;
-  if (error_code EC = getRvaPtr(ImportTableRva, IntPtr))
+  if (std::error_code EC = getRvaPtr(ImportTableRva, IntPtr))
     return EC;
   ImportDirectory = reinterpret_cast<
       const import_directory_table_entry *>(IntPtr);
@@ -485,7 +491,7 @@ error_code COFFObjectFile::initImportTablePtr() {
 }
 
 // Find the export table.
-error_code COFFObjectFile::initExportTablePtr() {
+std::error_code COFFObjectFile::initExportTablePtr() {
   // First, we get the RVA of the export table. If the file lacks a pointer to
   // the export table, do nothing.
   const data_directory *DataEntry;
@@ -498,14 +504,14 @@ error_code COFFObjectFile::initExportTablePtr() {
 
   uint32_t ExportTableRva = DataEntry->RelativeVirtualAddress;
   uintptr_t IntPtr = 0;
-  if (error_code EC = getRvaPtr(ExportTableRva, IntPtr))
+  if (std::error_code EC = getRvaPtr(ExportTableRva, IntPtr))
     return EC;
   ExportDirectory =
       reinterpret_cast<const export_directory_table_entry *>(IntPtr);
   return object_error::success;
 }
 
-COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &EC,
+COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, std::error_code &EC,
                                bool BufferOwned)
     : ObjectFile(Binary::ID_COFF, Object, BufferOwned), COFFHeader(nullptr),
       PE32Header(nullptr), PE32PlusHeader(nullptr), DataDirectory(nullptr),
@@ -686,28 +692,30 @@ unsigned COFFObjectFile::getArch() const {
 
 // This method is kept here because lld uses this. As soon as we make
 // lld to use getCOFFHeader, this method will be removed.
-error_code COFFObjectFile::getHeader(const coff_file_header *&Res) const {
+std::error_code COFFObjectFile::getHeader(const coff_file_header *&Res) const {
   return getCOFFHeader(Res);
 }
 
-error_code COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
+std::error_code
+COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
   Res = COFFHeader;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
+std::error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
   Res = PE32Header;
   return object_error::success;
 }
 
-error_code
+std::error_code
 COFFObjectFile::getPE32PlusHeader(const pe32plus_header *&Res) const {
   Res = PE32PlusHeader;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getDataDirectory(uint32_t Index,
-                                            const data_directory *&Res) const {
+std::error_code
+COFFObjectFile::getDataDirectory(uint32_t Index,
+                                 const data_directory *&Res) const {
   // Error if if there's no data directory or the index is out of range.
   if (!DataDirectory)
     return object_error::parse_failed;
@@ -720,8 +728,8 @@ error_code COFFObjectFile::getDataDirectory(uint32_t Index,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSection(int32_t Index,
-                                      const coff_section *&Result) const {
+std::error_code COFFObjectFile::getSection(int32_t Index,
+                                           const coff_section *&Result) const {
   // Check for special index values.
   if (COFF::isReservedSectionNumber(Index))
     Result = nullptr;
@@ -733,8 +741,8 @@ error_code COFFObjectFile::getSection(int32_t Index,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getString(uint32_t Offset,
-                                     StringRef &Result) const {
+std::error_code COFFObjectFile::getString(uint32_t Offset,
+                                          StringRef &Result) const {
   if (StringTableSize <= 4)
     // Tried to get a string from an empty string table.
     return object_error::parse_failed;
@@ -744,8 +752,8 @@ error_code COFFObjectFile::getString(uint32_t Offset,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbol(uint32_t Index,
-                                     const coff_symbol *&Result) const {
+std::error_code COFFObjectFile::getSymbol(uint32_t Index,
+                                          const coff_symbol *&Result) const {
   if (Index < COFFHeader->NumberOfSymbols)
     Result = SymbolTable + Index;
   else
@@ -753,12 +761,12 @@ error_code COFFObjectFile::getSymbol(uint32_t Index,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolName(const coff_symbol *Symbol,
-                                         StringRef &Res) const {
+std::error_code COFFObjectFile::getSymbolName(const coff_symbol *Symbol,
+                                              StringRef &Res) const {
   // Check for string table entry. First 4 bytes are 0.
   if (Symbol->Name.Offset.Zeroes == 0) {
     uint32_t Offset = Symbol->Name.Offset.Offset;
-    if (error_code EC = getString(Offset, Res))
+    if (std::error_code EC = getString(Offset, Res))
       return EC;
     return object_error::success;
   }
@@ -795,8 +803,8 @@ ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
                            Symbol->NumberOfAuxSymbols * sizeof(coff_symbol));
 }
 
-error_code COFFObjectFile::getSectionName(const coff_section *Sec,
-                                          StringRef &Res) const {
+std::error_code COFFObjectFile::getSectionName(const coff_section *Sec,
+                                               StringRef &Res) const {
   StringRef Name;
   if (Sec->Name[7] == 0)
     // Null terminated, let ::strlen figure out the length.
@@ -815,7 +823,7 @@ error_code COFFObjectFile::getSectionName(const coff_section *Sec,
       if (Name.substr(1).getAsInteger(10, Offset))
         return object_error::parse_failed;
     }
-    if (error_code EC = getString(Offset, Name))
+    if (std::error_code EC = getString(Offset, Name))
       return EC;
   }
 
@@ -823,8 +831,9 @@ error_code COFFObjectFile::getSectionName(const coff_section *Sec,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSectionContents(const coff_section *Sec,
-                                              ArrayRef<uint8_t> &Res) const {
+std::error_code
+COFFObjectFile::getSectionContents(const coff_section *Sec,
+                                   ArrayRef<uint8_t> &Res) const {
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
@@ -846,13 +855,13 @@ void COFFObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
             reinterpret_cast<const coff_relocation*>(Rel.p) + 1);
 }
 
-error_code COFFObjectFile::getRelocationAddress(DataRefImpl Rel,
-                                                uint64_t &Res) const {
+std::error_code COFFObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                     uint64_t &Res) const {
   report_fatal_error("getRelocationAddress not implemented in COFFObjectFile");
 }
 
-error_code COFFObjectFile::getRelocationOffset(DataRefImpl Rel,
-                                               uint64_t &Res) const {
+std::error_code COFFObjectFile::getRelocationOffset(DataRefImpl Rel,
+                                                    uint64_t &Res) const {
   Res = toRel(Rel)->VirtualAddress;
   return object_error::success;
 }
@@ -864,8 +873,8 @@ symbol_iterator COFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   return symbol_iterator(SymbolRef(Ref, this));
 }
 
-error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
-                                             uint64_t &Res) const {
+std::error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
+                                                  uint64_t &Res) const {
   const coff_relocation* R = toRel(Rel);
   Res = R->Type;
   return object_error::success;
@@ -891,8 +900,9 @@ COFFObjectFile::getCOFFRelocation(const RelocationRef &Reloc) const {
     Res = #reloc_type;                                                         \
     break;
 
-error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
-                                          SmallVectorImpl<char> &Result) const {
+std::error_code
+COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
+                                      SmallVectorImpl<char> &Result) const {
   const coff_relocation *Reloc = toRel(Rel);
   StringRef Res;
   switch (COFFHeader->Machine) {
@@ -966,26 +976,29 @@ error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
 
 #undef LLVM_COFF_SWITCH_RELOC_TYPE_NAME
 
-error_code COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
-                                          SmallVectorImpl<char> &Result) const {
+std::error_code
+COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
+                                         SmallVectorImpl<char> &Result) const {
   const coff_relocation *Reloc = toRel(Rel);
   const coff_symbol *Symb = nullptr;
-  if (error_code EC = getSymbol(Reloc->SymbolTableIndex, Symb)) return EC;
+  if (std::error_code EC = getSymbol(Reloc->SymbolTableIndex, Symb))
+    return EC;
   DataRefImpl Sym;
   Sym.p = reinterpret_cast<uintptr_t>(Symb);
   StringRef SymName;
-  if (error_code EC = getSymbolName(Sym, SymName)) return EC;
+  if (std::error_code EC = getSymbolName(Sym, SymName))
+    return EC;
   Result.append(SymName.begin(), SymName.end());
   return object_error::success;
 }
 
-error_code COFFObjectFile::getLibraryNext(DataRefImpl LibData,
-                                          LibraryRef &Result) const {
+std::error_code COFFObjectFile::getLibraryNext(DataRefImpl LibData,
+                                               LibraryRef &Result) const {
   report_fatal_error("getLibraryNext not implemented in COFFObjectFile");
 }
 
-error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
-                                          StringRef &Result) const {
+std::error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
+                                               StringRef &Result) const {
   report_fatal_error("getLibraryPath not implemented in COFFObjectFile");
 }
 
@@ -998,24 +1011,25 @@ void ImportDirectoryEntryRef::moveNext() {
   ++Index;
 }
 
-error_code ImportDirectoryEntryRef::
-getImportTableEntry(const import_directory_table_entry *&Result) const {
+std::error_code ImportDirectoryEntryRef::getImportTableEntry(
+    const import_directory_table_entry *&Result) const {
   Result = ImportTable;
   return object_error::success;
 }
 
-error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
+std::error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(ImportTable->NameRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ImportTable->NameRVA, IntPtr))
     return EC;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return object_error::success;
 }
 
-error_code ImportDirectoryEntryRef::getImportLookupEntry(
+std::error_code ImportDirectoryEntryRef::getImportLookupEntry(
     const import_lookup_table_entry32 *&Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC =
+  if (std::error_code EC =
           OwningObject->getRvaPtr(ImportTable->ImportLookupTableRVA, IntPtr))
     return EC;
   Result = reinterpret_cast<const import_lookup_table_entry32 *>(IntPtr);
@@ -1033,31 +1047,33 @@ void ExportDirectoryEntryRef::moveNext() {
 
 // Returns the name of the current export symbol. If the symbol is exported only
 // by ordinal, the empty string is set as a result.
-error_code ExportDirectoryEntryRef::getDllName(StringRef &Result) const {
+std::error_code ExportDirectoryEntryRef::getDllName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr))
     return EC;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return object_error::success;
 }
 
 // Returns the starting ordinal number.
-error_code ExportDirectoryEntryRef::getOrdinalBase(uint32_t &Result) const {
+std::error_code
+ExportDirectoryEntryRef::getOrdinalBase(uint32_t &Result) const {
   Result = ExportTable->OrdinalBase;
   return object_error::success;
 }
 
 // Returns the export ordinal of the current export symbol.
-error_code ExportDirectoryEntryRef::getOrdinal(uint32_t &Result) const {
+std::error_code ExportDirectoryEntryRef::getOrdinal(uint32_t &Result) const {
   Result = ExportTable->OrdinalBase + Index;
   return object_error::success;
 }
 
 // Returns the address of the current export symbol.
-error_code ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
+std::error_code ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(
-          ExportTable->ExportAddressTableRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ExportTable->ExportAddressTableRVA, IntPtr))
     return EC;
   const export_address_table_entry *entry =
       reinterpret_cast<const export_address_table_entry *>(IntPtr);
@@ -1067,10 +1083,11 @@ error_code ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
 
 // Returns the name of the current export symbol. If the symbol is exported only
 // by ordinal, the empty string is set as a result.
-error_code ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
+std::error_code
+ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(
-          ExportTable->OrdinalTableRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ExportTable->OrdinalTableRVA, IntPtr))
     return EC;
   const ulittle16_t *Start = reinterpret_cast<const ulittle16_t *>(IntPtr);
 
@@ -1080,11 +1097,11 @@ error_code ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
        I < E; ++I, ++Offset) {
     if (*I != Index)
       continue;
-    if (error_code EC = OwningObject->getRvaPtr(
-            ExportTable->NamePointerRVA, IntPtr))
+    if (std::error_code EC =
+            OwningObject->getRvaPtr(ExportTable->NamePointerRVA, IntPtr))
       return EC;
     const ulittle32_t *NamePtr = reinterpret_cast<const ulittle32_t *>(IntPtr);
-    if (error_code EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr))
+    if (std::error_code EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr))
       return EC;
     Result = StringRef(reinterpret_cast<const char *>(IntPtr));
     return object_error::success;
@@ -1095,7 +1112,7 @@ error_code ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
 
 ErrorOr<ObjectFile *> ObjectFile::createCOFFObjectFile(MemoryBuffer *Object,
                                                        bool BufferOwned) {
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<COFFObjectFile> Ret(
       new COFFObjectFile(Object, EC, BufferOwned));
   if (EC)
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index a2c4df2007f7..0a3e2cb790d0 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -17,13 +17,13 @@
 namespace llvm {
 using namespace object;
 
-ErrorOr<ObjectFile *> ObjectFile::createELFObjectFile(MemoryBuffer *Obj,
-                                                      bool BufferOwned) {
+static ErrorOr<ObjectFile *> createELFObjectFileAux(MemoryBuffer *Obj,
+                                                    bool BufferOwned) {
   std::pair<unsigned char, unsigned char> Ident = getElfArchType(Obj);
   std::size_t MaxAlignment =
     1ULL << countTrailingZeros(uintptr_t(Obj->getBufferStart()));
 
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<ObjectFile> R;
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
@@ -36,7 +36,7 @@ ErrorOr<ObjectFile *> ObjectFile::createELFObjectFile(MemoryBuffer *Obj,
       R.reset(new ELFObjectFile<ELFType<support::little, 2, false> >(
           Obj, EC, BufferOwned));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 4)
@@ -48,7 +48,7 @@ ErrorOr<ObjectFile *> ObjectFile::createELFObjectFile(MemoryBuffer *Obj,
       R.reset(new ELFObjectFile<ELFType<support::big, 2, false> >(Obj, EC,
                                                                   BufferOwned));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
@@ -60,7 +60,7 @@ ErrorOr<ObjectFile *> ObjectFile::createELFObjectFile(MemoryBuffer *Obj,
       R.reset(new ELFObjectFile<ELFType<support::big, 2, true> >(Obj, EC,
                                                                  BufferOwned));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
@@ -72,14 +72,22 @@ ErrorOr<ObjectFile *> ObjectFile::createELFObjectFile(MemoryBuffer *Obj,
       R.reset(new ELFObjectFile<ELFType<support::little, 2, true> >(
           Obj, EC, BufferOwned));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   }
   else
-    report_fatal_error("Buffer is not an ELF object file!");
+    llvm_unreachable("Buffer is not an ELF object file!");
 
   if (EC)
     return EC;
   return R.release();
 }
 
+ErrorOr<ObjectFile *> ObjectFile::createELFObjectFile(MemoryBuffer *Obj,
+                                                      bool BufferOwned) {
+  ErrorOr<ObjectFile *> Ret = createELFObjectFileAux(Obj, BufferOwned);
+  if (BufferOwned && Ret.getError())
+    delete Obj;
+  return Ret;
+}
+
 } // end namespace llvm
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index 8329853340cd..dc3d46782503 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp
@@ -246,16 +246,17 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
 #define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
+#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M);
   switch (Object->Header.Machine) {
   case ELF::EM_ARM:
     BCase(EF_ARM_SOFT_FLOAT)
     BCase(EF_ARM_VFP_FLOAT)
-    BCase(EF_ARM_EABI_UNKNOWN)
-    BCase(EF_ARM_EABI_VER1)
-    BCase(EF_ARM_EABI_VER2)
-    BCase(EF_ARM_EABI_VER3)
-    BCase(EF_ARM_EABI_VER4)
-    BCase(EF_ARM_EABI_VER5)
+    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK)
     break;
   case ELF::EM_MIPS:
     BCase(EF_MIPS_NOREORDER)
@@ -266,17 +267,17 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_MIPS_ABI_O32)
     BCase(EF_MIPS_MICROMIPS)
     BCase(EF_MIPS_ARCH_ASE_M16)
-    BCase(EF_MIPS_ARCH_1)
-    BCase(EF_MIPS_ARCH_2)
-    BCase(EF_MIPS_ARCH_3)
-    BCase(EF_MIPS_ARCH_4)
-    BCase(EF_MIPS_ARCH_5)
-    BCase(EF_MIPS_ARCH_32)
-    BCase(EF_MIPS_ARCH_64)
-    BCase(EF_MIPS_ARCH_32R2)
-    BCase(EF_MIPS_ARCH_64R2)
-    BCase(EF_MIPS_ARCH_32R6)
-    BCase(EF_MIPS_ARCH_64R6)
+    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH)
     break;
   case ELF::EM_HEXAGON:
     BCase(EF_HEXAGON_MACH_V2)
@@ -292,6 +293,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     llvm_unreachable("Unsupported architecture");
   }
 #undef BCase
+#undef BCaseMask
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
@@ -366,6 +368,16 @@ void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
 #undef ECase
 }
 
+void ScalarEnumerationTraits<ELFYAML::ELF_STV>::enumeration(
+    IO &IO, ELFYAML::ELF_STV &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(STV_DEFAULT)
+  ECase(STV_INTERNAL)
+  ECase(STV_HIDDEN)
+  ECase(STV_PROTECTED)
+#undef ECase
+}
+
 void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
     IO &IO, ELFYAML::ELF_REL &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
@@ -647,6 +659,7 @@ void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Section", Symbol.Section, StringRef());
   IO.mapOptional("Value", Symbol.Value, Hex64(0));
   IO.mapOptional("Size", Symbol.Size, Hex64(0));
+  IO.mapOptional("Visibility", Symbol.Visibility, ELFYAML::ELF_STV(0));
 }
 
 void MappingTraits<ELFYAML::LocalGlobalWeakSymbols>::mapping(
@@ -662,7 +675,6 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
   IO.mapOptional("Flags", Section.Flags, ELFYAML::ELF_SHF(0));
   IO.mapOptional("Address", Section.Address, Hex64(0));
   IO.mapOptional("Link", Section.Link, StringRef());
-  IO.mapOptional("Info", Section.Info, StringRef());
   IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
 }
 
@@ -674,6 +686,7 @@ static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
 
 static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
   commonSectionMapping(IO, Section);
+  IO.mapOptional("Info", Section.Info, StringRef());
   IO.mapOptional("Relocations", Section.Relocations);
 }
 
diff --git a/lib/Object/Error.cpp b/lib/Object/Error.cpp
index 8e508696cfb8..9d252697ae52 100644
--- a/lib/Object/Error.cpp
+++ b/lib/Object/Error.cpp
@@ -18,11 +18,10 @@ using namespace llvm;
 using namespace object;
 
 namespace {
-class _object_error_category : public error_category {
+class _object_error_category : public std::error_category {
 public:
-  const char* name() const override;
+  const char* name() const LLVM_NOEXCEPT override;
   std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
 };
 }
 
@@ -30,8 +29,8 @@ const char *_object_error_category::name() const {
   return "llvm.object";
 }
 
-std::string _object_error_category::message(int ev) const {
-  object_error::Impl E = static_cast<object_error::Impl>(ev);
+std::string _object_error_category::message(int EV) const {
+  object_error E = static_cast<object_error>(EV);
   switch (E) {
   case object_error::success: return "Success";
   case object_error::arch_not_found:
@@ -47,13 +46,7 @@ std::string _object_error_category::message(int ev) const {
                    "defined.");
 }
 
-error_condition _object_error_category::default_error_condition(int ev) const {
-  if (ev == object_error::success)
-    return errc::success;
-  return errc::invalid_argument;
-}
-
-const error_category &object::object_category() {
+const std::error_category &object::object_category() {
   static _object_error_category o;
   return o;
 }
diff --git a/lib/Object/IRObjectFile.cpp b/lib/Object/IRObjectFile.cpp
index a8aba26c5aef..004d8de065d8 100644
--- a/lib/Object/IRObjectFile.cpp
+++ b/lib/Object/IRObjectFile.cpp
@@ -20,10 +20,11 @@
 using namespace llvm;
 using namespace object;
 
-IRObjectFile::IRObjectFile(MemoryBuffer *Object, error_code &EC,
+IRObjectFile::IRObjectFile(MemoryBuffer *Object, std::error_code &EC,
                            LLVMContext &Context, bool BufferOwned)
     : SymbolicFile(Binary::ID_IR, Object, BufferOwned) {
-  ErrorOr<Module*> MOrErr = parseBitcodeFile(Object, Context);
+  ErrorOr<Module *> MOrErr =
+      getLazyBitcodeModule(Object, Context, /*BufferOwned*/ false);
   if ((EC = MOrErr.getError()))
     return;
 
@@ -92,8 +93,8 @@ void IRObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
   Symb.p = Res;
 }
 
-error_code IRObjectFile::printSymbolName(raw_ostream &OS,
-                                         DataRefImpl Symb) const {
+std::error_code IRObjectFile::printSymbolName(raw_ostream &OS,
+                                              DataRefImpl Symb) const {
   const GlobalValue &GV = getGV(Symb);
 
   if (Mang)
@@ -104,11 +105,21 @@ error_code IRObjectFile::printSymbolName(raw_ostream &OS,
   return object_error::success;
 }
 
+static bool isDeclaration(const GlobalValue &V) {
+  if (V.hasAvailableExternallyLinkage())
+    return true;
+
+  if (V.isMaterializable())
+    return false;
+
+  return V.isDeclaration();
+}
+
 uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   const GlobalValue &GV = getGV(Symb);
 
   uint32_t Res = BasicSymbolRef::SF_None;
-  if (GV.isDeclaration() || GV.hasAvailableExternallyLinkage())
+  if (isDeclaration(GV))
     Res |= BasicSymbolRef::SF_Undefined;
   if (GV.hasPrivateLinkage())
     Res |= BasicSymbolRef::SF_FormatSpecific;
@@ -142,7 +153,7 @@ basic_symbol_iterator IRObjectFile::symbol_end_impl() const {
 
 ErrorOr<SymbolicFile *> llvm::object::SymbolicFile::createIRObjectFile(
     MemoryBuffer *Object, LLVMContext &Context, bool BufferOwned) {
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<IRObjectFile> Ret(
       new IRObjectFile(Object, EC, Context, BufferOwned));
   if (EC)
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 0951460ccbb2..1e105d3973b7 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -28,6 +28,7 @@ using namespace llvm;
 using namespace object;
 
 namespace llvm {
+
 namespace object {
 
 struct nlist_base {
@@ -42,191 +43,196 @@ struct section_base {
   char segname[16];
 };
 
-template<typename T>
-static void SwapValue(T &Value) {
-  Value = sys::SwapByteOrder(Value);
-}
-
 template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
 void SwapStruct(MachO::any_relocation_info &H) {
-  SwapValue(H.r_word0);
-  SwapValue(H.r_word1);
+  sys::swapByteOrder(H.r_word0);
+  sys::swapByteOrder(H.r_word1);
 }
 
 template<>
 void SwapStruct(MachO::load_command &L) {
-  SwapValue(L.cmd);
-  SwapValue(L.cmdsize);
+  sys::swapByteOrder(L.cmd);
+  sys::swapByteOrder(L.cmdsize);
 }
 
 template<>
 void SwapStruct(nlist_base &S) {
-  SwapValue(S.n_strx);
-  SwapValue(S.n_desc);
+  sys::swapByteOrder(S.n_strx);
+  sys::swapByteOrder(S.n_desc);
 }
 
 template<>
 void SwapStruct(MachO::section &S) {
-  SwapValue(S.addr);
-  SwapValue(S.size);
-  SwapValue(S.offset);
-  SwapValue(S.align);
-  SwapValue(S.reloff);
-  SwapValue(S.nreloc);
-  SwapValue(S.flags);
-  SwapValue(S.reserved1);
-  SwapValue(S.reserved2);
+  sys::swapByteOrder(S.addr);
+  sys::swapByteOrder(S.size);
+  sys::swapByteOrder(S.offset);
+  sys::swapByteOrder(S.align);
+  sys::swapByteOrder(S.reloff);
+  sys::swapByteOrder(S.nreloc);
+  sys::swapByteOrder(S.flags);
+  sys::swapByteOrder(S.reserved1);
+  sys::swapByteOrder(S.reserved2);
 }
 
 template<>
 void SwapStruct(MachO::section_64 &S) {
-  SwapValue(S.addr);
-  SwapValue(S.size);
-  SwapValue(S.offset);
-  SwapValue(S.align);
-  SwapValue(S.reloff);
-  SwapValue(S.nreloc);
-  SwapValue(S.flags);
-  SwapValue(S.reserved1);
-  SwapValue(S.reserved2);
-  SwapValue(S.reserved3);
+  sys::swapByteOrder(S.addr);
+  sys::swapByteOrder(S.size);
+  sys::swapByteOrder(S.offset);
+  sys::swapByteOrder(S.align);
+  sys::swapByteOrder(S.reloff);
+  sys::swapByteOrder(S.nreloc);
+  sys::swapByteOrder(S.flags);
+  sys::swapByteOrder(S.reserved1);
+  sys::swapByteOrder(S.reserved2);
+  sys::swapByteOrder(S.reserved3);
 }
 
 template<>
 void SwapStruct(MachO::nlist &S) {
-  SwapValue(S.n_strx);
-  SwapValue(S.n_desc);
-  SwapValue(S.n_value);
+  sys::swapByteOrder(S.n_strx);
+  sys::swapByteOrder(S.n_desc);
+  sys::swapByteOrder(S.n_value);
 }
 
 template<>
 void SwapStruct(MachO::nlist_64 &S) {
-  SwapValue(S.n_strx);
-  SwapValue(S.n_desc);
-  SwapValue(S.n_value);
+  sys::swapByteOrder(S.n_strx);
+  sys::swapByteOrder(S.n_desc);
+  sys::swapByteOrder(S.n_value);
 }
 
 template<>
 void SwapStruct(MachO::mach_header &H) {
-  SwapValue(H.magic);
-  SwapValue(H.cputype);
-  SwapValue(H.cpusubtype);
-  SwapValue(H.filetype);
-  SwapValue(H.ncmds);
-  SwapValue(H.sizeofcmds);
-  SwapValue(H.flags);
+  sys::swapByteOrder(H.magic);
+  sys::swapByteOrder(H.cputype);
+  sys::swapByteOrder(H.cpusubtype);
+  sys::swapByteOrder(H.filetype);
+  sys::swapByteOrder(H.ncmds);
+  sys::swapByteOrder(H.sizeofcmds);
+  sys::swapByteOrder(H.flags);
 }
 
 template<>
 void SwapStruct(MachO::mach_header_64 &H) {
-  SwapValue(H.magic);
-  SwapValue(H.cputype);
-  SwapValue(H.cpusubtype);
-  SwapValue(H.filetype);
-  SwapValue(H.ncmds);
-  SwapValue(H.sizeofcmds);
-  SwapValue(H.flags);
-  SwapValue(H.reserved);
+  sys::swapByteOrder(H.magic);
+  sys::swapByteOrder(H.cputype);
+  sys::swapByteOrder(H.cpusubtype);
+  sys::swapByteOrder(H.filetype);
+  sys::swapByteOrder(H.ncmds);
+  sys::swapByteOrder(H.sizeofcmds);
+  sys::swapByteOrder(H.flags);
+  sys::swapByteOrder(H.reserved);
 }
 
 template<>
 void SwapStruct(MachO::symtab_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.symoff);
-  SwapValue(C.nsyms);
-  SwapValue(C.stroff);
-  SwapValue(C.strsize);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.symoff);
+  sys::swapByteOrder(C.nsyms);
+  sys::swapByteOrder(C.stroff);
+  sys::swapByteOrder(C.strsize);
 }
 
 template<>
 void SwapStruct(MachO::dysymtab_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.ilocalsym);
-  SwapValue(C.nlocalsym);
-  SwapValue(C.iextdefsym);
-  SwapValue(C.nextdefsym);
-  SwapValue(C.iundefsym);
-  SwapValue(C.nundefsym);
-  SwapValue(C.tocoff);
-  SwapValue(C.ntoc);
-  SwapValue(C.modtaboff);
-  SwapValue(C.nmodtab);
-  SwapValue(C.extrefsymoff);
-  SwapValue(C.nextrefsyms);
-  SwapValue(C.indirectsymoff);
-  SwapValue(C.nindirectsyms);
-  SwapValue(C.extreloff);
-  SwapValue(C.nextrel);
-  SwapValue(C.locreloff);
-  SwapValue(C.nlocrel);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.ilocalsym);
+  sys::swapByteOrder(C.nlocalsym);
+  sys::swapByteOrder(C.iextdefsym);
+  sys::swapByteOrder(C.nextdefsym);
+  sys::swapByteOrder(C.iundefsym);
+  sys::swapByteOrder(C.nundefsym);
+  sys::swapByteOrder(C.tocoff);
+  sys::swapByteOrder(C.ntoc);
+  sys::swapByteOrder(C.modtaboff);
+  sys::swapByteOrder(C.nmodtab);
+  sys::swapByteOrder(C.extrefsymoff);
+  sys::swapByteOrder(C.nextrefsyms);
+  sys::swapByteOrder(C.indirectsymoff);
+  sys::swapByteOrder(C.nindirectsyms);
+  sys::swapByteOrder(C.extreloff);
+  sys::swapByteOrder(C.nextrel);
+  sys::swapByteOrder(C.locreloff);
+  sys::swapByteOrder(C.nlocrel);
 }
 
 template<>
 void SwapStruct(MachO::linkedit_data_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.dataoff);
-  SwapValue(C.datasize);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.dataoff);
+  sys::swapByteOrder(C.datasize);
 }
 
 template<>
 void SwapStruct(MachO::segment_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.vmaddr);
-  SwapValue(C.vmsize);
-  SwapValue(C.fileoff);
-  SwapValue(C.filesize);
-  SwapValue(C.maxprot);
-  SwapValue(C.initprot);
-  SwapValue(C.nsects);
-  SwapValue(C.flags);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.vmaddr);
+  sys::swapByteOrder(C.vmsize);
+  sys::swapByteOrder(C.fileoff);
+  sys::swapByteOrder(C.filesize);
+  sys::swapByteOrder(C.maxprot);
+  sys::swapByteOrder(C.initprot);
+  sys::swapByteOrder(C.nsects);
+  sys::swapByteOrder(C.flags);
 }
 
 template<>
 void SwapStruct(MachO::segment_command_64 &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.vmaddr);
-  SwapValue(C.vmsize);
-  SwapValue(C.fileoff);
-  SwapValue(C.filesize);
-  SwapValue(C.maxprot);
-  SwapValue(C.initprot);
-  SwapValue(C.nsects);
-  SwapValue(C.flags);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.vmaddr);
+  sys::swapByteOrder(C.vmsize);
+  sys::swapByteOrder(C.fileoff);
+  sys::swapByteOrder(C.filesize);
+  sys::swapByteOrder(C.maxprot);
+  sys::swapByteOrder(C.initprot);
+  sys::swapByteOrder(C.nsects);
+  sys::swapByteOrder(C.flags);
 }
 
 template<>
 void SwapStruct(uint32_t &C) {
-  SwapValue(C);
+  sys::swapByteOrder(C);
 }
 
 template<>
 void SwapStruct(MachO::linker_options_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.count);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.count);
 }
 
 template<>
 void SwapStruct(MachO::version_min_command&C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.version);
-  SwapValue(C.reserved);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.version);
+  sys::swapByteOrder(C.reserved);
+}
+
+template<>
+void SwapStruct(MachO::dylib_command&C) {
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.dylib.name);
+  sys::swapByteOrder(C.dylib.timestamp);
+  sys::swapByteOrder(C.dylib.current_version);
+  sys::swapByteOrder(C.dylib.compatibility_version);
 }
 
 template<>
 void SwapStruct(MachO::data_in_code_entry &C) {
-  SwapValue(C.offset);
-  SwapValue(C.length);
-  SwapValue(C.kind);
+  sys::swapByteOrder(C.offset);
+  sys::swapByteOrder(C.length);
+  sys::swapByteOrder(C.kind);
 }
 
 template<typename T>
@@ -306,7 +312,7 @@ static void printRelocationTargetName(const MachOObjectFile *O,
     uint32_t Val = O->getPlainRelocationSymbolNum(RE);
 
     for (const SymbolRef &Symbol : O->symbols()) {
-      error_code ec;
+      std::error_code ec;
       uint64_t Addr;
       StringRef Name;
 
@@ -323,7 +329,7 @@ static void printRelocationTargetName(const MachOObjectFile *O,
     // If we couldn't find a symbol that this relocation refers to, try
     // to find a section beginning instead.
     for (const SectionRef &Section : O->sections()) {
-      error_code ec;
+      std::error_code ec;
       uint64_t Addr;
       StringRef Name;
 
@@ -417,7 +423,7 @@ static uint32_t getSectionFlags(const MachOObjectFile *O,
 }
 
 MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian,
-                                 bool Is64bits, error_code &EC,
+                                 bool Is64bits, std::error_code &EC,
                                  bool BufferOwned)
     : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object, BufferOwned),
       SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr),
@@ -443,6 +449,12 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian,
         const char *Sec = getSectionPtr(this, Load, J);
         Sections.push_back(Sec);
       }
+    } else if (Load.C.cmd == MachO::LC_LOAD_DYLIB ||
+               Load.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
+               Load.C.cmd == MachO::LC_LAZY_LOAD_DYLIB ||
+               Load.C.cmd == MachO::LC_REEXPORT_DYLIB ||
+               Load.C.cmd == MachO::LC_LOAD_UPWARD_DYLIB) {
+      Libraries.push_back(Load.Ptr);
     }
 
     if (I == LoadCommandCount - 1)
@@ -459,8 +471,8 @@ void MachOObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
   Symb.p += SymbolTableEntrySize;
 }
 
-error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
-                                          StringRef &Res) const {
+std::error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
+                                               StringRef &Res) const {
   StringRef StringTable = getStringTableData();
   nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   const char *Start = &StringTable.data()[Entry.n_strx];
@@ -468,20 +480,52 @@ error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
-                                             uint64_t &Res) const {
+// getIndirectName() returns the name of the alias'ed symbol who's string table
+// index is in the n_value field.
+std::error_code MachOObjectFile::getIndirectName(DataRefImpl Symb,
+                                                 StringRef &Res) const {
+  StringRef StringTable = getStringTableData();
+  uint64_t NValue;
+  if (is64Bit()) {
+    MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
+    NValue = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) != MachO::N_INDR)
+      return object_error::parse_failed;
+  } else {
+    MachO::nlist Entry = getSymbolTableEntry(Symb);
+    NValue = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) != MachO::N_INDR)
+      return object_error::parse_failed;
+  }
+  if (NValue >= StringTable.size())
+    return object_error::parse_failed;
+  const char *Start = &StringTable.data()[NValue];
+  Res = StringRef(Start);
+  return object_error::success;
+}
+
+std::error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
+                                                  uint64_t &Res) const {
   if (is64Bit()) {
     MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
-    Res = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) == MachO::N_UNDF &&
+        Entry.n_value == 0)
+      Res = UnknownAddressOrSize;
+    else
+      Res = Entry.n_value;
   } else {
     MachO::nlist Entry = getSymbolTableEntry(Symb);
-    Res = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) == MachO::N_UNDF &&
+        Entry.n_value == 0)
+      Res = UnknownAddressOrSize;
+    else
+      Res = Entry.n_value;
   }
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
-                                               uint32_t &Result) const {
+std::error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
+                                                    uint32_t &Result) const {
   uint32_t flags = getSymbolFlags(DRI);
   if (flags & SymbolRef::SF_Common) {
     nlist_base Entry = getSymbolTableEntryBase(this, DRI);
@@ -492,8 +536,8 @@ error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
-                                          uint64_t &Result) const {
+std::error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
+                                               uint64_t &Result) const {
   uint64_t BeginOffset;
   uint64_t EndOffset = 0;
   uint8_t SectionIndex;
@@ -501,6 +545,10 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
   nlist_base Entry = getSymbolTableEntryBase(this, DRI);
   uint64_t Value;
   getSymbolAddress(DRI, Value);
+  if (Value == UnknownAddressOrSize) {
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
 
   BeginOffset = Value;
 
@@ -519,6 +567,8 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
     DataRefImpl DRI = Symbol.getRawDataRefImpl();
     Entry = getSymbolTableEntryBase(this, DRI);
     getSymbolAddress(DRI, Value);
+    if (Value == UnknownAddressOrSize)
+      continue;
     if (Entry.n_sect == SectionIndex && Value > BeginOffset)
       if (!EndOffset || Value < EndOffset)
         EndOffset = Value;
@@ -535,8 +585,8 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
-                                          SymbolRef::Type &Res) const {
+std::error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
+                                               SymbolRef::Type &Res) const {
   nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t n_type = Entry.n_type;
 
@@ -570,6 +620,9 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF)
     Result |= SymbolRef::SF_Undefined;
 
+  if ((MachOType & MachO::N_TYPE) == MachO::N_INDR)
+    Result |= SymbolRef::SF_Indirect;
+
   if (MachOType & MachO::N_STAB)
     Result |= SymbolRef::SF_FormatSpecific;
 
@@ -578,7 +631,7 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
     if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF) {
       uint64_t Value;
       getSymbolAddress(DRI, Value);
-      if (Value)
+      if (Value && Value != UnknownAddressOrSize)
         Result |= SymbolRef::SF_Common;
     }
   }
@@ -592,9 +645,8 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   return Result;
 }
 
-error_code
-MachOObjectFile::getSymbolSection(DataRefImpl Symb,
-                                  section_iterator &Res) const {
+std::error_code MachOObjectFile::getSymbolSection(DataRefImpl Symb,
+                                                  section_iterator &Res) const {
   nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t index = Entry.n_sect;
 
@@ -613,15 +665,15 @@ void MachOObjectFile::moveSectionNext(DataRefImpl &Sec) const {
   Sec.d.a++;
 }
 
-error_code
-MachOObjectFile::getSectionName(DataRefImpl Sec, StringRef &Result) const {
+std::error_code MachOObjectFile::getSectionName(DataRefImpl Sec,
+                                                StringRef &Result) const {
   ArrayRef<char> Raw = getSectionRawName(Sec);
   Result = parseSegmentOrSectionName(Raw.data());
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
+std::error_code MachOObjectFile::getSectionAddress(DataRefImpl Sec,
+                                                   uint64_t &Res) const {
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
     Res = Sect.addr;
@@ -632,8 +684,8 @@ MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionSize(DataRefImpl Sec, uint64_t &Res) const {
+std::error_code MachOObjectFile::getSectionSize(DataRefImpl Sec,
+                                                uint64_t &Res) const {
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
     Res = Sect.size;
@@ -645,8 +697,8 @@ MachOObjectFile::getSectionSize(DataRefImpl Sec, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionContents(DataRefImpl Sec, StringRef &Res) const {
+std::error_code MachOObjectFile::getSectionContents(DataRefImpl Sec,
+                                                    StringRef &Res) const {
   uint32_t Offset;
   uint64_t Size;
 
@@ -664,8 +716,8 @@ MachOObjectFile::getSectionContents(DataRefImpl Sec, StringRef &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
+std::error_code MachOObjectFile::getSectionAlignment(DataRefImpl Sec,
+                                                     uint64_t &Res) const {
   uint32_t Align;
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
@@ -679,14 +731,15 @@ MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::isSectionText(DataRefImpl Sec, bool &Res) const {
+std::error_code MachOObjectFile::isSectionText(DataRefImpl Sec,
+                                               bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   Res = Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionData(DataRefImpl Sec, bool &Result) const {
+std::error_code MachOObjectFile::isSectionData(DataRefImpl Sec,
+                                               bool &Result) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
   Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
@@ -695,7 +748,8 @@ error_code MachOObjectFile::isSectionData(DataRefImpl Sec, bool &Result) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec, bool &Result) const {
+std::error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec,
+                                              bool &Result) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
   Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
@@ -704,7 +758,7 @@ error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec, bool &Result) const {
   return object_error::success;
 }
 
-error_code
+std::error_code
 MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
                                                bool &Result) const {
   // FIXME: Unimplemented.
@@ -712,15 +766,15 @@ MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
-                                             bool &Result) const {
+std::error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
+                                                  bool &Result) const {
   // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::isSectionZeroInit(DataRefImpl Sec, bool &Res) const {
+std::error_code MachOObjectFile::isSectionZeroInit(DataRefImpl Sec,
+                                                   bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
   Res = SectionType == MachO::S_ZEROFILL ||
@@ -728,8 +782,8 @@ MachOObjectFile::isSectionZeroInit(DataRefImpl Sec, bool &Res) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
-                                                  bool &Result) const {
+std::error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
+                                                       bool &Result) const {
   // Consider using the code from isSectionText to look for __const sections.
   // Alternately, emit S_ATTR_PURE_INSTRUCTIONS and/or S_ATTR_SOME_INSTRUCTIONS
   // to use section attributes to distinguish code from data.
@@ -739,9 +793,9 @@ error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                       bool &Result) const {
+std::error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
+                                                       DataRefImpl Symb,
+                                                       bool &Result) const {
   SymbolRef::Type ST;
   this->getSymbolType(Symb, ST);
   if (ST == SymbolRef::ST_Unknown) {
@@ -789,8 +843,8 @@ void MachOObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
   ++Rel.d.b;
 }
 
-error_code
-MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
+std::error_code MachOObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                      uint64_t &Res) const {
   uint64_t Offset;
   getRelocationOffset(Rel, Offset);
 
@@ -802,8 +856,8 @@ MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
-                                                uint64_t &Res) const {
+std::error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
+                                                     uint64_t &Res) const {
   assert(getHeader().filetype == MachO::MH_OBJECT &&
          "Only implemented for MH_OBJECT");
   MachO::any_relocation_info RE = getRelocation(Rel);
@@ -829,14 +883,14 @@ MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   return symbol_iterator(SymbolRef(Sym, this));
 }
 
-error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
-                                              uint64_t &Res) const {
+std::error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
+                                                   uint64_t &Res) const {
   MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationType(RE);
   return object_error::success;
 }
 
-error_code
+std::error_code
 MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
                                        SmallVectorImpl<char> &Result) const {
   StringRef res;
@@ -949,7 +1003,7 @@ MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
   return object_error::success;
 }
 
-error_code
+std::error_code
 MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
   MachO::any_relocation_info RE = getRelocation(Rel);
@@ -1125,8 +1179,8 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getRelocationHidden(DataRefImpl Rel, bool &Result) const {
+std::error_code MachOObjectFile::getRelocationHidden(DataRefImpl Rel,
+                                                     bool &Result) const {
   unsigned Arch = getArch();
   uint64_t Type;
   getRelocationType(Rel, Type);
@@ -1153,16 +1207,199 @@ MachOObjectFile::getRelocationHidden(DataRefImpl Rel, bool &Result) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::getLibraryNext(DataRefImpl LibData,
-                                           LibraryRef &Res) const {
+std::error_code MachOObjectFile::getLibraryNext(DataRefImpl LibData,
+                                                LibraryRef &Res) const {
   report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
 }
 
-error_code MachOObjectFile::getLibraryPath(DataRefImpl LibData,
-                                           StringRef &Res) const {
+std::error_code MachOObjectFile::getLibraryPath(DataRefImpl LibData,
+                                                StringRef &Res) const {
   report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
 }
 
+//
+// guessLibraryShortName() is passed a name of a dynamic library and returns a
+// guess on what the short name is.  Then name is returned as a substring of the
+// StringRef Name passed in.  The name of the dynamic library is recognized as
+// a framework if it has one of the two following forms:
+//      Foo.framework/Versions/A/Foo
+//      Foo.framework/Foo
+// Where A and Foo can be any string.  And may contain a trailing suffix
+// starting with an underbar.  If the Name is recognized as a framework then
+// isFramework is set to true else it is set to false.  If the Name has a
+// suffix then Suffix is set to the substring in Name that contains the suffix
+// else it is set to a NULL StringRef.
+//
+// The Name of the dynamic library is recognized as a library name if it has
+// one of the two following forms:
+//      libFoo.A.dylib
+//      libFoo.dylib
+// The library may have a suffix trailing the name Foo of the form:
+//      libFoo_profile.A.dylib
+//      libFoo_profile.dylib
+//
+// The Name of the dynamic library is also recognized as a library name if it
+// has the following form:
+//      Foo.qtx
+//
+// If the Name of the dynamic library is none of the forms above then a NULL
+// StringRef is returned.
+//
+StringRef MachOObjectFile::guessLibraryShortName(StringRef Name,
+                                                 bool &isFramework,
+                                                 StringRef &Suffix) {
+  StringRef Foo, F, DotFramework, V, Dylib, Lib, Dot, Qtx;
+  size_t a, b, c, d, Idx;
+
+  isFramework = false;
+  Suffix = StringRef();
+
+  // Pull off the last component and make Foo point to it
+  a = Name.rfind('/');
+  if (a == Name.npos || a == 0)
+    goto guess_library;
+  Foo = Name.slice(a+1, Name.npos);
+
+  // Look for a suffix starting with a '_'
+  Idx = Foo.rfind('_');
+  if (Idx != Foo.npos && Foo.size() >= 2) {
+    Suffix = Foo.slice(Idx, Foo.npos);
+    Foo = Foo.slice(0, Idx);
+  }
+
+  // First look for the form Foo.framework/Foo
+  b = Name.rfind('/', a);
+  if (b == Name.npos)
+    Idx = 0;
+  else
+    Idx = b+1;
+  F = Name.slice(Idx, Idx + Foo.size());
+  DotFramework = Name.slice(Idx + Foo.size(),
+                            Idx + Foo.size() + sizeof(".framework/")-1);
+  if (F == Foo && DotFramework == ".framework/") {
+    isFramework = true;
+    return Foo;
+  }
+
+  // Next look for the form Foo.framework/Versions/A/Foo
+  if (b == Name.npos)
+    goto guess_library;
+  c =  Name.rfind('/', b);
+  if (c == Name.npos || c == 0)
+    goto guess_library;
+  V = Name.slice(c+1, Name.npos);
+  if (!V.startswith("Versions/"))
+    goto guess_library;
+  d =  Name.rfind('/', c);
+  if (d == Name.npos)
+    Idx = 0;
+  else
+    Idx = d+1;
+  F = Name.slice(Idx, Idx + Foo.size());
+  DotFramework = Name.slice(Idx + Foo.size(),
+                            Idx + Foo.size() + sizeof(".framework/")-1);
+  if (F == Foo && DotFramework == ".framework/") {
+    isFramework = true;
+    return Foo;
+  }
+
+guess_library:
+  // pull off the suffix after the "." and make a point to it
+  a = Name.rfind('.');
+  if (a == Name.npos || a == 0)
+    return StringRef();
+  Dylib = Name.slice(a, Name.npos);
+  if (Dylib != ".dylib")
+    goto guess_qtx;
+
+  // First pull off the version letter for the form Foo.A.dylib if any.
+  if (a >= 3) {
+    Dot = Name.slice(a-2, a-1);
+    if (Dot == ".")
+      a = a - 2;
+  }
+
+  b = Name.rfind('/', a);
+  if (b == Name.npos)
+    b = 0;
+  else
+    b = b+1;
+  // ignore any suffix after an underbar like Foo_profile.A.dylib
+  Idx = Name.find('_', b);
+  if (Idx != Name.npos && Idx != b) {
+    Lib = Name.slice(b, Idx);
+    Suffix = Name.slice(Idx, a);
+  }
+  else
+    Lib = Name.slice(b, a);
+  // There are incorrect library names of the form:
+  // libATS.A_profile.dylib so check for these.
+  if (Lib.size() >= 3) {
+    Dot = Lib.slice(Lib.size()-2, Lib.size()-1);
+    if (Dot == ".")
+      Lib = Lib.slice(0, Lib.size()-2);
+  }
+  return Lib;
+
+guess_qtx:
+  Qtx = Name.slice(a, Name.npos);
+  if (Qtx != ".qtx")
+    return StringRef();
+  b = Name.rfind('/', a);
+  if (b == Name.npos)
+    Lib = Name.slice(0, a);
+  else
+    Lib = Name.slice(b+1, a);
+  // There are library names of the form: QT.A.qtx so check for these.
+  if (Lib.size() >= 3) {
+    Dot = Lib.slice(Lib.size()-2, Lib.size()-1);
+    if (Dot == ".")
+      Lib = Lib.slice(0, Lib.size()-2);
+  }
+  return Lib;
+}
+
+// getLibraryShortNameByIndex() is used to get the short name of the library
+// for an undefined symbol in a linked Mach-O binary that was linked with the
+// normal two-level namespace default (that is MH_TWOLEVEL in the header).
+// It is passed the index (0 - based) of the library as translated from
+// GET_LIBRARY_ORDINAL (1 - based).
+std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index,
+                                                            StringRef &Res) {
+  if (Index >= Libraries.size())
+    return object_error::parse_failed;
+
+  MachO::dylib_command D =
+    getStruct<MachO::dylib_command>(this, Libraries[Index]);
+  if (D.dylib.name >= D.cmdsize)
+    return object_error::parse_failed;
+
+  // If the cache of LibrariesShortNames is not built up do that first for
+  // all the Libraries.
+  if (LibrariesShortNames.size() == 0) {
+    for (unsigned i = 0; i < Libraries.size(); i++) {
+      MachO::dylib_command D =
+        getStruct<MachO::dylib_command>(this, Libraries[i]);
+      if (D.dylib.name >= D.cmdsize) {
+        LibrariesShortNames.push_back(StringRef());
+        continue;
+      }
+      char *P = (char *)(Libraries[i]) + D.dylib.name;
+      StringRef Name = StringRef(P);
+      StringRef Suffix;
+      bool isFramework;
+      StringRef shortName = guessLibraryShortName(Name, isFramework, Suffix);
+      if (shortName == StringRef())
+        LibrariesShortNames.push_back(Name);
+      else
+        LibrariesShortNames.push_back(shortName);
+    }
+  }
+
+  Res = LibrariesShortNames[Index];
+  return object_error::success;
+}
+
 basic_symbol_iterator MachOObjectFile::symbol_begin_impl() const {
   return getSymbolByIndex(0);
 }
@@ -1578,7 +1815,7 @@ void MachOObjectFile::ReadULEB128s(uint64_t Index,
 ErrorOr<ObjectFile *> ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer,
                                                         bool BufferOwned) {
   StringRef Magic = Buffer->getBuffer().slice(0, 4);
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<MachOObjectFile> Ret;
   if (Magic == "\xFE\xED\xFA\xCE")
     Ret.reset(new MachOObjectFile(Buffer, false, false, EC, BufferOwned));
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index 5085efde5ecf..e414de8bcf17 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -22,27 +22,22 @@
 using namespace llvm;
 using namespace object;
 
-template<typename T>
-static void SwapValue(T &Value) {
-  Value = sys::SwapByteOrder(Value);
-}
-
 template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
 void SwapStruct(MachO::fat_header &H) {
-  SwapValue(H.magic);
-  SwapValue(H.nfat_arch);
+  sys::swapByteOrder(H.magic);
+  sys::swapByteOrder(H.nfat_arch);
 }
 
 template<>
 void SwapStruct(MachO::fat_arch &H) {
-  SwapValue(H.cputype);
-  SwapValue(H.cpusubtype);
-  SwapValue(H.offset);
-  SwapValue(H.size);
-  SwapValue(H.align);
+  sys::swapByteOrder(H.cputype);
+  sys::swapByteOrder(H.cpusubtype);
+  sys::swapByteOrder(H.offset);
+  sys::swapByteOrder(H.size);
+  sys::swapByteOrder(H.align);
 }
 
 template<typename T>
@@ -72,7 +67,7 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch(
   }
 }
 
-error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
+std::error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
     std::unique_ptr<ObjectFile> &Result) const {
   if (Parent) {
     StringRef ParentData = Parent->getData();
@@ -83,7 +78,7 @@ error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
     MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
         ObjectData, ObjectName, false);
     ErrorOr<ObjectFile *> Obj = ObjectFile::createMachOObjectFile(ObjBuffer);
-    if (error_code EC = Obj.getError())
+    if (std::error_code EC = Obj.getError())
       return EC;
     Result.reset(Obj.get());
     return object_error::success;
@@ -91,7 +86,7 @@ error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
   return object_error::parse_failed;
 }
 
-error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
+std::error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
     std::unique_ptr<Archive> &Result) const {
   if (Parent) {
     StringRef ParentData = Parent->getData();
@@ -102,7 +97,7 @@ error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
     MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
         ObjectData, ObjectName, false);
     ErrorOr<Archive *> Obj = Archive::create(ObjBuffer);
-    if (error_code EC = Obj.getError())
+    if (std::error_code EC = Obj.getError())
       return EC;
     Result.reset(Obj.get());
     return object_error::success;
@@ -114,7 +109,7 @@ void MachOUniversalBinary::anchor() { }
 
 ErrorOr<MachOUniversalBinary *>
 MachOUniversalBinary::create(MemoryBuffer *Source) {
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<MachOUniversalBinary> Ret(
       new MachOUniversalBinary(Source, EC));
   if (EC)
@@ -123,9 +118,8 @@ MachOUniversalBinary::create(MemoryBuffer *Source) {
 }
 
 MachOUniversalBinary::MachOUniversalBinary(MemoryBuffer *Source,
-                                           error_code &ec)
-  : Binary(Binary::ID_MachOUniversalBinary, Source),
-    NumberOfObjects(0) {
+                                           std::error_code &ec)
+    : Binary(Binary::ID_MachOUniversalBinary, Source), NumberOfObjects(0) {
   if (Source->getBufferSize() < sizeof(MachO::fat_header)) {
     ec = object_error::invalid_file_type;
     return;
@@ -155,7 +149,7 @@ static bool getCTMForArch(Triple::ArchType Arch, MachO::CPUType &CTM) {
   }
 }
 
-error_code MachOUniversalBinary::getObjectForArch(
+std::error_code MachOUniversalBinary::getObjectForArch(
     Triple::ArchType Arch, std::unique_ptr<ObjectFile> &Result) const {
   MachO::CPUType CTM;
   if (!getCTMForArch(Arch, CTM))
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index b0068a87b0de..7282f468b084 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -89,7 +89,7 @@ void LLVMMoveToNextSection(LLVMSectionIteratorRef SI) {
 
 void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect,
                                  LLVMSymbolIteratorRef Sym) {
-  if (error_code ec = (*unwrap(Sym))->getSection(*unwrap(Sect)))
+  if (std::error_code ec = (*unwrap(Sym))->getSection(*unwrap(Sect)))
     report_fatal_error(ec.message());
 }
 
@@ -115,28 +115,28 @@ void LLVMMoveToNextSymbol(LLVMSymbolIteratorRef SI) {
 // SectionRef accessors
 const char *LLVMGetSectionName(LLVMSectionIteratorRef SI) {
   StringRef ret;
-  if (error_code ec = (*unwrap(SI))->getName(ret))
+  if (std::error_code ec = (*unwrap(SI))->getName(ret))
    report_fatal_error(ec.message());
   return ret.data();
 }
 
 uint64_t LLVMGetSectionSize(LLVMSectionIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getSize(ret))
+  if (std::error_code ec = (*unwrap(SI))->getSize(ret))
     report_fatal_error(ec.message());
   return ret;
 }
 
 const char *LLVMGetSectionContents(LLVMSectionIteratorRef SI) {
   StringRef ret;
-  if (error_code ec = (*unwrap(SI))->getContents(ret))
+  if (std::error_code ec = (*unwrap(SI))->getContents(ret))
     report_fatal_error(ec.message());
   return ret.data();
 }
 
 uint64_t LLVMGetSectionAddress(LLVMSectionIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getAddress(ret))
+  if (std::error_code ec = (*unwrap(SI))->getAddress(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -144,7 +144,7 @@ uint64_t LLVMGetSectionAddress(LLVMSectionIteratorRef SI) {
 LLVMBool LLVMGetSectionContainsSymbol(LLVMSectionIteratorRef SI,
                                  LLVMSymbolIteratorRef Sym) {
   bool ret;
-  if (error_code ec = (*unwrap(SI))->containsSymbol(**unwrap(Sym), ret))
+  if (std::error_code ec = (*unwrap(SI))->containsSymbol(**unwrap(Sym), ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -172,21 +172,21 @@ void LLVMMoveToNextRelocation(LLVMRelocationIteratorRef SI) {
 // SymbolRef accessors
 const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) {
   StringRef ret;
-  if (error_code ec = (*unwrap(SI))->getName(ret))
+  if (std::error_code ec = (*unwrap(SI))->getName(ret))
     report_fatal_error(ec.message());
   return ret.data();
 }
 
 uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getAddress(ret))
+  if (std::error_code ec = (*unwrap(SI))->getAddress(ret))
     report_fatal_error(ec.message());
   return ret;
 }
 
 uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getSize(ret))
+  if (std::error_code ec = (*unwrap(SI))->getSize(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -194,14 +194,14 @@ uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
 // RelocationRef accessors
 uint64_t LLVMGetRelocationAddress(LLVMRelocationIteratorRef RI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(RI))->getAddress(ret))
+  if (std::error_code ec = (*unwrap(RI))->getAddress(ret))
     report_fatal_error(ec.message());
   return ret;
 }
 
 uint64_t LLVMGetRelocationOffset(LLVMRelocationIteratorRef RI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(RI))->getOffset(ret))
+  if (std::error_code ec = (*unwrap(RI))->getOffset(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -213,7 +213,7 @@ LLVMSymbolIteratorRef LLVMGetRelocationSymbol(LLVMRelocationIteratorRef RI) {
 
 uint64_t LLVMGetRelocationType(LLVMRelocationIteratorRef RI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(RI))->getType(ret))
+  if (std::error_code ec = (*unwrap(RI))->getType(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -221,7 +221,7 @@ uint64_t LLVMGetRelocationType(LLVMRelocationIteratorRef RI) {
 // NOTE: Caller takes ownership of returned string.
 const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) {
   SmallVector<char, 0> ret;
-  if (error_code ec = (*unwrap(RI))->getTypeName(ret))
+  if (std::error_code ec = (*unwrap(RI))->getTypeName(ret))
     report_fatal_error(ec.message());
 
   char *str = static_cast<char*>(malloc(ret.size()));
@@ -232,7 +232,7 @@ const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) {
 // NOTE: Caller takes ownership of returned string.
 const char *LLVMGetRelocationValueString(LLVMRelocationIteratorRef RI) {
   SmallVector<char, 0> ret;
-  if (error_code ec = (*unwrap(RI))->getValueString(ret))
+  if (std::error_code ec = (*unwrap(RI))->getValueString(ret))
     report_fatal_error(ec.message());
 
   char *str = static_cast<char*>(malloc(ret.size()));
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index d30f0cca49c1..7666ed53c372 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -16,7 +16,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -27,17 +27,17 @@ ObjectFile::ObjectFile(unsigned int Type, MemoryBuffer *Source,
                        bool BufferOwned)
     : SymbolicFile(Type, Source, BufferOwned) {}
 
-error_code ObjectFile::printSymbolName(raw_ostream &OS,
-                                       DataRefImpl Symb) const {
+std::error_code ObjectFile::printSymbolName(raw_ostream &OS,
+                                            DataRefImpl Symb) const {
   StringRef Name;
-  if (error_code EC = getSymbolName(Symb, Name))
+  if (std::error_code EC = getSymbolName(Symb, Name))
     return EC;
   OS << Name;
   return object_error::success;
 }
 
-error_code ObjectFile::getSymbolAlignment(DataRefImpl DRI,
-                                          uint32_t &Result) const {
+std::error_code ObjectFile::getSymbolAlignment(DataRefImpl DRI,
+                                               uint32_t &Result) const {
   Result = 0;
   return object_error::success;
 }
@@ -87,7 +87,7 @@ ErrorOr<ObjectFile *> ObjectFile::createObjectFile(MemoryBuffer *Object,
 
 ErrorOr<ObjectFile *> ObjectFile::createObjectFile(StringRef ObjectPath) {
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(ObjectPath, File))
+  if (std::error_code EC = MemoryBuffer::getFile(ObjectPath, File))
     return EC;
   return createObjectFile(File.release());
 }
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index de2b13dafd02..012122272067 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -18,10 +18,10 @@
 using namespace llvm;
 
 namespace {
-class InstrProfErrorCategoryType : public error_category {
-  const char *name() const override { return "llvm.instrprof"; }
+class InstrProfErrorCategoryType : public std::error_category {
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.instrprof"; }
   std::string message(int IE) const override {
-    instrprof_error::ErrorType E = static_cast<instrprof_error::ErrorType>(IE);
+    instrprof_error E = static_cast<instrprof_error>(IE);
     switch (E) {
     case instrprof_error::success:
       return "Success";
@@ -52,15 +52,10 @@ class InstrProfErrorCategoryType : public error_category {
     }
     llvm_unreachable("A value of instrprof_error has no message.");
   }
-  error_condition default_error_condition(int EV) const override {
-    if (EV == instrprof_error::success)
-      return errc::success;
-    return errc::invalid_argument;
-  }
 };
 }
 
-const error_category &llvm::instrprof_category() {
+const std::error_category &llvm::instrprof_category() {
   static InstrProfErrorCategoryType C;
   return C;
 }
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 7014f5e5cc8c..d0493d34c203 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -21,9 +21,9 @@
 
 using namespace llvm;
 
-static error_code setupMemoryBuffer(std::string Path,
-                                    std::unique_ptr<MemoryBuffer> &Buffer) {
-  if (error_code EC = MemoryBuffer::getFileOrSTDIN(Path, Buffer))
+static std::error_code
+setupMemoryBuffer(std::string Path, std::unique_ptr<MemoryBuffer> &Buffer) {
+  if (std::error_code EC = MemoryBuffer::getFileOrSTDIN(Path, Buffer))
     return EC;
 
   // Sanity check the file.
@@ -32,15 +32,16 @@ static error_code setupMemoryBuffer(std::string Path,
   return instrprof_error::success;
 }
 
-static error_code initializeReader(InstrProfReader &Reader) {
+static std::error_code initializeReader(InstrProfReader &Reader) {
   return Reader.readHeader();
 }
 
-error_code InstrProfReader::create(std::string Path,
-                                   std::unique_ptr<InstrProfReader> &Result) {
+std::error_code
+InstrProfReader::create(std::string Path,
+                        std::unique_ptr<InstrProfReader> &Result) {
   // Set up the buffer to read.
   std::unique_ptr<MemoryBuffer> Buffer;
-  if (error_code EC = setupMemoryBuffer(Path, Buffer))
+  if (std::error_code EC = setupMemoryBuffer(Path, Buffer))
     return EC;
 
   // Create the reader.
@@ -57,11 +58,11 @@ error_code InstrProfReader::create(std::string Path,
   return initializeReader(*Result);
 }
 
-error_code IndexedInstrProfReader::create(
+std::error_code IndexedInstrProfReader::create(
     std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result) {
   // Set up the buffer to read.
   std::unique_ptr<MemoryBuffer> Buffer;
-  if (error_code EC = setupMemoryBuffer(Path, Buffer))
+  if (std::error_code EC = setupMemoryBuffer(Path, Buffer))
     return EC;
 
   // Create the reader.
@@ -78,7 +79,7 @@ void InstrProfIterator::Increment() {
     *this = InstrProfIterator();
 }
 
-error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   // Skip empty lines.
   while (!Line.is_at_end() && Line->empty())
     ++Line;
@@ -157,11 +158,11 @@ bool RawInstrProfReader<IntPtrT>::hasFormat(const MemoryBuffer &DataBuffer) {
   uint64_t Magic =
     *reinterpret_cast<const uint64_t *>(DataBuffer.getBufferStart());
   return getRawMagic<IntPtrT>() == Magic ||
-    sys::SwapByteOrder(getRawMagic<IntPtrT>()) == Magic;
+    sys::getSwappedBytes(getRawMagic<IntPtrT>()) == Magic;
 }
 
 template <class IntPtrT>
-error_code RawInstrProfReader<IntPtrT>::readHeader() {
+std::error_code RawInstrProfReader<IntPtrT>::readHeader() {
   if (!hasFormat(*DataBuffer))
     return error(instrprof_error::bad_magic);
   if (DataBuffer->getBufferSize() < sizeof(RawHeader))
@@ -173,7 +174,8 @@ error_code RawInstrProfReader<IntPtrT>::readHeader() {
 }
 
 template <class IntPtrT>
-error_code RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
   const char *End = DataBuffer->getBufferEnd();
   // Skip zero padding between profiles.
   while (CurrentPos != End && *CurrentPos == 0)
@@ -200,7 +202,8 @@ static uint64_t getRawVersion() {
 }
 
 template <class IntPtrT>
-error_code RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
   if (swap(Header.Version) != getRawVersion())
     return error(instrprof_error::unsupported_version);
 
@@ -229,10 +232,10 @@ error_code RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
 }
 
 template <class IntPtrT>
-error_code
+std::error_code
 RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
   if (Data == DataEnd)
-    if (error_code EC = readNextHeader(ProfileEnd))
+    if (std::error_code EC = readNextHeader(ProfileEnd))
       return EC;
 
   // Get the raw data.
@@ -286,7 +289,7 @@ bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
   return Magic == IndexedInstrProf::Magic;
 }
 
-error_code IndexedInstrProfReader::readHeader() {
+std::error_code IndexedInstrProfReader::readHeader() {
   const unsigned char *Start =
       (const unsigned char *)DataBuffer->getBufferStart();
   const unsigned char *Cur = Start;
@@ -324,7 +327,7 @@ error_code IndexedInstrProfReader::readHeader() {
   return success();
 }
 
-error_code IndexedInstrProfReader::getFunctionCounts(
+std::error_code IndexedInstrProfReader::getFunctionCounts(
     StringRef FuncName, uint64_t &FuncHash, std::vector<uint64_t> &Counts) {
   const auto &Iter = Index->find(FuncName);
   if (Iter == Index->end())
@@ -339,7 +342,8 @@ error_code IndexedInstrProfReader::getFunctionCounts(
   return success();
 }
 
-error_code IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+std::error_code
+IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   // Are we out of records?
   if (RecordIterator == Index->data_end())
     return error(instrprof_error::eof);
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 83c41d93bb18..e55c29918136 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -66,9 +66,10 @@ class InstrProfRecordTrait {
 };
 }
 
-error_code InstrProfWriter::addFunctionCounts(StringRef FunctionName,
-                                              uint64_t FunctionHash,
-                                              ArrayRef<uint64_t> Counters) {
+std::error_code
+InstrProfWriter::addFunctionCounts(StringRef FunctionName,
+                                   uint64_t FunctionHash,
+                                   ArrayRef<uint64_t> Counters) {
   auto Where = FunctionData.find(FunctionName);
   if (Where == FunctionData.end()) {
     // If this is the first time we've seen this function, just add it.
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index f9fe09541dbe..7989e30afae6 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -1372,7 +1372,9 @@ APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
   case PackCategoriesIntoKey(fcZero, fcNaN):
   case PackCategoriesIntoKey(fcNormal, fcNaN):
   case PackCategoriesIntoKey(fcInfinity, fcNaN):
-    sign = false;
+    // We need to be sure to flip the sign here for subtraction because we
+    // don't have a separate negate operation so -NaN becomes 0 - NaN here.
+    sign = rhs.sign ^ subtract;
     category = fcNaN;
     copySignificand(rhs);
     return opOK;
diff --git a/lib/Support/ARMWinEH.cpp b/lib/Support/ARMWinEH.cpp
new file mode 100644
index 000000000000..03c150f1150b
--- /dev/null
+++ b/lib/Support/ARMWinEH.cpp
@@ -0,0 +1,38 @@
+//===-- ARMWinEH.cpp - Windows on ARM EH Support Functions ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ARMWinEH.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF) {
+  uint8_t NumRegisters = RF.Reg();
+  uint8_t RegistersVFP = RF.R();
+  uint8_t LinkRegister = RF.L();
+  uint8_t ChainedFrame = RF.C();
+
+  uint16_t GPRMask = (ChainedFrame << 11) | (LinkRegister << 14);
+  uint32_t VFPMask = 0;
+
+  if (RegistersVFP)
+    VFPMask |= (((1 << ((NumRegisters + 1) % 8)) - 1) << 8);
+  else
+    GPRMask |= (((1 << (NumRegisters + 1)) - 1) << 4);
+
+  if (PrologueFolding(RF))
+    GPRMask |= (((1 << (NumRegisters + 1)) - 1) << (~RF.StackAdjust() & 0x3));
+
+  return std::make_pair(GPRMask, VFPMask);
+}
+}
+}
+}
+
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index b4c674d32e27..2438d729d8de 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMSupport
   APInt.cpp
   APSInt.cpp
   ARMBuildAttrs.cpp
+  ARMWinEH.cpp
   Allocator.cpp
   BlockFrequency.cpp
   BranchProbability.cpp
@@ -82,7 +83,6 @@ add_llvm_library(LLVMSupport
   RWMutex.cpp
   SearchForAddressOfSpecialSymbol.cpp
   Signals.cpp
-  system_error.cpp
   TargetRegistry.cpp
   ThreadLocal.cpp
   Threading.cpp
@@ -99,7 +99,6 @@ add_llvm_library(LLVMSupport
   Unix/Program.inc
   Unix/RWMutex.inc
   Unix/Signals.inc
-  Unix/system_error.inc
   Unix/ThreadLocal.inc
   Unix/TimeValue.inc
   Unix/Watchdog.inc
@@ -112,7 +111,6 @@ add_llvm_library(LLVMSupport
   Windows/Program.inc
   Windows/RWMutex.inc
   Windows/Signals.inc
-  Windows/system_error.inc
   Windows/ThreadLocal.inc
   Windows/TimeValue.inc
   Windows/Watchdog.inc
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 37bbf48303a6..e09d4b6e47b4 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -31,10 +31,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cerrno>
 #include <cstdlib>
 #include <map>
+#include <system_error>
 using namespace llvm;
 using namespace cl;
 
@@ -1699,7 +1699,7 @@ class VersionPrinter {
     OS << "LLVM (http://llvm.org/):\n"
        << "  " << PACKAGE_NAME << " version " << PACKAGE_VERSION;
 #ifdef LLVM_VERSION_INFO
-    OS << LLVM_VERSION_INFO;
+    OS << " " << LLVM_VERSION_INFO;
 #endif
     OS << "\n  ";
 #ifndef __OPTIMIZE__
diff --git a/lib/Support/ConvertUTF.c b/lib/Support/ConvertUTF.c
index 23f17ca25aea..128459a1d548 100644
--- a/lib/Support/ConvertUTF.c
+++ b/lib/Support/ConvertUTF.c
@@ -51,6 +51,7 @@
 #ifdef CVTUTF_DEBUG
 #include <stdio.h>
 #endif
+#include <assert.h>
 
 static const int halfShift  = 10; /* used for shifting by 10 bits */
 
@@ -392,6 +393,99 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
 
 /* --------------------------------------------------------------------- */
 
+static unsigned
+findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
+                                          const UTF8 *sourceEnd) {
+  UTF8 b1, b2, b3;
+
+  assert(!isLegalUTF8Sequence(source, sourceEnd));
+
+  /*
+   * Unicode 6.3.0, D93b:
+   *
+   *   Maximal subpart of an ill-formed subsequence: The longest code unit
+   *   subsequence starting at an unconvertible offset that is either:
+   *   a. the initial subsequence of a well-formed code unit sequence, or
+   *   b. a subsequence of length one.
+   */
+
+  if (source == sourceEnd)
+    return 0;
+
+  /*
+   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
+   * Byte Sequences.
+   */
+
+  b1 = *source;
+  ++source;
+  if (b1 >= 0xC2 && b1 <= 0xDF) {
+    /*
+     * First byte is valid, but we know that this code unit sequence is
+     * invalid, so the maximal subpart has to end after the first byte.
+     */
+    return 1;
+  }
+
+  if (source == sourceEnd)
+    return 1;
+
+  b2 = *source;
+  ++source;
+
+  if (b1 == 0xE0) {
+    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
+  }
+  if (b1 >= 0xE1 && b1 <= 0xEC) {
+    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+  }
+  if (b1 == 0xED) {
+    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
+  }
+  if (b1 >= 0xEE && b1 <= 0xEF) {
+    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+  }
+  if (b1 == 0xF0) {
+    if (b2 >= 0x90 && b2 <= 0xBF) {
+      if (source == sourceEnd)
+        return 2;
+
+      b3 = *source;
+      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+    }
+    return 1;
+  }
+  if (b1 >= 0xF1 && b1 <= 0xF3) {
+    if (b2 >= 0x80 && b2 <= 0xBF) {
+      if (source == sourceEnd)
+        return 2;
+
+      b3 = *source;
+      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+    }
+    return 1;
+  }
+  if (b1 == 0xF4) {
+    if (b2 >= 0x80 && b2 <= 0x8F) {
+      if (source == sourceEnd)
+        return 2;
+
+      b3 = *source;
+      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+    }
+    return 1;
+  }
+
+  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
+  /*
+   * There are no valid sequences that start with these bytes.  Maximal subpart
+   * is defined to have length 1 in these cases.
+   */
+  return 1;
+}
+
+/* --------------------------------------------------------------------- */
+
 /*
  * Exported function to return the total number of bytes in a codepoint
  * represented in UTF-8, given the value of the first byte.
@@ -491,9 +585,10 @@ ConversionResult ConvertUTF8toUTF16 (
 
 /* --------------------------------------------------------------------- */
 
-ConversionResult ConvertUTF8toUTF32 (
+static ConversionResult ConvertUTF8toUTF32Impl(
         const UTF8** sourceStart, const UTF8* sourceEnd, 
-        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
+        Boolean InputIsPartial) {
     ConversionResult result = conversionOK;
     const UTF8* source = *sourceStart;
     UTF32* target = *targetStart;
@@ -501,12 +596,42 @@ ConversionResult ConvertUTF8toUTF32 (
         UTF32 ch = 0;
         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
         if (extraBytesToRead >= sourceEnd - source) {
-            result = sourceExhausted; break;
+            if (flags == strictConversion || InputIsPartial) {
+                result = sourceExhausted;
+                break;
+            } else {
+                result = sourceIllegal;
+
+                /*
+                 * Replace the maximal subpart of ill-formed sequence with
+                 * replacement character.
+                 */
+                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
+                                                                    sourceEnd);
+                *target++ = UNI_REPLACEMENT_CHAR;
+                continue;
+            }
         }
+        if (target >= targetEnd) {
+            result = targetExhausted; break;
+        }
+
         /* Do this check whether lenient or strict */
         if (!isLegalUTF8(source, extraBytesToRead+1)) {
             result = sourceIllegal;
-            break;
+            if (flags == strictConversion) {
+                /* Abort conversion. */
+                break;
+            } else {
+                /*
+                 * Replace the maximal subpart of ill-formed sequence with
+                 * replacement character.
+                 */
+                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
+                                                                    sourceEnd);
+                *target++ = UNI_REPLACEMENT_CHAR;
+                continue;
+            }
         }
         /*
          * The cases all fall through. See "Note A" below.
@@ -521,10 +646,6 @@ ConversionResult ConvertUTF8toUTF32 (
         }
         ch -= offsetsFromUTF8[extraBytesToRead];
 
-        if (target >= targetEnd) {
-            source -= (extraBytesToRead+1); /* Back up the source pointer! */
-            result = targetExhausted; break;
-        }
         if (ch <= UNI_MAX_LEGAL_UTF32) {
             /*
              * UTF-16 surrogate values are illegal in UTF-32, and anything
@@ -551,6 +672,22 @@ ConversionResult ConvertUTF8toUTF32 (
     return result;
 }
 
+ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
+                                           const UTF8 *sourceEnd,
+                                           UTF32 **targetStart,
+                                           UTF32 *targetEnd,
+                                           ConversionFlags flags) {
+  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
+                                flags, /*InputIsPartial=*/true);
+}
+
+ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
+                                    const UTF8 *sourceEnd, UTF32 **targetStart,
+                                    UTF32 *targetEnd, ConversionFlags flags) {
+  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
+                                flags, /*InputIsPartial=*/false);
+}
+
 /* ---------------------------------------------------------------------
 
     Note A.
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index a426377042d3..016c805cc7ab 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -22,7 +22,8 @@ namespace {
 
 struct CrashRecoveryContextImpl;
 
-static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
+static ManagedStatic<
+    sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
 
 struct CrashRecoveryContextImpl {
   CrashRecoveryContext *CRC;
@@ -231,7 +232,8 @@ void CrashRecoveryContext::Disable() {
 
 #include <signal.h>
 
-static const int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
+static const int Signals[] =
+    { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
 static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]);
 static struct sigaction PrevActions[NumSignals];
 
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index 7b829215d29b..5d6d60a87fbf 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -21,7 +21,7 @@ static T getU(uint32_t *offset_ptr, const DataExtractor *de,
   if (de->isValidOffsetForDataOfSize(offset, sizeof(val))) {
     std::memcpy(&val, &Data[offset], sizeof(val));
     if (sys::IsLittleEndianHost != isLittleEndian)
-      val = sys::SwapByteOrder(val);
+      sys::swapByteOrder(val);
 
     // Advance the offset
     *offset_ptr += sizeof(val);
diff --git a/lib/Support/DataStream.cpp b/lib/Support/DataStream.cpp
index eec858416a8f..32653de51943 100644
--- a/lib/Support/DataStream.cpp
+++ b/lib/Support/DataStream.cpp
@@ -18,10 +18,10 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/system_error.h"
 #include <cerrno>
 #include <cstdio>
 #include <string>
+#include <system_error>
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
 #else
@@ -64,11 +64,11 @@ class DataFileStreamer : public DataStreamer {
     return read(Fd, buf, len);
   }
 
-  error_code OpenFile(const std::string &Filename) {
+  std::error_code OpenFile(const std::string &Filename) {
     if (Filename == "-") {
       Fd = 0;
       sys::ChangeStdinToBinary();
-      return error_code::success();
+      return std::error_code();
     }
 
     return sys::fs::openFileForRead(Filename, Fd);
@@ -81,7 +81,7 @@ namespace llvm {
 DataStreamer *getDataFileStreamer(const std::string &Filename,
                                   std::string *StrError) {
   DataFileStreamer *s = new DataFileStreamer();
-  if (error_code e = s->OpenFile(Filename)) {
+  if (std::error_code e = s->OpenFile(Filename)) {
     *StrError = std::string("Could not open ") + Filename + ": " +
         e.message() + "\n";
     return nullptr;
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 342c4f05cc9d..c36007f55999 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -18,8 +18,12 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdlib>
@@ -37,17 +41,20 @@ using namespace llvm;
 static fatal_error_handler_t ErrorHandler = nullptr;
 static void *ErrorHandlerUserData = nullptr;
 
+static sys::Mutex ErrorHandlerMutex;
+
 void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
                                        void *user_data) {
-  assert(!llvm_is_multithreaded() &&
-         "Cannot register error handlers after starting multithreaded mode!\n");
+  llvm::MutexGuard Lock(ErrorHandlerMutex);
   assert(!ErrorHandler && "Error handler already registered!\n");
   ErrorHandler = handler;
   ErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_fatal_error_handler() {
+  llvm::MutexGuard Lock(ErrorHandlerMutex);
   ErrorHandler = nullptr;
+  ErrorHandlerUserData = nullptr;
 }
 
 void llvm::report_fatal_error(const char *Reason, bool GenCrashDiag) {
@@ -63,8 +70,18 @@ void llvm::report_fatal_error(StringRef Reason, bool GenCrashDiag) {
 }
 
 void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
-  if (ErrorHandler) {
-    ErrorHandler(ErrorHandlerUserData, Reason.str(), GenCrashDiag);
+  llvm::fatal_error_handler_t handler = nullptr;
+  void* handlerData = nullptr;
+  {
+    // Only acquire the mutex while reading the handler, so as not to invoke a
+    // user-supplied callback under a lock.
+    llvm::MutexGuard Lock(ErrorHandlerMutex);
+    handler = ErrorHandler;
+    handlerData = ErrorHandlerUserData;
+  }
+
+  if (handler) {
+    handler(handlerData, Reason.str(), GenCrashDiag);
   } else {
     // Blast the result out to stderr.  We don't try hard to make sure this
     // succeeds (e.g. handling EINTR) and we can't use errs() here because
@@ -119,3 +136,70 @@ void LLVMInstallFatalErrorHandler(LLVMFatalErrorHandler Handler) {
 void LLVMResetFatalErrorHandler() {
   remove_fatal_error_handler();
 }
+
+#ifdef LLVM_ON_WIN32
+
+#include <winerror.h>
+
+// I'd rather not double the line count of the following.
+#define MAP_ERR_TO_COND(x, y)                                                  \
+  case x:                                                                      \
+    return make_error_code(errc::y)
+
+std::error_code llvm::mapWindowsError(unsigned EV) {
+  switch (EV) {
+    MAP_ERR_TO_COND(ERROR_ACCESS_DENIED, permission_denied);
+    MAP_ERR_TO_COND(ERROR_ALREADY_EXISTS, file_exists);
+    MAP_ERR_TO_COND(ERROR_BAD_UNIT, no_such_device);
+    MAP_ERR_TO_COND(ERROR_BUFFER_OVERFLOW, filename_too_long);
+    MAP_ERR_TO_COND(ERROR_BUSY, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_BUSY_DRIVE, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_CANNOT_MAKE, permission_denied);
+    MAP_ERR_TO_COND(ERROR_CANTOPEN, io_error);
+    MAP_ERR_TO_COND(ERROR_CANTREAD, io_error);
+    MAP_ERR_TO_COND(ERROR_CANTWRITE, io_error);
+    MAP_ERR_TO_COND(ERROR_CURRENT_DIRECTORY, permission_denied);
+    MAP_ERR_TO_COND(ERROR_DEV_NOT_EXIST, no_such_device);
+    MAP_ERR_TO_COND(ERROR_DEVICE_IN_USE, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_DIR_NOT_EMPTY, directory_not_empty);
+    MAP_ERR_TO_COND(ERROR_DIRECTORY, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_DISK_FULL, no_space_on_device);
+    MAP_ERR_TO_COND(ERROR_FILE_EXISTS, file_exists);
+    MAP_ERR_TO_COND(ERROR_FILE_NOT_FOUND, no_such_file_or_directory);
+    MAP_ERR_TO_COND(ERROR_HANDLE_DISK_FULL, no_space_on_device);
+    MAP_ERR_TO_COND(ERROR_INVALID_ACCESS, permission_denied);
+    MAP_ERR_TO_COND(ERROR_INVALID_DRIVE, no_such_device);
+    MAP_ERR_TO_COND(ERROR_INVALID_FUNCTION, function_not_supported);
+    MAP_ERR_TO_COND(ERROR_INVALID_HANDLE, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_INVALID_NAME, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_LOCK_VIOLATION, no_lock_available);
+    MAP_ERR_TO_COND(ERROR_LOCKED, no_lock_available);
+    MAP_ERR_TO_COND(ERROR_NEGATIVE_SEEK, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_NOACCESS, permission_denied);
+    MAP_ERR_TO_COND(ERROR_NOT_ENOUGH_MEMORY, not_enough_memory);
+    MAP_ERR_TO_COND(ERROR_NOT_READY, resource_unavailable_try_again);
+    MAP_ERR_TO_COND(ERROR_OPEN_FAILED, io_error);
+    MAP_ERR_TO_COND(ERROR_OPEN_FILES, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_OUTOFMEMORY, not_enough_memory);
+    MAP_ERR_TO_COND(ERROR_PATH_NOT_FOUND, no_such_file_or_directory);
+    MAP_ERR_TO_COND(ERROR_BAD_NETPATH, no_such_file_or_directory);
+    MAP_ERR_TO_COND(ERROR_READ_FAULT, io_error);
+    MAP_ERR_TO_COND(ERROR_RETRY, resource_unavailable_try_again);
+    MAP_ERR_TO_COND(ERROR_SEEK, io_error);
+    MAP_ERR_TO_COND(ERROR_SHARING_VIOLATION, permission_denied);
+    MAP_ERR_TO_COND(ERROR_TOO_MANY_OPEN_FILES, too_many_files_open);
+    MAP_ERR_TO_COND(ERROR_WRITE_FAULT, io_error);
+    MAP_ERR_TO_COND(ERROR_WRITE_PROTECT, permission_denied);
+    MAP_ERR_TO_COND(WSAEACCES, permission_denied);
+    MAP_ERR_TO_COND(WSAEBADF, bad_file_descriptor);
+    MAP_ERR_TO_COND(WSAEFAULT, bad_address);
+    MAP_ERR_TO_COND(WSAEINTR, interrupted);
+    MAP_ERR_TO_COND(WSAEINVAL, invalid_argument);
+    MAP_ERR_TO_COND(WSAEMFILE, too_many_files_open);
+    MAP_ERR_TO_COND(WSAENAMETOOLONG, filename_too_long);
+  default:
+    return std::error_code(EV, std::system_category());
+  }
+}
+
+#endif
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 49311c29d8bc..2e740ca04518 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using llvm::sys::fs::mapped_file_region;
 
@@ -30,13 +31,13 @@ FileOutputBuffer::~FileOutputBuffer() {
   sys::fs::remove(Twine(TempPath));
 }
 
-error_code FileOutputBuffer::create(StringRef FilePath,
-                                    size_t Size,
-                                    std::unique_ptr<FileOutputBuffer> &Result,
-                                    unsigned Flags) {
+std::error_code
+FileOutputBuffer::create(StringRef FilePath, size_t Size,
+                         std::unique_ptr<FileOutputBuffer> &Result,
+                         unsigned Flags) {
   // If file already exists, it must be a regular file (to be mappable).
   sys::fs::file_status Stat;
-  error_code EC = sys::fs::status(FilePath, Stat);
+  std::error_code EC = sys::fs::status(FilePath, Stat);
   switch (Stat.type()) {
     case sys::fs::file_type::file_not_found:
       // If file does not exist, we'll create one.
@@ -81,16 +82,16 @@ error_code FileOutputBuffer::create(StringRef FilePath,
   if (Result)
     MappedFile.release();
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
+std::error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
   // Unmap buffer, letting OS flush dirty pages to file on disk.
   Region.reset(nullptr);
 
   // If requested, resize file as part of commit.
   if ( NewSmallerSize != -1 ) {
-    error_code EC = sys::fs::resize_file(Twine(TempPath), NewSmallerSize);
+    std::error_code EC = sys::fs::resize_file(Twine(TempPath), NewSmallerSize);
     if (EC)
       return EC;
   }
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index b2dc47dc698a..0d26bafd7714 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -17,10 +17,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
+#include <system_error>
 using namespace llvm;
 
 static bool isSignedChar(char C) {
@@ -177,13 +177,13 @@ int llvm::DiffFilesWithTolerance(StringRef NameA,
   // Now its safe to mmap the files into memory because both files
   // have a non-zero size.
   std::unique_ptr<MemoryBuffer> F1;
-  if (error_code ec = MemoryBuffer::getFile(NameA, F1)) {
+  if (std::error_code ec = MemoryBuffer::getFile(NameA, F1)) {
     if (Error)
       *Error = ec.message();
     return 2;
   }
   std::unique_ptr<MemoryBuffer> F2;
-  if (error_code ec = MemoryBuffer::getFile(NameB, F2)) {
+  if (std::error_code ec = MemoryBuffer::getFile(NameB, F2)) {
     if (Error)
       *Error = ec.message();
     return 2;
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index f5b2943e86b2..e68ee434dd59 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -68,7 +68,7 @@ StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
 std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
   FD = -1;
   SmallString<128> Filename;
-  error_code EC = sys::fs::createTemporaryFile(Name, "dot", FD, Filename);
+  std::error_code EC = sys::fs::createTemporaryFile(Name, "dot", FD, Filename);
   if (EC) {
     errs() << "Error: " << EC.message() << "\n";
     return "";
@@ -78,148 +78,165 @@ std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
   return Filename.str();
 }
 
-// Execute the graph viewer. Return true if successful.
-static bool LLVM_ATTRIBUTE_UNUSED
-ExecGraphViewer(StringRef ExecPath, std::vector<const char*> &args,
-                StringRef Filename, bool wait, std::string &ErrMsg) {
+// Execute the graph viewer. Return true if there were errors.
+static bool ExecGraphViewer(StringRef ExecPath, std::vector<const char *> &args,
+                            StringRef Filename, bool wait,
+                            std::string &ErrMsg) {
+  assert(args.back() == nullptr);
   if (wait) {
-    if (sys::ExecuteAndWait(ExecPath, &args[0],nullptr,nullptr,0,0,&ErrMsg)) {
+    if (sys::ExecuteAndWait(ExecPath, args.data(), nullptr, nullptr, 0, 0,
+                            &ErrMsg)) {
       errs() << "Error: " << ErrMsg << "\n";
-      return false;
+      return true;
     }
     sys::fs::remove(Filename);
     errs() << " done. \n";
-  }
-  else {
-    sys::ExecuteNoWait(ExecPath, &args[0],nullptr,nullptr,0,&ErrMsg);
+  } else {
+    sys::ExecuteNoWait(ExecPath, args.data(), nullptr, nullptr, 0, &ErrMsg);
     errs() << "Remember to erase graph file: " << Filename.str() << "\n";
   }
-  return true;
+  return false;
+}
+
+struct GraphSession {
+  std::string LogBuffer;
+  bool TryFindProgram(StringRef Names, std::string &ProgramPath) {
+    raw_string_ostream Log(LogBuffer);
+    SmallVector<StringRef, 8> parts;
+    Names.split(parts, "|");
+    for (auto Name : parts) {
+      ProgramPath = sys::FindProgramByName(Name);
+      if (!ProgramPath.empty())
+        return true;
+      Log << "  Tried '" << Name << "'\n";
+    }
+    return false;
+  }
+};
+
+static const char *getProgramName(GraphProgram::Name program) {
+  switch (program) {
+  case GraphProgram::DOT:
+    return "dot";
+  case GraphProgram::FDP:
+    return "fdp";
+  case GraphProgram::NEATO:
+    return "neato";
+  case GraphProgram::TWOPI:
+    return "twopi";
+  case GraphProgram::CIRCO:
+    return "circo";
+  }
+  llvm_unreachable("bad kind");
 }
 
-void llvm::DisplayGraph(StringRef FilenameRef, bool wait,
+bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
                         GraphProgram::Name program) {
   std::string Filename = FilenameRef;
   wait &= !ViewBackground;
   std::string ErrMsg;
-#if HAVE_GRAPHVIZ
-  std::string Graphviz(LLVM_PATH_GRAPHVIZ);
-
-  std::vector<const char*> args;
-  args.push_back(Graphviz.c_str());
-  args.push_back(Filename.c_str());
-  args.push_back(nullptr);
-
-  errs() << "Running 'Graphviz' program... ";
-  if (!ExecGraphViewer(Graphviz, args, Filename, wait, ErrMsg))
-    return;
-
-#elif HAVE_XDOT
-  std::vector<const char*> args;
-  args.push_back(LLVM_PATH_XDOT);
-  args.push_back(Filename.c_str());
-
-  switch (program) {
-  case GraphProgram::DOT:   args.push_back("-f"); args.push_back("dot"); break;
-  case GraphProgram::FDP:   args.push_back("-f"); args.push_back("fdp"); break;
-  case GraphProgram::NEATO: args.push_back("-f"); args.push_back("neato");break;
-  case GraphProgram::TWOPI: args.push_back("-f"); args.push_back("twopi");break;
-  case GraphProgram::CIRCO: args.push_back("-f"); args.push_back("circo");break;
+  std::string ViewerPath;
+  GraphSession S;
+
+  // Graphviz
+  if (S.TryFindProgram("Graphviz", ViewerPath)) {
+    std::vector<const char *> args;
+    args.push_back(ViewerPath.c_str());
+    args.push_back(Filename.c_str());
+    args.push_back(nullptr);
+
+    errs() << "Running 'Graphviz' program... ";
+    return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
   }
 
-  args.push_back(0);
+  // xdot
+  if (S.TryFindProgram("xdot|xdot.py", ViewerPath)) {
+    std::vector<const char *> args;
+    args.push_back(ViewerPath.c_str());
+    args.push_back(Filename.c_str());
 
-  errs() << "Running 'xdot.py' program... ";
-  if (!ExecGraphViewer(LLVM_PATH_XDOT, args, Filename, wait, ErrMsg))
-    return;
+    args.push_back("-f");
+    args.push_back(getProgramName(program));
 
-#elif (HAVE_GV && (HAVE_DOT || HAVE_FDP || HAVE_NEATO || \
-                   HAVE_TWOPI || HAVE_CIRCO))
-  std::string PSFilename = Filename + ".ps";
-  std::string prog;
+    args.push_back(nullptr);
 
-  // Set default grapher
-#if HAVE_CIRCO
-  prog = LLVM_PATH_CIRCO;
-#endif
-#if HAVE_TWOPI
-  prog = LLVM_PATH_TWOPI;
-#endif
-#if HAVE_NEATO
-  prog = LLVM_PATH_NEATO;
-#endif
-#if HAVE_FDP
-  prog = LLVM_PATH_FDP;
-#endif
-#if HAVE_DOT
-  prog = LLVM_PATH_DOT;
-#endif
+    errs() << "Running 'xdot.py' program... ";
+    return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
+  }
 
-  // Find which program the user wants
-#if HAVE_DOT
-  if (program == GraphProgram::DOT)
-    prog = LLVM_PATH_DOT;
-#endif
-#if (HAVE_FDP)
-  if (program == GraphProgram::FDP)
-    prog = LLVM_PATH_FDP;
-#endif
-#if (HAVE_NEATO)
-  if (program == GraphProgram::NEATO)
-    prog = LLVM_PATH_NEATO;
-#endif
-#if (HAVE_TWOPI)
-  if (program == GraphProgram::TWOPI)
-    prog = LLVM_PATH_TWOPI;
-#endif
-#if (HAVE_CIRCO)
-  if (program == GraphProgram::CIRCO)
-    prog = LLVM_PATH_CIRCO;
+  enum PSViewerKind { PSV_None, PSV_OSXOpen, PSV_XDGOpen, PSV_Ghostview };
+  PSViewerKind PSViewer = PSV_None;
+#ifdef __APPLE__
+  if (!PSViewer && S.TryFindProgram("open", ViewerPath))
+    PSViewer = PSV_OSXOpen;
 #endif
+  if (!PSViewer && S.TryFindProgram("gv", ViewerPath))
+    PSViewer = PSV_Ghostview;
+  if (!PSViewer && S.TryFindProgram("xdg-open", ViewerPath))
+    PSViewer = PSV_XDGOpen;
+
+  // PostScript graph generator + PostScript viewer
+  std::string GeneratorPath;
+  if (PSViewer &&
+      (S.TryFindProgram(getProgramName(program), GeneratorPath) ||
+       S.TryFindProgram("dot|fdp|neato|twopi|circo", GeneratorPath))) {
+    std::string PSFilename = Filename + ".ps";
+
+    std::vector<const char *> args;
+    args.push_back(GeneratorPath.c_str());
+    args.push_back("-Tps");
+    args.push_back("-Nfontname=Courier");
+    args.push_back("-Gsize=7.5,10");
+    args.push_back(Filename.c_str());
+    args.push_back("-o");
+    args.push_back(PSFilename.c_str());
+    args.push_back(nullptr);
+
+    errs() << "Running '" << GeneratorPath << "' program... ";
+
+    if (ExecGraphViewer(GeneratorPath, args, Filename, wait, ErrMsg))
+      return true;
+
+    args.clear();
+    args.push_back(ViewerPath.c_str());
+    switch (PSViewer) {
+    case PSV_OSXOpen:
+      args.push_back("-W");
+      args.push_back(PSFilename.c_str());
+      break;
+    case PSV_XDGOpen:
+      wait = false;
+      args.push_back(PSFilename.c_str());
+      break;
+    case PSV_Ghostview:
+      args.push_back("--spartan");
+      args.push_back(PSFilename.c_str());
+      break;
+    case PSV_None:
+      llvm_unreachable("Invalid viewer");
+    }
+    args.push_back(nullptr);
 
-  std::vector<const char*> args;
-  args.push_back(prog.c_str());
-  args.push_back("-Tps");
-  args.push_back("-Nfontname=Courier");
-  args.push_back("-Gsize=7.5,10");
-  args.push_back(Filename.c_str());
-  args.push_back("-o");
-  args.push_back(PSFilename.c_str());
-  args.push_back(0);
-
-  errs() << "Running '" << prog << "' program... ";
-
-  if (!ExecGraphViewer(prog, args, Filename, wait, ErrMsg))
-    return;
-
-  std::string gv(LLVM_PATH_GV);
-  args.clear();
-  args.push_back(gv.c_str());
-  args.push_back(PSFilename.c_str());
-  args.push_back("--spartan");
-  args.push_back(0);
-
-  ErrMsg.clear();
-  if (!ExecGraphViewer(gv, args, PSFilename, wait, ErrMsg))
-    return;
-
-#elif HAVE_DOTTY
-  std::string dotty(LLVM_PATH_DOTTY);
+    ErrMsg.clear();
+    return ExecGraphViewer(ViewerPath, args, PSFilename, wait, ErrMsg);
+  }
 
-  std::vector<const char*> args;
-  args.push_back(dotty.c_str());
-  args.push_back(Filename.c_str());
-  args.push_back(0);
+  // dotty
+  if (S.TryFindProgram("dotty", ViewerPath)) {
+    std::vector<const char *> args;
+    args.push_back(ViewerPath.c_str());
+    args.push_back(Filename.c_str());
+    args.push_back(nullptr);
 
 // Dotty spawns another app and doesn't wait until it returns
-#if defined (__MINGW32__) || defined (_WINDOWS)
-  wait = false;
-#endif
-  errs() << "Running 'dotty' program... ";
-  if (!ExecGraphViewer(dotty, args, Filename, wait, ErrMsg))
-    return;
-#else
-  (void)Filename;
-  (void)ErrMsg;
+#ifdef LLVM_ON_WIN32
+    wait = false;
 #endif
+    errs() << "Running 'dotty' program... ";
+    return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
+  }
+
+  errs() << "Error: Couldn't find a usable graph viewer program:\n";
+  errs() << S.LogBuffer << "\n";
+  return true;
 }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index e0e85630be6b..ce0a3b6bed77 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -686,7 +686,7 @@ StringRef sys::getHostCPUName() {
 }
 #endif
 
-#if defined(__linux__) && defined(__arm__)
+#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   std::string Err;
   DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
@@ -715,8 +715,24 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
       break;
     }
 
+#if defined(__aarch64__)
+  // Keep track of which crypto features we have seen
+  enum {
+    CAP_AES   = 0x1,
+    CAP_PMULL = 0x2,
+    CAP_SHA1  = 0x4,
+    CAP_SHA2  = 0x8
+  };
+  uint32_t crypto = 0;
+#endif
+
   for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
     StringRef LLVMFeatureStr = StringSwitch<StringRef>(CPUFeatures[I])
+#if defined(__aarch64__)
+      .Case("asimd", "neon")
+      .Case("fp", "fp-armv8")
+      .Case("crc32", "crc")
+#else
       .Case("half", "fp16")
       .Case("neon", "neon")
       .Case("vfpv3", "vfp3")
@@ -724,12 +740,32 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
       .Case("vfpv4", "vfp4")
       .Case("idiva", "hwdiv-arm")
       .Case("idivt", "hwdiv")
+#endif
       .Default("");
 
+#if defined(__aarch64__)
+    // We need to check crypto separately since we need all of the crypto
+    // extensions to enable the subtarget feature
+    if (CPUFeatures[I] == "aes")
+      crypto |= CAP_AES;
+    else if (CPUFeatures[I] == "pmull")
+      crypto |= CAP_PMULL;
+    else if (CPUFeatures[I] == "sha1")
+      crypto |= CAP_SHA1;
+    else if (CPUFeatures[I] == "sha2")
+      crypto |= CAP_SHA2;
+#endif
+
     if (LLVMFeatureStr != "")
       Features.GetOrCreateValue(LLVMFeatureStr).setValue(true);
   }
 
+#if defined(__aarch64__)
+  // If we have all crypto bits we can add the feature
+  if (crypto == (CAP_AES | CAP_PMULL | CAP_SHA1 | CAP_SHA2))
+    Features.GetOrCreateValue("crypto").setValue(true);
+#endif
+
   return true;
 }
 #else
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 9b4bfbe81981..681bae2ba183 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Support/LockFileManager.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -71,7 +72,7 @@ bool LockFileManager::processStillExecuting(StringRef Hostname, int PID) {
 LockFileManager::LockFileManager(StringRef FileName)
 {
   this->FileName = FileName;
-  if (error_code EC = sys::fs::make_absolute(this->FileName)) {
+  if (std::error_code EC = sys::fs::make_absolute(this->FileName)) {
     Error = EC;
     return;
   }
@@ -87,10 +88,8 @@ LockFileManager::LockFileManager(StringRef FileName)
   UniqueLockFileName = LockFileName;
   UniqueLockFileName += "-%%%%%%%%";
   int UniqueLockFileID;
-  if (error_code EC
-        = sys::fs::createUniqueFile(UniqueLockFileName.str(),
-                                    UniqueLockFileID,
-                                    UniqueLockFileName)) {
+  if (std::error_code EC = sys::fs::createUniqueFile(
+          UniqueLockFileName.str(), UniqueLockFileID, UniqueLockFileName)) {
     Error = EC;
     return;
   }
@@ -122,9 +121,9 @@ LockFileManager::LockFileManager(StringRef FileName)
 
   while (1) {
     // Create a link from the lock file name. If this succeeds, we're done.
-    error_code EC =
+    std::error_code EC =
         sys::fs::create_link(UniqueLockFileName.str(), LockFileName.str());
-    if (EC == errc::success)
+    if (!EC)
       return;
 
     if (EC != errc::file_exists) {
diff --git a/lib/Support/Makefile b/lib/Support/Makefile
index 4a2185d589e5..39426aaaacee 100644
--- a/lib/Support/Makefile
+++ b/lib/Support/Makefile
@@ -17,3 +17,7 @@ include $(LEVEL)/Makefile.common
 
 CompileCommonOpts := $(filter-out -pedantic,$(CompileCommonOpts))
 CompileCommonOpts := $(filter-out -Wno-long-long,$(CompileCommonOpts))
+
+ifdef LLVM_VERSION_INFO
+CompileCommonOpts += -DLLVM_VERSION_INFO='"$(LLVM_VERSION_INFO)"'
+endif
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 629d8855f327..abf4f60b27a6 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -14,19 +14,20 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/system_error.h"
 #include <cassert>
 #include <cerrno>
 #include <cstdio>
 #include <cstring>
 #include <new>
 #include <sys/types.h>
+#include <system_error>
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
 #else
@@ -156,9 +157,10 @@ MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
 /// if the Filename is "-".  If an error occurs, this returns null and fills
 /// in *ErrStr with a reason.  If stdin is empty, this API (unlike getSTDIN)
 /// returns an empty buffer.
-error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
-                                        std::unique_ptr<MemoryBuffer> &Result,
-                                        int64_t FileSize) {
+std::error_code
+MemoryBuffer::getFileOrSTDIN(StringRef Filename,
+                             std::unique_ptr<MemoryBuffer> &Result,
+                             int64_t FileSize) {
   if (Filename == "-")
     return getSTDIN(Result);
   return getFile(Filename, Result, FileSize);
@@ -190,7 +192,7 @@ class MemoryBufferMMapFile : public MemoryBuffer {
 
 public:
   MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,
-                       uint64_t Offset, error_code EC)
+                       uint64_t Offset, std::error_code EC)
       : MFR(FD, false, sys::fs::mapped_file_region::readonly,
             getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {
     if (!EC) {
@@ -210,9 +212,9 @@ class MemoryBufferMMapFile : public MemoryBuffer {
 };
 }
 
-static error_code getMemoryBufferForStream(int FD,
-                                           StringRef BufferName,
-                                           std::unique_ptr<MemoryBuffer> &Result) {
+static std::error_code
+getMemoryBufferForStream(int FD, StringRef BufferName,
+                         std::unique_ptr<MemoryBuffer> &Result) {
   const ssize_t ChunkSize = 4096*4;
   SmallString<ChunkSize> Buffer;
   ssize_t ReadBytes;
@@ -222,26 +224,25 @@ static error_code getMemoryBufferForStream(int FD,
     ReadBytes = read(FD, Buffer.end(), ChunkSize);
     if (ReadBytes == -1) {
       if (errno == EINTR) continue;
-      return error_code(errno, posix_category());
+      return std::error_code(errno, std::generic_category());
     }
     Buffer.set_size(Buffer.size() + ReadBytes);
   } while (ReadBytes != 0);
 
   Result.reset(MemoryBuffer::getMemBufferCopy(Buffer, BufferName));
-  return error_code::success();
+  return std::error_code();
 }
 
-static error_code getFileAux(const char *Filename,
-                             std::unique_ptr<MemoryBuffer> &Result,
-                             int64_t FileSize,
-                             bool RequiresNullTerminator,
-                             bool IsVolatileSize);
-
-error_code MemoryBuffer::getFile(Twine Filename,
-                                 std::unique_ptr<MemoryBuffer> &Result,
-                                 int64_t FileSize,
-                                 bool RequiresNullTerminator,
-                                 bool IsVolatileSize) {
+static std::error_code getFileAux(const char *Filename,
+                                  std::unique_ptr<MemoryBuffer> &Result,
+                                  int64_t FileSize, bool RequiresNullTerminator,
+                                  bool IsVolatileSize);
+
+std::error_code MemoryBuffer::getFile(Twine Filename,
+                                      std::unique_ptr<MemoryBuffer> &Result,
+                                      int64_t FileSize,
+                                      bool RequiresNullTerminator,
+                                      bool IsVolatileSize) {
   // Ensure the path is null terminated.
   SmallString<256> PathBuf;
   StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf);
@@ -249,23 +250,25 @@ error_code MemoryBuffer::getFile(Twine Filename,
                     RequiresNullTerminator, IsVolatileSize);
 }
 
-static error_code getOpenFileImpl(int FD, const char *Filename,
-                                  std::unique_ptr<MemoryBuffer> &Result,
-                                  uint64_t FileSize, uint64_t MapSize,
-                                  int64_t Offset, bool RequiresNullTerminator,
-                                  bool IsVolatileSize);
+static std::error_code getOpenFileImpl(int FD, const char *Filename,
+                                       std::unique_ptr<MemoryBuffer> &Result,
+                                       uint64_t FileSize, uint64_t MapSize,
+                                       int64_t Offset,
+                                       bool RequiresNullTerminator,
+                                       bool IsVolatileSize);
 
-static error_code getFileAux(const char *Filename,
-                             std::unique_ptr<MemoryBuffer> &Result, int64_t FileSize,
-                             bool RequiresNullTerminator,
-                             bool IsVolatileSize) {
+static std::error_code getFileAux(const char *Filename,
+                                  std::unique_ptr<MemoryBuffer> &Result,
+                                  int64_t FileSize, bool RequiresNullTerminator,
+                                  bool IsVolatileSize) {
   int FD;
-  error_code EC = sys::fs::openFileForRead(Filename, FD);
+  std::error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
     return EC;
 
-  error_code ret = getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
-                                   RequiresNullTerminator, IsVolatileSize);
+  std::error_code ret =
+      getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
+                      RequiresNullTerminator, IsVolatileSize);
   close(FD);
   return ret;
 }
@@ -318,11 +321,12 @@ static bool shouldUseMmap(int FD,
   return true;
 }
 
-static error_code getOpenFileImpl(int FD, const char *Filename,
-                                  std::unique_ptr<MemoryBuffer> &Result,
-                                  uint64_t FileSize, uint64_t MapSize,
-                                  int64_t Offset, bool RequiresNullTerminator,
-                                  bool IsVolatileSize) {
+static std::error_code getOpenFileImpl(int FD, const char *Filename,
+                                       std::unique_ptr<MemoryBuffer> &Result,
+                                       uint64_t FileSize, uint64_t MapSize,
+                                       int64_t Offset,
+                                       bool RequiresNullTerminator,
+                                       bool IsVolatileSize) {
   static int PageSize = sys::process::get_self()->page_size();
 
   // Default is to map the full file.
@@ -331,7 +335,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
     // file descriptor is cheaper than stat on a random path.
     if (FileSize == uint64_t(-1)) {
       sys::fs::file_status Status;
-      error_code EC = sys::fs::status(FD, Status);
+      std::error_code EC = sys::fs::status(FD, Status);
       if (EC)
         return EC;
 
@@ -350,11 +354,11 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
 
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
                     PageSize, IsVolatileSize)) {
-    error_code EC;
+    std::error_code EC;
     Result.reset(new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile(
         RequiresNullTerminator, FD, MapSize, Offset, EC));
     if (!EC)
-      return error_code::success();
+      return std::error_code();
   }
 
   MemoryBuffer *Buf = MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
@@ -370,7 +374,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
   size_t BytesLeft = MapSize;
 #ifndef HAVE_PREAD
   if (lseek(FD, Offset, SEEK_SET) == -1)
-    return error_code(errno, posix_category());
+    return std::error_code(errno, std::generic_category());
 #endif
 
   while (BytesLeft) {
@@ -383,7 +387,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
       if (errno == EINTR)
         continue;
       // Error while reading.
-      return error_code(errno, posix_category());
+      return std::error_code(errno, std::generic_category());
     }
     if (NumRead == 0) {
       memset(BufPtr, 0, BytesLeft); // zero-initialize rest of the buffer.
@@ -394,22 +398,21 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
   }
 
   Result.swap(SB);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
-                                     std::unique_ptr<MemoryBuffer> &Result,
-                                     uint64_t FileSize,
-                                     bool RequiresNullTerminator,
-                                     bool IsVolatileSize) {
+std::error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
+                                          std::unique_ptr<MemoryBuffer> &Result,
+                                          uint64_t FileSize,
+                                          bool RequiresNullTerminator,
+                                          bool IsVolatileSize) {
   return getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
                          RequiresNullTerminator, IsVolatileSize);
 }
 
-error_code MemoryBuffer::getOpenFileSlice(int FD, const char *Filename,
-                                          std::unique_ptr<MemoryBuffer> &Result,
-                                          uint64_t MapSize, int64_t Offset,
-                                          bool IsVolatileSize) {
+std::error_code MemoryBuffer::getOpenFileSlice(
+    int FD, const char *Filename, std::unique_ptr<MemoryBuffer> &Result,
+    uint64_t MapSize, int64_t Offset, bool IsVolatileSize) {
   return getOpenFileImpl(FD, Filename, Result, -1, MapSize, Offset, false,
                          IsVolatileSize);
 }
@@ -418,7 +421,7 @@ error_code MemoryBuffer::getOpenFileSlice(int FD, const char *Filename,
 // MemoryBuffer::getSTDIN implementation.
 //===----------------------------------------------------------------------===//
 
-error_code MemoryBuffer::getSTDIN(std::unique_ptr<MemoryBuffer> &Result) {
+std::error_code MemoryBuffer::getSTDIN(std::unique_ptr<MemoryBuffer> &Result) {
   // Read in all of the data from stdin, we cannot mmap stdin.
   //
   // FIXME: That isn't necessarily true, we should try to mmap stdin and
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index b8d676f286c9..15edf0ddbbd8 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -164,12 +165,12 @@ enum FSEntity {
 };
 
 // Implemented in Unix/Path.inc and Windows/Path.inc.
-static error_code TempDir(SmallVectorImpl<char> &result);
+static std::error_code TempDir(SmallVectorImpl<char> &result);
 
-static error_code createUniqueEntity(const Twine &Model, int &ResultFD,
-                                     SmallVectorImpl<char> &ResultPath,
-                                     bool MakeAbsolute, unsigned Mode,
-                                     FSEntity Type) {
+static std::error_code createUniqueEntity(const Twine &Model, int &ResultFD,
+                                          SmallVectorImpl<char> &ResultPath,
+                                          bool MakeAbsolute, unsigned Mode,
+                                          FSEntity Type) {
   SmallString<128> ModelStorage;
   Model.toVector(ModelStorage);
 
@@ -177,7 +178,7 @@ static error_code createUniqueEntity(const Twine &Model, int &ResultFD,
     // Make model absolute by prepending a temp directory if it's not already.
     if (!sys::path::is_absolute(Twine(ModelStorage))) {
       SmallString<128> TDir;
-      if (error_code EC = TempDir(TDir))
+      if (std::error_code EC = TempDir(TDir))
         return EC;
       sys::path::append(TDir, Twine(ModelStorage));
       ModelStorage.swap(TDir);
@@ -201,7 +202,7 @@ static error_code createUniqueEntity(const Twine &Model, int &ResultFD,
   // Try to open + create the file.
   switch (Type) {
   case FS_File: {
-    if (error_code EC =
+    if (std::error_code EC =
             sys::fs::openFileForWrite(Twine(ResultPath.begin()), ResultFD,
                                       sys::fs::F_RW | sys::fs::F_Excl, Mode)) {
       if (EC == errc::file_exists)
@@ -209,26 +210,27 @@ static error_code createUniqueEntity(const Twine &Model, int &ResultFD,
       return EC;
     }
 
-    return error_code::success();
+    return std::error_code();
   }
 
   case FS_Name: {
     bool Exists;
-    error_code EC = sys::fs::exists(ResultPath.begin(), Exists);
+    std::error_code EC = sys::fs::exists(ResultPath.begin(), Exists);
     if (EC)
       return EC;
     if (Exists)
       goto retry_random_path;
-    return error_code::success();
+    return std::error_code();
   }
 
   case FS_Dir: {
-    if (error_code EC = sys::fs::create_directory(ResultPath.begin(), false)) {
+    if (std::error_code EC =
+            sys::fs::create_directory(ResultPath.begin(), false)) {
       if (EC == errc::file_exists)
         goto retry_random_path;
       return EC;
     }
-    return error_code::success();
+    return std::error_code();
   }
   }
   llvm_unreachable("Invalid Type");
@@ -705,29 +707,30 @@ bool is_relative(const Twine &path) {
 
 namespace fs {
 
-error_code getUniqueID(const Twine Path, UniqueID &Result) {
+std::error_code getUniqueID(const Twine Path, UniqueID &Result) {
   file_status Status;
-  error_code EC = status(Path, Status);
+  std::error_code EC = status(Path, Status);
   if (EC)
     return EC;
   Result = Status.getUniqueID();
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code createUniqueFile(const Twine &Model, int &ResultFd,
-                            SmallVectorImpl<char> &ResultPath, unsigned Mode) {
+std::error_code createUniqueFile(const Twine &Model, int &ResultFd,
+                                 SmallVectorImpl<char> &ResultPath,
+                                 unsigned Mode) {
   return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File);
 }
 
-error_code createUniqueFile(const Twine &Model,
-                            SmallVectorImpl<char> &ResultPath) {
+std::error_code createUniqueFile(const Twine &Model,
+                                 SmallVectorImpl<char> &ResultPath) {
   int Dummy;
   return createUniqueEntity(Model, Dummy, ResultPath, false, 0, FS_Name);
 }
 
-static error_code createTemporaryFile(const Twine &Model, int &ResultFD,
-                                      llvm::SmallVectorImpl<char> &ResultPath,
-                                      FSEntity Type) {
+static std::error_code
+createTemporaryFile(const Twine &Model, int &ResultFD,
+                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   SmallString<128> Storage;
   StringRef P = Model.toNullTerminatedStringRef(Storage);
   assert(P.find_first_of(separators) == StringRef::npos &&
@@ -737,24 +740,22 @@ static error_code createTemporaryFile(const Twine &Model, int &ResultFD,
                             true, owner_read | owner_write, Type);
 }
 
-static error_code
+static std::error_code
 createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD,
-                    llvm::SmallVectorImpl<char> &ResultPath,
-                    FSEntity Type) {
+                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   const char *Middle = Suffix.empty() ? "-%%%%%%" : "-%%%%%%.";
   return createTemporaryFile(Prefix + Middle + Suffix, ResultFD, ResultPath,
                              Type);
 }
 
-
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               int &ResultFD,
-                               SmallVectorImpl<char> &ResultPath) {
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    int &ResultFD,
+                                    SmallVectorImpl<char> &ResultPath) {
   return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File);
 }
 
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               SmallVectorImpl<char> &ResultPath) {
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    SmallVectorImpl<char> &ResultPath) {
   int Dummy;
   return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
 }
@@ -762,14 +763,14 @@ error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
 
 // This is a mkdtemp with a different pattern. We use createUniqueEntity mostly
 // for consistency. We should try using mkdtemp.
-error_code createUniqueDirectory(const Twine &Prefix,
-                                 SmallVectorImpl<char> &ResultPath) {
+std::error_code createUniqueDirectory(const Twine &Prefix,
+                                      SmallVectorImpl<char> &ResultPath) {
   int Dummy;
   return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath,
                             true, 0, FS_Dir);
 }
 
-error_code make_absolute(SmallVectorImpl<char> &path) {
+std::error_code make_absolute(SmallVectorImpl<char> &path) {
   StringRef p(path.data(), path.size());
 
   bool rootDirectory = path::has_root_directory(p),
@@ -781,11 +782,12 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
 
   // Already absolute.
   if (rootName && rootDirectory)
-    return error_code::success();
+    return std::error_code();
 
   // All of the following conditions will need the current directory.
   SmallString<128> current_dir;
-  if (error_code ec = current_path(current_dir)) return ec;
+  if (std::error_code ec = current_path(current_dir))
+    return ec;
 
   // Relative path. Prepend the current directory.
   if (!rootName && !rootDirectory) {
@@ -793,7 +795,7 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
     path::append(current_dir, p);
     // Set path to the result.
     path.swap(current_dir);
-    return error_code::success();
+    return std::error_code();
   }
 
   if (!rootName && rootDirectory) {
@@ -802,7 +804,7 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
     path::append(curDirRootName, p);
     // Set path to the result.
     path.swap(curDirRootName);
-    return error_code::success();
+    return std::error_code();
   }
 
   if (rootName && !rootDirectory) {
@@ -814,19 +816,19 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
     SmallString<128> res;
     path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
     path.swap(res);
-    return error_code::success();
+    return std::error_code();
   }
 
   llvm_unreachable("All rootName and rootDirectory combinations should have "
                    "occurred above!");
 }
 
-error_code create_directories(const Twine &Path, bool IgnoreExisting) {
+std::error_code create_directories(const Twine &Path, bool IgnoreExisting) {
   SmallString<128> PathStorage;
   StringRef P = Path.toStringRef(PathStorage);
 
   // Be optimistic and try to create the directory
-  error_code EC = create_directory(P, IgnoreExisting);
+  std::error_code EC = create_directory(P, IgnoreExisting);
   // If we succeeded, or had any error other than the parent not existing, just
   // return it.
   if (EC != errc::no_such_file_or_directory)
@@ -856,24 +858,24 @@ bool is_directory(file_status status) {
   return status.type() == file_type::directory_file;
 }
 
-error_code is_directory(const Twine &path, bool &result) {
+std::error_code is_directory(const Twine &path, bool &result) {
   file_status st;
-  if (error_code ec = status(path, st))
+  if (std::error_code ec = status(path, st))
     return ec;
   result = is_directory(st);
-  return error_code::success();
+  return std::error_code();
 }
 
 bool is_regular_file(file_status status) {
   return status.type() == file_type::regular_file;
 }
 
-error_code is_regular_file(const Twine &path, bool &result) {
+std::error_code is_regular_file(const Twine &path, bool &result) {
   file_status st;
-  if (error_code ec = status(path, st))
+  if (std::error_code ec = status(path, st))
     return ec;
   result = is_regular_file(st);
-  return error_code::success();
+  return std::error_code();
 }
 
 bool is_other(file_status status) {
@@ -890,24 +892,6 @@ void directory_entry::replace_filename(const Twine &filename, file_status st) {
   Status = st;
 }
 
-error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
-  SmallString<32>  MagicStorage;
-  StringRef Magic = magic.toStringRef(MagicStorage);
-  SmallString<32> Buffer;
-
-  if (error_code ec = get_magic(path, Magic.size(), Buffer)) {
-    if (ec == errc::value_too_large) {
-      // Magic.size() > file_size(Path).
-      result = false;
-      return error_code::success();
-    }
-    return ec;
-  }
-
-  result = Magic == Buffer;
-  return error_code::success();
-}
-
 /// @brief Identify the magic in magic.
   file_magic identify_magic(StringRef Magic) {
   if (Magic.size() < 4)
@@ -1040,17 +1024,21 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
   return file_magic::unknown;
 }
 
-error_code identify_magic(const Twine &path, file_magic &result) {
-  SmallString<32> Magic;
-  error_code ec = get_magic(path, Magic.capacity(), Magic);
-  if (ec && ec != errc::value_too_large)
-    return ec;
+std::error_code identify_magic(const Twine &Path, file_magic &Result) {
+  int FD;
+  if (std::error_code EC = openFileForRead(Path, FD))
+    return EC;
+
+  char Buffer[32];
+  int Length = read(FD, Buffer, sizeof(Buffer));
+  if (Length < 0)
+    return std::error_code(errno, std::generic_category());
 
-  result = identify_magic(Magic);
-  return error_code::success();
+  Result = identify_magic(StringRef(Buffer, Length));
+  return std::error_code();
 }
 
-error_code directory_entry::status(file_status &result) const {
+std::error_code directory_entry::status(file_status &result) const {
   return fs::status(Path, result);
 }
 
diff --git a/lib/Support/Program.cpp b/lib/Support/Program.cpp
index 83f2ec4f0fc4..eb700e3a8591 100644
--- a/lib/Support/Program.cpp
+++ b/lib/Support/Program.cpp
@@ -13,7 +13,7 @@
 
 #include "llvm/Support/Program.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 using namespace sys;
 
@@ -34,7 +34,8 @@ int sys::ExecuteAndWait(StringRef Program, const char **args, const char **envp,
   if (Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg)) {
     if (ExecutionFailed)
       *ExecutionFailed = false;
-    ProcessInfo Result = Wait(PI, secondsToWait, true, ErrMsg);
+    ProcessInfo Result = Wait(
+        PI, secondsToWait, /*WaitUntilTerminates=*/secondsToWait == 0, ErrMsg);
     return Result.ReturnCode;
   }
 
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index acd75fbbd192..4d00d3baa417 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static const size_t TabStop = 8;
@@ -49,9 +49,6 @@ SourceMgr::~SourceMgr() {
   }
 }
 
-/// AddIncludeFile - Search for a file with the specified name in the current
-/// directory or in one of the IncludeDirs.  If no file is found, this returns
-/// ~0, otherwise it returns the buffer ID of the stacked file.
 size_t SourceMgr::AddIncludeFile(const std::string &Filename,
                                  SMLoc IncludeLoc,
                                  std::string &IncludedFile) {
@@ -71,8 +68,6 @@ size_t SourceMgr::AddIncludeFile(const std::string &Filename,
 }
 
 
-/// FindBufferContainingLoc - Return the ID of the buffer containing the
-/// specified location, returning -1 if not found.
 int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
   for (unsigned i = 0, e = Buffers.size(); i != e; ++i)
     if (Loc.getPointer() >= Buffers[i].Buffer->getBufferStart() &&
@@ -83,8 +78,6 @@ int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
   return -1;
 }
 
-/// getLineAndColumn - Find the line and column number for the specified
-/// location in the specified file.  This is not a fast method.
 std::pair<unsigned, unsigned>
 SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const {
   if (BufferID == -1) BufferID = FindBufferContainingLoc(Loc);
@@ -143,11 +136,6 @@ void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
 }
 
 
-/// GetMessage - Return an SMDiagnostic at the specified location with the
-/// specified string.
-///
-/// @param Type - If non-null, the kind of message (e.g., "error") which is
-/// prefixed to the message.
 SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    const Twine &Msg,
                                    ArrayRef<SMRange> Ranges,
@@ -211,20 +199,16 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                       LineStr, ColRanges, FixIts);
 }
 
-void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
-                             SourceMgr::DiagKind Kind,
-                             const Twine &Msg, ArrayRef<SMRange> Ranges,
-                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
-  SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges, FixIts);
-  
+void SourceMgr::PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
+                             bool ShowColors) const {
   // Report the message with the diagnostic handler if present.
   if (DiagHandler) {
     DiagHandler(Diagnostic, DiagContext);
     return;
   }
 
-  if (Loc != SMLoc()) {
-    int CurBuf = FindBufferContainingLoc(Loc);
+  if (Diagnostic.getLoc().isValid()) {
+    int CurBuf = FindBufferContainingLoc(Diagnostic.getLoc());
     assert(CurBuf != -1 && "Invalid or unspecified location!");
     PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
   }
@@ -232,6 +216,13 @@ void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
   Diagnostic.print(nullptr, OS, ShowColors);
 }
 
+void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
+                             SourceMgr::DiagKind Kind,
+                             const Twine &Msg, ArrayRef<SMRange> Ranges,
+                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+  PrintMessage(OS, GetMessage(Loc, Kind, Msg, Ranges, FixIts), ShowColors);
+}
+
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
diff --git a/lib/Support/StringPool.cpp b/lib/Support/StringPool.cpp
index ff607cf8c4ad..76faabc92bb5 100644
--- a/lib/Support/StringPool.cpp
+++ b/lib/Support/StringPool.cpp
@@ -27,7 +27,7 @@ PooledStringPtr StringPool::intern(StringRef Key) {
   if (I != InternTable.end())
     return PooledStringPtr(&*I);
   
-  entry_t *S = entry_t::Create(Key.begin(), Key.end());
+  entry_t *S = entry_t::Create(Key);
   S->getValue().Pool = this;
   InternTable.insert(S);
   
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 23b49b75fa51..c9d89a82474d 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -83,8 +83,8 @@ MemoryBlock
 Memory::allocateMappedMemory(size_t NumBytes,
                              const MemoryBlock *const NearBlock,
                              unsigned PFlags,
-                             error_code &EC) {
-  EC = error_code::success();
+                             std::error_code &EC) {
+  EC = std::error_code();
   if (NumBytes == 0)
     return MemoryBlock();
 
@@ -95,7 +95,7 @@ Memory::allocateMappedMemory(size_t NumBytes,
 #ifdef NEED_DEV_ZERO_FOR_MMAP
   static int zero_fd = open("/dev/zero", O_RDWR);
   if (zero_fd == -1) {
-    EC = error_code(errno, system_category());
+    EC = std::error_code(errno, std::generic_category());
     return MemoryBlock();
   }
   fd = zero_fd;
@@ -123,7 +123,7 @@ Memory::allocateMappedMemory(size_t NumBytes,
     if (NearBlock) //Try again without a near hint
       return allocateMappedMemory(NumBytes, nullptr, PFlags, EC);
 
-    EC = error_code(errno, system_category());
+    EC = std::error_code(errno, std::generic_category());
     return MemoryBlock();
   }
 
@@ -137,38 +137,38 @@ Memory::allocateMappedMemory(size_t NumBytes,
   return Result;
 }
 
-error_code
+std::error_code
 Memory::releaseMappedMemory(MemoryBlock &M) {
   if (M.Address == nullptr || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   if (0 != ::munmap(M.Address, M.Size))
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
   M.Address = nullptr;
   M.Size = 0;
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code
+std::error_code
 Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
   if (M.Address == nullptr || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   if (!Flags)
-    return error_code(EINVAL, generic_category());
+    return std::error_code(EINVAL, std::generic_category());
 
   int Protect = getPosixProtectionFlags(Flags);
 
   int Result = ::mprotect(M.Address, M.Size, Protect);
   if (Result != 0)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
   if (Flags & MF_EXEC)
     Memory::InvalidateInstructionCache(M.Address, M.Size);
 
-  return error_code::success();
+  return std::error_code();
 }
 
 /// AllocateRWX - Allocate a slab of memory with read/write/execute
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 519a016a84c2..c9fae4298689 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -87,7 +87,7 @@ namespace {
   };
 }
 
-static error_code TempDir(SmallVectorImpl<char> &result) {
+static std::error_code TempDir(SmallVectorImpl<char> &result) {
   // FIXME: Don't use TMPDIR if program is SUID or SGID enabled.
   const char *dir = nullptr;
   (dir = std::getenv("TMPDIR")) || (dir = std::getenv("TMP")) ||
@@ -100,7 +100,7 @@ static error_code TempDir(SmallVectorImpl<char> &result) {
   result.clear();
   StringRef d(dir);
   result.append(d.begin(), d.end());
-  return error_code::success();
+  return std::error_code();
 }
 
 namespace llvm {
@@ -225,7 +225,7 @@ UniqueID file_status::getUniqueID() const {
   return UniqueID(fs_st_dev, fs_st_ino);
 }
 
-error_code current_path(SmallVectorImpl<char> &result) {
+std::error_code current_path(SmallVectorImpl<char> &result) {
   result.clear();
 
   const char *pwd = ::getenv("PWD");
@@ -235,7 +235,7 @@ error_code current_path(SmallVectorImpl<char> &result) {
       !llvm::sys::fs::status(".", DotStatus) &&
       PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
     result.append(pwd, pwd + strlen(pwd));
-    return error_code::success();
+    return std::error_code();
   }
 
 #ifdef MAXPATHLEN
@@ -248,8 +248,8 @@ error_code current_path(SmallVectorImpl<char> &result) {
   while (true) {
     if (::getcwd(result.data(), result.capacity()) == nullptr) {
       // See if there was a real error.
-      if (errno != errc::not_enough_memory)
-        return error_code(errno, system_category());
+      if (errno != ENOMEM)
+        return std::error_code(errno, std::generic_category());
       // Otherwise there just wasn't enough space.
       result.reserve(result.capacity() * 2);
     } else
@@ -257,22 +257,22 @@ error_code current_path(SmallVectorImpl<char> &result) {
   }
 
   result.set_size(strlen(result.data()));
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code create_directory(const Twine &path, bool IgnoreExisting) {
+std::error_code create_directory(const Twine &path, bool IgnoreExisting) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   if (::mkdir(p.begin(), S_IRWXU | S_IRWXG) == -1) {
-    if (errno != errc::file_exists || !IgnoreExisting)
-      return error_code(errno, system_category());
+    if (errno != EEXIST || !IgnoreExisting)
+      return std::error_code(errno, std::generic_category());
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code normalize_separators(SmallVectorImpl<char> &Path) {
+std::error_code normalize_separators(SmallVectorImpl<char> &Path) {
   for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
     if (*PI == '\\') {
       auto PN = PI + 1;
@@ -282,12 +282,12 @@ error_code normalize_separators(SmallVectorImpl<char> &Path) {
         *PI = '/';
     }
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 // Note that we are using symbolic link because hard links are not supported by
 // all filesystems (SMB doesn't).
-error_code create_link(const Twine &to, const Twine &from) {
+std::error_code create_link(const Twine &to, const Twine &from) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -295,20 +295,20 @@ error_code create_link(const Twine &to, const Twine &from) {
   StringRef t = to.toNullTerminatedStringRef(to_storage);
 
   if (::symlink(t.begin(), f.begin()) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code remove(const Twine &path, bool IgnoreNonExisting) {
+std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   struct stat buf;
   if (lstat(p.begin(), &buf) != 0) {
-    if (errno != errc::no_such_file_or_directory || !IgnoreNonExisting)
-      return error_code(errno, system_category());
-    return error_code::success();
+    if (errno != ENOENT || !IgnoreNonExisting)
+      return std::error_code(errno, std::generic_category());
+    return std::error_code();
   }
 
   // Note: this check catches strange situations. In all cases, LLVM should
@@ -320,14 +320,14 @@ error_code remove(const Twine &path, bool IgnoreNonExisting) {
     return make_error_code(errc::operation_not_permitted);
 
   if (::remove(p.begin()) == -1) {
-    if (errno != errc::no_such_file_or_directory || !IgnoreNonExisting)
-      return error_code(errno, system_category());
+    if (errno != ENOENT || !IgnoreNonExisting)
+      return std::error_code(errno, std::generic_category());
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code rename(const Twine &from, const Twine &to) {
+std::error_code rename(const Twine &from, const Twine &to) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -335,33 +335,33 @@ error_code rename(const Twine &from, const Twine &to) {
   StringRef t = to.toNullTerminatedStringRef(to_storage);
 
   if (::rename(f.begin(), t.begin()) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code resize_file(const Twine &path, uint64_t size) {
+std::error_code resize_file(const Twine &path, uint64_t size) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   if (::truncate(p.begin(), size) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code exists(const Twine &path, bool &result) {
+std::error_code exists(const Twine &path, bool &result) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   if (::access(p.begin(), F_OK) == -1) {
-    if (errno != errc::no_such_file_or_directory)
-      return error_code(errno, system_category());
+    if (errno != ENOENT)
+      return std::error_code(errno, std::generic_category());
     result = false;
   } else
     result = true;
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool can_write(const Twine &Path) {
@@ -390,18 +390,20 @@ bool equivalent(file_status A, file_status B) {
          A.fs_st_ino == B.fs_st_ino;
 }
 
-error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
   file_status fsA, fsB;
-  if (error_code ec = status(A, fsA)) return ec;
-  if (error_code ec = status(B, fsB)) return ec;
+  if (std::error_code ec = status(A, fsA))
+    return ec;
+  if (std::error_code ec = status(B, fsB))
+    return ec;
   result = equivalent(fsA, fsB);
-  return error_code::success();
+  return std::error_code();
 }
 
-static error_code fillStatus(int StatRet, const struct stat &Status,
+static std::error_code fillStatus(int StatRet, const struct stat &Status,
                              file_status &Result) {
   if (StatRet != 0) {
-    error_code ec(errno, system_category());
+    std::error_code ec(errno, std::generic_category());
     if (ec == errc::no_such_file_or_directory)
       Result = file_status(file_type::file_not_found);
     else
@@ -429,10 +431,10 @@ static error_code fillStatus(int StatRet, const struct stat &Status,
       file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_mtime,
                   Status.st_uid, Status.st_gid, Status.st_size);
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code status(const Twine &Path, file_status &Result) {
+std::error_code status(const Twine &Path, file_status &Result) {
   SmallString<128> PathStorage;
   StringRef P = Path.toNullTerminatedStringRef(PathStorage);
 
@@ -441,36 +443,36 @@ error_code status(const Twine &Path, file_status &Result) {
   return fillStatus(StatRet, Status, Result);
 }
 
-error_code status(int FD, file_status &Result) {
+std::error_code status(int FD, file_status &Result) {
   struct stat Status;
   int StatRet = ::fstat(FD, &Status);
   return fillStatus(StatRet, Status, Result);
 }
 
-error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
+std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
 #if defined(HAVE_FUTIMENS)
   timespec Times[2];
   Times[0].tv_sec = Time.toEpochTime();
   Times[0].tv_nsec = 0;
   Times[1] = Times[0];
   if (::futimens(FD, Times))
-    return error_code(errno, system_category());
-  return error_code::success();
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 #elif defined(HAVE_FUTIMES)
   timeval Times[2];
   Times[0].tv_sec = Time.toEpochTime();
   Times[0].tv_usec = 0;
   Times[1] = Times[0];
   if (::futimes(FD, Times))
-    return error_code(errno, system_category());
-  return error_code::success();
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 #else
 #warning Missing futimes() and futimens()
   return make_error_code(errc::not_supported);
 #endif
 }
 
-error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   AutoFD ScopedFD(FD);
   if (!CloseFD)
     ScopedFD.take();
@@ -478,7 +480,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   // Figure out how large the file is.
   struct stat FileInfo;
   if (fstat(FD, &FileInfo) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
   uint64_t FileSize = FileInfo.st_size;
 
   if (Size == 0)
@@ -486,7 +488,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   else if (FileSize < Size) {
     // We need to grow the file.
     if (ftruncate(FD, Size) == -1)
-      return error_code(errno, system_category());
+      return std::error_code(errno, std::generic_category());
   }
 
   int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
@@ -496,15 +498,15 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
 #endif
   Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
   if (Mapping == MAP_FAILED)
-    return error_code(errno, system_category());
-  return error_code::success();
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 }
 
 mapped_file_region::mapped_file_region(const Twine &path,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping() {
@@ -519,7 +521,7 @@ mapped_file_region::mapped_file_region(const Twine &path,
   int oflags = (mode == readonly) ? O_RDONLY : O_RDWR;
   int ofd = ::open(name.begin(), oflags);
   if (ofd == -1) {
-    ec = error_code(errno, system_category());
+    ec = std::error_code(errno, std::generic_category());
     return;
   }
 
@@ -533,7 +535,7 @@ mapped_file_region::mapped_file_region(int fd,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping() {
@@ -583,12 +585,12 @@ int mapped_file_region::alignment() {
   return process::get_self()->page_size();
 }
 
-error_code detail::directory_iterator_construct(detail::DirIterState &it,
+std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
                                                 StringRef path){
   SmallString<128> path_null(path);
   DIR *directory = ::opendir(path_null.c_str());
   if (!directory)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
   it.IterationHandle = reinterpret_cast<intptr_t>(directory);
   // Add something for replace_filename to replace.
@@ -597,19 +599,19 @@ error_code detail::directory_iterator_construct(detail::DirIterState &it,
   return directory_iterator_increment(it);
 }
 
-error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
   if (it.IterationHandle)
     ::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
   it.IterationHandle = 0;
   it.CurrentEntry = directory_entry();
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   errno = 0;
   dirent *cur_dir = ::readdir(reinterpret_cast<DIR *>(it.IterationHandle));
   if (cur_dir == nullptr && errno != 0) {
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
   } else if (cur_dir != nullptr) {
     StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
     if ((name.size() == 1 && name[0] == '.') ||
@@ -619,80 +621,20 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   } else
     return directory_iterator_destruct(it);
 
-  return error_code::success();
-}
-
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result) {
-  SmallString<128> PathStorage;
-  StringRef Path = path.toNullTerminatedStringRef(PathStorage);
-  result.set_size(0);
-
-  // Open path.
-  std::FILE *file = std::fopen(Path.data(), "rb");
-  if (!file)
-    return error_code(errno, system_category());
-
-  // Reserve storage.
-  result.reserve(len);
-
-  // Read magic!
-  size_t size = std::fread(result.data(), 1, len, file);
-  if (std::ferror(file) != 0) {
-    std::fclose(file);
-    return error_code(errno, system_category());
-  } else if (size != len) {
-    if (std::feof(file) != 0) {
-      std::fclose(file);
-      result.set_size(size);
-      return make_error_code(errc::value_too_large);
-    }
-  }
-  std::fclose(file);
-  result.set_size(size);
-  return error_code::success();
-}
-
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
-                                            bool map_writable, void *&result) {
-  SmallString<128> path_storage;
-  StringRef name = path.toNullTerminatedStringRef(path_storage);
-  int oflags = map_writable ? O_RDWR : O_RDONLY;
-  int ofd = ::open(name.begin(), oflags);
-  if ( ofd == -1 )
-    return error_code(errno, system_category());
-  AutoFD fd(ofd);
-  int flags = map_writable ? MAP_SHARED : MAP_PRIVATE;
-  int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ;
-#ifdef MAP_FILE
-  flags |= MAP_FILE;
-#endif
-  result = ::mmap(nullptr, size, prot, flags, fd, file_offset);
-  if (result == MAP_FAILED) {
-    return error_code(errno, system_category());
-  }
-  
-  return error_code::success();
-}
-
-error_code unmap_file_pages(void *base, size_t size) {
-  if ( ::munmap(base, size) == -1 )
-    return error_code(errno, system_category());
-   
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code openFileForRead(const Twine &Name, int &ResultFD) {
+std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
   SmallString<128> Storage;
   StringRef P = Name.toNullTerminatedStringRef(Storage);
   while ((ResultFD = open(P.begin(), O_RDONLY)) < 0) {
     if (errno != EINTR)
-      return error_code(errno, system_category());
+      return std::error_code(errno, std::generic_category());
   }
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code openFileForWrite(const Twine &Name, int &ResultFD,
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
                             sys::fs::OpenFlags Flags, unsigned Mode) {
   // Verify that we don't have both "append" and "excl".
   assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
@@ -717,9 +659,9 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   StringRef P = Name.toNullTerminatedStringRef(Storage);
   while ((ResultFD = open(P.begin(), OpenFlags, Mode)) < 0) {
     if (errno != EINTR)
-      return error_code(errno, system_category());
+      return std::error_code(errno, std::generic_category());
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 } // end namespace fs
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 8faa638ec8a3..d2c5dbcf6ebb 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -47,7 +47,6 @@
 using namespace llvm;
 using namespace sys;
 
-
 process::id_type self_process::get_id() {
   return getpid();
 }
@@ -190,12 +189,13 @@ Optional<std::string> Process::GetEnv(StringRef Name) {
   return std::string(Val);
 }
 
-error_code Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
-                                      ArrayRef<const char *> ArgsIn,
-                                      SpecificBumpPtrAllocator<char> &) {
+std::error_code
+Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
+                           ArrayRef<const char *> ArgsIn,
+                           SpecificBumpPtrAllocator<char> &) {
   ArgsOut.append(ArgsIn.begin(), ArgsIn.end());
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool Process::StandardInIsUserInput() {
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index 1225a9c042e5..50f973a850b6 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -48,6 +48,7 @@
 #endif
 
 namespace llvm {
+
 using namespace sys;
 
 ProcessInfo::ProcessInfo() : Pid(0), ReturnCode(0) {}
@@ -425,14 +426,14 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   return WaitResult;
 }
 
-error_code sys::ChangeStdinToBinary(){
+  std::error_code sys::ChangeStdinToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
-  return make_error_code(errc::success);
+    return std::error_code();
 }
 
-error_code sys::ChangeStdoutToBinary(){
+  std::error_code sys::ChangeStdoutToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
-  return make_error_code(errc::success);
+    return std::error_code();
 }
 
 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
diff --git a/lib/Support/Unix/system_error.inc b/lib/Support/Unix/system_error.inc
deleted file mode 100644
index 681e919edb4e..000000000000
--- a/lib/Support/Unix/system_error.inc
+++ /dev/null
@@ -1,34 +0,0 @@
-//===- llvm/Support/Unix/system_error.inc - Unix error_code ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides the Unix specific implementation of the error_code
-// and error_condition classes.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic UNIX code that
-//===          is guaranteed to work on *all* UNIX variants.
-//===----------------------------------------------------------------------===//
-
-using namespace llvm;
-
-std::string
-_system_error_category::message(int ev) const {
-  return _do_message::message(ev);
-}
-
-error_condition
-_system_error_category::default_error_condition(int ev) const {
-#ifdef ELAST
-  if (ev > ELAST)
-    return error_condition(ev, system_category());
-#endif  // ELAST
-  return error_condition(ev, generic_category());
-}
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 5d0278fe3c30..5ed0b709fa68 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -85,7 +85,7 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
   }
 
   SmallVector<wchar_t, MAX_PATH> filenameUnicode;
-  if (error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
+  if (std::error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
     SetLastError(ec.value());
     MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16: ");
     return DynamicLibrary();
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index ebe78782b904..ae8371abf5b3 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -15,6 +15,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/WindowsError.h"
 
 // The Windows.h header must be the last one included.
 #include "WindowsSupport.h"
@@ -69,8 +70,8 @@ namespace sys {
 MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
                                          const MemoryBlock *const NearBlock,
                                          unsigned Flags,
-                                         error_code &EC) {
-  EC = error_code::success();
+                                         std::error_code &EC) {
+  EC = std::error_code();
   if (NumBytes == 0)
     return MemoryBlock();
 
@@ -99,7 +100,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
       // Try again without the NearBlock hint
       return allocateMappedMemory(NumBytes, NULL, Flags, EC);
     }
-    EC = error_code(::GetLastError(), system_category());
+    EC = mapWindowsError(::GetLastError());
     return MemoryBlock();
   }
 
@@ -113,34 +114,34 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
   return Result;
 }
 
-error_code Memory::releaseMappedMemory(MemoryBlock &M) {
+  std::error_code Memory::releaseMappedMemory(MemoryBlock &M) {
   if (M.Address == 0 || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   if (!VirtualFree(M.Address, 0, MEM_RELEASE))
-    return error_code(::GetLastError(), system_category());
+    return mapWindowsError(::GetLastError());
 
   M.Address = 0;
   M.Size = 0;
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code Memory::protectMappedMemory(const MemoryBlock &M,
+  std::error_code Memory::protectMappedMemory(const MemoryBlock &M,
                                        unsigned Flags) {
   if (M.Address == 0 || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   DWORD Protect = getWindowsProtectionFlags(Flags);
 
   DWORD OldFlags;
   if (!VirtualProtect(M.Address, M.Size, Protect, &OldFlags))
-    return error_code(::GetLastError(), system_category());
+    return mapWindowsError(::GetLastError());
 
   if (Flags & MF_EXEC)
     Memory::InvalidateInstructionCache(M.Address, M.Size);
 
-  return error_code::success();
+  return std::error_code();
 }
 
 /// InvalidateInstructionCache - Before the JIT can run a block of code
@@ -156,18 +157,18 @@ MemoryBlock Memory::AllocateRWX(size_t NumBytes,
                                 const MemoryBlock *NearBlock,
                                 std::string *ErrMsg) {
   MemoryBlock MB;
-  error_code EC;
+  std::error_code EC;
   MB = allocateMappedMemory(NumBytes, NearBlock,
                             MF_READ|MF_WRITE|MF_EXEC, EC);
-  if (EC != error_code::success() && ErrMsg) {
+  if (EC != std::error_code() && ErrMsg) {
     MakeErrMsg(ErrMsg, EC.message());
   }
   return MB;
 }
 
 bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
-  error_code EC = releaseMappedMemory(M);
-  if (EC == error_code::success())
+  std::error_code EC = releaseMappedMemory(M);
+  if (EC == std::error_code())
     return false;
   MakeErrMsg(ErrMsg, EC.message());
   return true;
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index e59888e3858b..7a1bc0447fac 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/WindowsError.h"
 #include <fcntl.h>
 #include <io.h>
 #include <sys/stat.h>
@@ -44,7 +45,11 @@ using namespace llvm;
 using llvm::sys::windows::UTF8ToUTF16;
 using llvm::sys::windows::UTF16ToUTF8;
 
-static error_code TempDir(SmallVectorImpl<char> &Result) {
+static std::error_code windows_error(DWORD E) {
+  return mapWindowsError(E);
+}
+
+static std::error_code TempDir(SmallVectorImpl<char> &Result) {
   SmallVector<wchar_t, 64> Res;
 retry_temp_dir:
   DWORD Len = ::GetTempPathW(Res.capacity(), Res.begin());
@@ -119,7 +124,7 @@ TimeValue file_status::getLastModificationTime() const {
   return Ret;
 }
 
-error_code current_path(SmallVectorImpl<char> &result) {
+std::error_code current_path(SmallVectorImpl<char> &result) {
   SmallVector<wchar_t, MAX_PATH> cur_path;
   DWORD len = MAX_PATH;
 
@@ -141,30 +146,30 @@ error_code current_path(SmallVectorImpl<char> &result) {
   return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
-error_code create_directory(const Twine &path, bool IgnoreExisting) {
+std::error_code create_directory(const Twine &path, bool IgnoreExisting) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   if (!::CreateDirectoryW(path_utf16.begin(), NULL)) {
-    error_code ec = windows_error(::GetLastError());
-    if (ec != windows_error::already_exists || !IgnoreExisting)
-      return ec;
+    DWORD LastError = ::GetLastError();
+    if (LastError != ERROR_ALREADY_EXISTS || !IgnoreExisting)
+      return windows_error(LastError);
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code normalize_separators(SmallVectorImpl<char> &Path) {
+std::error_code normalize_separators(SmallVectorImpl<char> &Path) {
   (void) Path;
-  return error_code::success();
+  return std::error_code();
 }
 
 // We can't use symbolic links for windows.
-error_code create_link(const Twine &to, const Twine &from) {
+std::error_code create_link(const Twine &to, const Twine &from) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -174,47 +179,49 @@ error_code create_link(const Twine &to, const Twine &from) {
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
   SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+  if (std::error_code ec = UTF8ToUTF16(f, wide_from))
+    return ec;
+  if (std::error_code ec = UTF8ToUTF16(t, wide_to))
+    return ec;
 
   if (!::CreateHardLinkW(wide_from.begin(), wide_to.begin(), NULL))
     return windows_error(::GetLastError());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code remove(const Twine &path, bool IgnoreNonExisting) {
+std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
   file_status ST;
-  if (error_code EC = status(path, ST)) {
+  if (std::error_code EC = status(path, ST)) {
     if (EC != errc::no_such_file_or_directory || !IgnoreNonExisting)
       return EC;
-    return error_code::success();
+    return std::error_code();
   }
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   if (ST.type() == file_type::directory_file) {
     if (!::RemoveDirectoryW(c_str(path_utf16))) {
-      error_code EC = windows_error(::GetLastError());
+      std::error_code EC = windows_error(::GetLastError());
       if (EC != errc::no_such_file_or_directory || !IgnoreNonExisting)
         return EC;
     }
-    return error_code::success();
+    return std::error_code();
   }
   if (!::DeleteFileW(c_str(path_utf16))) {
-    error_code EC = windows_error(::GetLastError());
+    std::error_code EC = windows_error(::GetLastError());
     if (EC != errc::no_such_file_or_directory || !IgnoreNonExisting)
       return EC;
   }
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code rename(const Twine &from, const Twine &to) {
+std::error_code rename(const Twine &from, const Twine &to) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -224,16 +231,18 @@ error_code rename(const Twine &from, const Twine &to) {
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
   SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+  if (std::error_code ec = UTF8ToUTF16(f, wide_from))
+    return ec;
+  if (std::error_code ec = UTF8ToUTF16(t, wide_to))
+    return ec;
 
-  error_code ec = error_code::success();
+  std::error_code ec = std::error_code();
   for (int i = 0; i < 2000; i++) {
     if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
                       MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
-      return error_code::success();
-    ec = windows_error(::GetLastError());
-    if (ec != windows_error::access_denied)
+      return std::error_code();
+    DWORD LastError = ::GetLastError();
+    if (LastError != ERROR_ACCESS_DENIED)
       break;
     // Retry MoveFile() at ACCESS_DENIED.
     // System scanners (eg. indexer) might open the source file when
@@ -244,46 +253,46 @@ error_code rename(const Twine &from, const Twine &to) {
   return ec;
 }
 
-error_code resize_file(const Twine &path, uint64_t size) {
+std::error_code resize_file(const Twine &path, uint64_t size) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   int fd = ::_wopen(path_utf16.begin(), O_BINARY | _O_RDWR, S_IWRITE);
   if (fd == -1)
-    return error_code(errno, generic_category());
+    return std::error_code(errno, std::generic_category());
 #ifdef HAVE__CHSIZE_S
   errno_t error = ::_chsize_s(fd, size);
 #else
   errno_t error = ::_chsize(fd, size);
 #endif
   ::close(fd);
-  return error_code(error, generic_category());
+  return std::error_code(error, std::generic_category());
 }
 
-error_code exists(const Twine &path, bool &result) {
+std::error_code exists(const Twine &path, bool &result) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
 
   if (attributes == INVALID_FILE_ATTRIBUTES) {
     // See if the file didn't actually exist.
-    error_code ec = make_error_code(windows_error(::GetLastError()));
-    if (ec != windows_error::file_not_found &&
-        ec != windows_error::path_not_found)
-      return ec;
+    DWORD LastError = ::GetLastError();
+    if (LastError != ERROR_FILE_NOT_FOUND &&
+        LastError != ERROR_PATH_NOT_FOUND)
+      return windows_error(LastError);
     result = false;
   } else
     result = true;
-  return error_code::success();
+  return std::error_code();
 }
 
 bool can_write(const Twine &Path) {
@@ -320,12 +329,14 @@ bool equivalent(file_status A, file_status B) {
          A.VolumeSerialNumber == B.VolumeSerialNumber;
 }
 
-error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
   file_status fsA, fsB;
-  if (error_code ec = status(A, fsA)) return ec;
-  if (error_code ec = status(B, fsB)) return ec;
+  if (std::error_code ec = status(A, fsA))
+    return ec;
+  if (std::error_code ec = status(B, fsB))
+    return ec;
   result = equivalent(fsA, fsB);
-  return error_code::success();
+  return std::error_code();
 }
 
 static bool isReservedName(StringRef path) {
@@ -351,7 +362,7 @@ static bool isReservedName(StringRef path) {
   return false;
 }
 
-static error_code getStatus(HANDLE FileHandle, file_status &Result) {
+static std::error_code getStatus(HANDLE FileHandle, file_status &Result) {
   if (FileHandle == INVALID_HANDLE_VALUE)
     goto handle_status_error;
 
@@ -363,16 +374,16 @@ static error_code getStatus(HANDLE FileHandle, file_status &Result) {
     if (Err != NO_ERROR)
       return windows_error(Err);
     Result = file_status(file_type::type_unknown);
-    return error_code::success();
+    return std::error_code();
   }
   case FILE_TYPE_DISK:
     break;
   case FILE_TYPE_CHAR:
     Result = file_status(file_type::character_file);
-    return error_code::success();
+    return std::error_code();
   case FILE_TYPE_PIPE:
     Result = file_status(file_type::fifo_file);
-    return error_code::success();
+    return std::error_code();
   }
 
   BY_HANDLE_FILE_INFORMATION Info;
@@ -388,32 +399,32 @@ static error_code getStatus(HANDLE FileHandle, file_status &Result) {
                     Info.ftLastWriteTime.dwLowDateTime,
                     Info.dwVolumeSerialNumber, Info.nFileSizeHigh,
                     Info.nFileSizeLow, Info.nFileIndexHigh, Info.nFileIndexLow);
-    return error_code::success();
+    return std::error_code();
   }
 
 handle_status_error:
-  error_code EC = windows_error(::GetLastError());
-  if (EC == windows_error::file_not_found ||
-      EC == windows_error::path_not_found)
+  DWORD LastError = ::GetLastError();
+  if (LastError == ERROR_FILE_NOT_FOUND ||
+      LastError == ERROR_PATH_NOT_FOUND)
     Result = file_status(file_type::file_not_found);
-  else if (EC == windows_error::sharing_violation)
+  else if (LastError == ERROR_SHARING_VIOLATION)
     Result = file_status(file_type::type_unknown);
   else
     Result = file_status(file_type::status_error);
-  return EC;
+  return windows_error(LastError);
 }
 
-error_code status(const Twine &path, file_status &result) {
+std::error_code status(const Twine &path, file_status &result) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
   StringRef path8 = path.toStringRef(path_storage);
   if (isReservedName(path8)) {
     result = file_status(file_type::character_file);
-    return error_code::success();
+    return std::error_code();
   }
 
-  if (error_code ec = UTF8ToUTF16(path8, path_utf16))
+  if (std::error_code ec = UTF8ToUTF16(path8, path_utf16))
     return ec;
 
   DWORD attr = ::GetFileAttributesW(path_utf16.begin());
@@ -444,12 +455,12 @@ error_code status(const Twine &path, file_status &result) {
     return getStatus(h, result);
 }
 
-error_code status(int FD, file_status &Result) {
+std::error_code status(int FD, file_status &Result) {
   HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
   return getStatus(FileHandle, Result);
 }
 
-error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
+std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
   ULARGE_INTEGER UI;
   UI.QuadPart = Time.toWin32Time();
   FILETIME FT;
@@ -458,52 +469,10 @@ error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
   HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
   if (!SetFileTime(FileHandle, NULL, &FT, &FT))
     return windows_error(::GetLastError());
-  return error_code::success();
-}
-
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-  result.set_size(0);
-
-  // Convert path to UTF-16.
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  // Open file.
-  HANDLE file = ::CreateFileW(c_str(path_utf16),
-                              GENERIC_READ,
-                              FILE_SHARE_READ,
-                              NULL,
-                              OPEN_EXISTING,
-                              FILE_ATTRIBUTE_READONLY,
-                              NULL);
-  if (file == INVALID_HANDLE_VALUE)
-    return windows_error(::GetLastError());
-
-  // Allocate buffer.
-  result.reserve(len);
-
-  // Get magic!
-  DWORD bytes_read = 0;
-  BOOL read_success = ::ReadFile(file, result.data(), len, &bytes_read, NULL);
-  error_code ec = windows_error(::GetLastError());
-  ::CloseHandle(file);
-  if (!read_success || (bytes_read != len)) {
-    // Set result size to the number of bytes read if it's valid.
-    if (bytes_read <= len)
-      result.set_size(bytes_read);
-    // ERROR_HANDLE_EOF is mapped to errc::value_too_large.
-    return ec;
-  }
-
-  result.set_size(len);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   FileDescriptor = FD;
   // Make sure that the requested size fits within SIZE_T.
   if (Size > std::numeric_limits<SIZE_T>::max()) {
@@ -528,7 +497,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
                            (Offset + Size) & 0xffffffff,
                            0);
   if (FileMappingHandle == NULL) {
-    error_code ec = windows_error(GetLastError());
+    std::error_code ec = windows_error(GetLastError());
     if (FileDescriptor) {
       if (CloseFD)
         _close(FileDescriptor);
@@ -549,7 +518,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
                             Offset & 0xffffffff,
                             Size);
   if (Mapping == NULL) {
-    error_code ec = windows_error(GetLastError());
+    std::error_code ec = windows_error(GetLastError());
     ::CloseHandle(FileMappingHandle);
     if (FileDescriptor) {
       if (CloseFD)
@@ -563,7 +532,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
     MEMORY_BASIC_INFORMATION mbi;
     SIZE_T Result = VirtualQuery(Mapping, &mbi, sizeof(mbi));
     if (Result == 0) {
-      error_code ec = windows_error(GetLastError());
+      std::error_code ec = windows_error(GetLastError());
       ::UnmapViewOfFile(Mapping);
       ::CloseHandle(FileMappingHandle);
       if (FileDescriptor) {
@@ -584,14 +553,14 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
       _close(FileDescriptor); // Also closes FileHandle.
   } else
     ::CloseHandle(FileHandle);
-  return error_code::success();
+  return std::error_code();
 }
 
 mapped_file_region::mapped_file_region(const Twine &path,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping()
@@ -636,7 +605,7 @@ mapped_file_region::mapped_file_region(int fd,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping()
@@ -704,12 +673,11 @@ int mapped_file_region::alignment() {
   return SysInfo.dwAllocationGranularity;
 }
 
-error_code detail::directory_iterator_construct(detail::DirIterState &it,
+std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
                                                 StringRef path){
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path,
-                                  path_utf16))
+  if (std::error_code ec = UTF8ToUTF16(path, path_utf16))
     return ec;
 
   // Convert path to the format that Windows is happy with.
@@ -733,19 +701,19 @@ error_code detail::directory_iterator_construct(detail::DirIterState &it,
          (FilenameLen == 2 && FirstFind.cFileName[0] == L'.' &&
                               FirstFind.cFileName[1] == L'.'))
     if (!::FindNextFileW(FindHandle, &FirstFind)) {
-      error_code ec = windows_error(::GetLastError());
+      DWORD LastError = ::GetLastError();
       // Check for end.
-      if (ec == windows_error::no_more_files)
+      if (LastError == ERROR_NO_MORE_FILES)
         return detail::directory_iterator_destruct(it);
-      return ec;
+      return windows_error(LastError);
     } else
       FilenameLen = ::wcslen(FirstFind.cFileName);
 
   // Construct the current directory entry.
   SmallString<128> directory_entry_name_utf8;
-  if (error_code ec = UTF16ToUTF8(FirstFind.cFileName,
-                                  ::wcslen(FirstFind.cFileName),
-                                  directory_entry_name_utf8))
+  if (std::error_code ec =
+          UTF16ToUTF8(FirstFind.cFileName, ::wcslen(FirstFind.cFileName),
+                      directory_entry_name_utf8))
     return ec;
 
   it.IterationHandle = intptr_t(FindHandle.take());
@@ -753,26 +721,26 @@ error_code detail::directory_iterator_construct(detail::DirIterState &it,
   path::append(directory_entry_path, directory_entry_name_utf8.str());
   it.CurrentEntry = directory_entry(directory_entry_path.str());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
   if (it.IterationHandle != 0)
     // Closes the handle if it's valid.
     ScopedFindHandle close(HANDLE(it.IterationHandle));
   it.IterationHandle = 0;
   it.CurrentEntry = directory_entry();
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   WIN32_FIND_DATAW FindData;
   if (!::FindNextFileW(HANDLE(it.IterationHandle), &FindData)) {
-    error_code ec = windows_error(::GetLastError());
+    DWORD LastError = ::GetLastError();
     // Check for end.
-    if (ec == windows_error::no_more_files)
+    if (LastError == ERROR_NO_MORE_FILES)
       return detail::directory_iterator_destruct(it);
-    return ec;
+    return windows_error(LastError);
   }
 
   size_t FilenameLen = ::wcslen(FindData.cFileName);
@@ -782,60 +750,50 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
     return directory_iterator_increment(it);
 
   SmallString<128> directory_entry_path_utf8;
-  if (error_code ec = UTF16ToUTF8(FindData.cFileName,
-                                  ::wcslen(FindData.cFileName),
-                                  directory_entry_path_utf8))
+  if (std::error_code ec =
+          UTF16ToUTF8(FindData.cFileName, ::wcslen(FindData.cFileName),
+                      directory_entry_path_utf8))
     return ec;
 
   it.CurrentEntry.replace_filename(Twine(directory_entry_path_utf8));
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
-                                            bool map_writable, void *&result) {
-  assert(0 && "NOT IMPLEMENTED");
-  return windows_error::invalid_function;
-}
-
-error_code unmap_file_pages(void *base, size_t size) {
-  assert(0 && "NOT IMPLEMENTED");
-  return windows_error::invalid_function;
-}
-
-error_code openFileForRead(const Twine &Name, int &ResultFD) {
+std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
   SmallString<128> PathStorage;
   SmallVector<wchar_t, 128> PathUTF16;
 
-  if (error_code EC = UTF8ToUTF16(Name.toStringRef(PathStorage),
-                                  PathUTF16))
+  if (std::error_code EC =
+          UTF8ToUTF16(Name.toStringRef(PathStorage), PathUTF16))
     return EC;
 
   HANDLE H = ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
                            FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
                            OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
   if (H == INVALID_HANDLE_VALUE) {
-    error_code EC = windows_error(::GetLastError());
+    DWORD LastError = ::GetLastError();
+    std::error_code EC = windows_error(LastError);
     // Provide a better error message when trying to open directories.
     // This only runs if we failed to open the file, so there is probably
     // no performances issues.
-    if (EC != windows_error::access_denied)
+    if (LastError != ERROR_ACCESS_DENIED)
       return EC;
     if (is_directory(Name))
-      return error_code(errc::is_a_directory, posix_category());
+      return make_error_code(errc::is_a_directory);
     return EC;
   }
 
   int FD = ::_open_osfhandle(intptr_t(H), 0);
   if (FD == -1) {
     ::CloseHandle(H);
-    return windows_error::invalid_handle;
+    return windows_error(ERROR_INVALID_HANDLE);
   }
 
   ResultFD = FD;
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code openFileForWrite(const Twine &Name, int &ResultFD,
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
                             sys::fs::OpenFlags Flags, unsigned Mode) {
   // Verify that we don't have both "append" and "excl".
   assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
@@ -844,8 +802,8 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   SmallString<128> PathStorage;
   SmallVector<wchar_t, 128> PathUTF16;
 
-  if (error_code EC = UTF8ToUTF16(Name.toStringRef(PathStorage),
-                                  PathUTF16))
+  if (std::error_code EC =
+          UTF8ToUTF16(Name.toStringRef(PathStorage), PathUTF16))
     return EC;
 
   DWORD CreationDisposition;
@@ -865,14 +823,15 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
                            CreationDisposition, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (H == INVALID_HANDLE_VALUE) {
-    error_code EC = windows_error(::GetLastError());
+    DWORD LastError = ::GetLastError();
+    std::error_code EC = windows_error(LastError);
     // Provide a better error message when trying to open directories.
     // This only runs if we failed to open the file, so there is probably
     // no performances issues.
-    if (EC != windows_error::access_denied)
+    if (LastError != ERROR_ACCESS_DENIED)
       return EC;
     if (is_directory(Name))
-      return error_code(errc::is_a_directory, posix_category());
+      return make_error_code(errc::is_a_directory);
     return EC;
   }
 
@@ -886,11 +845,11 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   int FD = ::_open_osfhandle(intptr_t(H), OpenFlags);
   if (FD == -1) {
     ::CloseHandle(H);
-    return windows_error::invalid_handle;
+    return windows_error(ERROR_INVALID_HANDLE);
   }
 
   ResultFD = FD;
-  return error_code::success();
+  return std::error_code();
 }
 } // end namespace fs
 
@@ -911,14 +870,14 @@ bool home_directory(SmallVectorImpl<char> &result) {
 } // end namespace path
 
 namespace windows {
-llvm::error_code UTF8ToUTF16(llvm::StringRef utf8,
-                             llvm::SmallVectorImpl<wchar_t> &utf16) {
+std::error_code UTF8ToUTF16(llvm::StringRef utf8,
+                            llvm::SmallVectorImpl<wchar_t> &utf16) {
   if (!utf8.empty()) {
     int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8.begin(),
                                     utf8.size(), utf16.begin(), 0);
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
 
     utf16.reserve(len + 1);
     utf16.set_size(len);
@@ -927,25 +886,25 @@ llvm::error_code UTF8ToUTF16(llvm::StringRef utf8,
                                 utf8.size(), utf16.begin(), utf16.size());
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
   }
 
   // Make utf16 null terminated.
   utf16.push_back(0);
   utf16.pop_back();
 
-  return llvm::error_code::success();
+  return std::error_code();
 }
 
-llvm::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                             llvm::SmallVectorImpl<char> &utf8) {
+std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                            llvm::SmallVectorImpl<char> &utf8) {
   if (utf16_len) {
     // Get length.
     int len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.begin(),
                                     0, NULL, NULL);
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
 
     utf8.reserve(len);
     utf8.set_size(len);
@@ -955,14 +914,14 @@ llvm::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
                                 utf8.size(), NULL, NULL);
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
   }
 
   // Make utf8 null terminated.
   utf8.push_back(0);
   utf8.pop_back();
 
-  return llvm::error_code::success();
+  return std::error_code();
 }
 } // end namespace windows
 } // end namespace sys
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index c3df801dc015..81aee0e1b6ac 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/WindowsError.h"
 #include <malloc.h>
 
 // The Windows.h header must be after LLVM and standard headers.
@@ -47,7 +49,6 @@
 using namespace llvm;
 using namespace sys;
 
-
 process::id_type self_process::get_id() {
   return GetCurrentProcessId();
 }
@@ -178,12 +179,16 @@ Optional<std::string> Process::GetEnv(StringRef Name) {
   return std::string(Res.data());
 }
 
-error_code
+static std::error_code windows_error(DWORD E) {
+  return mapWindowsError(E);
+}
+
+std::error_code
 Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
                            ArrayRef<const char *>,
                            SpecificBumpPtrAllocator<char> &ArgAllocator) {
   int NewArgCount;
-  error_code ec;
+  std::error_code ec;
 
   wchar_t **UnicodeCommandLine = CommandLineToArgvW(GetCommandLineW(),
                                                     &NewArgCount);
@@ -208,7 +213,7 @@ Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
   if (ec)
     return ec;
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool Process::StandardInIsUserInput() {
@@ -363,12 +368,12 @@ unsigned Process::GetRandomNumber() {
   HCRYPTPROV HCPC;
   if (!::CryptAcquireContextW(&HCPC, NULL, NULL, PROV_RSA_FULL,
                               CRYPT_VERIFYCONTEXT))
-    assert(false && "Could not acquire a cryptographic context");
+    report_fatal_error("Could not acquire a cryptographic context");
 
   ScopedCryptContext CryptoProvider(HCPC);
   unsigned Ret;
   if (!::CryptGenRandom(CryptoProvider, sizeof(Ret),
                         reinterpret_cast<BYTE *>(&Ret)))
-    assert(false && "Could not generate a random number");
+    report_fatal_error("Could not generate a random number");
   return Ret;
 }
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 5827c105afac..b2f71aed0c7d 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -226,7 +226,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
     // an environment block by concatenating them.
     for (unsigned i = 0; envp[i]; ++i) {
       SmallVector<wchar_t, MAX_PATH> EnvString;
-      if (error_code ec = windows::UTF8ToUTF16(envp[i], EnvString)) {
+      if (std::error_code ec = windows::UTF8ToUTF16(envp[i], EnvString)) {
         SetLastError(ec.value());
         MakeErrMsg(ErrMsg, "Unable to convert environment variable to UTF-16");
         return false;
@@ -290,7 +290,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
   fflush(stderr);
 
   SmallVector<wchar_t, MAX_PATH> ProgramUtf16;
-  if (error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
+  if (std::error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
     SetLastError(ec.value());
     MakeErrMsg(ErrMsg,
                std::string("Unable to convert application name to UTF-16"));
@@ -298,7 +298,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
   }
 
   SmallVector<wchar_t, MAX_PATH> CommandUtf16;
-  if (error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
+  if (std::error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
     SetLastError(ec.value());
     MakeErrMsg(ErrMsg,
                std::string("Unable to convert command-line to UTF-16"));
@@ -422,18 +422,18 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   return WaitResult;
 }
 
-error_code sys::ChangeStdinToBinary(){
+  std::error_code sys::ChangeStdinToBinary(){
   int result = _setmode( _fileno(stdin), _O_BINARY );
   if (result == -1)
-    return error_code(errno, generic_category());
-  return make_error_code(errc::success);
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 }
 
-error_code sys::ChangeStdoutToBinary(){
+  std::error_code sys::ChangeStdoutToBinary(){
   int result = _setmode( _fileno(stdout), _O_BINARY );
   if (result == -1)
-    return error_code(errno, generic_category());
-  return make_error_code(errc::success);
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 }
 
 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index 6bef44446023..f68835b1a71a 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -32,7 +32,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 #include <windows.h>
 #include <wincrypt.h>
 #include <cassert>
@@ -163,10 +163,9 @@ c_str(SmallVectorImpl<T> &str) {
 
 namespace sys {
 namespace windows {
-error_code UTF8ToUTF16(StringRef utf8,
-                       SmallVectorImpl<wchar_t> &utf16);
-error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                       SmallVectorImpl<char> &utf8);
+std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
+std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                            SmallVectorImpl<char> &utf8);
 } // end namespace windows
 } // end namespace sys
 } // end namespace llvm.
diff --git a/lib/Support/Windows/system_error.inc b/lib/Support/Windows/system_error.inc
deleted file mode 100644
index 37ec81dd363c..000000000000
--- a/lib/Support/Windows/system_error.inc
+++ /dev/null
@@ -1,142 +0,0 @@
-//===- llvm/Support/Win32/system_error.inc - Windows error_code --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides the Windows specific implementation of the error_code
-// and error_condition classes.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic Windows code that
-//===          is guaranteed to work on *all* Windows variants.
-//===----------------------------------------------------------------------===//
-
-#include <windows.h>
-#include <winerror.h>
-
-using namespace llvm;
-
-std::string
-_system_error_category::message(int ev) const {
-  LPVOID lpMsgBuf = 0;
-  DWORD retval = ::FormatMessageA(
-    FORMAT_MESSAGE_ALLOCATE_BUFFER |
-    FORMAT_MESSAGE_FROM_SYSTEM |
-    FORMAT_MESSAGE_IGNORE_INSERTS,
-    NULL,
-    ev,
-    MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language
-    (LPSTR) &lpMsgBuf,
-    0,
-    NULL);
-  if (retval == 0) {
-    ::LocalFree(lpMsgBuf);
-    return std::string("Unknown error");
-  }
-
-  std::string str( static_cast<LPCSTR>(lpMsgBuf) );
-  ::LocalFree(lpMsgBuf);
-
-  while (str.size()
-     && (str[str.size()-1] == '\n' || str[str.size()-1] == '\r'))
-    str.erase( str.size()-1 );
-  if (str.size() && str[str.size()-1] == '.')
-    str.erase( str.size()-1 );
-  return str;
-}
-
-// I'd rather not double the line count of the following.
-#define MAP_ERR_TO_COND(x, y) case x: return make_error_condition(errc::y)
-
-error_condition
-_system_error_category::default_error_condition(int ev) const {
-  switch (ev) {
-  MAP_ERR_TO_COND(0, success);
-  // Windows system -> posix_errno decode table  ---------------------------//
-  // see WinError.h comments for descriptions of errors
-  MAP_ERR_TO_COND(ERROR_ACCESS_DENIED,       permission_denied);
-  MAP_ERR_TO_COND(ERROR_ALREADY_EXISTS,      file_exists);
-  MAP_ERR_TO_COND(ERROR_BAD_UNIT,            no_such_device);
-  MAP_ERR_TO_COND(ERROR_BUFFER_OVERFLOW,     filename_too_long);
-  MAP_ERR_TO_COND(ERROR_BUSY,                device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_BUSY_DRIVE,          device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_CANNOT_MAKE,         permission_denied);
-  MAP_ERR_TO_COND(ERROR_CANTOPEN,            io_error);
-  MAP_ERR_TO_COND(ERROR_CANTREAD,            io_error);
-  MAP_ERR_TO_COND(ERROR_CANTWRITE,           io_error);
-  MAP_ERR_TO_COND(ERROR_CURRENT_DIRECTORY,   permission_denied);
-  MAP_ERR_TO_COND(ERROR_DEV_NOT_EXIST,       no_such_device);
-  MAP_ERR_TO_COND(ERROR_DEVICE_IN_USE,       device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_DIR_NOT_EMPTY,       directory_not_empty);
-  MAP_ERR_TO_COND(ERROR_DIRECTORY,           invalid_argument);
-  MAP_ERR_TO_COND(ERROR_DISK_FULL,           no_space_on_device);
-  MAP_ERR_TO_COND(ERROR_FILE_EXISTS,         file_exists);
-  MAP_ERR_TO_COND(ERROR_FILE_NOT_FOUND,      no_such_file_or_directory);
-  MAP_ERR_TO_COND(ERROR_HANDLE_DISK_FULL,    no_space_on_device);
-  MAP_ERR_TO_COND(ERROR_HANDLE_EOF,          value_too_large);
-  MAP_ERR_TO_COND(ERROR_INVALID_ACCESS,      permission_denied);
-  MAP_ERR_TO_COND(ERROR_INVALID_DRIVE,       no_such_device);
-  MAP_ERR_TO_COND(ERROR_INVALID_FUNCTION,    function_not_supported);
-  MAP_ERR_TO_COND(ERROR_INVALID_HANDLE,      invalid_argument);
-  MAP_ERR_TO_COND(ERROR_INVALID_NAME,        invalid_argument);
-  MAP_ERR_TO_COND(ERROR_LOCK_VIOLATION,      no_lock_available);
-  MAP_ERR_TO_COND(ERROR_LOCKED,              no_lock_available);
-  MAP_ERR_TO_COND(ERROR_NEGATIVE_SEEK,       invalid_argument);
-  MAP_ERR_TO_COND(ERROR_NOACCESS,            permission_denied);
-  MAP_ERR_TO_COND(ERROR_NOT_ENOUGH_MEMORY,   not_enough_memory);
-  MAP_ERR_TO_COND(ERROR_NOT_READY,           resource_unavailable_try_again);
-  MAP_ERR_TO_COND(ERROR_NOT_SAME_DEVICE,     cross_device_link);
-  MAP_ERR_TO_COND(ERROR_OPEN_FAILED,         io_error);
-  MAP_ERR_TO_COND(ERROR_OPEN_FILES,          device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_OPERATION_ABORTED,   operation_canceled);
-  MAP_ERR_TO_COND(ERROR_OUTOFMEMORY,         not_enough_memory);
-  MAP_ERR_TO_COND(ERROR_PATH_NOT_FOUND,      no_such_file_or_directory);
-  MAP_ERR_TO_COND(ERROR_BAD_NETPATH,         no_such_file_or_directory);
-  MAP_ERR_TO_COND(ERROR_READ_FAULT,          io_error);
-  MAP_ERR_TO_COND(ERROR_RETRY,               resource_unavailable_try_again);
-  MAP_ERR_TO_COND(ERROR_SEEK,                io_error);
-  MAP_ERR_TO_COND(ERROR_SHARING_VIOLATION,   permission_denied);
-  MAP_ERR_TO_COND(ERROR_TOO_MANY_OPEN_FILES, too_many_files_open);
-  MAP_ERR_TO_COND(ERROR_WRITE_FAULT,         io_error);
-  MAP_ERR_TO_COND(ERROR_WRITE_PROTECT,       permission_denied);
-  MAP_ERR_TO_COND(ERROR_SEM_TIMEOUT,         timed_out);
-  MAP_ERR_TO_COND(WSAEACCES,                 permission_denied);
-  MAP_ERR_TO_COND(WSAEADDRINUSE,             address_in_use);
-  MAP_ERR_TO_COND(WSAEADDRNOTAVAIL,          address_not_available);
-  MAP_ERR_TO_COND(WSAEAFNOSUPPORT,           address_family_not_supported);
-  MAP_ERR_TO_COND(WSAEALREADY,               connection_already_in_progress);
-  MAP_ERR_TO_COND(WSAEBADF,                  bad_file_descriptor);
-  MAP_ERR_TO_COND(WSAECONNABORTED,           connection_aborted);
-  MAP_ERR_TO_COND(WSAECONNREFUSED,           connection_refused);
-  MAP_ERR_TO_COND(WSAECONNRESET,             connection_reset);
-  MAP_ERR_TO_COND(WSAEDESTADDRREQ,           destination_address_required);
-  MAP_ERR_TO_COND(WSAEFAULT,                 bad_address);
-  MAP_ERR_TO_COND(WSAEHOSTUNREACH,           host_unreachable);
-  MAP_ERR_TO_COND(WSAEINPROGRESS,            operation_in_progress);
-  MAP_ERR_TO_COND(WSAEINTR,                  interrupted);
-  MAP_ERR_TO_COND(WSAEINVAL,                 invalid_argument);
-  MAP_ERR_TO_COND(WSAEISCONN,                already_connected);
-  MAP_ERR_TO_COND(WSAEMFILE,                 too_many_files_open);
-  MAP_ERR_TO_COND(WSAEMSGSIZE,               message_size);
-  MAP_ERR_TO_COND(WSAENAMETOOLONG,           filename_too_long);
-  MAP_ERR_TO_COND(WSAENETDOWN,               network_down);
-  MAP_ERR_TO_COND(WSAENETRESET,              network_reset);
-  MAP_ERR_TO_COND(WSAENETUNREACH,            network_unreachable);
-  MAP_ERR_TO_COND(WSAENOBUFS,                no_buffer_space);
-  MAP_ERR_TO_COND(WSAENOPROTOOPT,            no_protocol_option);
-  MAP_ERR_TO_COND(WSAENOTCONN,               not_connected);
-  MAP_ERR_TO_COND(WSAENOTSOCK,               not_a_socket);
-  MAP_ERR_TO_COND(WSAEOPNOTSUPP,             operation_not_supported);
-  MAP_ERR_TO_COND(WSAEPROTONOSUPPORT,        protocol_not_supported);
-  MAP_ERR_TO_COND(WSAEPROTOTYPE,             wrong_protocol_type);
-  MAP_ERR_TO_COND(WSAETIMEDOUT,              timed_out);
-  MAP_ERR_TO_COND(WSAEWOULDBLOCK,            operation_would_block);
-  default: return error_condition(ev, system_category());
-  }
-}
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index e5f94947ff8c..5212624f0cd3 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
@@ -56,9 +57,7 @@ Input::Input(StringRef InputContent,
 Input::~Input() {
 }
 
-error_code Input::error() {
-  return EC;
-}
+std::error_code Input::error() { return EC; }
 
 // Pin the vtables to this file.
 void Input::HNode::anchor() {}
@@ -90,8 +89,8 @@ bool Input::setCurrentDocument() {
   return false;
 }
 
-void Input::nextDocument() {
-  ++DocIterator;
+bool Input::nextDocument() {
+  return ++DocIterator != Strm->end();
 }
 
 bool Input::mapTag(StringRef Tag, bool Default) {
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index f55838e660fe..f7c213ac2b85 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -22,10 +22,10 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/system_error.h"
 #include <cctype>
 #include <cerrno>
 #include <sys/stat.h>
+#include <system_error>
 
 // <fcntl.h> may provide O_BINARY.
 #if defined(HAVE_FCNTL_H)
@@ -450,7 +450,7 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
     return;
   }
 
-  error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
+  std::error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
 
   if (EC) {
     ErrorInfo = "Error opening output file '" + std::string(Filename) + "': " +
diff --git a/lib/Support/regcclass.h b/lib/Support/regcclass.h
index 2cea3e4e5406..7fd66046cd87 100644
--- a/lib/Support/regcclass.h
+++ b/lib/Support/regcclass.h
@@ -37,6 +37,9 @@
  *	@(#)cclass.h	8.3 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGCCLASS_H
+#define LLVM_SUPPORT_REGCCLASS_H
+
 /* character-class table */
 static struct cclass {
 	const char *name;
@@ -68,3 +71,5 @@ static struct cclass {
 					""} ,
 	{ NULL,		0,		"" }
 };
+
+#endif
diff --git a/lib/Support/regcname.h b/lib/Support/regcname.h
index 3c0bb248ffa7..891d25573e8c 100644
--- a/lib/Support/regcname.h
+++ b/lib/Support/regcname.h
@@ -35,6 +35,9 @@
  *	@(#)cname.h	8.3 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGCNAME_H
+#define LLVM_SUPPORT_REGCNAME_H
+
 /* character-name table */
 static struct cname {
 	const char *name;
@@ -137,3 +140,5 @@ static struct cname {
 	{ "DEL",			'\177' },
 	{ NULL,				0 }
 };
+
+#endif
diff --git a/lib/Support/regex2.h b/lib/Support/regex2.h
index 21659c34449a..d81bfbc97d02 100644
--- a/lib/Support/regex2.h
+++ b/lib/Support/regex2.h
@@ -35,6 +35,9 @@
  *	@(#)regex2.h	8.4 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGEX2_H
+#define LLVM_SUPPORT_REGEX2_H
+
 /*
  * internals of regex_t
  */
@@ -155,3 +158,5 @@ struct re_guts {
 /* misc utilities */
 #define	OUT	(CHAR_MAX+1)	/* a non-character value */
 #define	ISWORD(c)	(isalnum(c&0xff) || (c) == '_')
+
+#endif
diff --git a/lib/Support/regutils.h b/lib/Support/regutils.h
index d0ee100a382b..49a975cd2703 100644
--- a/lib/Support/regutils.h
+++ b/lib/Support/regutils.h
@@ -35,6 +35,9 @@
  *	@(#)utils.h	8.3 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGUTILS_H
+#define LLVM_SUPPORT_REGUTILS_H
+
 /* utility definitions */
 #define	NC		(CHAR_MAX - CHAR_MIN + 1)
 typedef unsigned char uch;
@@ -51,3 +54,5 @@ typedef unsigned char uch;
 #ifdef USEBCOPY
 #define	memmove(d, s, c)	bcopy(s, d, c)
 #endif
+
+#endif
diff --git a/lib/Support/system_error.cpp b/lib/Support/system_error.cpp
deleted file mode 100644
index 299f54abb15c..000000000000
--- a/lib/Support/system_error.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===---------------------- system_error.cpp ------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This was lifted from libc++ and modified for C++03.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/system_error.h"
-#include "llvm/Support/Errno.h"
-#include <cstring>
-#include <string>
-
-namespace llvm {
-
-// class error_category
-
-error_category::error_category() {
-}
-
-error_category::~error_category() {
-}
-
-error_condition
-error_category::default_error_condition(int ev) const {
-  return error_condition(ev, *this);
-}
-
-bool
-error_category::equivalent(int code, const error_condition& condition) const {
-  return default_error_condition(code) == condition;
-}
-
-bool
-error_category::equivalent(const error_code& code, int condition) const {
-  return *this == code.category() && code.value() == condition;
-}
-
-std::string
-_do_message::message(int ev) const {
-  return std::string(sys::StrError(ev));
-}
-
-class _generic_error_category : public _do_message {
-public:
-  const char* name() const override;
-  std::string message(int ev) const override;
-};
-
-const char*
-_generic_error_category::name() const {
-  return "generic";
-}
-
-std::string
-_generic_error_category::message(int ev) const {
-#ifdef ELAST
-  if (ev > ELAST)
-    return std::string("unspecified generic_category error");
-#endif  // ELAST
-  return _do_message::message(ev);
-}
-
-const error_category&
-generic_category() {
-  static _generic_error_category s;
-  return s;
-}
-
-class _system_error_category : public _do_message {
-public:
-  const char* name() const override;
-  std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
-};
-
-const char*
-_system_error_category::name() const {
-  return "system";
-}
-
-// std::string _system_error_category::message(int ev) const {
-// Is in Platform/system_error.inc
-
-// error_condition _system_error_category::default_error_condition(int ev) const
-// Is in Platform/system_error.inc
-
-const error_category&
-system_category() {
-  static _system_error_category s;
-  return s;
-}
-
-const error_category&
-posix_category() {
-#ifdef LLVM_ON_WIN32
-  return generic_category();
-#else
-  return system_category();
-#endif
-}
-
-// error_condition
-
-std::string
-error_condition::message() const {
-  return _cat_->message(_val_);
-}
-
-// error_code
-
-std::string
-error_code::message() const {
-  return _cat_->message(_val_);
-}
-
-} // end namespace llvm
-
-// Include the truly platform-specific parts of this class.
-#if defined(LLVM_ON_UNIX)
-#include "Unix/system_error.inc"
-#endif
-#if defined(LLVM_ON_WIN32)
-#include "Windows/system_error.inc"
-#endif
diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt
index 935d674a3603..fb702187d13a 100644
--- a/lib/TableGen/CMakeLists.txt
+++ b/lib/TableGen/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMTableGen
   Error.cpp
   Main.cpp
   Record.cpp
+  SetTheory.cpp
   StringMatcher.cpp
   TableGenBackend.cpp
   TGLexer.cpp
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index 476026dfd767..191307a66bc2 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -20,12 +20,12 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include <algorithm>
 #include <cstdio>
+#include <system_error>
 using namespace llvm;
 
 namespace {
@@ -82,8 +82,7 @@ int TableGenMain(char *argv0, TableGenMainFn *MainFn) {
 
   // Parse the input file.
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(InputFilename, File)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, File)) {
     errs() << "Could not open input file '" << InputFilename << "': "
            << ec.message() <<"\n";
     return 1;
diff --git a/utils/TableGen/SetTheory.cpp b/lib/TableGen/SetTheory.cpp
similarity index 99%
rename from utils/TableGen/SetTheory.cpp
rename to lib/TableGen/SetTheory.cpp
index 5ead7ed16fda..c99c2bab45ab 100644
--- a/utils/TableGen/SetTheory.cpp
+++ b/lib/TableGen/SetTheory.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SetTheory.h"
 #include "llvm/Support/Format.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 
 using namespace llvm;
 
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 038e0180fbbe..0550692ebce7 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -360,8 +360,13 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
   }
 
   if (Records.getDef(IterRec->getNameInitAsString())) {
-    Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
-    return true;
+    // If this record is anonymous, it's no problem, just generate a new name
+    if (IterRec->isAnonymous())
+      IterRec->setName(GetNewAnonymousName());
+    else {
+      Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
+      return true;
+    }
   }
 
   Records.addDef(IterRec);
@@ -782,7 +787,7 @@ Init *TGParser::ParseIDValue(Record *CurRec,
 ///
 /// Operation ::= XOperator ['<' Type '>'] '(' Args ')'
 ///
-Init *TGParser::ParseOperation(Record *CurRec) {
+Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   switch (Lex.getCode()) {
   default:
     TokError("unknown operation");
@@ -845,7 +850,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
         ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
         StringRecTy *SType = dyn_cast<StringRecTy>(LHSt->getType());
         if (!LType && !SType) {
-          TokError("expected list or string type argumnet in unary operator");
+          TokError("expected list or string type argument in unary operator");
           return nullptr;
         }
       }
@@ -853,7 +858,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
       if (Code == UnOpInit::HEAD
           || Code == UnOpInit::TAIL) {
         if (!LHSl && !LHSt) {
-          TokError("expected list type argumnet in unary operator");
+          TokError("expected list type argument in unary operator");
           return nullptr;
         }
 
@@ -877,7 +882,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
           assert(LHSt && "expected list type argument in unary operator");
           ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
           if (!LType) {
-            TokError("expected list type argumnet in unary operator");
+            TokError("expected list type argument in unary operator");
             return nullptr;
           }
           if (Code == UnOpInit::HEAD) {
@@ -1021,8 +1026,9 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     }
     Lex.Lex();  // eat the ','
 
-    Init *MHS = ParseValue(CurRec);
-    if (!MHS) return nullptr;
+    Init *MHS = ParseValue(CurRec, ItemType);
+    if (!MHS)
+      return nullptr;
 
     if (Lex.getCode() != tgtok::comma) {
       TokError("expected ',' in ternary operator");
@@ -1030,8 +1036,9 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     }
     Lex.Lex();  // eat the ','
 
-    Init *RHS = ParseValue(CurRec);
-    if (!RHS) return nullptr;
+    Init *RHS = ParseValue(CurRec, ItemType);
+    if (!RHS)
+      return nullptr;
 
     if (Lex.getCode() != tgtok::r_paren) {
       TokError("expected ')' in binary operator");
@@ -1441,7 +1448,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XIf:
   case tgtok::XForEach:
   case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
-    return ParseOperation(CurRec);
+    return ParseOperation(CurRec, ItemType);
   }
   }
 
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 6fd442a7c211..9f4b7e90826a 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -181,7 +181,7 @@ class TGParser {
   std::vector<unsigned> ParseRangeList();
   bool ParseRangePiece(std::vector<unsigned> &Ranges);
   RecTy *ParseType();
-  Init *ParseOperation(Record *CurRec);
+  Init *ParseOperation(Record *CurRec, RecTy *ItemType);
   RecTy *ParseOperatorType();
   Init *ParseObjectName(MultiClass *CurMultiClass);
   Record *ParseClassID();
diff --git a/lib/TableGen/module.modulemap b/lib/TableGen/module.modulemap
new file mode 100644
index 000000000000..8dac0a22c142
--- /dev/null
+++ b/lib/TableGen/module.modulemap
@@ -0,0 +1 @@
+module TableGen { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 0297de120cb8..1c022aaf86bd 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -1,4 +1,4 @@
-//==-- AArch64.h - Top-level interface for AArch64 representation -*- C++ -*-=//
+//==-- AArch64.h - Top-level interface for AArch64  --------------*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,35 +12,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_H
-#define LLVM_TARGET_AARCH64_H
+#ifndef TARGET_AArch64_H
+#define TARGET_AArch64_H
 
+#include "Utils/AArch64BaseInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
 
 namespace llvm {
 
-class AArch64AsmPrinter;
-class FunctionPass;
 class AArch64TargetMachine;
-class MachineInstr;
-class MCInst;
-
-FunctionPass *createAArch64ISelDAG(AArch64TargetMachine &TM,
-                                   CodeGenOpt::Level OptLevel);
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64BranchRelaxation();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *
+createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
-FunctionPass *createAArch64BranchFixupPass();
-
-/// \brief Creates an AArch64-specific Target Transformation Info pass.
-ImmutablePass *createAArch64TargetTransformInfoPass(
-                                                const AArch64TargetMachine *TM);
-
-void LowerAArch64MachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                      AArch64AsmPrinter &AP);
-
-
-}
+FunctionPass *createAArch64CollectLOHPass();
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e49afd60c8e3..e6a27c386b0e 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -1,4 +1,4 @@
-//===- AArch64.td - Describe the AArch64 Target Machine -------*- tblgen -*-==//
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is the top level entry point for the AArch64 target.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Target-independent interfaces
+// Target-independent interfaces which we are implementing
 //===----------------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
@@ -22,7 +21,7 @@ include "llvm/Target/Target.td"
 //
 
 def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
-  "Enable ARMv8 FP">;
+                                       "Enable ARMv8 FP">;
 
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
   "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
@@ -30,54 +29,107 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
   "Enable cryptographic instructions">;
 
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+  "Enable ARMv8 CRC-32 checksum instructions">;
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zero-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+include "AArch64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
 //===----------------------------------------------------------------------===//
-// AArch64 Processors
-//
 
 include "AArch64Schedule.td"
+include "AArch64InstrInfo.td"
 
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+def AArch64InstrInfo : InstrInfo;
 
-def : Processor<"generic", GenericItineraries, [FeatureFPARMv8, FeatureNEON]>;
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedA57.td"
+include "AArch64SchedCyclone.td"
 
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
-                                   FeatureCrypto]>;
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
 
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
-                                   FeatureCrypto]>;
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+                                   "Cyclone",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeatureZCRegMove, FeatureZCZeroing]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
+                                              FeatureNEON,
+                                              FeatureCRC]>;
 
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : Processor<"cortex-a57", NoItineraries, [ProcA57]>;
+def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 
 //===----------------------------------------------------------------------===//
-// Register File Description
+// Assembly parser
 //===----------------------------------------------------------------------===//
 
-include "AArch64RegisterInfo.td"
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+}
 
-include "AArch64CallingConv.td"
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+}
 
 //===----------------------------------------------------------------------===//
-// Instruction Descriptions
+// Assembly printer
 //===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
 
-include "AArch64InstrInfo.td"
-
-def AArch64InstrInfo : InstrInfo {
-  let noNamedPositionallyEncodedOperands = 1;
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int Variant = 1;
+  int isMCAsmWriter = 1;
 }
 
 //===----------------------------------------------------------------------===//
-// Declare the target which we are implementing
+// Target Declaration
 //===----------------------------------------------------------------------===//
 
 def AArch64 : Target {
   let InstructionSet = AArch64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
 }
diff --git a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
similarity index 90%
rename from lib/Target/ARM64/ARM64AddressTypePromotion.cpp
rename to lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index be2b5eed2ad2..04906f6078f8 100644
--- a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -1,5 +1,4 @@
-
-//===-- ARM64AddressTypePromotion.cpp --- Promote type for addr accesses -===//
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,7 +28,7 @@
 // FIXME: This pass may be useful for other targets too.
 // ===---------------------------------------------------------------------===//
 
-#include "ARM64.h"
+#include "AArch64.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -45,38 +44,38 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-type-promotion"
+#define DEBUG_TYPE "aarch64-type-promotion"
 
 static cl::opt<bool>
-EnableAddressTypePromotion("arm64-type-promotion", cl::Hidden,
+EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden,
                            cl::desc("Enable the type promotion pass"),
                            cl::init(true));
 static cl::opt<bool>
-EnableMerge("arm64-type-promotion-merge", cl::Hidden,
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
             cl::desc("Enable merging of redundant sexts when one is dominating"
                      " the other."),
             cl::init(true));
 
 //===----------------------------------------------------------------------===//
-//                       ARM64AddressTypePromotion
+//                       AArch64AddressTypePromotion
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-void initializeARM64AddressTypePromotionPass(PassRegistry &);
+void initializeAArch64AddressTypePromotionPass(PassRegistry &);
 }
 
 namespace {
-class ARM64AddressTypePromotion : public FunctionPass {
+class AArch64AddressTypePromotion : public FunctionPass {
 
 public:
   static char ID;
-  ARM64AddressTypePromotion()
+  AArch64AddressTypePromotion()
       : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
-    initializeARM64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+    initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
   }
 
   const char *getPassName() const override {
-    return "ARM64 Address Type Promotion";
+    return "AArch64 Address Type Promotion";
   }
 
   /// Iterate over the functions and promote the computation of interesting
@@ -140,19 +139,19 @@ class ARM64AddressTypePromotion : public FunctionPass {
 };
 } // end anonymous namespace.
 
-char ARM64AddressTypePromotion::ID = 0;
+char AArch64AddressTypePromotion::ID = 0;
 
-INITIALIZE_PASS_BEGIN(ARM64AddressTypePromotion, "arm64-type-promotion",
-                      "ARM64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                      "AArch64 Type Promotion Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64AddressTypePromotion, "arm64-type-promotion",
-                    "ARM64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                    "AArch64 Type Promotion Pass", false, false)
 
-FunctionPass *llvm::createARM64AddressTypePromotionPass() {
-  return new ARM64AddressTypePromotion();
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+  return new AArch64AddressTypePromotion();
 }
 
-bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
   if (isa<SExtInst>(Inst))
     return true;
 
@@ -175,7 +174,7 @@ bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
   return false;
 }
 
-bool ARM64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
   // If the type of the sext is the same as the considered one, this sext
   // will become useless.
   // Otherwise, we will have to do something to preserve the original value,
@@ -211,7 +210,7 @@ static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
 }
 
 bool
-ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
   if (SExt->getType() != ConsideredSExtType)
     return false;
 
@@ -249,7 +248,7 @@ ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
 //   = a
 // Iterate on 'c'.
 bool
-ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
   DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
 
   bool LocalChange = false;
@@ -375,8 +374,8 @@ ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
   return LocalChange;
 }
 
-void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
-                                           SetOfInstructions &ToRemove) {
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                             SetOfInstructions &ToRemove) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   for (auto &Entry : ValToSExtendedUses) {
@@ -414,7 +413,7 @@ void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
   }
 }
 
-void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
   DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
 
   DenseMap<Value *, Instruction *> SeenChains;
@@ -479,7 +478,7 @@ void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
   }
 }
 
-bool ARM64AddressTypePromotion::runOnFunction(Function &F) {
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
   if (!EnableAddressTypePromotion || F.isDeclaration())
     return false;
   Func = &F;
diff --git a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
similarity index 85%
rename from lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
rename to lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 5950a8f18e1e..734fb215e6ee 100644
--- a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -33,9 +33,9 @@
 // solution.
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64RegisterInfo.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -47,12 +47,12 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-simd-scalar"
+#define DEBUG_TYPE "aarch64-simd-scalar"
 
 // Allow forcing all i64 operations with equivalent SIMD instructions to use
 // them. For stress-testing the transformation function.
 static cl::opt<bool>
-TransformAll("arm64-simd-scalar-force-all",
+TransformAll("aarch64-simd-scalar-force-all",
              cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
              cl::init(false), cl::Hidden);
 
@@ -61,9 +61,9 @@ STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
 STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
 
 namespace {
-class ARM64AdvSIMDScalar : public MachineFunctionPass {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
   MachineRegisterInfo *MRI;
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
 
 private:
   // isProfitableToTransform - Predicate function to determine whether an
@@ -81,7 +81,7 @@ class ARM64AdvSIMDScalar : public MachineFunctionPass {
 
 public:
   static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
@@ -94,7 +94,7 @@ class ARM64AdvSIMDScalar : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
-char ARM64AdvSIMDScalar::ID = 0;
+char AArch64AdvSIMDScalar::ID = 0;
 } // end anonymous namespace
 
 static bool isGPR64(unsigned Reg, unsigned SubReg,
@@ -102,20 +102,20 @@ static bool isGPR64(unsigned Reg, unsigned SubReg,
   if (SubReg)
     return false;
   if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::GPR64RegClass);
-  return ARM64::GPR64RegClass.contains(Reg);
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+  return AArch64::GPR64RegClass.contains(Reg);
 }
 
 static bool isFPR64(unsigned Reg, unsigned SubReg,
                     const MachineRegisterInfo *MRI) {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR64RegClass) &&
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
             SubReg == 0) ||
-           (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR128RegClass) &&
-            SubReg == ARM64::dsub);
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+            SubReg == AArch64::dsub);
   // Physical register references just check the register class directly.
-  return (ARM64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
-         (ARM64::FPR128RegClass.contains(Reg) && SubReg == ARM64::dsub);
+  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
 }
 
 // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
@@ -125,17 +125,18 @@ static unsigned getSrcFromCopy(const MachineInstr *MI,
                                unsigned &SubReg) {
   SubReg = 0;
   // The "FMOV Xd, Dn" instruction is the typical form.
-  if (MI->getOpcode() == ARM64::FMOVDXr || MI->getOpcode() == ARM64::FMOVXDr)
+  if (MI->getOpcode() == AArch64::FMOVDXr ||
+      MI->getOpcode() == AArch64::FMOVXDr)
     return MI->getOperand(1).getReg();
   // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
   // these at this stage, but it's easy to check for.
-  if (MI->getOpcode() == ARM64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
-    SubReg = ARM64::dsub;
+  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = AArch64::dsub;
     return MI->getOperand(1).getReg();
   }
   // Or just a plain COPY instruction. This can be directly to/from FPR64,
   // or it can be a dsub subreg reference to an FPR128.
-  if (MI->getOpcode() == ARM64::COPY) {
+  if (MI->getOpcode() == AArch64::COPY) {
     if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                 MRI) &&
         isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
@@ -161,10 +162,10 @@ static int getTransformOpcode(unsigned Opc) {
   default:
     break;
   // FIXME: Lots more possibilities.
-  case ARM64::ADDXrr:
-    return ARM64::ADDv1i64;
-  case ARM64::SUBXrr:
-    return ARM64::SUBv1i64;
+  case AArch64::ADDXrr:
+    return AArch64::ADDv1i64;
+  case AArch64::SUBXrr:
+    return AArch64::SUBv1i64;
   }
   // No AdvSIMD equivalent, so just return the original opcode.
   return Opc;
@@ -178,7 +179,8 @@ static bool isTransformable(const MachineInstr *MI) {
 // isProfitableToTransform - Predicate function to determine whether an
 // instruction should be transformed to its equivalent AdvSIMD scalar
 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+bool
+AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   // If this instruction isn't eligible to be transformed (no SIMD equivalent),
   // early exit since that's the common case.
   if (!isTransformable(MI))
@@ -238,8 +240,8 @@ bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
     // preferable to have it use the FPR64 in most cases, as if the source
     // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
     // Ditto for a lane insert.
-    else if (Use->getOpcode() == ARM64::INSERT_SUBREG ||
-             Use->getOpcode() == ARM64::INSvi64gpr)
+    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+             Use->getOpcode() == AArch64::INSvi64gpr)
       ;
     else
       AllUsesAreCopies = false;
@@ -259,10 +261,10 @@ bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   return TransformAll;
 }
 
-static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
+static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
                                 unsigned Dst, unsigned Src, bool IsKill) {
   MachineInstrBuilder MIB =
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(ARM64::COPY),
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
               Dst)
           .addReg(Src, getKillRegState(IsKill));
   DEBUG(dbgs() << "    adding copy: " << *MIB);
@@ -273,7 +275,7 @@ static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
 // transformInstruction - Perform the transformation of an instruction
 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
 // to be the correct register class, minimizing cross-class copies.
-void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   DEBUG(dbgs() << "Scalar transform: " << *MI);
 
   MachineBasicBlock *MBB = MI->getParent();
@@ -316,19 +318,19 @@ void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   // copy.
   if (!Src0) {
     SubReg0 = 0;
-    Src0 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     insertCopy(TII, MI, Src0, OrigSrc0, true);
   }
   if (!Src1) {
     SubReg1 = 0;
-    Src1 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     insertCopy(TII, MI, Src1, OrigSrc1, true);
   }
 
   // Create a vreg for the destination.
   // FIXME: No need to do this if the ultimate user expects an FPR64.
   // Check for that and avoid the copy if possible.
-  unsigned Dst = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
 
   // For now, all of the new instructions have the same simple three-register
   // form, so no need to special case based on what instruction we're
@@ -349,7 +351,7 @@ void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
 }
 
 // processMachineBasicBlock - Main optimzation loop.
-bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
     MachineInstr *MI = I;
@@ -363,13 +365,13 @@ bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
 }
 
 // runOnMachineFunction - Pass entry point from PassManager.
-bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64AdvSIMDScalar *****\n");
+  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
 
   const TargetMachine &TM = mf.getTarget();
   MRI = &mf.getRegInfo();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
 
   // Just check things on a one-block-at-a-time basis.
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
@@ -378,8 +380,8 @@ bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   return Changed;
 }
 
-// createARM64AdvSIMDScalar - Factory function used by ARM64TargetMachine
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
 // to add the pass to the PassManager.
-FunctionPass *llvm::createARM64AdvSIMDScalar() {
-  return new ARM64AdvSIMDScalar();
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+  return new AArch64AdvSIMDScalar();
 }
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 5b5148351fac..cd94e244dc38 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64AsmPrinter.cpp - Print machine code to an AArch64 .s file --===//
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,237 +8,337 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to GAS-format AArch64 assembly language.
+// of machine-dependent LLVM code to the AArch64 assembly language.
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64AsmPrinter.h"
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
-
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-/// Try to print a floating-point register as if it belonged to a specified
-/// register-class. For example the inline asm operand modifier "b" requires its
-/// argument to be printed as "bN".
-static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
-                                       const TargetRegisterInfo *TRI,
-                                       char RegType, raw_ostream &O) {
-  if (!MO.isReg())
-    return true;
-
-  for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-    if (AArch64::FPR8RegClass.contains(*AR)) {
-      O << RegType << TRI->getEncodingValue(MO.getReg());
-      return false;
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  AArch64MCInstLower MCInstLowering;
+  StackMaps SM;
+
+public:
+  AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+        MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+        LOHLabelCounter(0) {}
+
+  const char *getPassName() const override {
+    return "AArch64 Assembly Printer";
+  }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+
+private:
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd() override;
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+  void EmitEndOfAsmFile(Module &M) override;
+  AArch64FunctionInfo *AArch64FI;
+
+  /// \brief Emit the LOHs contained in AArch64FI.
+  void EmitLOHs();
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+  unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMachO()) {
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    SM.serializeToStackMapSection();
+  }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
     }
   }
 
-  // The register doesn't correspond to anything floating-point like.
-  return true;
 }
 
-/// Implements the 'w' and 'x' inline asm operand modifiers, which print a GPR
-/// with the obvious type and an immediate 0 as either wzr or xzr.
-static bool printModifiedGPRAsmOperand(const MachineOperand &MO,
-                                       const TargetRegisterInfo *TRI,
-                                       const TargetRegisterClass &RegClass,
-                                       raw_ostream &O) {
-  char Prefix = &RegClass == &AArch64::GPR32RegClass ? 'w' : 'x';
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
 
-  if (MO.isImm() && MO.getImm() == 0) {
-    O << Prefix << "zr";
-    return false;
-  } else if (MO.isReg()) {
-    if (MO.getReg() == AArch64::XSP || MO.getReg() == AArch64::WSP) {
-      O << (Prefix == 'x' ? "sp" : "wsp");
-      return false;
-    }
+void AArch64AsmPrinter::EmitLOHs() {
+  SmallVector<MCSymbol *, 3> MCArgs;
 
-    for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-      if (RegClass.contains(*AR)) {
-        O << AArch64InstPrinter::getRegisterName(*AR);
-        return false;
-      }
+  for (const auto &D : AArch64FI->getLOHContainer()) {
+    for (const MachineInstr *MI : D.getArgs()) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
     }
+    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
+    MCArgs.clear();
   }
+}
+
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!AArch64FI->getLOHRelated().empty())
+    EmitLOHs();
+}
 
-  return true;
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+    return OutContext.GetOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.GetOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
 }
 
-bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
-                                             bool PrintImmediatePrefix,
-                                             StringRef Suffix, raw_ostream &O) {
-  StringRef Name;
-  StringRef Modifier;
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
   default:
-    return true;
-  case MachineOperand::MO_GlobalAddress:
-    Name = getSymbol(MO.getGlobal())->getName();
-
-    // Global variables may be accessed either via a GOT or in various fun and
-    // interesting TLS-model specific ways. Set the prefix modifier as
-    // appropriate here.
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal())) {
-      Reloc::Model RelocM = TM.getRelocationModel();
-      if (GV->isThreadLocal()) {
-        switch (TM.getTLSModel(GV)) {
-        case TLSModel::GeneralDynamic:
-          Modifier = "tlsdesc";
-          break;
-        case TLSModel::LocalDynamic:
-          Modifier = "dtprel";
-          break;
-        case TLSModel::InitialExec:
-          Modifier = "gottprel";
-          break;
-        case TLSModel::LocalExec:
-          Modifier = "tprel";
-          break;
-        }
-      } else if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
-        Modifier = "got";
-      }
-    }
+    llvm_unreachable("<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << AArch64InstPrinter::getRegisterName(Reg);
     break;
-  case MachineOperand::MO_BlockAddress:
-    Name = GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
     break;
-  case MachineOperand::MO_ConstantPoolIndex:
-    Name = GetCPISymbol(MO.getIndex())->getName();
+  }
+  }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                          raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
+    break;
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
     break;
   }
 
-  // Some instructions (notably ADRP) don't take the # prefix for
-  // immediates. Only print it if asked to.
-  if (PrintImmediatePrefix)
-    O << '#';
-
-  // Only need the joining "_" if both the prefix and the suffix are
-  // non-null. This little block simply takes care of the four possibly
-  // combinations involved there.
-  if (Modifier == "" && Suffix == "")
-    O << Name;
-  else if (Modifier == "" && Suffix != "")
-    O << ":" << Suffix << ':' << Name;
-  else if (Modifier != "" && Suffix == "")
-    O << ":" << Modifier << ':' << Name;
-  else
-    O << ":" << Modifier << '_' << Suffix << ':' << Name;
+  O << AArch64InstPrinter::getRegisterName(Reg);
+  return false;
+}
 
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                           const TargetRegisterClass *RC,
+                                           bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const AArch64RegisterInfo *RI =
+      static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << AArch64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
   return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const MachineOperand &MO = MI->getOperand(OpNum);
 
-  if (!ExtraCode)
-    ExtraCode = "";
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+    return false;
 
-  switch(ExtraCode[0]) {
-  default:
-    if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+        O << AArch64InstPrinter::getRegisterName(Reg);
         return false;
-    break;
-  case 'w':
-    // Output 32-bit general register operand, constant zero as wzr, or stack
-    // pointer as wsp. Ignored when used with other operand types.
-    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    AArch64::GPR32RegClass, O))
-      return false;
-    break;
-  case 'x':
-    // Output 64-bit general register operand, constant zero as xzr, or stack
-    // pointer as sp. Ignored when used with other operand types.
-    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    AArch64::GPR64RegClass, O))
-      return false;
-    break;
-  case 'H':
-    // Output higher numbered of a 64-bit general register pair
-  case 'Q':
-    // Output least significant register of a 64-bit general register pair
-  case 'R':
-    // Output most significant register of a 64-bit general register pair
-
-    // FIXME note: these three operand modifiers will require, to some extent,
-    // adding a paired GPR64 register class. Initial investigation suggests that
-    // assertions are hit unless it has a type and is made legal for that type
-    // in ISelLowering. After that step is made, the number of modifications
-    // needed explodes (operation legality, calling conventions, stores, reg
-    // copies ...).
-    llvm_unreachable("FIXME: Unimplemented register pairs");
-  case 'b':
-  case 'h':
-  case 's':
-  case 'd':
-  case 'q':
-    if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    ExtraCode[0], O))
-      return false;
-    break;
-  case 'A':
-    // Output symbolic address with appropriate relocation modifier (also
-    // suitable for ADRP).
-    if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O))
-      return false;
-    break;
-  case 'L':
-    // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
-    // modifier.
-    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O))
+      }
+      printOperand(MI, OpNum, O);
       return false;
-    break;
-  case 'G':
-    // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
-    // modifier (currently only for TLS local exec).
-    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O))
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &AArch64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &AArch64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &AArch64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &AArch64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &AArch64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
       return false;
-    break;
-  case 'a':
-    return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+    }
   }
 
-  // There's actually no operand modifier, which leads to a slightly eclectic
-  // set of behaviour which we have to handle here.
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  switch (MO.getType()) {
-  default:
-    llvm_unreachable("Unexpected operand for inline assembly");
-  case MachineOperand::MO_Register:
-    // GCC prints the unmodified operand of a 'w' constraint as the vector
-    // register. Technically, we could allocate the argument as a VPR128, but
-    // that leads to extremely dodgy copies being generated to get the data
-    // there.
-    if (printModifiedFPRAsmOperand(MO, TRI, 'v', O))
-      O << AArch64InstPrinter::getRegisterName(MO.getReg());
-    break;
-  case MachineOperand::MO_Immediate:
-    O << '#' << MO.getImm();
-    break;
-  case MachineOperand::MO_FPImmediate:
-    assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
-    O << "#0.0";
-    break;
-  case MachineOperand::MO_BlockAddress:
-  case MachineOperand::MO_ConstantPoolIndex:
-  case MachineOperand::MO_GlobalAddress:
-    return printSymbolicAddress(MO, false, "", O);
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (AArch64::GPR32allRegClass.contains(Reg) ||
+        AArch64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+                              O);
   }
 
+  printOperand(MI, OpNum, O);
   return false;
 }
 
@@ -247,15 +347,90 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
-  // Currently both the memory constraints (m and Q) behave the same and amount
-  // to the address as a single register. In future, we may allow "m" to provide
-  // both a base and an offset.
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
   const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isReg() && "unexpected inline assembly memory operand");
-  O << '[' << AArch64InstPrinter::getRegisterName(MO.getReg()) << ']';
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
   return false;
 }
 
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  // Emit padding.
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                        const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
 
 void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -263,41 +438,87 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(OutStreamer, MI))
     return;
 
-  MCInst TmpInst;
-  LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
-  EmitToStreamer(OutStreamer, TmpInst);
-}
+  if (AArch64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer.EmitLabel(LOHLabel);
+  }
 
-void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetELF()) {
-    const TargetLoweringObjectFileELF &TLOFELF =
-      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
 
-    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case AArch64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::BR);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TLSDESC_BLR: {
+    MCOperand Callee, Sym;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
+
+    // First emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(OutStreamer, TLSDescCall);
+
+    // Other than that it's just a normal indirect call to the function loaded
+    // from the descriptor.
+    MCInst BLR;
+    BLR.setOpcode(AArch64::BLR);
+    BLR.addOperand(Callee);
+    EmitToStreamer(OutStreamer, BLR);
 
-    // Output stubs for external and common global variables.
-    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
-    if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+    return;
+  }
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        OutStreamer.EmitLabel(Stubs[i].first);
-        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(0));
-      }
-      Stubs.clear();
-    }
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
   }
-}
 
-bool AArch64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  return AsmPrinter::runOnMachineFunction(MF);
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
-    RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
-    RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
-}
+  RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
 
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h
deleted file mode 100644
index f77553c7b8b6..000000000000
--- a/lib/Target/AArch64/AArch64AsmPrinter.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// AArch64AsmPrinter.h - Print machine code to an AArch64 .s file -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the AArch64 assembly printer class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64ASMPRINTER_H
-#define LLVM_AARCH64ASMPRINTER_H
-
-#include "AArch64.h"
-#include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
-
-  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
-  /// make the right decision when printing asm code for different targets.
-  const AArch64Subtarget *Subtarget;
-
-  // emitPseudoExpansionLowering - tblgen'erated.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
-
-  public:
-  explicit AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
-    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-  }
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
-  MCOperand lowerSymbolOperand(const MachineOperand &MO,
-                               const MCSymbol *Sym) const;
-
-  void EmitInstruction(const MachineInstr *MI) override;
-  void EmitEndOfAsmFile(Module &M) override;
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O) override;
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O) override;
-
-  /// printSymbolicAddress - Given some kind of reasonably bare symbolic
-  /// reference, print out the appropriate asm string to represent it. If
-  /// appropriate, a relocation-specifier will be produced, composed of a
-  /// general class derived from the MO parameter and an instruction-specific
-  /// suffix, provided in Suffix. E.g. ":got_lo12:" if a Suffix of "lo12" is
-  /// given.
-  bool printSymbolicAddress(const MachineOperand &MO,
-                            bool PrintImmediatePrefix,
-                            StringRef Suffix, raw_ostream &O);
-
-  const char *getPassName() const override {
-    return "AArch64 Assembly Printer";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
deleted file mode 100644
index 585cbee9966b..000000000000
--- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp
+++ /dev/null
@@ -1,601 +0,0 @@
-//===-- AArch64BranchFixupPass.cpp - AArch64 branch fixup -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that fixes AArch64 branches which have ended up out
-// of range for their immediate operands.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "aarch64-branch-fixup"
-
-STATISTIC(NumSplit,      "Number of uncond branches inserted");
-STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
-
-/// Return the worst case padding that could result from unknown offset bits.
-/// This does not include alignment padding caused by known offset bits.
-///
-/// @param LogAlign log2(alignment)
-/// @param KnownBits Number of known low offset bits.
-static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
-  if (KnownBits < LogAlign)
-    return (1u << LogAlign) - (1u << KnownBits);
-  return 0;
-}
-
-namespace {
-  /// Due to limited PC-relative displacements, conditional branches to distant
-  /// blocks may need converting into an unconditional equivalent. For example:
-  ///     tbz w1, #0, far_away
-  /// becomes
-  ///     tbnz w1, #0, skip
-  ///     b far_away
-  ///   skip:
-  class AArch64BranchFixup : public MachineFunctionPass {
-    /// Information about the offset and size of a single basic block.
-    struct BasicBlockInfo {
-      /// Distance from the beginning of the function to the beginning of this
-      /// basic block.
-      ///
-      /// Offsets are computed assuming worst case padding before an aligned
-      /// block. This means that subtracting basic block offsets always gives a
-      /// conservative estimate of the real distance which may be smaller.
-      ///
-      /// Because worst case padding is used, the computed offset of an aligned
-      /// block may not actually be aligned.
-      unsigned Offset;
-
-      /// Size of the basic block in bytes.  If the block contains inline
-      /// assembly, this is a worst case estimate.
-      ///
-      /// The size does not include any alignment padding whether from the
-      /// beginning of the block, or from an aligned jump table at the end.
-      unsigned Size;
-
-      /// The number of low bits in Offset that are known to be exact.  The
-      /// remaining bits of Offset are an upper bound.
-      uint8_t KnownBits;
-
-      /// When non-zero, the block contains instructions (inline asm) of unknown
-      /// size.  The real size may be smaller than Size bytes by a multiple of 1
-      /// << Unalign.
-      uint8_t Unalign;
-
-      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0) {}
-
-      /// Compute the number of known offset bits internally to this block.
-      /// This number should be used to predict worst case padding when
-      /// splitting the block.
-      unsigned internalKnownBits() const {
-        unsigned Bits = Unalign ? Unalign : KnownBits;
-        // If the block size isn't a multiple of the known bits, assume the
-        // worst case padding.
-        if (Size & ((1u << Bits) - 1))
-          Bits = countTrailingZeros(Size);
-        return Bits;
-      }
-
-      /// Compute the offset immediately following this block.  If LogAlign is
-      /// specified, return the offset the successor block will get if it has
-      /// this alignment.
-      unsigned postOffset(unsigned LogAlign = 0) const {
-        unsigned PO = Offset + Size;
-        if (!LogAlign)
-          return PO;
-        // Add alignment padding from the terminator.
-        return PO + UnknownPadding(LogAlign, internalKnownBits());
-      }
-
-      /// Compute the number of known low bits of postOffset.  If this block
-      /// contains inline asm, the number of known bits drops to the
-      /// instruction alignment.  An aligned terminator may increase the number
-      /// of know bits.
-      /// If LogAlign is given, also consider the alignment of the next block.
-      unsigned postKnownBits(unsigned LogAlign = 0) const {
-        return std::max(LogAlign, internalKnownBits());
-      }
-    };
-
-    std::vector<BasicBlockInfo> BBInfo;
-
-    /// One per immediate branch, keeping the machine instruction pointer,
-    /// conditional or unconditional, the max displacement, and (if IsCond is
-    /// true) the corresponding inverted branch opcode.
-    struct ImmBranch {
-      MachineInstr *MI;
-      unsigned OffsetBits : 31;
-      bool IsCond : 1;
-      ImmBranch(MachineInstr *mi, unsigned offsetbits, bool cond)
-        : MI(mi), OffsetBits(offsetbits), IsCond(cond) {}
-    };
-
-    /// Keep track of all the immediate branch instructions.
-    ///
-    std::vector<ImmBranch> ImmBranches;
-
-    MachineFunction *MF;
-    const AArch64InstrInfo *TII;
-  public:
-    static char ID;
-    AArch64BranchFixup() : MachineFunctionPass(ID) {}
-
-    bool runOnMachineFunction(MachineFunction &MF) override;
-
-    const char *getPassName() const override {
-      return "AArch64 branch fixup pass";
-    }
-
-  private:
-    void initializeFunctionInfo();
-    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
-    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
-    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB,
-                     unsigned OffsetBits);
-    bool fixupImmediateBr(ImmBranch &Br);
-    bool fixupConditionalBr(ImmBranch &Br);
-
-    void computeBlockSize(MachineBasicBlock *MBB);
-    unsigned getOffsetOf(MachineInstr *MI) const;
-    void dumpBBs();
-    void verify();
-  };
-  char AArch64BranchFixup::ID = 0;
-}
-
-/// check BBOffsets
-void AArch64BranchFixup::verify() {
-#ifndef NDEBUG
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned MBBId = MBB->getNumber();
-    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
-  }
-#endif
-}
-
-/// print block size and offset information - debugging
-void AArch64BranchFixup::dumpBBs() {
-  DEBUG({
-    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
-      const BasicBlockInfo &BBI = BBInfo[J];
-      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
-             << " kb=" << unsigned(BBI.KnownBits)
-             << " ua=" << unsigned(BBI.Unalign)
-             << format(" size=%#x\n", BBInfo[J].Size);
-    }
-  });
-}
-
-/// Returns an instance of the branch fixup pass.
-FunctionPass *llvm::createAArch64BranchFixupPass() {
-  return new AArch64BranchFixup();
-}
-
-bool AArch64BranchFixup::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  DEBUG(dbgs() << "***** AArch64BranchFixup ******");
-  TII = (const AArch64InstrInfo*)MF->getTarget().getInstrInfo();
-
-  // This pass invalidates liveness information when it splits basic blocks.
-  MF->getRegInfo().invalidateLiveness();
-
-  // Renumber all of the machine basic blocks in the function, guaranteeing that
-  // the numbers agree with the position of the block in the function.
-  MF->RenumberBlocks();
-
-  // Do the initial scan of the function, building up information about the
-  // sizes of each block and location of each immediate branch.
-  initializeFunctionInfo();
-
-  // Iteratively fix up branches until there is no change.
-  unsigned NoBRIters = 0;
-  bool MadeChange = false;
-  while (true) {
-    DEBUG(dbgs() << "Beginning iteration #" << NoBRIters << '\n');
-    bool BRChange = false;
-    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
-      BRChange |= fixupImmediateBr(ImmBranches[i]);
-    if (BRChange && ++NoBRIters > 30)
-      report_fatal_error("Branch Fix Up pass failed to converge!");
-    DEBUG(dumpBBs());
-
-    if (!BRChange)
-      break;
-    MadeChange = true;
-  }
-
-  // After a while, this might be made debug-only, but it is not expensive.
-  verify();
-
-  DEBUG(dbgs() << '\n'; dumpBBs());
-
-  BBInfo.clear();
-  ImmBranches.clear();
-
-  return MadeChange;
-}
-
-/// Return true if the specified basic block can fallthrough into the block
-/// immediately after it.
-static bool BBHasFallthrough(MachineBasicBlock *MBB) {
-  // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
-  // Can't fall off end of function.
-  if (std::next(MBBI) == MBB->getParent()->end())
-    return false;
-
-  MachineBasicBlock *NextBB = std::next(MBBI);
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I == NextBB)
-      return true;
-
-  return false;
-}
-
-/// Do the initial scan of the function, building up information about the sizes
-/// of each block, and each immediate branch.
-void AArch64BranchFixup::initializeFunctionInfo() {
-  BBInfo.clear();
-  BBInfo.resize(MF->getNumBlockIDs());
-
-  // First thing, compute the size of all basic blocks, and see if the function
-  // has any inline assembly in it. If so, we have to be conservative about
-  // alignment assumptions, as we don't know for sure the size of any
-  // instructions in the inline assembly.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(I);
-
-  // The known bits of the entry block offset are determined by the function
-  // alignment.
-  BBInfo.front().KnownBits = MF->getAlignment();
-
-  // Compute block offsets and known bits.
-  adjustBBOffsetsAfter(MF->begin());
-
-  // Now go back through the instructions and build up our data structures.
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (I->isDebugValue())
-        continue;
-
-      int Opc = I->getOpcode();
-      if (I->isBranch()) {
-        bool IsCond = false;
-
-        // The offsets encoded in instructions here scale by the instruction
-        // size (4 bytes), effectively increasing their range by 2 bits.
-        unsigned Bits = 0;
-        switch (Opc) {
-        default:
-          continue;  // Ignore other JT branches
-        case AArch64::TBZxii:
-        case AArch64::TBZwii:
-        case AArch64::TBNZxii:
-        case AArch64::TBNZwii:
-          IsCond = true;
-          Bits = 14 + 2;
-          break;
-        case AArch64::Bcc:
-        case AArch64::CBZx:
-        case AArch64::CBZw:
-        case AArch64::CBNZx:
-        case AArch64::CBNZw:
-          IsCond = true;
-          Bits = 19 + 2;
-          break;
-        case AArch64::Bimm:
-          Bits = 26 + 2;
-          break;
-        }
-
-        // Record this immediate branch.
-        ImmBranches.push_back(ImmBranch(I, Bits, IsCond));
-      }
-    }
-  }
-}
-
-/// Compute the size and some alignment information for MBB.  This function
-/// updates BBInfo directly.
-void AArch64BranchFixup::computeBlockSize(MachineBasicBlock *MBB) {
-  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
-  BBI.Size = 0;
-  BBI.Unalign = 0;
-
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I) {
-    BBI.Size += TII->getInstSizeInBytes(*I);
-    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
-    // The actual size may be smaller, but still a multiple of the instr size.
-    if (I->isInlineAsm())
-      BBI.Unalign = 2;
-  }
-}
-
-/// Return the current offset of the specified machine instruction from the
-/// start of the function.  This offset changes as stuff is moved around inside
-/// the function.
-unsigned AArch64BranchFixup::getOffsetOf(MachineInstr *MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-
-  // The offset is composed of two things: the sum of the sizes of all MBB's
-  // before this instruction's block, and the offset from the start of the block
-  // it is in.
-  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
-
-  // Sum instructions before MI in MBB.
-  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
-    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->getInstSizeInBytes(*I);
-  }
-  return Offset;
-}
-
-/// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch.  Update data structures and renumber blocks to
-/// account for this change and returns the newly created block.
-MachineBasicBlock *
-AArch64BranchFixup::splitBlockBeforeInstr(MachineInstr *MI) {
-  MachineBasicBlock *OrigBB = MI->getParent();
-
-  // Create a new MBB for the code after the OrigBB.
-  MachineBasicBlock *NewBB =
-    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
-  MF->insert(MBBI, NewBB);
-
-  // Splice the instructions starting with MI over to NewBB.
-  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
-
-  // Add an unconditional branch from OrigBB to NewBB.
-  // Note the new unconditional branch is not being recorded.
-  // There doesn't seem to be meaningful DebugInfo available; this doesn't
-  // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::Bimm)).addMBB(NewBB);
-  ++NumSplit;
-
-  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
-  NewBB->transferSuccessors(OrigBB);
-
-  // OrigBB branches to NewBB.
-  OrigBB->addSuccessor(NewBB);
-
-  // Update internal data structures to account for the newly inserted MBB.
-  MF->RenumberBlocks(NewBB);
-
-  // Insert an entry into BBInfo to align it properly with the (newly
-  // renumbered) block numbers.
-  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
-
-  // Figure out how large the OrigBB is.  As the first half of the original
-  // block, it cannot contain a tablejump.  The size includes
-  // the new jump we added.  (It should be possible to do this without
-  // recounting everything, but it's very confusing, and this is rarely
-  // executed.)
-  computeBlockSize(OrigBB);
-
-  // Figure out how large the NewMBB is.  As the second half of the original
-  // block, it may contain a tablejump.
-  computeBlockSize(NewBB);
-
-  // All BBOffsets following these blocks must be modified.
-  adjustBBOffsetsAfter(OrigBB);
-
-  return NewBB;
-}
-
-void AArch64BranchFixup::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
-  unsigned BBNum = BB->getNumber();
-  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
-    // Get the offset and known bits at the end of the layout predecessor.
-    // Include the alignment of the current block.
-    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
-    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
-    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
-
-    // This is where block i begins.  Stop if the offset is already correct,
-    // and we have updated 2 blocks.  This is the maximum number of blocks
-    // changed before calling this function.
-    if (i > BBNum + 2 &&
-        BBInfo[i].Offset == Offset &&
-        BBInfo[i].KnownBits == KnownBits)
-      break;
-
-    BBInfo[i].Offset = Offset;
-    BBInfo[i].KnownBits = KnownBits;
-  }
-}
-
-/// Returns true if the distance between specific MI and specific BB can fit in
-/// MI's displacement field.
-bool AArch64BranchFixup::isBBInRange(MachineInstr *MI,
-                                     MachineBasicBlock *DestBB,
-                                     unsigned OffsetBits) {
-  int64_t BrOffset   = getOffsetOf(MI);
-  int64_t DestOffset = BBInfo[DestBB->getNumber()].Offset;
-
-  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
-               << " from BB#" << MI->getParent()->getNumber()
-               << " bits available=" << OffsetBits
-               << " from " << getOffsetOf(MI) << " to " << DestOffset
-               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
-
-  return isIntN(OffsetBits, DestOffset - BrOffset);
-}
-
-/// Fix up an immediate branch whose destination is too far away to fit in its
-/// displacement field.
-bool AArch64BranchFixup::fixupImmediateBr(ImmBranch &Br) {
-  MachineInstr *MI = Br.MI;
-  MachineBasicBlock *DestBB = nullptr;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    if (MI->getOperand(i).isMBB()) {
-      DestBB = MI->getOperand(i).getMBB();
-      break;
-    }
-  }
-  assert(DestBB && "Branch with no destination BB?");
-
-  // Check to see if the DestBB is already in-range.
-  if (isBBInRange(MI, DestBB, Br.OffsetBits))
-    return false;
-
-  assert(Br.IsCond && "Only conditional branches should need fixup");
-  return fixupConditionalBr(Br);
-}
-
-/// Fix up a conditional branch whose destination is too far away to fit in its
-/// displacement field. It is converted to an inverse conditional branch + an
-/// unconditional branch to the destination.
-bool
-AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
-  MachineInstr *MI = Br.MI;
-  MachineBasicBlock *MBB = MI->getParent();
-  unsigned CondBrMBBOperand = 0;
-
-  // The general idea is to add an unconditional branch to the destination and
-  // invert the conditional branch to jump over it. Complications occur around
-  // fallthrough and unreachable ends to the block.
-  //   b.lt L1
-  //   =>
-  //   b.ge L2
-  //   b   L1
-  // L2:
-
-  // First we invert the conditional branch, by creating a replacement if
-  // necessary. This if statement contains all the special handling of different
-  // branch types.
-  if (MI->getOpcode() == AArch64::Bcc) {
-    // The basic block is operand number 1 for Bcc
-    CondBrMBBOperand = 1;
-
-    A64CC::CondCodes CC = (A64CC::CondCodes)MI->getOperand(0).getImm();
-    CC = A64InvertCondCode(CC);
-    MI->getOperand(0).setImm(CC);
-  } else {
-    MachineInstrBuilder InvertedMI;
-    int InvertedOpcode;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("Unknown branch type");
-    case AArch64::TBZxii: InvertedOpcode = AArch64::TBNZxii; break;
-    case AArch64::TBZwii: InvertedOpcode = AArch64::TBNZwii; break;
-    case AArch64::TBNZxii: InvertedOpcode = AArch64::TBZxii; break;
-    case AArch64::TBNZwii: InvertedOpcode = AArch64::TBZwii; break;
-    case AArch64::CBZx: InvertedOpcode = AArch64::CBNZx; break;
-    case AArch64::CBZw: InvertedOpcode = AArch64::CBNZw; break;
-    case AArch64::CBNZx: InvertedOpcode = AArch64::CBZx; break;
-    case AArch64::CBNZw: InvertedOpcode = AArch64::CBZw; break;
-    }
-
-    InvertedMI = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(InvertedOpcode));
-    for (unsigned i = 0, e= MI->getNumOperands(); i != e; ++i) {
-      InvertedMI.addOperand(MI->getOperand(i));
-      if (MI->getOperand(i).isMBB())
-        CondBrMBBOperand = i;
-    }
-
-    MI->eraseFromParent();
-    MI = Br.MI = InvertedMI;
-  }
-
-  // If the branch is at the end of its MBB and that has a fall-through block,
-  // direct the updated conditional branch to the fall-through
-  // block. Otherwise, split the MBB before the next instruction.
-  MachineInstr *BMI = &MBB->back();
-  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
-
-  ++NumCBrFixed;
-  if (BMI != MI) {
-    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
-        BMI->getOpcode() == AArch64::Bimm) {
-      // Last MI in the BB is an unconditional branch. We can swap destinations:
-      // b.eq L1 (temporarily b.ne L1 after first change)
-      // b   L2
-      // =>
-      // b.ne L2
-      // b   L1
-      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
-      if (isBBInRange(MI, NewDest, Br.OffsetBits)) {
-        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
-                     << *BMI);
-        MachineBasicBlock *DestBB = MI->getOperand(CondBrMBBOperand).getMBB();
-        BMI->getOperand(0).setMBB(DestBB);
-        MI->getOperand(CondBrMBBOperand).setMBB(NewDest);
-        return true;
-      }
-    }
-  }
-
-  if (NeedSplit) {
-    MachineBasicBlock::iterator MBBI = MI; ++MBBI;
-    splitBlockBeforeInstr(MBBI);
-    // No need for the branch to the next block. We're adding an unconditional
-    // branch to the destination.
-    int delta = TII->getInstSizeInBytes(MBB->back());
-    BBInfo[MBB->getNumber()].Size -= delta;
-    MBB->back().eraseFromParent();
-    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
-  }
-
-  // After splitting and removing the unconditional branch from the original BB,
-  // the structure is now:
-  // oldbb:
-  //   [things]
-  //   b.invertedCC L1
-  // splitbb/fallthroughbb:
-  //   [old b L2/real continuation]
-  //
-  // We now have to change the conditional branch to point to splitbb and add an
-  // unconditional branch after it to L1, giving the final structure:
-  // oldbb:
-  //   [things]
-  //   b.invertedCC splitbb
-  //   b L1
-  // splitbb/fallthroughbb:
-  //   [old b L2/real continuation]
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
-
-  DEBUG(dbgs() << "  Insert B to BB#"
-               << MI->getOperand(CondBrMBBOperand).getMBB()->getNumber()
-               << " also invert condition and change dest. to BB#"
-               << NextBB->getNumber() << "\n");
-
-  // Insert a new unconditional branch and fixup the destination of the
-  // conditional one.  Also update the ImmBranch as well as adding a new entry
-  // for the new branch.
-  BuildMI(MBB, DebugLoc(), TII->get(AArch64::Bimm))
-    .addMBB(MI->getOperand(CondBrMBBOperand).getMBB());
-  MI->getOperand(CondBrMBBOperand).setMBB(NextBB);
-
-  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
-
-  // 26 bits written down in Bimm, specifying a multiple of 4.
-  unsigned OffsetBits = 26 + 2;
-  ImmBranches.push_back(ImmBranch(&MBB->back(), OffsetBits, false));
-
-  adjustBBOffsetsAfter(MBB);
-  return true;
-}
diff --git a/lib/Target/ARM64/ARM64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
similarity index 78%
rename from lib/Target/ARM64/ARM64BranchRelaxation.cpp
rename to lib/Target/AArch64/AArch64BranchRelaxation.cpp
index 73be3504790e..484e7e8ffce4 100644
--- a/lib/Target/ARM64/ARM64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64BranchRelaxation.cpp - ARM64 branch relaxation ---------------===//
+//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,9 +9,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -23,29 +23,29 @@
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-branch-relax"
+#define DEBUG_TYPE "aarch64-branch-relax"
 
 static cl::opt<bool>
-BranchRelaxation("arm64-branch-relax", cl::Hidden, cl::init(true),
+BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true),
                  cl::desc("Relax out of range conditional branches"));
 
 static cl::opt<unsigned>
-TBZDisplacementBits("arm64-tbz-offset-bits", cl::Hidden, cl::init(14),
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
                     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
 
 static cl::opt<unsigned>
-CBZDisplacementBits("arm64-cbz-offset-bits", cl::Hidden, cl::init(19),
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
                     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
 
 static cl::opt<unsigned>
-BCCDisplacementBits("arm64-bcc-offset-bits", cl::Hidden, cl::init(19),
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
                     cl::desc("Restrict range of Bcc instructions (DEBUG)"));
 
 STATISTIC(NumSplit, "Number of basic blocks split");
 STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
 
 namespace {
-class ARM64BranchRelaxation : public MachineFunctionPass {
+class AArch64BranchRelaxation : public MachineFunctionPass {
   /// BasicBlockInfo - Information about the offset and size of a single
   /// basic block.
   struct BasicBlockInfo {
@@ -77,7 +77,7 @@ class ARM64BranchRelaxation : public MachineFunctionPass {
   SmallVector<BasicBlockInfo, 16> BlockInfo;
 
   MachineFunction *MF;
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
 
   bool relaxBranchInstructions();
   void scanFunction();
@@ -92,19 +92,19 @@ class ARM64BranchRelaxation : public MachineFunctionPass {
 
 public:
   static char ID;
-  ARM64BranchRelaxation() : MachineFunctionPass(ID) {}
+  AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "ARM64 branch relaxation pass";
+    return "AArch64 branch relaxation pass";
   }
 };
-char ARM64BranchRelaxation::ID = 0;
+char AArch64BranchRelaxation::ID = 0;
 }
 
 /// verify - check BBOffsets, BBSizes, alignment of islands
-void ARM64BranchRelaxation::verify() {
+void AArch64BranchRelaxation::verify() {
 #ifndef NDEBUG
   unsigned PrevNum = MF->begin()->getNumber();
   for (MachineBasicBlock &MBB : *MF) {
@@ -118,7 +118,7 @@ void ARM64BranchRelaxation::verify() {
 }
 
 /// print block size and offset information - debugging
-void ARM64BranchRelaxation::dumpBBs() {
+void AArch64BranchRelaxation::dumpBBs() {
   for (auto &MBB : *MF) {
     const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
     dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
@@ -145,7 +145,7 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
 
 /// scanFunction - Do the initial scan of the function, building up
 /// information about each block.
-void ARM64BranchRelaxation::scanFunction() {
+void AArch64BranchRelaxation::scanFunction() {
   BlockInfo.clear();
   BlockInfo.resize(MF->getNumBlockIDs());
 
@@ -162,7 +162,7 @@ void ARM64BranchRelaxation::scanFunction() {
 
 /// computeBlockSize - Compute the size for MBB.
 /// This function updates BlockInfo directly.
-void ARM64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
+void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
   unsigned Size = 0;
   for (const MachineInstr &MI : MBB)
     Size += TII->GetInstSizeInBytes(&MI);
@@ -172,7 +172,7 @@ void ARM64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
 /// getInstrOffset - Return the current offset of the specified machine
 /// instruction from the start of the function.  This offset changes as stuff is
 /// moved around inside the function.
-unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
   MachineBasicBlock *MBB = MI->getParent();
 
   // The offset is composed of two things: the sum of the sizes of all MBB's
@@ -188,7 +188,7 @@ unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
   return Offset;
 }
 
-void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
   unsigned PrevNum = Start.getNumber();
   for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
     unsigned Num = MBB.getNumber();
@@ -209,7 +209,7 @@ void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
 /// and must be updated by the caller! Other transforms follow using this
 /// utility function, so no point updating now rather than waiting.
 MachineBasicBlock *
-ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
   MachineBasicBlock *OrigBB = MI->getParent();
 
   // Create a new MBB for the code after the OrigBB.
@@ -226,7 +226,7 @@ ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
   // Note the new unconditional branch is not being recorded.
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc(), TII->get(ARM64::B)).addMBB(NewBB);
+  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB);
 
   // Insert an entry into BlockInfo to align it properly with the block numbers.
   BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
@@ -252,9 +252,9 @@ ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
 
 /// isBlockInRange - Returns true if the distance between specific MI and
 /// specific BB can fit in MI's displacement field.
-bool ARM64BranchRelaxation::isBlockInRange(MachineInstr *MI,
-                                           MachineBasicBlock *DestBB,
-                                           unsigned Bits) {
+bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+                                             MachineBasicBlock *DestBB,
+                                             unsigned Bits) {
   unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
   unsigned BrOffset = getInstrOffset(MI);
   unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
@@ -275,15 +275,15 @@ static bool isConditionalBranch(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
-  case ARM64::TBZW:
-  case ARM64::TBNZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZX:
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
     return true;
   }
 }
@@ -291,17 +291,17 @@ static bool isConditionalBranch(unsigned Opc) {
 static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBZW:
-  case ARM64::TBNZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZX:
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
     return MI->getOperand(2).getMBB();
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
     return MI->getOperand(1).getMBB();
   }
 }
@@ -309,49 +309,49 @@ static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
 static unsigned getOppositeConditionOpcode(unsigned Opc) {
   switch (Opc) {
   default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBNZW:   return ARM64::TBZW;
-  case ARM64::TBNZX:   return ARM64::TBZX;
-  case ARM64::TBZW:    return ARM64::TBNZW;
-  case ARM64::TBZX:    return ARM64::TBNZX;
-  case ARM64::CBNZW:   return ARM64::CBZW;
-  case ARM64::CBNZX:   return ARM64::CBZX;
-  case ARM64::CBZW:    return ARM64::CBNZW;
-  case ARM64::CBZX:    return ARM64::CBNZX;
-  case ARM64::Bcc:     return ARM64::Bcc; // Condition is an operand for Bcc.
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::TBNZW:   return AArch64::TBZW;
+  case AArch64::TBNZX:   return AArch64::TBZX;
+  case AArch64::TBZW:    return AArch64::TBNZW;
+  case AArch64::TBZX:    return AArch64::TBNZX;
+  case AArch64::CBNZW:   return AArch64::CBZW;
+  case AArch64::CBNZX:   return AArch64::CBZX;
+  case AArch64::CBZW:    return AArch64::CBNZW;
+  case AArch64::CBZX:    return AArch64::CBNZX;
+  case AArch64::Bcc:     return AArch64::Bcc; // Condition is an operand for Bcc.
   }
 }
 
 static unsigned getBranchDisplacementBits(unsigned Opc) {
   switch (Opc) {
   default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBNZW:
-  case ARM64::TBZW:
-  case ARM64::TBNZX:
-  case ARM64::TBZX:
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::TBNZW:
+  case AArch64::TBZW:
+  case AArch64::TBNZX:
+  case AArch64::TBZX:
     return TBZDisplacementBits;
-  case ARM64::CBNZW:
-  case ARM64::CBZW:
-  case ARM64::CBNZX:
-  case ARM64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBZW:
+  case AArch64::CBNZX:
+  case AArch64::CBZX:
     return CBZDisplacementBits;
-  case ARM64::Bcc:
+  case AArch64::Bcc:
     return BCCDisplacementBits;
   }
 }
 
 static inline void invertBccCondition(MachineInstr *MI) {
-  assert(MI->getOpcode() == ARM64::Bcc && "Unexpected opcode!");
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(0).getImm();
-  CC = ARM64CC::getInvertedCondCode(CC);
+  assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!");
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm();
+  CC = AArch64CC::getInvertedCondCode(CC);
   MI->getOperand(0).setImm((int64_t)CC);
 }
 
 /// fixupConditionalBranch - Fix up a conditional branch whose destination is
 /// too far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
-bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   MachineBasicBlock *DestBB = getDestBlock(MI);
 
   // Add an unconditional branch to the destination and invert the branch
@@ -372,7 +372,7 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   if (BMI != MI) {
     if (std::next(MachineBasicBlock::iterator(MI)) ==
             std::prev(MBB->getLastNonDebugInstr()) &&
-        BMI->getOpcode() == ARM64::B) {
+        BMI->getOpcode() == AArch64::B) {
       // Last MI in the BB is an unconditional branch. Can we simply invert the
       // condition and swap destinations:
       // beq L1
@@ -386,14 +386,15 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
         DEBUG(dbgs() << "  Invert condition and swap its destination with "
                      << *BMI);
         BMI->getOperand(0).setMBB(DestBB);
-        unsigned OpNum =
-            (MI->getOpcode() == ARM64::TBZW || MI->getOpcode() == ARM64::TBNZW ||
-             MI->getOpcode() == ARM64::TBZX || MI->getOpcode() == ARM64::TBNZX)
-                ? 2
-                : 1;
+        unsigned OpNum = (MI->getOpcode() == AArch64::TBZW ||
+                          MI->getOpcode() == AArch64::TBNZW ||
+                          MI->getOpcode() == AArch64::TBZX ||
+                          MI->getOpcode() == AArch64::TBNZX)
+                             ? 2
+                             : 1;
         MI->getOperand(OpNum).setMBB(NewDest);
         MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
-        if (MI->getOpcode() == ARM64::Bcc)
+        if (MI->getOpcode() == AArch64::Bcc)
           invertBccCondition(MI);
         return true;
       }
@@ -429,14 +430,14 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   MachineInstrBuilder MIB = BuildMI(
       MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
                                 .addOperand(MI->getOperand(0));
-  if (MI->getOpcode() == ARM64::TBZW || MI->getOpcode() == ARM64::TBNZW ||
-      MI->getOpcode() == ARM64::TBZX || MI->getOpcode() == ARM64::TBNZX)
+  if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW ||
+      MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX)
     MIB.addOperand(MI->getOperand(1));
-  if (MI->getOpcode() == ARM64::Bcc)
+  if (MI->getOpcode() == AArch64::Bcc)
     invertBccCondition(MIB);
   MIB.addMBB(NextBB);
   BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
-  BuildMI(MBB, DebugLoc(), TII->get(ARM64::B)).addMBB(DestBB);
+  BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
   BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
@@ -448,7 +449,7 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   return true;
 }
 
-bool ARM64BranchRelaxation::relaxBranchInstructions() {
+bool AArch64BranchRelaxation::relaxBranchInstructions() {
   bool Changed = false;
   // Relaxing branches involves creating new basic blocks, so re-eval
   // end() for termination.
@@ -465,16 +466,16 @@ bool ARM64BranchRelaxation::relaxBranchInstructions() {
   return Changed;
 }
 
-bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
 
   // If the pass is disabled, just bail early.
   if (!BranchRelaxation)
     return false;
 
-  DEBUG(dbgs() << "***** ARM64BranchRelaxation *****\n");
+  DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
 
-  TII = (const ARM64InstrInfo *)MF->getTarget().getInstrInfo();
+  TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
@@ -502,8 +503,8 @@ bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   return MadeChange;
 }
 
-/// createARM64BranchRelaxation - returns an instance of the constpool
+/// createAArch64BranchRelaxation - returns an instance of the constpool
 /// island pass.
-FunctionPass *llvm::createARM64BranchRelaxation() {
-  return new ARM64BranchRelaxation();
+FunctionPass *llvm::createAArch64BranchRelaxation() {
+  return new AArch64BranchRelaxation();
 }
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
deleted file mode 100644
index 9fe6aae2e32f..000000000000
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ /dev/null
@@ -1,197 +0,0 @@
-//==-- AArch64CallingConv.td - Calling Conventions for ARM ----*- tblgen -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This describes the calling conventions for AArch64 architecture.
-//===----------------------------------------------------------------------===//
-
-
-// The AArch64 Procedure Call Standard is unfortunately specified at a slightly
-// higher level of abstraction than LLVM's target interface presents. In
-// particular, it refers (like other ABIs, in fact) directly to
-// structs. However, generic LLVM code takes the liberty of lowering structure
-// arguments to the component fields before we see them.
-//
-// As a result, the obvious direct map from LLVM IR to PCS concepts can't be
-// implemented, so the goals of this calling convention are, in decreasing
-// priority order:
-//     1. Expose *some* way to express the concepts required to implement the
-//        generic PCS from a front-end.
-//     2. Provide a sane ABI for pure LLVM.
-//     3. Follow the generic PCS as closely as is naturally possible.
-//
-// The suggested front-end implementation of PCS features is:
-//     * Integer, float and vector arguments of all sizes which end up in
-//       registers are passed and returned via the natural LLVM type.
-//     * Structure arguments with size <= 16 bytes are passed and returned in
-//       registers as similar integer or composite types. For example:
-//       [1 x i64], [2 x i64] or [1 x i128] (if alignment 16 needed).
-//     * HFAs in registers follow rules similar to small structs: appropriate
-//       composite types.
-//     * Structure arguments with size > 16 bytes are passed via a pointer,
-//       handled completely by the front-end.
-//     * Structure return values > 16 bytes via an sret pointer argument.
-//     * Other stack-based arguments (not large structs) are passed using byval
-//       pointers. Padding arguments are added beforehand to guarantee a large
-//       struct doesn't later use integer registers.
-//
-// N.b. this means that it is the front-end's responsibility (if it cares about
-// PCS compliance) to check whether enough registers are available for an
-// argument when deciding how to pass it.
-
-class CCIfAlign<int Align, CCAction A>:
-  CCIf<"ArgFlags.getOrigAlign() == " # Align, A>;
-
-def CC_A64_APCS : CallingConv<[
-  // SRet is an LLVM-specific concept, so it takes precedence over general ABI
-  // concerns. However, this rule will be used by C/C++ frontends to implement
-  // structure return.
-  CCIfSRet<CCAssignToReg<[X8]>>,
-
-  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
-  // slot is 64-bit.
-  CCIfByVal<CCPassByVal<8, 8>>,
-
-  // Canonicalise the various types that live in different floating-point
-  // registers. This makes sense because the PCS does not distinguish Short
-  // Vectors and Floating-point types.
-  CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
-  CCIfType<[v1i32, v4i8, v2i16], CCBitConvertToType<f32>>,
-  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCBitConvertToType<f128>>,
-
-  // PCS: "C.1: If the argument is a Half-, Single-, Double- or Quad- precision
-  // Floating-point or Short Vector Type and the NSRN is less than 8, then the
-  // argument is allocated to the least significant bits of register
-  // v[NSRN]. The NSRN is incremented by one. The argument has now been
-  // allocated."
-  CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
-  CCIfType<[f16],  CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
-  CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
-  CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
-  CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
-  // PCS: "C.2: If the argument is an HFA and there are sufficient unallocated
-  // SIMD and Floating-point registers (NSRN - number of elements < 8), then the
-  // argument is allocated to SIMD and Floating-point registers (with one
-  // register per element of the HFA). The NSRN is incremented by the number of
-  // registers used. The argument has now been allocated."
-  //
-  // N.b. As above, this rule is the responsibility of the front-end.
-
-  // "C.3: If the argument is an HFA then the NSRN is set to 8 and the size of
-  // the argument is rounded up to the nearest multiple of 8 bytes."
-  //
-  // "C.4: If the argument is an HFA, a Quad-precision Floating-point or Short
-  // Vector Type then the NSAA is rounded up to the larger of 8 or the Natural
-  // Alignment of the Argument's type."
-  //
-  // It is expected that these will be satisfied by adding dummy arguments to
-  // the prototype.
-
-  // PCS: "C.5: If the argument is a Half- or Single- precision Floating-point
-  // type then the size of the argument is set to 8 bytes. The effect is as if
-  // the argument had been copied to the least significant bits of a 64-bit
-  // register and the remaining bits filled with unspecified values."
-  CCIfType<[f16, f32], CCPromoteToType<f64>>,
-
-  // PCS: "C.6: If the argument is an HFA, a Half-, Single-, Double- or Quad-
-  // precision Floating-point or Short Vector Type, then the argument is copied
-  // to memory at the adjusted NSAA. The NSAA is incremented by the size of the
-  // argument. The argument has now been allocated."
-  CCIfType<[f64], CCAssignToStack<8, 8>>,
-  CCIfType<[f128], CCAssignToStack<16, 16>>,
-
-  // PCS: "C.7: If the argument is an Integral Type, the size of the argument is
-  // less than or equal to 8 bytes and the NGRN is less than 8, the argument is
-  // copied to the least significant bits of x[NGRN]. The NGRN is incremented by
-  // one. The argument has now been allocated."
-
-  // First we implement C.8 and C.9 (128-bit types get even registers). i128 is
-  // represented as two i64s, the first one being split. If we delayed this
-  // operation C.8 would never be reached.
-  CCIfType<[i64],
-        CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6], [X0, X1, X3, X5]>>>,
-
-  // Note: the promotion also implements C.14.
-  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-
-  // And now the real implementation of C.7
-  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
-
-  // PCS: "C.8: If the argument has an alignment of 16 then the NGRN is rounded
-  // up to the next even number."
-  //
-  // "C.9: If the argument is an Integral Type, the size of the argument is
-  // equal to 16 and the NGRN is less than 7, the argument is copied to x[NGRN]
-  // and x[NGRN+1], x[NGRN] shall contain the lower addressed double-word of the
-  // memory representation of the argument. The NGRN is incremented by two. The
-  // argument has now been allocated."
-  //
-  // Subtlety here: what if alignment is 16 but it is not an integral type? All
-  // floating-point types have been allocated already, which leaves composite
-  // types: this is why a front-end may need to produce i128 for a struct <= 16
-  // bytes.
-
-  // PCS: "C.10 If the argument is a Composite Type and the size in double-words
-  // of the argument is not more than 8 minus NGRN, then the argument is copied
-  // into consecutive general-purpose registers, starting at x[NGRN]. The
-  // argument is passed as though it had been loaded into the registers from a
-  // double-word aligned address with an appropriate sequence of LDR
-  // instructions loading consecutive registers from memory (the contents of any
-  // unused parts of the registers are unspecified by this standard). The NGRN
-  // is incremented by the number of registers used. The argument has now been
-  // allocated."
-  //
-  // Another one that's the responsibility of the front-end (sigh).
-
-  // PCS: "C.11: The NGRN is set to 8."
-  CCCustom<"CC_AArch64NoMoreRegs">,
-
-  // PCS: "C.12: The NSAA is rounded up to the larger of 8 or the Natural
-  // Alignment of the argument's type."
-  //
-  // PCS: "C.13: If the argument is a composite type then the argument is copied
-  // to memory at the adjusted NSAA. The NSAA is by the size of the
-  // argument. The argument has now been allocated."
-  //
-  // Note that the effect of this corresponds to a memcpy rather than register
-  // stores so that the struct ends up correctly addressable at the adjusted
-  // NSAA.
-
-  // PCS: "C.14: If the size of the argument is less than 8 bytes then the size
-  // of the argument is set to 8 bytes. The effect is as if the argument was
-  // copied to the least significant bits of a 64-bit register and the remaining
-  // bits filled with unspecified values."
-  //
-  // Integer types were widened above. Floating-point and composite types have
-  // already been allocated completely. Nothing to do.
-
-  // PCS: "C.15: The argument is copied to memory at the adjusted NSAA. The NSAA
-  // is incremented by the size of the argument. The argument has now been
-  // allocated."
-  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64], CCAssignToStack<8, 8>>
-
-]>;
-
-// According to the PCS, X19-X30 are callee-saved, however only the low 64-bits
-// of vector registers (8-15) are callee-saved. The order here is is picked up
-// by PrologEpilogInserter.cpp to allocate stack slots, starting from top of
-// stack upon entry. This gives the customary layout of x30 at [sp-8], x29 at
-// [sp-16], ...
-def CSR_PCS : CalleeSavedRegs<(add (sequence "X%u", 30, 19),
-                                   (sequence "D%u", 15, 8))>;
-
-
-// TLS descriptor calls are extremely restricted in their changes, to allow
-// optimisations in the (hopefully) more common fast path where no real action
-// is needed. They actually have to preserve all registers, except for the
-// unavoidable X30 and the return register X0.
-def TLSDesc : CalleeSavedRegs<(add (sequence "X%u", 29, 1),
-                                   (sequence "Q%u", 31, 0))>;
diff --git a/lib/Target/ARM64/ARM64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
similarity index 92%
rename from lib/Target/ARM64/ARM64CallingConvention.td
rename to lib/Target/AArch64/AArch64CallingConvention.td
index 0ef5601718d2..8e8bd3d0bcd8 100644
--- a/lib/Target/ARM64/ARM64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -1,4 +1,4 @@
-//===- ARM64CallingConv.td - Calling Conventions for ARM64 -*- tablegen -*-===//
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This describes the calling conventions for ARM64 architecture.
+// This describes the calling conventions for AArch64 architecture.
 //
 //===----------------------------------------------------------------------===//
 
@@ -22,7 +22,7 @@ class CCIfBigEndian<CCAction A> :
 // ARM AAPCS64 Calling Convention
 //===----------------------------------------------------------------------===//
 
-def CC_ARM64_AAPCS : CallingConv<[
+def CC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
@@ -42,7 +42,7 @@ def CC_ARM64_AAPCS : CallingConv<[
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -73,7 +73,7 @@ def CC_ARM64_AAPCS : CallingConv<[
            CCAssignToStack<16, 16>>
 ]>;
 
-def RetCC_ARM64_AAPCS : CallingConv<[
+def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
@@ -104,7 +104,7 @@ def RetCC_ARM64_AAPCS : CallingConv<[
 // from the standard one at this level:
 //     + i128s (i.e. split i64s) don't need even registers.
 //     + Stack slots are sized as needed rather than being at least 64-bit.
-def CC_ARM64_DarwinPCS : CallingConv<[
+def CC_AArch64_DarwinPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
@@ -117,7 +117,7 @@ def CC_ARM64_DarwinPCS : CallingConv<[
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -140,14 +140,15 @@ def CC_ARM64_DarwinPCS : CallingConv<[
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Stack">>,
+  CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
+  CCIf<"ValVT == MVT::i16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
            CCAssignToStack<8, 8>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
 ]>;
 
-def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
@@ -166,19 +167,18 @@ def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
 // in register and the remaining arguments on stack. We allow 32bit stack slots,
 // so that WebKit can write partial values in the stack and define the other
 // 32bit quantity as undef.
-def CC_ARM64_WebKit_JS : CallingConv<[
+def CC_AArch64_WebKit_JS : CallingConv<[
   // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_WebKit_JS_i1i8i16_Reg">>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
 
   // Pass the remaining arguments on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64], CCAssignToStack<8, 8>>
 ]>;
 
-def RetCC_ARM64_WebKit_JS : CallingConv<[
+def RetCC_AArch64_WebKit_JS : CallingConv<[
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
@@ -197,7 +197,7 @@ def RetCC_ARM64_WebKit_JS : CallingConv<[
 // It would be better to model its preservation semantics properly (create a
 // vreg on entry, use it in RET & tail call generation; make that vreg def if we
 // end up saving LR as part of a call frame). Watch this space...
-def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
                                            X23, X24, X25, X26, X27, X28,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
@@ -210,24 +210,24 @@ def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 // (For generic ARM 64-bit ABI code, clang will not generate constructors or
 // destructors with 'this' returns, so this RegMask will not be used in that
 // case)
-def CSR_ARM64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_ARM64_AAPCS, X0)>;
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
 // fast path for calculation, but other registers except X0 (argument/return)
 // and LR (it is a call, after all) are preserved.
-def CSR_ARM64_TLS_Darwin
+def CSR_AArch64_TLS_Darwin
     : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
                            FP,
                            (sequence "Q%u", 0, 31))>;
 
 // The ELF stub used for TLS-descriptor access saves every feasible
 // register. Only X0 and LR are clobbered.
-def CSR_ARM64_TLS_ELF
+def CSR_AArch64_TLS_ELF
     : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
                            (sequence "Q%u", 0, 31))>;
 
-def CSR_ARM64_AllRegs
+def CSR_AArch64_AllRegs
     : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
                            (sequence "X%u", 0, 28), FP, LR, SP,
                            (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
diff --git a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
similarity index 81%
rename from lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
rename to lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index dce1301b92e1..4d23dc59d7ac 100644
--- a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64CleanupLocalDynamicTLSPass.cpp -----------------------*- C++ -*-=//
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,10 +22,10 @@
 // pass looks through a function and performs such combinations.
 //
 //===----------------------------------------------------------------------===//
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -39,7 +39,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
   LDTLSCleanup() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
       // No point folding accesses if there isn't at least two.
       return false;
@@ -62,7 +62,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
     for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
          ++I) {
       switch (I->getOpcode()) {
-      case ARM64::TLSDESC_BLR:
+      case AArch64::TLSDESC_BLR:
         // Make sure it's a local dynamic access.
         if (!I->getOperand(1).isSymbol() ||
             strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
@@ -92,15 +92,15 @@ struct LDTLSCleanup : public MachineFunctionPass {
   MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
                                        unsigned TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
-    MachineInstr *Copy =
-        BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                TII->get(TargetOpcode::COPY), ARM64::X0).addReg(TLSBaseAddrReg);
+    MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 AArch64::X0).addReg(TLSBaseAddrReg);
 
     // Erase the TLS_base_addr instruction.
     I->eraseFromParent();
@@ -112,19 +112,19 @@ struct LDTLSCleanup : public MachineFunctionPass {
   // inserting a copy instruction after I. Returns the new instruction.
   MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
 
     // Create a virtual register for the TLS base address.
     MachineRegisterInfo &RegInfo = MF->getRegInfo();
-    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
 
     // Insert a copy from X0 to TLSBaseAddrReg for later.
     MachineInstr *Next = I->getNextNode();
     MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
                                  TII->get(TargetOpcode::COPY),
-                                 *TLSBaseAddrReg).addReg(ARM64::X0);
+                                 *TLSBaseAddrReg).addReg(AArch64::X0);
 
     return Copy;
   }
@@ -142,6 +142,6 @@ struct LDTLSCleanup : public MachineFunctionPass {
 }
 
 char LDTLSCleanup::ID = 0;
-FunctionPass *llvm::createARM64CleanupLocalDynamicTLSPass() {
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
   return new LDTLSCleanup();
 }
diff --git a/lib/Target/ARM64/ARM64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
similarity index 91%
rename from lib/Target/ARM64/ARM64CollectLOH.cpp
rename to lib/Target/AArch64/AArch64CollectLOH.cpp
index 8b48f3ae9b2a..6b1f09678e9a 100644
--- a/lib/Target/ARM64/ARM64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -1,4 +1,4 @@
-//===-------------- ARM64CollectLOH.cpp - ARM64 collect LOH pass --*- C++ -*-=//
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -85,8 +85,8 @@
 // This LOH aims at getting rid of redundant ADRP instructions.
 //
 // The overall design for emitting the LOHs is:
-// 1. ARM64CollectLOH (this pass) records the LOHs in the ARM64FunctionInfo.
-// 2. ARM64AsmPrinter reads the LOHs from ARM64FunctionInfo and it:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
 //     1. Associates them a label.
 //     2. Emits them in a MCStreamer (EmitLOHDirective).
 //         - The MCMachOStreamer records them into the MCAssembler.
@@ -98,10 +98,10 @@
 //         - Other ObjectWriters ignore them.
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
@@ -122,16 +122,16 @@
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-collect-loh"
+#define DEBUG_TYPE "aarch64-collect-loh"
 
 static cl::opt<bool>
-PreCollectRegister("arm64-collect-loh-pre-collect-register", cl::Hidden,
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
                    cl::desc("Restrict analysis to registers invovled"
                             " in LOHs"),
                    cl::init(true));
 
 static cl::opt<bool>
-BasicBlockScopeOnly("arm64-collect-loh-bb-only", cl::Hidden,
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
                     cl::desc("Restrict analysis at basic block scope"),
                     cl::init(true));
 
@@ -164,20 +164,20 @@ STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
 STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
 
 namespace llvm {
-void initializeARM64CollectLOHPass(PassRegistry &);
+void initializeAArch64CollectLOHPass(PassRegistry &);
 }
 
 namespace {
-struct ARM64CollectLOH : public MachineFunctionPass {
+struct AArch64CollectLOH : public MachineFunctionPass {
   static char ID;
-  ARM64CollectLOH() : MachineFunctionPass(ID) {
-    initializeARM64CollectLOHPass(*PassRegistry::getPassRegistry());
+  AArch64CollectLOH() : MachineFunctionPass(ID) {
+    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "ARM64 Collect Linker Optimization Hint (LOH)";
+    return "AArch64 Collect Linker Optimization Hint (LOH)";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -214,14 +214,14 @@ typedef DenseMap<unsigned, unsigned> MapRegToId;
 typedef SmallVector<unsigned, 32> MapIdToReg;
 } // end anonymous namespace.
 
-char ARM64CollectLOH::ID = 0;
+char AArch64CollectLOH::ID = 0;
 
-INITIALIZE_PASS_BEGIN(ARM64CollectLOH, "arm64-collect-loh",
-                      "ARM64 Collect Linker Optimization Hint (LOH)", false,
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+                      "AArch64 Collect Linker Optimization Hint (LOH)", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(ARM64CollectLOH, "arm64-collect-loh",
-                    "ARM64 Collect Linker Optimization Hint (LOH)", false,
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+                    "AArch64 Collect Linker Optimization Hint (LOH)", false,
                     false)
 
 /// Given a couple (MBB, reg) get the corresponding set of instruction from
@@ -295,7 +295,7 @@ static void initReachingDef(MachineFunction &MF,
     BitVector &BBKillSet = Kill[&MBB];
     BBKillSet.resize(NbReg);
     for (const MachineInstr &MI : MBB) {
-      bool IsADRP = MI.getOpcode() == ARM64::ADRP;
+      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
 
       // Process uses first.
       if (IsADRP || !ADRPMode)
@@ -509,9 +509,9 @@ static bool canDefBePartOfLOH(const MachineInstr *Def) {
   switch (Opc) {
   default:
     return false;
-  case ARM64::ADRP:
+  case AArch64::ADRP:
     return true;
-  case ARM64::ADDXri:
+  case AArch64::ADDXri:
     // Check immediate to see if the immediate is an address.
     switch (Def->getOperand(2).getType()) {
     default:
@@ -522,7 +522,7 @@ static bool canDefBePartOfLOH(const MachineInstr *Def) {
     case MachineOperand::MO_BlockAddress:
       return true;
     }
-  case ARM64::LDRXui:
+  case AArch64::LDRXui:
     // Check immediate to see if the immediate is an address.
     switch (Def->getOperand(2).getType()) {
     default:
@@ -541,13 +541,13 @@ static bool isCandidateStore(const MachineInstr *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
     // In case we have str xA, [xA, #imm], this is two different uses
     // of xA and we cannot fold, otherwise the xA stored may be wrong,
     // even if #imm == 0.
@@ -582,7 +582,7 @@ static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
         MapRegToId::const_iterator It;
         // if all the reaching defs are not adrp, this use will not be
         // simplifiable.
-        if ((ADRPMode && Def->getOpcode() != ARM64::ADRP) ||
+        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
             (!ADRPMode && !canDefBePartOfLOH(Def)) ||
             (!ADRPMode && isCandidateStore(MI) &&
              // store are LOH candidate iff the end of the chain is used as
@@ -615,7 +615,7 @@ static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
 /// Based on the use to defs information (in ADRPMode), compute the
 /// opportunities of LOH ADRP-related.
 static void computeADRP(const InstrToInstrs &UseToDefs,
-                        ARM64FunctionInfo &ARM64FI,
+                        AArch64FunctionInfo &AArch64FI,
                         const MachineDominatorTree *MDT) {
   DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
   for (const auto &Entry : UseToDefs) {
@@ -634,7 +634,7 @@ static void computeADRP(const InstrToInstrs &UseToDefs,
       SmallVector<const MachineInstr *, 2> Args;
       Args.push_back(L2);
       Args.push_back(L1);
-      ARM64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
       ++NumADRPSimpleCandidate;
     }
 #ifdef DEBUG
@@ -656,19 +656,19 @@ static bool isCandidateLoad(const MachineInstr *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
-  case ARM64::LDRSBWui:
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHWui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (Instr->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
       return false;
     return true;
   }
@@ -681,12 +681,12 @@ static bool supportLoadFromLiteral(const MachineInstr *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
-  case ARM64::LDRSWui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
     return true;
   }
   // Unreachable.
@@ -705,7 +705,7 @@ static bool isCandidate(const MachineInstr *Instr,
     return false;
 
   const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
-  if (Def->getOpcode() != ARM64::ADRP) {
+  if (Def->getOpcode() != AArch64::ADRP) {
     // At this point, Def is ADDXri or LDRXui of the right type of
     // symbol, because we filtered out the uses that were not defined
     // by these kind of instructions (+ ADRP).
@@ -728,7 +728,7 @@ static bool isCandidate(const MachineInstr *Instr,
   // - top is ADRP.
   // - check the simple chain property: each intermediate node must
   // dominates the next one.
-  if (Def->getOpcode() == ARM64::ADRP)
+  if (Def->getOpcode() == AArch64::ADRP)
     return MDT->dominates(Def, Instr);
   return false;
 }
@@ -736,22 +736,22 @@ static bool isCandidate(const MachineInstr *Instr,
 static bool registerADRCandidate(const MachineInstr &Use,
                                  const InstrToInstrs &UseToDefs,
                                  const InstrToInstrs *DefsPerColorToUses,
-                                 ARM64FunctionInfo &ARM64FI,
+                                 AArch64FunctionInfo &AArch64FI,
                                  SetOfMachineInstr *InvolvedInLOHs,
                                  const MapRegToId &RegToId) {
   // Look for opportunities to turn ADRP -> ADD or
   // ADRP -> LDR GOTPAGEOFF into ADR.
   // If ADRP has more than one use. Give up.
-  if (Use.getOpcode() != ARM64::ADDXri &&
-      (Use.getOpcode() != ARM64::LDRXui ||
-       !(Use.getOperand(2).getTargetFlags() & ARM64II::MO_GOT)))
+  if (Use.getOpcode() != AArch64::ADDXri &&
+      (Use.getOpcode() != AArch64::LDRXui ||
+       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
     return false;
   InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
   // The map may contain garbage that we need to ignore.
   if (It == UseToDefs.end() || It->second.empty())
     return false;
   const MachineInstr &Def = **It->second.begin();
-  if (Def.getOpcode() != ARM64::ADRP)
+  if (Def.getOpcode() != AArch64::ADRP)
     return false;
   // Check the number of users of ADRP.
   const SetOfMachineInstr *Users =
@@ -772,7 +772,7 @@ static bool registerADRCandidate(const MachineInstr &Use,
   Args.push_back(&Def);
   Args.push_back(&Use);
 
-  ARM64FI.addLOHDirective(Use.getOpcode() == ARM64::ADDXri ? MCLOH_AdrpAdd
+  AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
                                                            : MCLOH_AdrpLdrGot,
                           Args);
   return true;
@@ -782,7 +782,7 @@ static bool registerADRCandidate(const MachineInstr &Use,
 /// opportunities of LOH non-ADRP-related
 static void computeOthers(const InstrToInstrs &UseToDefs,
                           const InstrToInstrs *DefsPerColorToUses,
-                          ARM64FunctionInfo &ARM64FI, const MapRegToId &RegToId,
+                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
                           const MachineDominatorTree *MDT) {
   SetOfMachineInstr *InvolvedInLOHs = nullptr;
 #ifdef DEBUG
@@ -839,7 +839,7 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
     const MachineInstr *L1 = Def;
     const MachineInstr *L2 = nullptr;
     unsigned ImmediateDefOpc = Def->getOpcode();
-    if (Def->getOpcode() != ARM64::ADRP) {
+    if (Def->getOpcode() != AArch64::ADRP) {
       // Check the number of users of this node.
       const SetOfMachineInstr *Users =
           getUses(DefsPerColorToUses,
@@ -899,10 +899,10 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
       continue;
     }
 
-    bool IsL2Add = (ImmediateDefOpc == ARM64::ADDXri);
+    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
     // If the chain is three instructions long and ldr is the second element,
     // then this ldr must load form GOT, otherwise this is not a correct chain.
-    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != ARM64II::MO_GOT)
+    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
       continue;
     SmallVector<const MachineInstr *, 3> Args;
     MCLOHType Kind;
@@ -944,18 +944,18 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
 #ifdef DEBUG
         // get the immediate of the load
         if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
+          if (ImmediateDefOpc == AArch64::ADDXri)
             ++NumADDToLDR;
           else
             ++NumLDRToLDR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
+        else if (ImmediateDefOpc == AArch64::ADDXri)
           ++NumADDToLDRWithImm;
         else
           ++NumLDRToLDRWithImm;
 #endif // DEBUG
       }
     } else {
-      if (ImmediateDefOpc == ARM64::ADRP)
+      if (ImmediateDefOpc == AArch64::ADRP)
         continue;
       else {
 
@@ -978,23 +978,23 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
 #ifdef DEBUG
         // get the immediate of the store
         if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
+          if (ImmediateDefOpc == AArch64::ADDXri)
             ++NumADDToSTR;
           else
             ++NumLDRToSTR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
+        else if (ImmediateDefOpc == AArch64::ADDXri)
           ++NumADDToSTRWithImm;
         else
           ++NumLDRToSTRWithImm;
 #endif // DEBUG
       }
     }
-    ARM64FI.addLOHDirective(Kind, Args);
+    AArch64FI.addLOHDirective(Kind, Args);
   }
 
   // Now, we grabbed all the big patterns, check ADR opportunities.
   for (const MachineInstr *Candidate : PotentialADROpportunities)
-    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, ARM64FI,
+    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
                          InvolvedInLOHs, RegToId);
 }
 
@@ -1041,15 +1041,15 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
   }
 }
 
-bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
 
   MapRegToId RegToId;
   MapIdToReg IdToReg;
-  ARM64FunctionInfo *ARM64FI = MF.getInfo<ARM64FunctionInfo>();
-  assert(ARM64FI && "No MachineFunctionInfo for this function!");
+  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+  assert(AArch64FI && "No MachineFunctionInfo for this function!");
 
   DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
 
@@ -1059,11 +1059,11 @@ bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
 
   MachineInstr *DummyOp = nullptr;
   if (BasicBlockScopeOnly) {
-    const ARM64InstrInfo *TII =
-        static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+    const AArch64InstrInfo *TII =
+        static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
     // For local analysis, create a dummy operation to record uses that are not
     // local.
-    DummyOp = MF.CreateMachineInstr(TII->get(ARM64::COPY), DebugLoc());
+    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
   }
 
   unsigned NbReg = RegToId.size();
@@ -1084,7 +1084,7 @@ bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
 
   // Compute LOH for ADRP.
-  computeADRP(ADRPToReachingDefs, *ARM64FI, MDT);
+  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
   delete[] ColorOpToReachedUses;
 
   // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
@@ -1100,7 +1100,7 @@ bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
 
   // Compute other than AdrpAdrp LOH.
-  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *ARM64FI, RegToId,
+  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
                 MDT);
   delete[] ColorOpToReachedUses;
 
@@ -1110,8 +1110,8 @@ bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   return Modified;
 }
 
-/// createARM64CollectLOHPass - returns an instance of the Statistic for
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
 /// linker optimization pass.
-FunctionPass *llvm::createARM64CollectLOHPass() {
-  return new ARM64CollectLOH();
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+  return new AArch64CollectLOH();
 }
diff --git a/lib/Target/ARM64/ARM64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
similarity index 85%
rename from lib/Target/ARM64/ARM64ConditionalCompares.cpp
rename to lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 2243cce51a1f..452cdecf8a0c 100644
--- a/lib/Target/ARM64/ARM64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64ConditionalCompares.cpp --- CCMP formation for ARM64 ---------===//
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ARM64ConditionalCompares pass which reduces
+// This file implements the AArch64ConditionalCompares pass which reduces
 // branching and code size by using the conditional compare instructions CCMP,
 // CCMN, and FCMP.
 //
@@ -17,7 +17,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
+#include "AArch64.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -42,16 +42,16 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-ccmp"
+#define DEBUG_TYPE "aarch64-ccmp"
 
 // Absolute maximum number of instructions allowed per speculated block.
 // This bypasses all other heuristics, so it should be set fairly high.
 static cl::opt<unsigned> BlockInstrLimit(
-    "arm64-ccmp-limit", cl::init(30), cl::Hidden,
+    "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
     cl::desc("Maximum number of instructions per speculated block."));
 
 // Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-ccmp", cl::Hidden,
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
                             cl::desc("Turn all knobs to 11"));
 
 STATISTIC(NumConsidered, "Number of ccmps considered");
@@ -98,7 +98,7 @@ STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
 //
 // The cmp-conversion turns the compare instruction in CmpBB into a conditional
 // compare, and merges CmpBB into Head, speculatively executing its
-// instructions. The ARM64 conditional compare instructions have an immediate
+// instructions. The AArch64 conditional compare instructions have an immediate
 // operand that specifies the NZCV flag values when the condition is false and
 // the compare isn't executed. This makes it possible to chain compares with
 // different condition codes.
@@ -162,13 +162,13 @@ class SSACCmpConv {
   SmallVector<MachineOperand, 4> HeadCond;
 
   /// The condition code that makes Head branch to CmpBB.
-  ARM64CC::CondCode HeadCmpBBCC;
+  AArch64CC::CondCode HeadCmpBBCC;
 
   /// The branch condition in CmpBB.
   SmallVector<MachineOperand, 4> CmpBBCond;
 
   /// The condition code that makes CmpBB branch to Tail.
-  ARM64CC::CondCode CmpBBTailCC;
+  AArch64CC::CondCode CmpBBTailCC;
 
   /// Check if the Tail PHIs are trivially convertible.
   bool trivialTailPHIs();
@@ -253,11 +253,11 @@ void SSACCmpConv::updateTailPHIs() {
   }
 }
 
-// This pass runs before the ARM64DeadRegisterDefinitions pass, so compares are
-// still writing virtual registers without any uses.
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
 bool SSACCmpConv::isDeadDef(unsigned DstReg) {
   // Writes to the zero register are dead.
-  if (DstReg == ARM64::WZR || DstReg == ARM64::XZR)
+  if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
     return true;
   if (!TargetRegisterInfo::isVirtualRegister(DstReg))
     return false;
@@ -269,11 +269,11 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) {
 // Parse a condition code returned by AnalyzeBranch, and compute the CondCode
 // corresponding to TBB.
 // Return
-static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
   // A normal br.cond simply has the condition code.
   if (Cond[0].getImm() != -1) {
     assert(Cond.size() == 1 && "Unknown Cond array format");
-    CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
     return true;
   }
   // For tbz and cbz instruction, the opcode is next.
@@ -282,15 +282,15 @@ static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
     // This includes tbz / tbnz branches which can't be converted to
     // ccmp + br.cond.
     return false;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
     assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::EQ;
+    CC = AArch64CC::EQ;
     return true;
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
     assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::NE;
+    CC = AArch64CC::NE;
     return true;
   }
 }
@@ -300,12 +300,12 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
   if (I == MBB->end())
     return nullptr;
   // The terminator must be controlled by the flags.
-  if (!I->readsRegister(ARM64::NZCV)) {
+  if (!I->readsRegister(AArch64::NZCV)) {
     switch (I->getOpcode()) {
-    case ARM64::CBZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZW:
-    case ARM64::CBNZX:
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
       // These can be converted into a ccmp against #0.
       return I;
     }
@@ -320,11 +320,11 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     assert(!I->isTerminator() && "Spurious terminator");
     switch (I->getOpcode()) {
     // cmp is an alias for subs with a dead destination register.
-    case ARM64::SUBSWri:
-    case ARM64::SUBSXri:
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
     // cmn is an alias for adds with a dead destination register.
-    case ARM64::ADDSWri:
-    case ARM64::ADDSXri:
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
       // Check that the immediate operand is within range, ccmp wants a uimm5.
       // Rd = SUBSri Rn, imm, shift
       if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
@@ -333,25 +333,25 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
         return nullptr;
       }
     // Fall through.
-    case ARM64::SUBSWrr:
-    case ARM64::SUBSXrr:
-    case ARM64::ADDSWrr:
-    case ARM64::ADDSXrr:
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
       if (isDeadDef(I->getOperand(0).getReg()))
         return I;
       DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
       ++NumLiveDstRejs;
       return nullptr;
-    case ARM64::FCMPSrr:
-    case ARM64::FCMPDrr:
-    case ARM64::FCMPESrr:
-    case ARM64::FCMPEDrr:
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
       return I;
     }
 
     // Check for flag reads and clobbers.
     MIOperands::PhysRegInfo PRI =
-        MIOperands(I).analyzePhysReg(ARM64::NZCV, TRI);
+        MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
 
     if (PRI.Reads) {
       // The ccmp doesn't produce exactly the same flags as the original
@@ -422,7 +422,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
     }
 
     // Only CmpMI is allowed to clobber the flags.
-    if (&I != CmpMI && I.modifiesRegister(ARM64::NZCV, TRI)) {
+    if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
       DEBUG(dbgs() << "Clobbers flags: " << I);
       return false;
     }
@@ -519,7 +519,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // Make sure the branch direction is right.
   if (TBB != CmpBB) {
     assert(TBB == Tail && "Unexpected TBB");
-    HeadCmpBBCC = ARM64CC::getInvertedCondCode(HeadCmpBBCC);
+    HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
   }
 
   CmpBBCond.clear();
@@ -543,10 +543,10 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   }
 
   if (TBB != Tail)
-    CmpBBTailCC = ARM64CC::getInvertedCondCode(CmpBBTailCC);
+    CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
 
-  DEBUG(dbgs() << "Head->CmpBB on " << ARM64CC::getCondCodeName(HeadCmpBBCC)
-               << ", CmpBB->Tail on " << ARM64CC::getCondCodeName(CmpBBTailCC)
+  DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
                << '\n');
 
   CmpMI = findConvertibleCompare(CmpBB);
@@ -579,13 +579,13 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     ++NumCompBranches;
     unsigned Opc = 0;
     switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-      Opc = ARM64::SUBSWri;
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+      Opc = AArch64::SUBSWri;
       break;
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
-      Opc = ARM64::SUBSXri;
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      Opc = AArch64::SUBSXri;
       break;
     default:
       llvm_unreachable("Cannot convert Head branch");
@@ -615,27 +615,27 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   switch (CmpMI->getOpcode()) {
   default:
     llvm_unreachable("Unknown compare opcode");
-  case ARM64::SUBSWri:    Opc = ARM64::CCMPWi; break;
-  case ARM64::SUBSWrr:    Opc = ARM64::CCMPWr; break;
-  case ARM64::SUBSXri:    Opc = ARM64::CCMPXi; break;
-  case ARM64::SUBSXrr:    Opc = ARM64::CCMPXr; break;
-  case ARM64::ADDSWri:    Opc = ARM64::CCMNWi; break;
-  case ARM64::ADDSWrr:    Opc = ARM64::CCMNWr; break;
-  case ARM64::ADDSXri:    Opc = ARM64::CCMNXi; break;
-  case ARM64::ADDSXrr:    Opc = ARM64::CCMNXr; break;
-  case ARM64::FCMPSrr:    Opc = ARM64::FCCMPSrr; FirstOp = 0; break;
-  case ARM64::FCMPDrr:    Opc = ARM64::FCCMPDrr; FirstOp = 0; break;
-  case ARM64::FCMPESrr:   Opc = ARM64::FCCMPESrr; FirstOp = 0; break;
-  case ARM64::FCMPEDrr:   Opc = ARM64::FCCMPEDrr; FirstOp = 0; break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-    Opc = ARM64::CCMPWi;
+  case AArch64::SUBSWri:    Opc = AArch64::CCMPWi; break;
+  case AArch64::SUBSWrr:    Opc = AArch64::CCMPWr; break;
+  case AArch64::SUBSXri:    Opc = AArch64::CCMPXi; break;
+  case AArch64::SUBSXrr:    Opc = AArch64::CCMPXr; break;
+  case AArch64::ADDSWri:    Opc = AArch64::CCMNWi; break;
+  case AArch64::ADDSWrr:    Opc = AArch64::CCMNWr; break;
+  case AArch64::ADDSXri:    Opc = AArch64::CCMNXi; break;
+  case AArch64::ADDSXrr:    Opc = AArch64::CCMNXr; break;
+  case AArch64::FCMPSrr:    Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+  case AArch64::FCMPDrr:    Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+  case AArch64::FCMPESrr:   Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+  case AArch64::FCMPEDrr:   Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+    Opc = AArch64::CCMPWi;
     FirstOp = 0;
     isZBranch = true;
     break;
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-    Opc = ARM64::CCMPXi;
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    Opc = AArch64::CCMPXi;
     FirstOp = 0;
     isZBranch = true;
     break;
@@ -646,7 +646,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // The NZCV immediate operand should provide flags for the case where Head
   // would have branched to Tail. These flags should cause the new Head
   // terminator to branch to tail.
-  unsigned NZCV = ARM64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
   const MCInstrDesc &MCID = TII->get(Opc);
   MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
                          TII->getRegClass(MCID, 0, TRI, *MF));
@@ -665,10 +665,10 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // If CmpMI was a terminator, we need a new conditional branch to replace it.
   // This now becomes a Head terminator.
   if (isZBranch) {
-    bool isNZ = CmpMI->getOpcode() == ARM64::CBNZW ||
-                CmpMI->getOpcode() == ARM64::CBNZX;
-    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(ARM64::Bcc))
-        .addImm(isNZ ? ARM64CC::NE : ARM64CC::EQ)
+    bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+                CmpMI->getOpcode() == AArch64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+        .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
         .addOperand(CmpMI->getOperand(1)); // Branch target.
   }
   CmpMI->eraseFromParent();
@@ -687,10 +687,10 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
   // plus a branch instruction.
   if (HeadCond[0].getImm() == -1) {
     switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
       // Therefore delta += 1
       delta = 1;
       break;
@@ -706,21 +706,21 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
   default:
     --delta;
     break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
     break;
   }
   return delta;
 }
 
 //===----------------------------------------------------------------------===//
-//                       ARM64ConditionalCompares Pass
+//                       AArch64ConditionalCompares Pass
 //===----------------------------------------------------------------------===//
 
 namespace {
-class ARM64ConditionalCompares : public MachineFunctionPass {
+class AArch64ConditionalCompares : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const MCSchedModel *SchedModel;
@@ -735,11 +735,11 @@ class ARM64ConditionalCompares : public MachineFunctionPass {
 
 public:
   static char ID;
-  ARM64ConditionalCompares() : MachineFunctionPass(ID) {}
+  AArch64ConditionalCompares() : MachineFunctionPass(ID) {}
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnMachineFunction(MachineFunction &MF) override;
   const char *getPassName() const override {
-    return "ARM64 Conditional Compares";
+    return "AArch64 Conditional Compares";
   }
 
 private:
@@ -751,25 +751,25 @@ class ARM64ConditionalCompares : public MachineFunctionPass {
 };
 } // end anonymous namespace
 
-char ARM64ConditionalCompares::ID = 0;
+char AArch64ConditionalCompares::ID = 0;
 
 namespace llvm {
-void initializeARM64ConditionalComparesPass(PassRegistry &);
+void initializeAArch64ConditionalComparesPass(PassRegistry &);
 }
 
-INITIALIZE_PASS_BEGIN(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                      false, false)
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+                      "AArch64 CCMP Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                    false, false)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+                    "AArch64 CCMP Pass", false, false)
 
-FunctionPass *llvm::createARM64ConditionalCompares() {
-  return new ARM64ConditionalCompares();
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+  return new AArch64ConditionalCompares();
 }
 
-void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
@@ -781,8 +781,8 @@ void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 /// Update the dominator tree after if-conversion erased some blocks.
-void
-ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
+void AArch64ConditionalCompares::updateDomTree(
+    ArrayRef<MachineBasicBlock *> Removed) {
   // convert() removes CmpBB which was previously dominated by Head.
   // CmpBB children should be transferred to Head.
   MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
@@ -798,7 +798,7 @@ ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
 
 /// Update LoopInfo after if-conversion.
 void
-ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
   if (!Loops)
     return;
   for (unsigned i = 0, e = Removed.size(); i != e; ++i)
@@ -806,7 +806,7 @@ ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
 }
 
 /// Invalidate MachineTraceMetrics before if-conversion.
-void ARM64ConditionalCompares::invalidateTraces() {
+void AArch64ConditionalCompares::invalidateTraces() {
   Traces->invalidate(CmpConv.Head);
   Traces->invalidate(CmpConv.CmpBB);
 }
@@ -814,7 +814,7 @@ void ARM64ConditionalCompares::invalidateTraces() {
 /// Apply cost model and heuristics to the if-conversion in IfConv.
 /// Return true if the conversion is a good idea.
 ///
-bool ARM64ConditionalCompares::shouldConvert() {
+bool AArch64ConditionalCompares::shouldConvert() {
   // Stress testing mode disables all cost considerations.
   if (Stress)
     return true;
@@ -875,7 +875,7 @@ bool ARM64ConditionalCompares::shouldConvert() {
   return true;
 }
 
-bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
   bool Changed = false;
   while (CmpConv.canConvert(MBB) && shouldConvert()) {
     invalidateTraces();
@@ -888,8 +888,8 @@ bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
   return Changed;
 }
 
-bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** ARM64 Conditional Compares **********\n"
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
diff --git a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
similarity index 79%
rename from lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
rename to lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index e8f03ec833f7..a2d853c85fef 100644
--- a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===//
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,8 +11,8 @@
 // hardware's register renamer.
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -21,12 +21,12 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-dead-defs"
+#define DEBUG_TYPE "aarch64-dead-defs"
 
 STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
 
 namespace {
-class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
 private:
   const TargetRegisterInfo *TRI;
   bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI);
@@ -34,7 +34,7 @@ class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
   bool usesFrameIndex(const MachineInstr &MI);
 public:
   static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+  explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
 
   virtual bool runOnMachineFunction(MachineFunction &F) override;
 
@@ -45,10 +45,10 @@ class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
-char ARM64DeadRegisterDefinitions::ID = 0;
+char AArch64DeadRegisterDefinitions::ID = 0;
 } // end anonymous namespace
 
-bool ARM64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
+bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
     unsigned Reg, const MachineInstr &MI) {
   for (const MachineOperand &MO : MI.implicit_operands())
     if (MO.isReg() && MO.isDef())
@@ -57,15 +57,15 @@ bool ARM64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
   return false;
 }
 
-bool ARM64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
+bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
   for (const MachineOperand &Op : MI.uses())
     if (Op.isFI())
       return true;
   return false;
 }
 
-bool
-ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock &MBB) {
+bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+    MachineBasicBlock &MBB) {
   bool Changed = false;
   for (MachineInstr &MI : MBB) {
     if (usesFrameIndex(MI)) {
@@ -99,11 +99,11 @@ ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock &MBB) {
         default:
           DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
           continue;
-        case ARM64::GPR32RegClassID:
-          NewReg = ARM64::WZR;
+        case AArch64::GPR32RegClassID:
+          NewReg = AArch64::WZR;
           break;
-        case ARM64::GPR64RegClassID:
-          NewReg = ARM64::XZR;
+        case AArch64::GPR64RegClassID:
+          NewReg = AArch64::XZR;
           break;
         }
         DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
@@ -118,10 +118,10 @@ ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock &MBB) {
 
 // Scan the function for instructions that have a dead definition of a
 // register. Replace that register with the zero register when possible.
-bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getTarget().getRegisterInfo();
   bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64DeadRegisterDefinitions *****\n");
+  DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
 
   for (auto &MBB : MF)
     if (processMachineBasicBlock(MBB))
@@ -129,6 +129,6 @@ bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-FunctionPass *llvm::createARM64DeadRegisterDefinitions() {
-  return new ARM64DeadRegisterDefinitions();
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+  return new AArch64DeadRegisterDefinitions();
 }
diff --git a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
similarity index 76%
rename from lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
rename to lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index a4b5d31314ef..a76fd76e5ed4 100644
--- a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64ExpandPseudoInsts.cpp - Expand pseudo instructions ---*- C++ -*-=//
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,25 +14,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "ARM64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/MathExtras.h"
 using namespace llvm;
 
 namespace {
-class ARM64ExpandPseudo : public MachineFunctionPass {
+class AArch64ExpandPseudo : public MachineFunctionPass {
 public:
   static char ID;
-  ARM64ExpandPseudo() : MachineFunctionPass(ID) {}
+  AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
 
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
   const char *getPassName() const override {
-    return "ARM64 pseudo instruction expansion pass";
+    return "AArch64 pseudo instruction expansion pass";
   }
 
 private:
@@ -41,7 +41,7 @@ class ARM64ExpandPseudo : public MachineFunctionPass {
   bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                     unsigned BitSize);
 };
-char ARM64ExpandPseudo::ID = 0;
+char AArch64ExpandPseudo::ID = 0;
 }
 
 /// \brief Transfer implicit operands on the pseudo instruction to the
@@ -87,17 +87,17 @@ static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
 static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
                        MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator &MBBI,
-                       const ARM64InstrInfo *TII, unsigned ChunkIdx) {
+                       const AArch64InstrInfo *TII, unsigned ChunkIdx) {
   assert(ChunkIdx < 4 && "Out of range chunk index specified!");
   const unsigned ShiftAmt = ChunkIdx * 16;
 
   uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+  if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
             .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
+            .addReg(AArch64::XZR)
             .addImm(Encoding);
 
     // Create the MOVK instruction.
@@ -105,11 +105,11 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
     const unsigned DstReg = MI.getOperand(0).getReg();
     const bool DstIsDead = MI.getOperand(0).isDead();
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
             .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
             .addReg(DstReg)
             .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
 
     transferImpOps(MI, MIB, MIB1);
     MI.eraseFromParent();
@@ -124,7 +124,7 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
 static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
   Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
 
-  return ARM64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+  return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
 }
 
 /// \brief Check for identical 16-bit chunks within the constant and if so
@@ -138,7 +138,7 @@ static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
 static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
                                  MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator &MBBI,
-                                 const ARM64InstrInfo *TII) {
+                                 const AArch64InstrInfo *TII) {
   typedef DenseMap<uint64_t, unsigned> CountMap;
   CountMap Counts;
 
@@ -162,9 +162,9 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
     const bool CountThree = Count == 3;
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
             .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
+            .addReg(AArch64::XZR)
             .addImm(Encoding);
 
     const unsigned DstReg = MI.getOperand(0).getReg();
@@ -182,12 +182,12 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
 
     // Create the first MOVK instruction.
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
             .addReg(DstReg,
                     RegState::Define | getDeadRegState(DstIsDead && CountThree))
             .addReg(DstReg)
             .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
 
     // In case we have three instances the whole constant is now materialized
     // and we can exit.
@@ -207,11 +207,11 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
 
     // Create the second MOVK instruction.
     MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
             .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
             .addReg(DstReg)
             .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
 
     transferImpOps(MI, MIB, MIB2);
     MI.eraseFromParent();
@@ -272,7 +272,7 @@ static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
 static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
                               MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator &MBBI,
-                              const ARM64InstrInfo *TII) {
+                              const AArch64InstrInfo *TII) {
   const int NotSet = -1;
   const uint64_t Mask = 0xFFFF;
 
@@ -343,11 +343,11 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
 
   // Create the ORR-immediate instruction.
   uint64_t Encoding = 0;
-  ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
   MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
           .addOperand(MI.getOperand(0))
-          .addReg(ARM64::XZR)
+          .addReg(AArch64::XZR)
           .addImm(Encoding);
 
   const unsigned DstReg = MI.getOperand(0).getReg();
@@ -356,12 +356,13 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
   const bool SingleMovk = SecondMovkIdx == NotSet;
   // Create the first MOVK instruction.
   MachineInstrBuilder MIB1 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
           .addReg(DstReg,
                   RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
           .addReg(DstReg)
           .addImm(getChunk(UImm, FirstMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, FirstMovkIdx * 16));
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
 
   // Early exit in case we only need to emit a single MOVK instruction.
   if (SingleMovk) {
@@ -372,11 +373,12 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
 
   // Create the second MOVK instruction.
   MachineInstrBuilder MIB2 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
           .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
           .addReg(DstReg)
           .addImm(getChunk(UImm, SecondMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, SecondMovkIdx * 16));
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
 
   transferImpOps(MI, MIB, MIB2);
   MI.eraseFromParent();
@@ -385,9 +387,9 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
 
 /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
 /// real move-immediate instructions to synthesize the immediate.
-bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     unsigned BitSize) {
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned BitSize) {
   MachineInstr &MI = *MBBI;
   uint64_t Imm = MI.getOperand(1).getImm();
   const unsigned Mask = 0xFFFF;
@@ -395,12 +397,12 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   // Try a MOVI instruction (aka ORR-immediate with the zero register).
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
-    unsigned Opc = (BitSize == 32 ? ARM64::ORRWri : ARM64::ORRXri);
+  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
             .addOperand(MI.getOperand(0))
-            .addReg(BitSize == 32 ? ARM64::WZR : ARM64::XZR)
+            .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
             .addImm(Encoding);
     transferImpOps(MI, MIB, MIB);
     MI.eraseFromParent();
@@ -504,9 +506,9 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   unsigned FirstOpc;
   if (BitSize == 32) {
     Imm &= (1LL << 32) - 1;
-    FirstOpc = (isNeg ? ARM64::MOVNWi : ARM64::MOVZWi);
+    FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
   } else {
-    FirstOpc = (isNeg ? ARM64::MOVNXi : ARM64::MOVZXi);
+    FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
   }
   unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
   unsigned LastShift = 0; // LSL amount for last MOVK
@@ -524,7 +526,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
           .addReg(DstReg, RegState::Define |
                               getDeadRegState(DstIsDead && Shift == LastShift))
           .addImm(Imm16)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
 
   // If a MOVN was used for the high bits of a negative value, flip the rest
   // of the bits back for use with MOVK.
@@ -538,7 +540,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   }
 
   MachineInstrBuilder MIB2;
-  unsigned Opc = (BitSize == 32 ? ARM64::MOVKWi : ARM64::MOVKXi);
+  unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
   while (Shift != LastShift) {
     Shift -= 16;
     Imm16 = (Imm >> Shift) & Mask;
@@ -550,7 +552,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                            getDeadRegState(DstIsDead && Shift == LastShift))
                .addReg(DstReg)
                .addImm(Imm16)
-               .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
   }
 
   transferImpOps(MI, MIB1, MIB2);
@@ -560,7 +562,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
 
 /// \brief If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
-bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
@@ -568,75 +570,76 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   default:
     break;
 
-  case ARM64::ADDWrr:
-  case ARM64::SUBWrr:
-  case ARM64::ADDXrr:
-  case ARM64::SUBXrr:
-  case ARM64::ADDSWrr:
-  case ARM64::SUBSWrr:
-  case ARM64::ADDSXrr:
-  case ARM64::SUBSXrr:
-  case ARM64::ANDWrr:
-  case ARM64::ANDXrr:
-  case ARM64::BICWrr:
-  case ARM64::BICXrr:
-  case ARM64::ANDSWrr:
-  case ARM64::ANDSXrr:
-  case ARM64::BICSWrr:
-  case ARM64::BICSXrr:
-  case ARM64::EONWrr:
-  case ARM64::EONXrr:
-  case ARM64::EORWrr:
-  case ARM64::EORXrr:
-  case ARM64::ORNWrr:
-  case ARM64::ORNXrr:
-  case ARM64::ORRWrr:
-  case ARM64::ORRXrr: {
+  case AArch64::ADDWrr:
+  case AArch64::SUBWrr:
+  case AArch64::ADDXrr:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSWrr:
+  case AArch64::SUBSWrr:
+  case AArch64::ADDSXrr:
+  case AArch64::SUBSXrr:
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSXrr:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr: {
     unsigned Opcode;
     switch (MI.getOpcode()) {
     default:
       return false;
-    case ARM64::ADDWrr:      Opcode = ARM64::ADDWrs; break;
-    case ARM64::SUBWrr:      Opcode = ARM64::SUBWrs; break;
-    case ARM64::ADDXrr:      Opcode = ARM64::ADDXrs; break;
-    case ARM64::SUBXrr:      Opcode = ARM64::SUBXrs; break;
-    case ARM64::ADDSWrr:     Opcode = ARM64::ADDSWrs; break;
-    case ARM64::SUBSWrr:     Opcode = ARM64::SUBSWrs; break;
-    case ARM64::ADDSXrr:     Opcode = ARM64::ADDSXrs; break;
-    case ARM64::SUBSXrr:     Opcode = ARM64::SUBSXrs; break;
-    case ARM64::ANDWrr:      Opcode = ARM64::ANDWrs; break;
-    case ARM64::ANDXrr:      Opcode = ARM64::ANDXrs; break;
-    case ARM64::BICWrr:      Opcode = ARM64::BICWrs; break;
-    case ARM64::BICXrr:      Opcode = ARM64::BICXrs; break;
-    case ARM64::ANDSWrr:     Opcode = ARM64::ANDSWrs; break;
-    case ARM64::ANDSXrr:     Opcode = ARM64::ANDSXrs; break;
-    case ARM64::BICSWrr:     Opcode = ARM64::BICSWrs; break;
-    case ARM64::BICSXrr:     Opcode = ARM64::BICSXrs; break;
-    case ARM64::EONWrr:      Opcode = ARM64::EONWrs; break;
-    case ARM64::EONXrr:      Opcode = ARM64::EONXrs; break;
-    case ARM64::EORWrr:      Opcode = ARM64::EORWrs; break;
-    case ARM64::EORXrr:      Opcode = ARM64::EORXrs; break;
-    case ARM64::ORNWrr:      Opcode = ARM64::ORNWrs; break;
-    case ARM64::ORNXrr:      Opcode = ARM64::ORNXrs; break;
-    case ARM64::ORRWrr:      Opcode = ARM64::ORRWrs; break;
-    case ARM64::ORRXrr:      Opcode = ARM64::ORRXrs; break;
+    case AArch64::ADDWrr:      Opcode = AArch64::ADDWrs; break;
+    case AArch64::SUBWrr:      Opcode = AArch64::SUBWrs; break;
+    case AArch64::ADDXrr:      Opcode = AArch64::ADDXrs; break;
+    case AArch64::SUBXrr:      Opcode = AArch64::SUBXrs; break;
+    case AArch64::ADDSWrr:     Opcode = AArch64::ADDSWrs; break;
+    case AArch64::SUBSWrr:     Opcode = AArch64::SUBSWrs; break;
+    case AArch64::ADDSXrr:     Opcode = AArch64::ADDSXrs; break;
+    case AArch64::SUBSXrr:     Opcode = AArch64::SUBSXrs; break;
+    case AArch64::ANDWrr:      Opcode = AArch64::ANDWrs; break;
+    case AArch64::ANDXrr:      Opcode = AArch64::ANDXrs; break;
+    case AArch64::BICWrr:      Opcode = AArch64::BICWrs; break;
+    case AArch64::BICXrr:      Opcode = AArch64::BICXrs; break;
+    case AArch64::ANDSWrr:     Opcode = AArch64::ANDSWrs; break;
+    case AArch64::ANDSXrr:     Opcode = AArch64::ANDSXrs; break;
+    case AArch64::BICSWrr:     Opcode = AArch64::BICSWrs; break;
+    case AArch64::BICSXrr:     Opcode = AArch64::BICSXrs; break;
+    case AArch64::EONWrr:      Opcode = AArch64::EONWrs; break;
+    case AArch64::EONXrr:      Opcode = AArch64::EONXrs; break;
+    case AArch64::EORWrr:      Opcode = AArch64::EORWrs; break;
+    case AArch64::EORXrr:      Opcode = AArch64::EORXrs; break;
+    case AArch64::ORNWrr:      Opcode = AArch64::ORNWrs; break;
+    case AArch64::ORNXrr:      Opcode = AArch64::ORNXrs; break;
+    case AArch64::ORRWrr:      Opcode = AArch64::ORRWrs; break;
+    case AArch64::ORRXrr:      Opcode = AArch64::ORRXrs; break;
     }
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
                 MI.getOperand(0).getReg())
             .addOperand(MI.getOperand(1))
             .addOperand(MI.getOperand(2))
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     transferImpOps(MI, MIB1, MIB1);
     MI.eraseFromParent();
     return true;
   }
 
-  case ARM64::FCVTSHpseudo: {
+  case AArch64::FCVTSHpseudo: {
     MachineOperand Src = MI.getOperand(1);
     Src.setImplicit();
-    unsigned SrcH = TII->getRegisterInfo().getSubReg(Src.getReg(), ARM64::hsub);
-    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::FCVTSHr))
+    unsigned SrcH =
+        TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub);
+    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr))
                    .addOperand(MI.getOperand(0))
                    .addReg(SrcH, RegState::Undef)
                    .addOperand(Src);
@@ -644,33 +647,34 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.eraseFromParent();
     return true;
   }
-  case ARM64::LOADgot: {
+  case AArch64::LOADgot: {
     // Expand into ADRP + LDR.
     unsigned DstReg = MI.getOperand(0).getReg();
     const MachineOperand &MO1 = MI.getOperand(1);
     unsigned Flags = MO1.getTargetFlags();
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg);
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
     MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::LDRXui))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
             .addOperand(MI.getOperand(0))
             .addReg(DstReg);
 
     if (MO1.isGlobal()) {
-      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | ARM64II::MO_PAGE);
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
       MIB2.addGlobalAddress(MO1.getGlobal(), 0,
-                            Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+                            Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
     } else if (MO1.isSymbol()) {
-      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | ARM64II::MO_PAGE);
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
       MIB2.addExternalSymbol(MO1.getSymbolName(),
-                             Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+                             Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
     } else {
       assert(MO1.isCPI() &&
              "Only expect globals, externalsymbols, or constant pools");
       MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGE);
+                                Flags | AArch64II::MO_PAGE);
       MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+                                Flags | AArch64II::MO_PAGEOFF |
+                                    AArch64II::MO_NC);
     }
 
     transferImpOps(MI, MIB1, MIB2);
@@ -678,20 +682,20 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return true;
   }
 
-  case ARM64::MOVaddr:
-  case ARM64::MOVaddrJT:
-  case ARM64::MOVaddrCP:
-  case ARM64::MOVaddrBA:
-  case ARM64::MOVaddrTLS:
-  case ARM64::MOVaddrEXT: {
+  case AArch64::MOVaddr:
+  case AArch64::MOVaddrJT:
+  case AArch64::MOVaddrCP:
+  case AArch64::MOVaddrBA:
+  case AArch64::MOVaddrTLS:
+  case AArch64::MOVaddrEXT: {
     // Expand into ADRP + ADD.
     unsigned DstReg = MI.getOperand(0).getReg();
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg)
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
             .addOperand(MI.getOperand(1));
 
     MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADDXri))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
             .addOperand(MI.getOperand(0))
             .addReg(DstReg)
             .addOperand(MI.getOperand(2))
@@ -702,13 +706,13 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return true;
   }
 
-  case ARM64::MOVi32imm:
+  case AArch64::MOVi32imm:
     return expandMOVImm(MBB, MBBI, 32);
-  case ARM64::MOVi64imm:
+  case AArch64::MOVi64imm:
     return expandMOVImm(MBB, MBBI, 64);
-  case ARM64::RET_ReallyLR:
-    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::RET))
-        .addReg(ARM64::LR);
+  case AArch64::RET_ReallyLR:
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+        .addReg(AArch64::LR);
     MI.eraseFromParent();
     return true;
   }
@@ -717,7 +721,7 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
 
 /// \brief Iterate over the instructions in basic block MBB and expand any
 /// pseudo instructions.  Return true if anything was modified.
-bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
 
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
@@ -730,8 +734,8 @@ bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   return Modified;
 }
 
-bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
 
   bool Modified = false;
   for (auto &MBB : MF)
@@ -740,6 +744,6 @@ bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
 }
 
 /// \brief Returns an instance of the pseudo instruction expansion pass.
-FunctionPass *llvm::createARM64ExpandPseudoPass() {
-  return new ARM64ExpandPseudo();
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+  return new AArch64ExpandPseudo();
 }
diff --git a/lib/Target/ARM64/ARM64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
similarity index 77%
rename from lib/Target/ARM64/ARM64FastISel.cpp
rename to lib/Target/AArch64/AArch64FastISel.cpp
index f4bf616559a8..cc2a70ee8a2e 100644
--- a/lib/Target/ARM64/ARM64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -1,4 +1,4 @@
-//===-- ARM6464FastISel.cpp - ARM64 FastISel implementation ---------------===//
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the ARM64-specific support for the FastISel class. Some
+// This file defines the AArch64-specific support for the FastISel class. Some
 // of the target-specific code is generated by tablegen in the file
-// ARM64GenFastISel.inc, which is #included here.
+// AArch64GenFastISel.inc, which is #included here.
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -40,7 +39,7 @@ using namespace llvm;
 
 namespace {
 
-class ARM64FastISel : public FastISel {
+class AArch64FastISel : public FastISel {
 
   class Address {
   public:
@@ -85,9 +84,9 @@ class ARM64FastISel : public FastISel {
     bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
   };
 
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
+  const AArch64Subtarget *Subtarget;
   LLVMContext *Context;
 
 private:
@@ -130,8 +129,8 @@ class ARM64FastISel : public FastISel {
   unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
   unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
 
-  unsigned ARM64MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned ARM64MaterializeGV(const GlobalValue *GV);
+  unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned AArch64MaterializeGV(const GlobalValue *GV);
 
   // Call handling routines.
 private:
@@ -150,29 +149,29 @@ class ARM64FastISel : public FastISel {
   unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
   unsigned TargetMaterializeConstant(const Constant *C) override;
 
-  explicit ARM64FastISel(FunctionLoweringInfo &funcInfo,
+  explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo)
       : FastISel(funcInfo, libInfo) {
-    Subtarget = &TM.getSubtarget<ARM64Subtarget>();
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
     Context = &funcInfo.Fn->getContext();
   }
 
   bool TargetSelectInstruction(const Instruction *I) override;
 
-#include "ARM64GenFastISel.inc"
+#include "AArch64GenFastISel.inc"
 };
 
 } // end anonymous namespace
 
-#include "ARM64GenCallingConv.inc"
+#include "AArch64GenCallingConv.inc"
 
-CCAssignFn *ARM64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
   if (CC == CallingConv::WebKit_JS)
-    return CC_ARM64_WebKit_JS;
-  return Subtarget->isTargetDarwin() ? CC_ARM64_DarwinPCS : CC_ARM64_AAPCS;
+    return CC_AArch64_WebKit_JS;
+  return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
 }
 
-unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
          "Alloca should always return a pointer.");
 
@@ -184,8 +183,8 @@ unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&ARM64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addFrameIndex(SI->second)
         .addImm(0)
@@ -196,7 +195,7 @@ unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   return 0;
 }
 
-unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
   if (VT != MVT::f32 && VT != MVT::f64)
     return 0;
 
@@ -209,11 +208,11 @@ unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
     int Imm;
     unsigned Opc;
     if (is64bit) {
-      Imm = ARM64_AM::getFP64Imm(Val);
-      Opc = ARM64::FMOVDi;
+      Imm = AArch64_AM::getFP64Imm(Val);
+      Opc = AArch64::FMOVDi;
     } else {
-      Imm = ARM64_AM::getFP32Imm(Val);
-      Opc = ARM64::FMOVSi;
+      Imm = AArch64_AM::getFP32Imm(Val);
+      Opc = AArch64::FMOVSi;
     }
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
@@ -228,28 +227,27 @@ unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
     Align = DL.getTypeAllocSize(CFP->getType());
 
   unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64commonRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-          ADRPReg).addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGE);
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+          ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
 
-  unsigned Opc = is64bit ? ARM64::LDRDui : ARM64::LDRSui;
+  unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(ADRPReg)
-      .addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+      .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   return ResultReg;
 }
 
-unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) {
-  // We can't handle thread-local variables quickly yet. Unfortunately we have
-  // to peer through any aliases to find out if that rule applies.
-  const GlobalValue *TLSGV = GV;
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    TLSGV = GA->getAliasee();
+unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+  // We can't handle thread-local variables quickly yet.
+  if (GV->isThreadLocal())
+    return 0;
 
-  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
-    if (GVar->isThreadLocal())
-      return 0;
+  // MachO still uses GOT for large code-model accesses, but ELF requires
+  // movz/movk sequences, which FastISel doesn't handle yet.
+  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+    return 0;
 
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
@@ -257,37 +255,37 @@ unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) {
   if (!DestEVT.isSimple())
     return 0;
 
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64commonRegClass);
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   unsigned ResultReg;
 
-  if (OpFlags & ARM64II::MO_GOT) {
+  if (OpFlags & AArch64II::MO_GOT) {
     // ADRP + LDRX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
             ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGE);
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
 
-    ResultReg = createResultReg(&ARM64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::LDRXui),
+    ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
             ResultReg)
         .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGEOFF |
-                          ARM64II::MO_NC);
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                          AArch64II::MO_NC);
   } else {
     // ADRP + ADDX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-            ADRPReg).addGlobalAddress(GV, 0, ARM64II::MO_PAGE);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
 
-    ResultReg = createResultReg(&ARM64::GPR64spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
+    ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC)
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
         .addImm(0);
   }
   return ResultReg;
 }
 
-unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) {
+unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
@@ -297,15 +295,15 @@ unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) {
 
   // FIXME: Handle ConstantInt.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-    return ARM64MaterializeFP(CFP, VT);
+    return AArch64MaterializeFP(CFP, VT);
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return ARM64MaterializeGV(GV);
+    return AArch64MaterializeGV(GV);
 
   return 0;
 }
 
 // Computes the address to get to an object.
-bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
@@ -413,7 +411,7 @@ bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
   return Addr.isValid();
 }
 
-bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(Ty, true);
 
   // Only handle simple types.
@@ -430,7 +428,7 @@ bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   return TLI.isTypeLegal(VT);
 }
 
-bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
   if (isTypeLegal(Ty, VT))
     return true;
 
@@ -442,8 +440,8 @@ bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
   return false;
 }
 
-bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                                    bool UseUnscaled) {
+bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
+                                      int64_t ScaleFactor, bool UseUnscaled) {
   bool needsLowering = false;
   int64_t Offset = Addr.getOffset();
   switch (VT.SimpleTy) {
@@ -465,11 +463,18 @@ bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
     break;
   }
 
-  // FIXME: If this is a stack pointer and the offset needs to be simplified
-  // then put the alloca address into a register, set the base type back to
-  // register and continue. This should almost never happen.
+  //If this is a stack pointer and the offset needs to be simplified then put
+  // the alloca address into a register, set the base type back to register and
+  // continue. This should almost never happen.
   if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
-    return false;
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addFrameIndex(Addr.getFI())
+        .addImm(0)
+        .addImm(0);
+    Addr.setKind(Address::RegBase);
+    Addr.setReg(ResultReg);
   }
 
   // Since the offset is too large for the load/store instruction get the
@@ -486,9 +491,9 @@ bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
   return true;
 }
 
-void ARM64FastISel::AddLoadStoreOperands(Address &Addr,
-                                         const MachineInstrBuilder &MIB,
-                                         unsigned Flags, bool UseUnscaled) {
+void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+                                           const MachineInstrBuilder &MIB,
+                                           unsigned Flags, bool UseUnscaled) {
   int64_t Offset = Addr.getOffset();
   // Frame base works a bit differently. Handle it separately.
   if (Addr.getKind() == Address::FrameIndexBase) {
@@ -507,8 +512,8 @@ void ARM64FastISel::AddLoadStoreOperands(Address &Addr,
   }
 }
 
-bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                             bool UseUnscaled) {
+bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                               bool UseUnscaled) {
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
   if (!UseUnscaled && Addr.getOffset() < 0)
@@ -525,32 +530,32 @@ bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
     VTIsi1 = true;
   // Intentional fall-through.
   case MVT::i8:
-    Opc = UseUnscaled ? ARM64::LDURBBi : ARM64::LDRBBui;
-    RC = &ARM64::GPR32RegClass;
+    Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+    RC = &AArch64::GPR32RegClass;
     ScaleFactor = 1;
     break;
   case MVT::i16:
-    Opc = UseUnscaled ? ARM64::LDURHHi : ARM64::LDRHHui;
-    RC = &ARM64::GPR32RegClass;
+    Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
+    RC = &AArch64::GPR32RegClass;
     ScaleFactor = 2;
     break;
   case MVT::i32:
-    Opc = UseUnscaled ? ARM64::LDURWi : ARM64::LDRWui;
-    RC = &ARM64::GPR32RegClass;
+    Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
+    RC = &AArch64::GPR32RegClass;
     ScaleFactor = 4;
     break;
   case MVT::i64:
-    Opc = UseUnscaled ? ARM64::LDURXi : ARM64::LDRXui;
-    RC = &ARM64::GPR64RegClass;
+    Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    RC = &AArch64::GPR64RegClass;
     ScaleFactor = 8;
     break;
   case MVT::f32:
-    Opc = UseUnscaled ? ARM64::LDURSi : ARM64::LDRSui;
+    Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
     RC = TLI.getRegClassFor(VT);
     ScaleFactor = 4;
     break;
   case MVT::f64:
-    Opc = UseUnscaled ? ARM64::LDURDi : ARM64::LDRDui;
+    Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
     RC = TLI.getRegClassFor(VT);
     ScaleFactor = 8;
     break;
@@ -577,18 +582,18 @@ bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
 
   // Loading an i1 requires special handling.
   if (VTIsi1) {
-    MRI.constrainRegClass(ResultReg, &ARM64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ANDReg)
         .addReg(ResultReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     ResultReg = ANDReg;
   }
   return true;
 }
 
-bool ARM64FastISel::SelectLoad(const Instruction *I) {
+bool AArch64FastISel::SelectLoad(const Instruction *I) {
   MVT VT;
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
@@ -609,8 +614,8 @@ bool ARM64FastISel::SelectLoad(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                              bool UseUnscaled) {
+bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                bool UseUnscaled) {
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
   if (!UseUnscaled && Addr.getOffset() < 0)
@@ -626,27 +631,27 @@ bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
   case MVT::i1:
     VTIsi1 = true;
   case MVT::i8:
-    StrOpc = UseUnscaled ? ARM64::STURBBi : ARM64::STRBBui;
+    StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
     ScaleFactor = 1;
     break;
   case MVT::i16:
-    StrOpc = UseUnscaled ? ARM64::STURHHi : ARM64::STRHHui;
+    StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
     ScaleFactor = 2;
     break;
   case MVT::i32:
-    StrOpc = UseUnscaled ? ARM64::STURWi : ARM64::STRWui;
+    StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
     ScaleFactor = 4;
     break;
   case MVT::i64:
-    StrOpc = UseUnscaled ? ARM64::STURXi : ARM64::STRXui;
+    StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
     ScaleFactor = 8;
     break;
   case MVT::f32:
-    StrOpc = UseUnscaled ? ARM64::STURSi : ARM64::STRSui;
+    StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
     ScaleFactor = 4;
     break;
   case MVT::f64:
-    StrOpc = UseUnscaled ? ARM64::STURDi : ARM64::STRDui;
+    StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
     ScaleFactor = 8;
     break;
   }
@@ -666,12 +671,12 @@ bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
 
   // Storing an i1 requires special handling.
   if (VTIsi1) {
-    MRI.constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ANDReg)
         .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     SrcReg = ANDReg;
   }
   // Create the base instruction, then add the operands.
@@ -681,7 +686,7 @@ bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
   return true;
 }
 
-bool ARM64FastISel::SelectStore(const Instruction *I) {
+bool AArch64FastISel::SelectStore(const Instruction *I) {
   MVT VT;
   Value *Op0 = I->getOperand(0);
   // Verify we have a legal type before going any further.  Currently, we handle
@@ -706,53 +711,53 @@ bool ARM64FastISel::SelectStore(const Instruction *I) {
   return true;
 }
 
-static ARM64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
   switch (Pred) {
   case CmpInst::FCMP_ONE:
   case CmpInst::FCMP_UEQ:
   default:
     // AL is our "false" for now. The other two need more compares.
-    return ARM64CC::AL;
+    return AArch64CC::AL;
   case CmpInst::ICMP_EQ:
   case CmpInst::FCMP_OEQ:
-    return ARM64CC::EQ;
+    return AArch64CC::EQ;
   case CmpInst::ICMP_SGT:
   case CmpInst::FCMP_OGT:
-    return ARM64CC::GT;
+    return AArch64CC::GT;
   case CmpInst::ICMP_SGE:
   case CmpInst::FCMP_OGE:
-    return ARM64CC::GE;
+    return AArch64CC::GE;
   case CmpInst::ICMP_UGT:
   case CmpInst::FCMP_UGT:
-    return ARM64CC::HI;
+    return AArch64CC::HI;
   case CmpInst::FCMP_OLT:
-    return ARM64CC::MI;
+    return AArch64CC::MI;
   case CmpInst::ICMP_ULE:
   case CmpInst::FCMP_OLE:
-    return ARM64CC::LS;
+    return AArch64CC::LS;
   case CmpInst::FCMP_ORD:
-    return ARM64CC::VC;
+    return AArch64CC::VC;
   case CmpInst::FCMP_UNO:
-    return ARM64CC::VS;
+    return AArch64CC::VS;
   case CmpInst::FCMP_UGE:
-    return ARM64CC::PL;
+    return AArch64CC::PL;
   case CmpInst::ICMP_SLT:
   case CmpInst::FCMP_ULT:
-    return ARM64CC::LT;
+    return AArch64CC::LT;
   case CmpInst::ICMP_SLE:
   case CmpInst::FCMP_ULE:
-    return ARM64CC::LE;
+    return AArch64CC::LE;
   case CmpInst::FCMP_UNE:
   case CmpInst::ICMP_NE:
-    return ARM64CC::NE;
+    return AArch64CC::NE;
   case CmpInst::ICMP_UGE:
-    return ARM64CC::HS;
+    return AArch64CC::HS;
   case CmpInst::ICMP_ULT:
-    return ARM64CC::LO;
+    return AArch64CC::LO;
   }
 }
 
-bool ARM64FastISel::SelectBranch(const Instruction *I) {
+bool AArch64FastISel::SelectBranch(const Instruction *I) {
   const BranchInst *BI = cast<BranchInst>(I);
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
@@ -760,8 +765,8 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
       // We may not handle every CC for now.
-      ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-      if (CC == ARM64CC::AL)
+      AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+      if (CC == AArch64CC::AL)
         return false;
 
       // Emit the cmp.
@@ -769,7 +774,7 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
         return false;
 
       // Emit the branch.
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
       FuncInfo.MBB->addSuccessor(TBB);
@@ -788,26 +793,27 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
       // Issue an extract_subreg to get the lower 32-bits.
       if (SrcVT == MVT::i64)
         CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
-                                             ARM64::sub_32);
+                                             AArch64::sub_32);
 
-      MRI.constrainRegClass(CondReg, &ARM64::GPR32RegClass);
-      unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-              ANDReg)
+      MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+      unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::ANDWri), ANDReg)
           .addReg(CondReg)
-          .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
+          .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBSWri))
           .addReg(ANDReg)
           .addReg(ANDReg)
           .addImm(0)
           .addImm(0);
 
-      unsigned CC = ARM64CC::NE;
+      unsigned CC = AArch64CC::NE;
       if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
         std::swap(TBB, FBB);
-        CC = ARM64CC::EQ;
+        CC = AArch64CC::EQ;
       }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
       FuncInfo.MBB->addSuccessor(TBB);
@@ -818,7 +824,7 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
                  dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::B))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
         .addMBB(Target);
     FuncInfo.MBB->addSuccessor(Target);
     return true;
@@ -835,19 +841,19 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
   // Regardless, the compare has been done in the predecessor block,
   // and it left a value for us in a virtual register.  Ergo, we test
   // the one-bit value left in the virtual register.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri),
-          ARM64::WZR)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
+          AArch64::WZR)
       .addReg(CondReg)
       .addImm(0)
       .addImm(0);
 
-  unsigned CC = ARM64CC::NE;
+  unsigned CC = AArch64CC::NE;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
-    CC = ARM64CC::EQ;
+    CC = AArch64CC::EQ;
   }
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
       .addImm(CC)
       .addMBB(TBB);
   FuncInfo.MBB->addSuccessor(TBB);
@@ -855,14 +861,14 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::SelectIndirectBr(const Instruction *I) {
+bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
   const IndirectBrInst *BI = cast<IndirectBrInst>(I);
   unsigned AddrReg = getRegForValue(BI->getOperand(0));
   if (AddrReg == 0)
     return false;
 
   // Emit the indirect branch.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BR))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
       .addReg(AddrReg);
 
   // Make sure the CFG is up-to-date.
@@ -872,7 +878,7 @@ bool ARM64FastISel::SelectIndirectBr(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
+bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
   Type *Ty = Src1Value->getType();
   EVT SrcEVT = TLI.getValueType(Ty, true);
   if (!SrcEVT.isSimple())
@@ -916,26 +922,26 @@ bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
     needsExt = true;
   // Intentional fall-through.
   case MVT::i32:
-    ZReg = ARM64::WZR;
+    ZReg = AArch64::WZR;
     if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSWri : ARM64::SUBSWri;
+      CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
     else
-      CmpOpc = ARM64::SUBSWrr;
+      CmpOpc = AArch64::SUBSWrr;
     break;
   case MVT::i64:
-    ZReg = ARM64::XZR;
+    ZReg = AArch64::XZR;
     if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSXri : ARM64::SUBSXri;
+      CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
     else
-      CmpOpc = ARM64::SUBSXrr;
+      CmpOpc = AArch64::SUBSXrr;
     break;
   case MVT::f32:
     isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPSri : ARM64::FCMPSrr;
+    CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
     break;
   case MVT::f64:
     isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPDri : ARM64::FCMPDrr;
+    CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
     break;
   }
 
@@ -986,12 +992,12 @@ bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
   return true;
 }
 
-bool ARM64FastISel::SelectCmp(const Instruction *I) {
+bool AArch64FastISel::SelectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
 
   // We may not handle every CC for now.
-  ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-  if (CC == ARM64CC::AL)
+  AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+  if (CC == AArch64CC::AL)
     return false;
 
   // Emit the cmp.
@@ -999,19 +1005,19 @@ bool ARM64FastISel::SelectCmp(const Instruction *I) {
     return false;
 
   // Now set a register based on the comparison.
-  ARM64CC::CondCode invertedCC = getInvertedCondCode(CC);
-  unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::CSINCWr),
+  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
           ResultReg)
-      .addReg(ARM64::WZR)
-      .addReg(ARM64::WZR)
+      .addReg(AArch64::WZR)
+      .addReg(AArch64::WZR)
       .addImm(invertedCC);
 
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectSelect(const Instruction *I) {
+bool AArch64FastISel::SelectSelect(const Instruction *I) {
   const SelectInst *SI = cast<SelectInst>(I);
 
   EVT DestEVT = TLI.getValueType(SI->getType(), true);
@@ -1034,14 +1040,14 @@ bool ARM64FastISel::SelectSelect(const Instruction *I) {
     return false;
 
 
-  MRI.constrainRegClass(CondReg, &ARM64::GPR32RegClass);
-  unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+  MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+  unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
           ANDReg)
       .addReg(CondReg)
-      .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
       .addReg(ANDReg)
       .addReg(ANDReg)
       .addImm(0)
@@ -1052,16 +1058,16 @@ bool ARM64FastISel::SelectSelect(const Instruction *I) {
   default:
     return false;
   case MVT::i32:
-    SelectOpc = ARM64::CSELWr;
+    SelectOpc = AArch64::CSELWr;
     break;
   case MVT::i64:
-    SelectOpc = ARM64::CSELXr;
+    SelectOpc = AArch64::CSELXr;
     break;
   case MVT::f32:
-    SelectOpc = ARM64::FCSELSrrr;
+    SelectOpc = AArch64::FCSELSrrr;
     break;
   case MVT::f64:
-    SelectOpc = ARM64::FCSELDrrr;
+    SelectOpc = AArch64::FCSELDrrr;
     break;
   }
 
@@ -1070,13 +1076,13 @@ bool ARM64FastISel::SelectSelect(const Instruction *I) {
           ResultReg)
       .addReg(TrueReg)
       .addReg(FalseReg)
-      .addImm(ARM64CC::NE);
+      .addImm(AArch64CC::NE);
 
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectFPExt(const Instruction *I) {
+bool AArch64FastISel::SelectFPExt(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
     return false;
@@ -1085,14 +1091,14 @@ bool ARM64FastISel::SelectFPExt(const Instruction *I) {
   if (Op == 0)
     return false;
 
-  unsigned ResultReg = createResultReg(&ARM64::FPR64RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTDSr),
+  unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
           ResultReg).addReg(Op);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectFPTrunc(const Instruction *I) {
+bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
     return false;
@@ -1101,15 +1107,15 @@ bool ARM64FastISel::SelectFPTrunc(const Instruction *I) {
   if (Op == 0)
     return false;
 
-  unsigned ResultReg = createResultReg(&ARM64::FPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTSDr),
+  unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
           ResultReg).addReg(Op);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
 // FPToUI and FPToSI
-bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
@@ -1125,24 +1131,24 @@ bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
   unsigned Opc;
   if (SrcVT == MVT::f64) {
     if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWDr : ARM64::FCVTZSUXDr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
     else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWDr : ARM64::FCVTZUUXDr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
   } else {
     if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWSr : ARM64::FCVTZSUXSr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
     else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWSr : ARM64::FCVTZUUXSr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
   }
   unsigned ResultReg = createResultReg(
-      DestVT == MVT::i32 ? &ARM64::GPR32RegClass : &ARM64::GPR64RegClass);
+      DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
@@ -1163,20 +1169,20 @@ bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
       return false;
   }
 
-  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &ARM64::GPR64RegClass
-                                                  : &ARM64::GPR32RegClass);
+  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
+                                                  : &AArch64::GPR32RegClass);
 
   unsigned Opc;
   if (SrcVT == MVT::i64) {
     if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUXSri : ARM64::SCVTFUXDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
     else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUXSri : ARM64::UCVTFUXDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
   } else {
     if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUWSri : ARM64::SCVTFUWDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
     else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUWSri : ARM64::UCVTFUWDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
   }
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
@@ -1186,12 +1192,11 @@ bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
   return true;
 }
 
-bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                                    SmallVectorImpl<unsigned> &ArgRegs,
-                                    SmallVectorImpl<MVT> &ArgVTs,
-                                    SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                                    SmallVectorImpl<unsigned> &RegArgs,
-                                    CallingConv::ID CC, unsigned &NumBytes) {
+bool AArch64FastISel::ProcessCallArgs(
+    SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
+    SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+    SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+    unsigned &NumBytes) {
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
@@ -1220,7 +1225,6 @@ bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
       Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
       if (Arg == 0)
         return false;
-      ArgVT = DestVT;
       break;
     }
     case CCValAssign::AExt:
@@ -1231,7 +1235,6 @@ bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
       Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
       if (Arg == 0)
         return false;
-      ArgVT = DestVT;
       break;
     }
     default:
@@ -1250,7 +1253,7 @@ bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
       assert(VA.isMemLoc() && "Assuming store on stack.");
 
       // Need to store on the stack.
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
 
       unsigned BEAlign = 0;
       if (ArgSize < 8 && !Subtarget->isLittleEndian())
@@ -1258,7 +1261,7 @@ bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
 
       Address Addr;
       Addr.setKind(Address::RegBase);
-      Addr.setReg(ARM64::SP);
+      Addr.setReg(AArch64::SP);
       Addr.setOffset(VA.getLocMemOffset() + BEAlign);
 
       if (!EmitStore(ArgVT, Arg, Addr))
@@ -1268,9 +1271,9 @@ bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
   return true;
 }
 
-bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                               const Instruction *I, CallingConv::ID CC,
-                               unsigned &NumBytes) {
+bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                                 const Instruction *I, CallingConv::ID CC,
+                                 unsigned &NumBytes) {
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
@@ -1302,8 +1305,8 @@ bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
   return true;
 }
 
-bool ARM64FastISel::SelectCall(const Instruction *I,
-                               const char *IntrMemName = nullptr) {
+bool AArch64FastISel::SelectCall(const Instruction *I,
+                                 const char *IntrMemName = nullptr) {
   const CallInst *CI = cast<CallInst>(I);
   const Value *Callee = CI->getCalledValue();
 
@@ -1396,7 +1399,7 @@ bool ARM64FastISel::SelectCall(const Instruction *I,
 
   // Issue the call.
   MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BL));
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
   if (!IntrMemName)
     MIB.addGlobalAddress(GV, 0, 0);
   else
@@ -1421,15 +1424,15 @@ bool ARM64FastISel::SelectCall(const Instruction *I,
   return true;
 }
 
-bool ARM64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
   if (Alignment)
     return Len / Alignment <= 4;
   else
     return Len < 32;
 }
 
-bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
-                                       unsigned Alignment) {
+bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+                                         uint64_t Len, unsigned Alignment) {
   // Make sure we don't bloat code by inlining very large memcpy's.
   if (!IsMemCpySmall(Len, Alignment))
     return false;
@@ -1464,10 +1467,12 @@ bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
     bool RV;
     unsigned ResultReg;
     RV = EmitLoad(VT, ResultReg, Src);
-    assert(RV == true && "Should be able to handle this load.");
+    if (!RV)
+      return false;
+
     RV = EmitStore(VT, ResultReg, Dest);
-    assert(RV == true && "Should be able to handle this store.");
-    (void)RV;
+    if (!RV)
+      return false;
 
     int64_t Size = VT.getSizeInBits() / 8;
     Len -= Size;
@@ -1481,7 +1486,7 @@ bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
   return true;
 }
 
-bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default:
@@ -1539,7 +1544,7 @@ bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     return SelectCall(&I, "memset");
   }
   case Intrinsic::trap: {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BRK))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
   }
@@ -1547,7 +1552,7 @@ bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
   return false;
 }
 
-bool ARM64FastISel::SelectRet(const Instruction *I) {
+bool AArch64FastISel::SelectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
 
@@ -1569,8 +1574,8 @@ bool ARM64FastISel::SelectRet(const Instruction *I) {
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
                    I->getContext());
-    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                     : RetCC_ARM64_AAPCS;
+    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+                                                     : RetCC_AArch64_AAPCS;
     CCInfo.AnalyzeReturn(Outs, RetCC);
 
     // Only handle a single return value for now.
@@ -1631,13 +1636,13 @@ bool ARM64FastISel::SelectRet(const Instruction *I) {
   }
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(ARM64::RET_ReallyLR));
+                                    TII.get(AArch64::RET_ReallyLR));
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
 }
 
-bool ARM64FastISel::SelectTrunc(const Instruction *I) {
+bool AArch64FastISel::SelectTrunc(const Instruction *I) {
   Type *DestTy = I->getType();
   Value *Op = I->getOperand(0);
   Type *SrcTy = Op->getType();
@@ -1684,14 +1689,14 @@ bool ARM64FastISel::SelectTrunc(const Instruction *I) {
     }
     // Issue an extract_subreg to get the lower 32-bits.
     unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
-                                                ARM64::sub_32);
-    MRI.constrainRegClass(Reg32, &ARM64::GPR32RegClass);
+                                                AArch64::sub_32);
+    MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
     // Create the AND instruction which performs the actual truncation.
-    unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ANDReg)
         .addReg(Reg32)
-        .addImm(ARM64_AM::encodeLogicalImmediate(Mask, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
     SrcReg = ANDReg;
   }
 
@@ -1699,7 +1704,7 @@ bool ARM64FastISel::SelectTrunc(const Instruction *I) {
   return true;
 }
 
-unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
   assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
           DestVT == MVT::i64) &&
          "Unexpected value type.");
@@ -1708,22 +1713,22 @@ unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
     DestVT = MVT::i32;
 
   if (isZExt) {
-    MRI.constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-    unsigned ResultReg = createResultReg(&ARM64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ResultReg)
         .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
 
     if (DestVT == MVT::i64) {
       // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
       // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
-      unsigned Reg64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass);
+      unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(ARM64::SUBREG_TO_REG), Reg64)
+              TII.get(AArch64::SUBREG_TO_REG), Reg64)
           .addImm(0)
           .addReg(ResultReg)
-          .addImm(ARM64::sub_32);
+          .addImm(AArch64::sub_32);
       ResultReg = Reg64;
     }
     return ResultReg;
@@ -1732,8 +1737,8 @@ unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
       // FIXME: We're SExt i1 to i64.
       return 0;
     }
-    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SBFMWri),
+    unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
             ResultReg)
         .addReg(SrcReg)
         .addImm(0)
@@ -1742,8 +1747,8 @@ unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
   }
 }
 
-unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
-                                   bool isZExt) {
+unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool isZExt) {
   assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
   unsigned Opc;
   unsigned Imm = 0;
@@ -1755,21 +1760,21 @@ unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
     return Emiti1Ext(SrcReg, DestVT, isZExt);
   case MVT::i8:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 7;
     break;
   case MVT::i16:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 15;
     break;
   case MVT::i32:
     assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
-    Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+    Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     Imm = 31;
     break;
   }
@@ -1778,12 +1783,12 @@ unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   if (DestVT == MVT::i8 || DestVT == MVT::i16)
     DestVT = MVT::i32;
   else if (DestVT == MVT::i64) {
-    unsigned Src64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass);
+    unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(ARM64::SUBREG_TO_REG), Src64)
+            TII.get(AArch64::SUBREG_TO_REG), Src64)
         .addImm(0)
         .addReg(SrcReg)
-        .addImm(ARM64::sub_32);
+        .addImm(AArch64::sub_32);
     SrcReg = Src64;
   }
 
@@ -1796,7 +1801,7 @@ unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   return ResultReg;
 }
 
-bool ARM64FastISel::SelectIntExt(const Instruction *I) {
+bool AArch64FastISel::SelectIntExt(const Instruction *I) {
   // On ARM, in general, integer casts don't involve legal types; this code
   // handles promotable integers.  The high bits for a type smaller than
   // the register size are assumed to be undefined.
@@ -1825,7 +1830,7 @@ bool ARM64FastISel::SelectIntExt(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
   EVT DestEVT = TLI.getValueType(I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
@@ -1840,13 +1845,13 @@ bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
   default:
     return false;
   case ISD::SREM:
-    DivOpc = is64bit ? ARM64::SDIVXr : ARM64::SDIVWr;
+    DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
     break;
   case ISD::UREM:
-    DivOpc = is64bit ? ARM64::UDIVXr : ARM64::UDIVWr;
+    DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
     break;
   }
-  unsigned MSubOpc = is64bit ? ARM64::MSUBXrrr : ARM64::MSUBWrrr;
+  unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
@@ -1870,7 +1875,7 @@ bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
   return true;
 }
 
-bool ARM64FastISel::SelectMul(const Instruction *I) {
+bool AArch64FastISel::SelectMul(const Instruction *I) {
   EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
   if (!SrcEVT.isSimple())
     return false;
@@ -1889,12 +1894,12 @@ bool ARM64FastISel::SelectMul(const Instruction *I) {
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
-    ZReg = ARM64::WZR;
-    Opc = ARM64::MADDWrrr;
+    ZReg = AArch64::WZR;
+    Opc = AArch64::MADDWrrr;
     break;
   case MVT::i64:
-    ZReg = ARM64::XZR;
-    Opc = ARM64::MADDXrrr;
+    ZReg = AArch64::XZR;
+    Opc = AArch64::MADDXrrr;
     break;
   }
 
@@ -1916,7 +1921,7 @@ bool ARM64FastISel::SelectMul(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) {
+bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
   default:
     break;
@@ -1966,12 +1971,12 @@ bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) {
   }
   return false;
   // Silence warnings.
-  (void)&CC_ARM64_DarwinPCS_VarArg;
+  (void)&CC_AArch64_DarwinPCS_VarArg;
 }
 
 namespace llvm {
-llvm::FastISel *ARM64::createFastISel(FunctionLoweringInfo &funcInfo,
-                                      const TargetLibraryInfo *libInfo) {
-  return new ARM64FastISel(funcInfo, libInfo);
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
+                                        const TargetLibraryInfo *libInfo) {
+  return new AArch64FastISel(funcInfo, libInfo);
 }
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 972e6f7617b3..9c337172dd06 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1,4 +1,4 @@
-//===- AArch64FrameLowering.cpp - AArch64 Frame Information ---------------===//
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,227 +11,445 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-void AArch64FrameLowering::splitSPAdjustments(uint64_t Total,
-                                              uint64_t &Initial,
-                                              uint64_t &Residual) const {
-  // 0x1f0 here is a pessimistic (i.e. realistic) boundary: x-register LDP
-  // instructions have a 7-bit signed immediate scaled by 8, giving a reach of
-  // 0x1f8, but stack adjustment should always be a multiple of 16.
-  if (Total <= 0x1f0) {
-    Initial = Total;
-    Residual = 0;
-  } else {
-    Initial = 0x1f0;
-    Residual = Total - Initial;
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+                                   cl::desc("enable use of redzone on AArch64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset)
+      Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset + Align - 1) / Align * Align;
   }
+  // This does not include the 16 bytes used for fp and lr.
+  return (unsigned)Offset;
 }
 
-void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+    return false;
 
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  // Note: currently hasFP() is always true for hasCalls(), but that's an
+  // implementation detail of the current code, not a strict requirement,
+  // so stay safe here and check both.
+  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+    return false;
+  return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  assert(!RegInfo->needsStackRealignment(MF) &&
+         "No stack realignment on AArch64!");
+#endif
+
+  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  int Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = I->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy)
+      Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 24-bits
+      // because there's no guaranteed temporary register available.
+      //
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // 1) For offset <= 12-bit, we use LSL #0
+      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+      // LSL #0, and the other uses LSL #12.
+      //
+      // Mostly call frames will be allocated at the start of a function so
+      // this is OK, but it is a limitation that needs dealing with.
+      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xffffff && "call frame too large");
+    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+                    TII);
+  }
+  MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  bool NeedsFrameMoves = MMI.hasDebugInfo()
-    || MF.getFunction()->needsUnwindTableEntry();
-
-  uint64_t NumInitialBytes, NumResidualBytes;
-
-  // Currently we expect the stack to be laid out by
-  //     sub sp, sp, #initial
-  //     stp x29, x30, [sp, #offset]
-  //     ...
-  //     str xxx, [sp, #offset]
-  //     sub sp, sp, #rest (possibly via extra instructions).
-  if (MFI->getCalleeSavedInfo().size()) {
-    // If there are callee-saved registers, we want to store them efficiently as
-    // a block, and virtual base assignment happens too early to do it for us so
-    // we adjust the stack in two phases: first just for callee-saved fiddling,
-    // then to allocate the rest of the frame.
-    splitSPAdjustments(MFI->getStackSize(), NumInitialBytes, NumResidualBytes);
-  } else {
-    // If there aren't any callee-saved registers, two-phase adjustment is
-    // inefficient. It's more efficient to adjust with NumInitialBytes too
-    // because when we're in a "callee pops argument space" situation, that pop
-    // must be tacked onto Initial for correctness.
-    NumInitialBytes = MFI->getStackSize();
-    NumResidualBytes = 0;
-  }
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const DataLayout *TD = MF.getTarget().getDataLayout();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -TD->getPointerSize(0);
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+  unsigned TotalSkipped = 0;
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
+                     getOffsetOfLocalArea() + saveAreaOffset;
+
+    // Don't output a new CFI directive if we're re-saving the frame pointer or
+    // link register. This happens when the PrologEpilogInserter has inserted an
+    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+    // method automatically generates the directives when frame pointers are
+    // used. If we generate CFI directives for the extra "STP"s, the linker will
+    // lose track of the correct values for the frame pointer and link register.
+    if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
+      TotalSkipped += stackGrowth;
+      continue;
+    }
 
-  // Tell everyone else how much adjustment we're expecting them to use. In
-  // particular if an adjustment is required for a tail call the epilogue could
-  // have a different view of things.
-  FuncInfo->setInitialStackAdjust(NumInitialBytes);
-
-  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumInitialBytes,
-               MachineInstr::FrameSetup);
-
-  if (NeedsFrameMoves && NumInitialBytes) {
-    // We emit this update even if the CFA is set from a frame pointer later so
-    // that the CFA is valid in the interim.
-    MachineLocation Dst(MachineLocation::VirtualFP);
-    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
-    unsigned CFIIndex = MMI.addFrameInst(
-        MCCFIInstruction::createDefCfa(nullptr, Reg, -NumInitialBytes));
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, DwarfReg, Offset - TotalSkipped));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
   }
+}
 
-  // Otherwise we need to set the frame pointer and/or add a second stack
-  // adjustment.
-
-  bool FPNeedsSetting = hasFP(MF);
-  for (; MBBI != MBB.end(); ++MBBI) {
-    // Note that this search makes strong assumptions about the operation used
-    // to store the frame-pointer: it must be "STP x29, x30, ...". This could
-    // change in future, but until then there's no point in implementing
-    // untestable more generic cases.
-    if (FPNeedsSetting && MBBI->getOpcode() == AArch64::LSPair64_STR
-                       && MBBI->getOperand(0).getReg() == AArch64::X29) {
-      int64_t X29FrameIdx = MBBI->getOperand(2).getIndex();
-      FuncInfo->setFramePointerOffset(MFI->getObjectOffset(X29FrameIdx));
-
-      ++MBBI;
-      emitRegUpdate(MBB, MBBI, DL, TII, AArch64::X29, AArch64::XSP,
-                    AArch64::X29,
-                    NumInitialBytes + MFI->getObjectOffset(X29FrameIdx),
-                    MachineInstr::FrameSetup);
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-      // The offset adjustment used when emitting debugging locations relative
-      // to whatever frame base is set. AArch64 uses the default frame base (FP
-      // or SP) and this adjusts the calculations to be correct.
-      MFI->setOffsetAdjustment(- MFI->getObjectOffset(X29FrameIdx)
-                               - MFI->getStackSize());
-
-      if (NeedsFrameMoves) {
-        unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true);
-        unsigned Offset = MFI->getObjectOffset(X29FrameIdx);
-        unsigned CFIIndex = MMI.addFrameInst(
-            MCCFIInstruction::createDefCfa(nullptr, Reg, Offset));
-        BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
-      }
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
 
-      FPNeedsSetting = false;
+    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (NumBytes && !canUseRedZone(MF)) {
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else if (NumBytes) {
+      ++NumRedZoneFunctions;
     }
 
-    if (!MBBI->getFlag(MachineInstr::FrameSetup))
-      break;
+    return;
   }
 
-  assert(!FPNeedsSetting && "Frame pointer couldn't be set");
+  // Only set up FP if we actually need to.
+  int FPOffset = 0;
+  if (HasFP) {
+    // First instruction must a) allocate the stack  and b) have an immediate
+    // that is a multiple of -2.
+    assert((MBBI->getOpcode() == AArch64::STPXpre ||
+            MBBI->getOpcode() == AArch64::STPDpre) &&
+           MBBI->getOperand(3).getReg() == AArch64::SP &&
+           MBBI->getOperand(4).getImm() < 0 &&
+           (MBBI->getOperand(4).getImm() & 1) == 0);
+
+    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
+    // required for the callee saved register area we get the frame pointer
+    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+    FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
+    assert(FPOffset >= 0 && "Bad Framepointer Offset");
+  }
 
-  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumResidualBytes,
-               MachineInstr::FrameSetup);
+  // Move past the saves of the callee-saved registers.
+  while (MBBI->getOpcode() == AArch64::STPXi ||
+         MBBI->getOpcode() == AArch64::STPDi ||
+         MBBI->getOpcode() == AArch64::STPXpre ||
+         MBBI->getOpcode() == AArch64::STPDpre) {
+    ++MBBI;
+    NumBytes -= 16;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  if (HasFP) {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
 
-  // Now we emit the rest of the frame setup information, if necessary: we've
-  // already noted the FP and initial SP moves so we're left with the prologue's
-  // final SP update and callee-saved register locations.
-  if (!NeedsFrameMoves)
-    return;
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes);
 
-  // The rest of the stack adjustment
-  if (!hasFP(MF) && NumResidualBytes) {
-    MachineLocation Dst(MachineLocation::VirtualFP);
-    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
-    unsigned Offset = NumResidualBytes + NumInitialBytes;
-    unsigned CFIIndex =
-        MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
   }
 
-  // And any callee-saved registers (it's fine to leave them to the end here,
-  // because the old values are still valid at this point.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  if (CSI.size()) {
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Offset = MFI->getObjectOffset(I->getFrameIdx());
-      unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  //
+  if (RegInfo->hasBasePointer(MF))
+    TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
+
+  if (needsFrameMoves) {
+    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const int StackGrowth = -TD->getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, Offset));
-      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored LR
+      unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored FP
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
+
+    // Now emit the moves for whatever callee saved regs we have.
+    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
   }
 }
 
-void
-AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
+static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+  unsigned RtIdx = 0;
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost)
+    RtIdx = 1;
+
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost ||
+      MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
+    if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
+        MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
+      return false;
+    return true;
+  }
 
+  return false;
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
   DebugLoc DL = MBBI->getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned RetOpcode = MBBI->getOpcode();
 
+  int NumBytes = MFI->getStackSize();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
   // Initial and residual are named for consitency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
-  uint64_t NumInitialBytes = FuncInfo->getInitialStackAdjust();
-  uint64_t NumResidualBytes = MFI.getStackSize() - NumInitialBytes;
   uint64_t ArgumentPopSize = 0;
-  if (RetOpcode == AArch64::TC_RETURNdi ||
-      RetOpcode == AArch64::TC_RETURNxi) {
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
+  if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) {
     MachineOperand &StackAdjust = MBBI->getOperand(1);
 
-    MachineInstrBuilder MIB;
-    if (RetOpcode == AArch64::TC_RETURNdi) {
-      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_Bimm));
-      if (JumpTarget.isGlobal()) {
-        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
-                             JumpTarget.getTargetFlags());
-      } else {
-        assert(JumpTarget.isSymbol() && "unexpected tail call destination");
-        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
-                              JumpTarget.getTargetFlags());
-      }
-    } else {
-      assert(RetOpcode == AArch64::TC_RETURNxi && JumpTarget.isReg()
-             && "Unexpected tail call");
-
-      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_BRx));
-      MIB.addReg(JumpTarget.getReg(), RegState::Kill);
-    }
-
-    // Add the extra operands onto the new tail call instruction even though
-    // they're not used directly (so that liveness is tracked properly etc).
-    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
-        MIB->addOperand(MBBI->getOperand(i));
-
-
-    // Delete the pseudo instruction TC_RETURN.
-    MachineInstr *NewMI = std::prev(MBBI);
-    MBB.erase(MBBI);
-    MBBI = NewMI;
-
     // For a tail-call in a callee-pops-arguments environment, some or all of
     // the stack may actually be in use for the call's arguments, this is
     // calculated during LowerCall and consumed here...
@@ -241,386 +459,434 @@ AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     // conveniently stored in the MachineFunctionInfo by
     // LowerFormalArguments. This will, of course, be zero for the C calling
     // convention.
-    ArgumentPopSize = FuncInfo->getArgumentStackToRestore();
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
   }
 
-  assert(NumInitialBytes % 16 == 0 && NumResidualBytes % 16 == 0
-         && "refusing to adjust stack by misaligned amt");
-
-  // We may need to address callee-saved registers differently, so find out the
-  // bound on the frame indices.
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  int MinCSFI = 0;
-  int MaxCSFI = -1;
-
-  if (CSI.size()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  // The stack frame should be like below,
+  //
+  //      ----------------------                     ---
+  //      |                    |                      |
+  //      | BytesInStackArgArea|              CalleeArgStackSize
+  //      | (NumReusableBytes) |                (of tail call)
+  //      |                    |                     ---
+  //      |                    |                      |
+  //      ---------------------|        ---           |
+  //      |                    |         |            |
+  //      |   CalleeSavedReg   |         |            |
+  //      | (NumRestores * 16) |         |            |
+  //      |                    |         |            |
+  //      ---------------------|         |         NumBytes
+  //      |                    |     StackSize  (StackAdjustUp)
+  //      |   LocalStackSize   |         |            |
+  //      | (covering callee   |         |            |
+  //      |       args)        |         |            |
+  //      |                    |         |            |
+  //      ----------------------        ---          ---
+  //
+  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+  //             = StackSize + ArgumentPopSize
+  //
+  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+  // it as the 2nd argument of AArch64ISD::TC_RETURN.
+  NumBytes += ArgumentPopSize;
+
+  unsigned NumRestores = 0;
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBBI;
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  if (LastPopI != MBB.begin()) {
+    do {
+      ++NumRestores;
+      --LastPopI;
+    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+    if (!isCSRestore(LastPopI, CSRegs)) {
+      ++LastPopI;
+      --NumRestores;
+    }
   }
-
-  // The "residual" stack update comes first from this direction and guarantees
-  // that SP is NumInitialBytes below its value on function entry, either by a
-  // direct update or restoring it from the frame pointer.
-  if (NumInitialBytes + ArgumentPopSize != 0) {
-    emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16,
-                 NumInitialBytes + ArgumentPopSize);
-    --MBBI;
+  NumBytes -= NumRestores * 16;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
+                      TII);
+    return;
   }
 
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
+}
 
-  // MBBI now points to the instruction just past the last callee-saved
-  // restoration (either RET/B if NumInitialBytes == 0, or the "ADD sp, sp"
-  // otherwise).
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
 
-  // Now we need to find out where to put the bulk of the stack adjustment
-  MachineBasicBlock::iterator FirstEpilogue = MBBI;
-  while (MBBI != MBB.begin()) {
-    --MBBI;
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
 
-    unsigned FrameOp;
-    for (FrameOp = 0; FrameOp < MBBI->getNumOperands(); ++FrameOp) {
-      if (MBBI->getOperand(FrameOp).isFI())
-        break;
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                     int FI, unsigned &FrameReg,
+                                                     bool PreferFP) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  int FPOffset = MFI->getObjectOffset(FI) + 16;
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs (and thus the SP isn't reliable as a base).
+  // Make sure useFPForScavengingIndex() does the right thing for the emergency
+  // spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
     }
-
-    // If this instruction doesn't have a frame index we've reached the end of
-    // the callee-save restoration.
-    if (FrameOp == MBBI->getNumOperands())
-      break;
-
-    // Likewise if it *is* a local reference, but not to a callee-saved object.
-    int FrameIdx = MBBI->getOperand(FrameOp).getIndex();
-    if (FrameIdx < MinCSFI || FrameIdx > MaxCSFI)
-      break;
-
-    FirstEpilogue = MBBI;
   }
 
-  if (MF.getFrameInfo()->hasVarSizedObjects()) {
-    int64_t StaticFrameBase;
-    StaticFrameBase = -(NumInitialBytes + FuncInfo->getFramePointerOffset());
-    emitRegUpdate(MBB, FirstEpilogue, DL, TII,
-                  AArch64::XSP, AArch64::X29, AArch64::NoRegister,
-                  StaticFrameBase);
-  } else {
-    emitSPUpdate(MBB, FirstEpilogue, DL,TII, AArch64::X16, NumResidualBytes);
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
   }
-}
 
-int64_t
-AArch64FrameLowering::resolveFrameIndexReference(MachineFunction &MF,
-                                                 int FrameIndex,
-                                                 unsigned &FrameReg,
-                                                 int SPAdj,
-                                                 bool IsCalleeSaveOp) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  int64_t TopOfFrameOffset = MFI->getObjectOffset(FrameIndex);
-
-  assert(!(IsCalleeSaveOp && FuncInfo->getInitialStackAdjust() == 0)
-         && "callee-saved register in unexpected place");
-
-  // If the frame for this function is particularly large, we adjust the stack
-  // in two phases which means the callee-save related operations see a
-  // different (intermediate) stack size.
-  int64_t FrameRegPos;
-  if (IsCalleeSaveOp) {
-    FrameReg = AArch64::XSP;
-    FrameRegPos = -static_cast<int64_t>(FuncInfo->getInitialStackAdjust());
-  } else if (useFPForAddressing(MF)) {
-    // Have to use the frame pointer since we have no idea where SP is.
-    FrameReg = AArch64::X29;
-    FrameRegPos = FuncInfo->getFramePointerOffset();
-  } else {
-    FrameReg = AArch64::XSP;
-    FrameRegPos = -static_cast<int64_t>(MFI->getStackSize()) + SPAdj;
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = AArch64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
   }
 
-  return TopOfFrameOffset - FrameRegPos;
+  return Offset;
 }
 
-void
-AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
-  const AArch64RegisterInfo *RegInfo =
-    static_cast<const AArch64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
-
-  if (hasFP(MF)) {
-    MF.getRegInfo().setPhysRegUsed(AArch64::X29);
-    MF.getRegInfo().setPhysRegUsed(AArch64::X30);
-  }
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  if (Reg != AArch64::LR)
+    return getKillRegState(true);
 
-  // If addressing of local variables is going to be more complicated than
-  // shoving a base register and an offset into the instruction then we may well
-  // need to scavenge registers. We should either specifically add an
-  // callee-save register for this purpose or allocate an extra spill slot.
-  bool BigStack =
-    MFI->estimateStackSize(MF) >= TII.estimateRSStackLimit(MF)
-    || MFI->hasVarSizedObjects() // Access will be from X29: messes things up
-    || (MFI->adjustsStack() && !hasReservedCallFrame(MF));
-
-  if (!BigStack)
-    return;
-
-  // We certainly need some slack space for the scavenger, preferably an extra
-  // register.
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs();
-  MCPhysReg ExtraReg = AArch64::NoRegister;
-
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    if (AArch64::GPR64RegClass.contains(CSRegs[i]) &&
-        !MF.getRegInfo().isPhysRegUsed(CSRegs[i])) {
-      ExtraReg = CSRegs[i];
-      break;
-    }
-  }
-
-  if (ExtraReg != 0) {
-    MF.getRegInfo().setPhysRegUsed(ExtraReg);
-  } else {
-    assert(RS && "Expect register scavenger to be available");
-
-    // Create a stack slot for scavenging purposes. PrologEpilogInserter
-    // helpfully places it near either SP or FP for us to avoid
-    // infinitely-regression during scavenging.
-    const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
-    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
-                                                       RC->getAlignment(),
-                                                       false));
-  }
+  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+  bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
+  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+  return getKillRegState(LRKill);
 }
 
-bool AArch64FrameLowering::determinePrologueDeath(MachineBasicBlock &MBB,
-                                                  unsigned Reg) const {
-  // If @llvm.returnaddress is called then it will refer to X30 by some means;
-  // the prologue store does not kill the register.
-  if (Reg == AArch64::X30) {
-    if (MBB.getParent()->getFrameInfo()->isReturnAddressTaken()
-        && MBB.getParent()->getRegInfo().isLiveIn(Reg))
-    return false;
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned idx = Count - i - 2;
+    unsigned Reg1 = CSI[idx].getReg();
+    unsigned Reg2 = CSI[idx + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    unsigned StrOpc;
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+    // first spill is a pre-increment that allocates the stack.
+    // For example:
+    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rational and sequence for restores in epilog.
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPXpre;
+      else
+        StrOpc = AArch64::STPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPDpre;
+      else
+        StrOpc = AArch64::STPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+    // Compute offset: i = 0 => offset = -Count;
+    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+    const int Offset = (i == 0) ? -Count : i;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for STP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+    if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
+        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(AArch64::SP)
+        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
   }
-
-  // In all other cases, physical registers are dead after they've been saved
-  // but live at the beginning of the prologue block.
-  MBB.addLiveIn(Reg);
   return true;
 }
 
-void
-AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      const std::vector<CalleeSavedInfo> &CSI,
-                                      const TargetRegisterInfo *TRI,
-                                      const LoadStoreMethod PossClasses[],
-                                      unsigned NumClasses) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned Reg1 = CSI[i].getReg();
+    unsigned Reg2 = CSI[i + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+    // the last load is sp-pi post-increment and de-allocates the stack:
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPXpost;
+      else
+        LdrOpc = AArch64::LDPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPDpost;
+      else
+        LdrOpc = AArch64::LDPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+
+    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+    // etc.
+    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for LDP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getDefRegState(true))
+        .addReg(Reg1, getDefRegState(true))
+        .addReg(AArch64::SP)
+        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+  }
+  return true;
+}
 
-  // A certain amount of implicit contract is present here. The actual stack
-  // offsets haven't been allocated officially yet, so for strictly correct code
-  // we rely on the fact that the elements of CSI are allocated in order
-  // starting at SP, purely as dictated by size and alignment. In practice since
-  // this function handles the only accesses to those slots it's not quite so
-  // important.
-  //
-  // We have also ordered the Callee-saved register list in AArch64CallingConv
-  // so that the above scheme puts registers in order: in particular we want
-  // &X30 to be &X29+8 for an ABI-correct frame record (PCS 5.2.2)
-  for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-
-    // First we need to find out which register class the register belongs to so
-    // that we can use the correct load/store instrucitons.
-    unsigned ClassIdx;
-    for (ClassIdx = 0; ClassIdx < NumClasses; ++ClassIdx) {
-      if (PossClasses[ClassIdx].RegClass->contains(Reg))
-        break;
-    }
-    assert(ClassIdx != NumClasses
-           && "Asked to store register in unexpected class");
-    const TargetRegisterClass &TheClass = *PossClasses[ClassIdx].RegClass;
-
-    // Now we need to decide whether it's possible to emit a paired instruction:
-    // for this we want the next register to be in the same class.
-    MachineInstrBuilder NewMI;
-    bool Pair = false;
-    if (i + 1 < CSI.size() && TheClass.contains(CSI[i+1].getReg())) {
-      Pair = true;
-      unsigned StLow = 0, StHigh = 0;
-      if (isPrologue) {
-        // Most of these registers will be live-in to the MBB and killed by our
-        // store, though there are exceptions (see determinePrologueDeath).
-        StLow = getKillRegState(determinePrologueDeath(MBB, CSI[i+1].getReg()));
-        StHigh = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
-      } else {
-        StLow = RegState::Define;
-        StHigh = RegState::Define;
-      }
+void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SmallVector<unsigned, 4> UnspilledCSGPRs;
+  SmallVector<unsigned, 4> UnspilledCSFPRs;
 
-      NewMI = BuildMI(MBB, MBBI, DL, TII.get(PossClasses[ClassIdx].PairOpcode))
-                .addReg(CSI[i+1].getReg(), StLow)
-                .addReg(CSI[i].getReg(), StHigh);
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    MRI->setPhysRegUsed(AArch64::FP);
+    MRI->setPhysRegUsed(AArch64::LR);
+  }
 
-      // If it's a paired op, we've consumed two registers
-      ++i;
-    } else {
-      unsigned State;
-      if (isPrologue) {
-        State = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
+  // Spill the BasePtr if it's used. Do this first thing so that the
+  // getCalleeSavedRegs() below will get the right answer.
+  if (RegInfo->hasBasePointer(MF))
+    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumGPRSpilled = 0;
+  unsigned NumFPRSpilled = 0;
+  bool ExtraCSSpill = false;
+  bool CanEliminateFrame = true;
+  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  // Check pairs of consecutive callee-saved registers.
+  for (unsigned i = 0; CSRegs[i]; i += 2) {
+    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+    const unsigned OddReg = CSRegs[i];
+    const unsigned EvenReg = CSRegs[i + 1];
+    assert((AArch64::GPR64RegClass.contains(OddReg) &&
+            AArch64::GPR64RegClass.contains(EvenReg)) ^
+               (AArch64::FPR64RegClass.contains(OddReg) &&
+                AArch64::FPR64RegClass.contains(EvenReg)) &&
+           "Register class mismatch!");
+
+    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+    // Early exit if none of the registers in the register pair is actually
+    // used.
+    if (!OddRegUsed && !EvenRegUsed) {
+      if (AArch64::GPR64RegClass.contains(OddReg)) {
+        UnspilledCSGPRs.push_back(OddReg);
+        UnspilledCSGPRs.push_back(EvenReg);
       } else {
-        State = RegState::Define;
+        UnspilledCSFPRs.push_back(OddReg);
+        UnspilledCSFPRs.push_back(EvenReg);
       }
+      continue;
+    }
 
-      NewMI = BuildMI(MBB, MBBI, DL,
-                      TII.get(PossClasses[ClassIdx].SingleOpcode))
-                .addReg(CSI[i].getReg(), State);
+    unsigned Reg = AArch64::NoRegister;
+    // If only one of the registers of the register pair is used, make sure to
+    // mark the other one as used as well.
+    if (OddRegUsed ^ EvenRegUsed) {
+      // Find out which register is the additional spill.
+      Reg = OddRegUsed ? EvenReg : OddReg;
+      MRI->setPhysRegUsed(Reg);
     }
 
-    // Note that the FrameIdx refers to the second register in a pair: it will
-    // be allocated the smaller numeric address and so is the one an LDP/STP
-    // address must use.
-    int FrameIdx = CSI[i].getFrameIdx();
-    MachineMemOperand::MemOperandFlags Flags;
-    Flags = isPrologue ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                             Flags,
-                             Pair ? TheClass.getSize() * 2 : TheClass.getSize(),
-                             MFI.getObjectAlignment(FrameIdx));
-
-    NewMI.addFrameIndex(FrameIdx)
-      .addImm(0)                  // address-register offset
-      .addMemOperand(MMO);
-
-    if (isPrologue)
-      NewMI.setMIFlags(MachineInstr::FrameSetup);
-
-    // For aesthetic reasons, during an epilogue we want to emit complementary
-    // operations to the prologue, but in the opposite order. So we still
-    // iterate through the CalleeSavedInfo list in order, but we put the
-    // instructions successively earlier in the MBB.
-    if (!isPrologue)
-      --MBBI;
+    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+    assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
+            (RegInfo->getEncodingValue(OddReg) + 1 ==
+             RegInfo->getEncodingValue(EvenReg))) &&
+           "Register pair of non-adjacent registers!");
+    if (AArch64::GPR64RegClass.contains(OddReg)) {
+      NumGPRSpilled += 2;
+      // If it's not a reserved register, we can use it in lieu of an
+      // emergency spill slot for the register scavenger.
+      // FIXME: It would be better to instead keep looking and choose another
+      // unspilled register that isn't reserved, if there is one.
+      if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+        ExtraCSSpill = true;
+    } else
+      NumFPRSpilled += 2;
+
+    CanEliminateFrame = false;
   }
-}
-
-bool
-AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
-
-  static const LoadStoreMethod PossibleClasses[] = {
-    {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR},
-    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR},
-  };
-  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
-
-  emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI,
-                  PossibleClasses, NumClasses);
-
-  return true;
-}
-
-bool
-AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-
-  if (CSI.empty())
-    return false;
-
-  static const LoadStoreMethod PossibleClasses[] = {
-    {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR},
-    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR},
-  };
-  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
-
-  emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI,
-                  PossibleClasses, NumClasses);
-
-  return true;
-}
-
-bool
-AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
-
-  // This is a decision of ABI compliance. The AArch64 PCS gives various options
-  // for conformance, and even at the most stringent level more or less permits
-  // elimination for leaf functions because there's no loss of functionality
-  // (for debugging etc)..
-  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->hasCalls())
-    return true;
 
-  // The following are hard-limits: incorrect code will be generated if we try
-  // to omit the frame.
-  return (RI->needsStackRealignment(MF) ||
-          MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-bool
-AArch64FrameLowering::useFPForAddressing(const MachineFunction &MF) const {
-  return MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-bool
-AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Of the various reasons for having a frame pointer, it's actually only
-  // variable-sized objects that prevent reservation of a call frame.
-  return !(hasFP(MF) && MFI->hasVarSizedObjects());
-}
-
-void
-AArch64FrameLowering::eliminateCallFramePseudoInstr(
-                                MachineFunction &MF,
-                                MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const {
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
-  DebugLoc dl = MI->getDebugLoc();
-  int Opcode = MI->getOpcode();
-  bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode();
-  uint64_t CalleePopAmount = IsDestroy ? MI->getOperand(1).getImm() : 0;
-
-  if (!hasReservedCallFrame(MF)) {
-    unsigned Align = getStackAlignment();
-
-    int64_t Amount = MI->getOperand(0).getImm();
-    Amount = RoundUpToAlignment(Amount, Align);
-    if (!IsDestroy) Amount = -Amount;
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+
+    // If we're adding a register to spill here, we have to add two of them
+    // to keep the number of regs to spill even.
+    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+    unsigned Count = 0;
+    while (!UnspilledCSGPRs.empty() && Count < 2) {
+      unsigned Reg = UnspilledCSGPRs.back();
+      UnspilledCSGPRs.pop_back();
+      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+                   << " to get a scratch register.\n");
+      MRI->setPhysRegUsed(Reg);
+      ExtraCSSpill = true;
+      ++Count;
+    }
 
-    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
-    // doesn't have to pop anything), then the first operand will be zero too so
-    // this adjustment is a no-op.
-    if (CalleePopAmount == 0) {
-      // FIXME: in-function stack adjustment for calls is limited to 12-bits
-      // because there's no guaranteed temporary register available. Mostly call
-      // frames will be allocated at the start of a function so this is OK, but
-      // it is a limitation that needs dealing with.
-      assert(Amount > -0xfff && Amount < 0xfff && "call frame too large");
-      emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, Amount);
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
     }
-  } else if (CalleePopAmount != 0) {
-    // If the calling convention demands that the callee pops arguments from the
-    // stack, we want to add it back if we have a reserved call frame.
-    assert(CalleePopAmount < 0xfff && "call frame too large");
-    emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, -CalleePopAmount);
   }
-
-  MBB.erase(MI);
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 6ec27e3104f0..7686e6f18432 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -1,4 +1,4 @@
-//==- AArch64FrameLowering.h - Define frame lowering for AArch64 -*- C++ -*--=//
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,100 +7,60 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class implements the AArch64-specific parts of the TargetFrameLowering
-// class.
+//
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_FRAMEINFO_H
-#define LLVM_AARCH64_FRAMEINFO_H
+#ifndef AArch64_FRAMELOWERING_H
+#define AArch64_FRAMELOWERING_H
 
-#include "AArch64Subtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
-class AArch64Subtarget;
 
 class AArch64FrameLowering : public TargetFrameLowering {
-private:
-  // In order to unify the spilling and restoring of callee-saved registers into
-  // emitFrameMemOps, we need to be able to specify which instructions to use
-  // for the relevant memory operations on each register class. An array of the
-  // following struct is populated and passed in to achieve this.
-  struct LoadStoreMethod {
-    const TargetRegisterClass *RegClass; // E.g. GPR64RegClass
+public:
+  explicit AArch64FrameLowering()
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            false /*StackRealignable*/) {}
 
-    // The preferred instruction.
-    unsigned PairOpcode; // E.g. LSPair64_STR
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned FramePtr) const;
 
-    // Sometimes only a single register can be handled at once.
-    unsigned SingleOpcode; // E.g. LS64_STR
-  };
-protected:
-  const AArch64Subtarget &STI;
-
-public:
-  explicit AArch64FrameLowering(const AArch64Subtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0, 16),
-      STI(sti) {
-  }
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
   void emitPrologue(MachineFunction &MF) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  /// Decides how much stack adjustment to perform in each phase of the prologue
-  /// and epilogue.
-  void splitSPAdjustments(uint64_t Total, uint64_t &Initial,
-                          uint64_t &Residual) const;
-
-  int64_t resolveFrameIndexReference(MachineFunction &MF, int FrameIndex,
-                                     unsigned &FrameReg, int SPAdj,
-                                     bool IsCalleeSaveOp) const;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
-
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI,
-                                const std::vector<CalleeSavedInfo> &CSI,
-                                const TargetRegisterInfo *TRI) const override;
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI,
-                                const std::vector<CalleeSavedInfo> &CSI,
-                                const TargetRegisterInfo *TRI) const override;
-
-  void
-  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const override;
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
 
-  /// If the register is X30 (i.e. LR) and the return address is used in the
-  /// function then the callee-save store doesn't actually kill the register,
-  /// otherwise it does.
-  bool determinePrologueDeath(MachineBasicBlock &MBB, unsigned Reg) const;
-
-  /// This function emits the loads or stores required during prologue and
-  /// epilogue as efficiently as possible.
-  ///
-  /// The operations involved in setting up and tearing down the frame are
-  /// similar enough to warrant a shared function, particularly as discrepancies
-  /// between the two would be disastrous.
-  void emitFrameMemOps(bool isStore, MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator MI,
-                       const std::vector<CalleeSavedInfo> &CSI,
-                       const TargetRegisterInfo *TRI,
-                       const LoadStoreMethod PossibleClasses[],
-                       unsigned NumClasses) const;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
 
   bool hasFP(const MachineFunction &MF) const override;
-
-  bool useFPForAddressing(const MachineFunction &MF) const;
-
-  /// On AA
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index d1d89af6e04d..98609760a73a 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -11,15 +11,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -27,103 +28,102 @@ using namespace llvm;
 #define DEBUG_TYPE "aarch64-isel"
 
 //===--------------------------------------------------------------------===//
-/// AArch64 specific code to select AArch64 machine instructions for
-/// SelectionDAG operations.
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
 ///
 namespace {
 
 class AArch64DAGToDAGISel : public SelectionDAGISel {
   AArch64TargetMachine &TM;
 
-  /// Keep a pointer to the AArch64Subtarget around so that we can
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
+  bool ForCodeSize;
+
 public:
   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
-    : SelectionDAGISel(tm, OptLevel), TM(tm),
-      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
-  }
+      : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+        ForCodeSize(false) {}
 
   const char *getPassName() const override {
     return "AArch64 Instruction Selection";
   }
 
-  // Include the pieces autogenerated from the target description.
-#include "AArch64GenDAGISel.inc"
-
-  template<unsigned MemSize>
-  bool SelectOffsetUImm12(SDValue N, SDValue &UImm12) {
-    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-    if (!CN || CN->getZExtValue() % MemSize != 0
-        || CN->getZExtValue() / MemSize > 0xfff)
-      return false;
-
-    UImm12 =  CurDAG->getTargetConstant(CN->getZExtValue() / MemSize, MVT::i64);
-    return true;
-  }
-
-  template<unsigned RegWidth>
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
-    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
-  }
-
-  /// Used for pre-lowered address-reference nodes, so we already know
-  /// the fields match. This operand's job is simply to add an
-  /// appropriate shift operand to the MOVZ/MOVK instruction.
-  template<unsigned LogShift>
-  bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) {
-    Imm = N;
-    Shift = CurDAG->getTargetConstant(LogShift, MVT::i32);
-    return true;
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+    ForCodeSize =
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                             Attribute::OptimizeForSize) ||
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  bool SelectFPZeroOperand(SDValue N, SDValue &Dummy);
-
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                unsigned RegWidth);
+  SDNode *Select(SDNode *Node) override;
 
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                     char ConstraintCode,
                                     std::vector<SDValue> &OutOps) override;
 
-  bool SelectLogicalImm(SDValue N, SDValue &Imm);
-
-  template<unsigned RegWidth>
-  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos) {
-    return SelectTSTBOperand(N, FixedPos, RegWidth);
+  SDNode *SelectMLAV64LaneV128(SDNode *N);
+  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, false, Reg, Shift);
+  }
+  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, true, Reg, Shift);
+  }
+  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
   }
 
-  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos, unsigned RegWidth);
-
-  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32,
-                       unsigned Op64);
-
-  /// Put the given constant into a pool and return a DAG which will give its
-  /// address.
-  SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV);
-
-  SDNode *TrySelectToMoveImm(SDNode *N);
-  SDNode *LowerToFPLitPool(SDNode *Node);
-  SDNode *SelectToLitPool(SDNode *N);
-
-  SDNode* Select(SDNode*) override;
-private:
-  /// Get the opcode for table lookup instruction
-  unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec);
-
-  /// Select NEON table lookup intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  /// IsExt is to indicate if the result will be extended with an argument.
-  SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
+  template<int Width>
+  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
 
-  /// Select NEON load intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *Opcode);
+  template<int Width>
+  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
 
-  /// Select NEON store intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *Opcodes);
 
   /// Form sequences of consecutive 64/128-bit registers for use in NEON
   /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
@@ -137,313 +137,710 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
                       unsigned SubRegs[]);
 
-  /// Select NEON load-duplicate intrinsics.  NumVecs should be 2, 3 or 4.
-  /// The opcode array specifies the instructions used for load.
-  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       const uint16_t *Opcodes);
+  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+
+  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                     unsigned SubRegIdx);
+  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                         unsigned SubRegIdx);
+  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
 
-  /// Select NEON load/store lane intrinsics.  NumVecs should be 2, 3 or 4.
-  /// The opcode arrays specify the instructions used for load/store.
-  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
-                          unsigned NumVecs, const uint16_t *Opcodes);
+  SDNode *SelectBitfieldExtractOp(SDNode *N);
+  SDNode *SelectBitfieldInsertOp(SDNode *N);
+
+  SDNode *SelectLIBM(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+                             SDValue &Shift);
+  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool isWorthFolding(SDValue V) const;
+  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+                         SDValue &Offset, SDValue &SignExtend);
+
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
 
-  SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
-                               SDValue Operand);
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
 };
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
 }
 
-bool
-AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                              unsigned RegWidth) {
-  const ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
-  if (!CN) return false;
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+  return isIntImmediate(N.getNode(), Imm);
+}
 
-  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
-  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
-  // x-register.
-  //
-  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
-  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
-  // integers.
-  bool IsExact;
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
 
-  // fbits is between 1 and 64 in the worst-case, which means the fmul
-  // could have 2^64 as an actual operand. Need 65 bits of precision.
-  APSInt IntVal(65, true);
-  CN->getValueAPF().convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all AArch64
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
 
-  // N.b. isPowerOf2 also checks for > 0.
-  if (!IsExact || !IntVal.isPowerOf2()) return false;
-  unsigned FBits = IntVal.logBase2();
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+                                           SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
 
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding FBits, but it must still be in range.
-  if (FBits == 0 || FBits > RegWidth) return false;
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
 
-  FixedPos = CurDAG->getTargetConstant(64 - FBits, MVT::i32);
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
   return true;
 }
 
-bool
-AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                                 char ConstraintCode,
-                                                 std::vector<SDValue> &OutOps) {
-  switch (ConstraintCode) {
-  default: llvm_unreachable("Unrecognised AArch64 memory constraint");
-  case 'm':
-    // FIXME: more freedom is actually permitted for 'm'. We can go
-    // hunting for a base and an offset if we want. Of course, since
-    // we don't really know how the operand is going to be used we're
-    // probably restricted to the load/store pair's simm7 as an offset
-    // range anyway.
-  case 'Q':
-    OutOps.push_back(Op);
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+                                              SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  // The immediate operand must be a 24-bit zero-extended immediate.
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
+    return false;
+
+  if (N.getValueType() == MVT::i32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return false;
+
+  Immed &= 0xFFFFFFULL;
+  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+  switch (N.getOpcode()) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case ISD::SHL:
+    return AArch64_AM::LSL;
+  case ISD::SRL:
+    return AArch64_AM::LSR;
+  case ISD::SRA:
+    return AArch64_AM::ASR;
+  case ISD::ROTR:
+    return AArch64_AM::ROR;
   }
+}
 
+/// \brief Determine wether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+  // it hurts if the a value is used at least twice, unless we are optimizing
+  // for code size.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
   return false;
 }
 
-bool
-AArch64DAGToDAGISel::SelectFPZeroOperand(SDValue N, SDValue &Dummy) {
-  ConstantFPSDNode *Imm = dyn_cast<ConstantFPSDNode>(N);
-  if (!Imm || !Imm->getValueAPF().isPosZero())
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
     return false;
 
-  // Doesn't actually carry any information, but keeps TableGen quiet.
-  Dummy = CurDAG->getTargetConstant(0, MVT::i32);
-  return true;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+    return isWorthFolding(N);
+  }
+
+  return false;
+}
+
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND ||
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT SrcVT;
+    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+    else
+      SrcVT = N.getOperand(0).getValueType();
+
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::SXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::SXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::SXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+             N.getOpcode() == ISD::ANY_EXTEND) {
+    EVT SrcVT = N.getOperand(0).getValueType();
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::UXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::UXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::UXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::AND) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return AArch64_AM::InvalidShiftExtend;
+    uint64_t AndMask = CSD->getZExtValue();
+
+    switch (AndMask) {
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    case 0xFF:
+      return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFF:
+      return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFFFFFF:
+      return AArch64_AM::UXTW;
+    }
+  }
+
+  return AArch64_AM::InvalidShiftExtend;
 }
 
-bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
-  uint32_t Bits;
-  uint32_t RegWidth = N.getValueType().getSizeInBits();
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+      DL->getOpcode() != AArch64ISD::DUPLANE32)
+    return false;
 
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-  if (!CN) return false;
+  SDValue SV = DL->getOperand(0);
+  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return false;
 
-  if (!A64Imms::isLogicalImm(RegWidth, CN->getZExtValue(), Bits))
+  SDValue EV = SV.getOperand(1);
+  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
     return false;
 
-  Imm = CurDAG->getTargetConstant(Bits, MVT::i32);
+  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+  LaneOp = EV.getOperand(0);
+
   return true;
 }
 
-SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
-  SDNode *ResNode;
-  SDLoc dl(Node);
-  EVT DestType = Node->getValueType(0);
-  unsigned DestWidth = DestType.getSizeInBits();
-
-  unsigned MOVOpcode;
-  EVT MOVType;
-  int UImm16, Shift;
-  uint32_t LogicalBits;
-
-  uint64_t BitPat = cast<ConstantSDNode>(Node)->getZExtValue();
-  if (A64Imms::isMOVZImm(DestWidth, BitPat, UImm16, Shift)) {
-    MOVType = DestType;
-    MOVOpcode = DestWidth == 64 ? AArch64::MOVZxii : AArch64::MOVZwii;
-  } else if (A64Imms::isMOVNImm(DestWidth, BitPat, UImm16, Shift)) {
-    MOVType = DestType;
-    MOVOpcode = DestWidth == 64 ? AArch64::MOVNxii : AArch64::MOVNwii;
-  } else if (DestWidth == 64 && A64Imms::isMOVNImm(32, BitPat, UImm16, Shift)) {
-    // To get something like 0x0000_0000_ffff_1234 into a 64-bit register we can
-    // use a 32-bit instruction: "movn w0, 0xedbc".
-    MOVType = MVT::i32;
-    MOVOpcode = AArch64::MOVNwii;
-  } else if (A64Imms::isLogicalImm(DestWidth, BitPat, LogicalBits))  {
-    MOVOpcode = DestWidth == 64 ? AArch64::ORRxxi : AArch64::ORRwwi;
-    uint16_t ZR = DestWidth == 64 ? AArch64::XZR : AArch64::WZR;
-
-    return CurDAG->getMachineNode(MOVOpcode, dl, DestType,
-                              CurDAG->getRegister(ZR, DestType),
-                              CurDAG->getTargetConstant(LogicalBits, MVT::i32));
-  } else {
-    // Can't handle it in one instruction. There's scope for permitting two (or
-    // more) instructions, but that'll need more thought.
-    return nullptr;
+// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+                             SDValue &LaneOp, int &LaneIdx) {
+
+  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+      return false;
+  }
+  StdOp = Op1;
+  return true;
+}
+
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector.  Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
+  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
+  int LaneIdx = -1; // Will hold the lane index.
+
+  if (Op1.getOpcode() != ISD::MUL ||
+      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                        LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (Op1.getOpcode() != ISD::MUL ||
+        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                          LaneIdx))
+      return nullptr;
   }
 
-  ResNode = CurDAG->getMachineNode(MOVOpcode, dl, MOVType,
-                                   CurDAG->getTargetConstant(UImm16, MVT::i32),
-                                   CurDAG->getTargetConstant(Shift, MVT::i32));
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
 
-  if (MOVType != DestType) {
-    ResNode = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                          MVT::i64, MVT::i32, MVT::Other,
-                          CurDAG->getTargetConstant(0, MVT::i64),
-                          SDValue(ResNode, 0),
-                          CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32));
+  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+  unsigned MLAOpc = ~0U;
+
+  switch (N->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unrecognized MLA.");
+  case MVT::v4i16:
+    MLAOpc = AArch64::MLAv4i16_indexed;
+    break;
+  case MVT::v8i16:
+    MLAOpc = AArch64::MLAv8i16_indexed;
+    break;
+  case MVT::v2i32:
+    MLAOpc = AArch64::MLAv2i32_indexed;
+    break;
+  case MVT::v4i32:
+    MLAOpc = AArch64::MLAv4i32_indexed;
+    break;
   }
 
-  return ResNode;
+  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
 }
 
-SDValue
-AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL,
-                                                const Constant *CV) {
-  EVT PtrVT = getTargetLowering()->getPointerTy();
-
-  switch (getTargetLowering()->getTargetMachine().getCodeModel()) {
-  case CodeModel::Small: {
-    unsigned Alignment =
-      getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
-    return CurDAG->getNode(
-        AArch64ISD::WrapperSmall, DL, PtrVT,
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_LO12),
-        CurDAG->getConstant(Alignment, MVT::i32));
-  }
-  case CodeModel::Large: {
-    SDNode *LitAddr;
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVZxii, DL, PtrVT,
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
-        CurDAG->getTargetConstant(3, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
-        CurDAG->getTargetConstant(2, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
-        CurDAG->getTargetConstant(1, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC),
-        CurDAG->getTargetConstant(0, MVT::i32));
-    return SDValue(LitAddr, 0);
+SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDValue SMULLOp0;
+  SDValue SMULLOp1;
+  int LaneIdx;
+
+  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+                        LaneIdx))
+    return nullptr;
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+  unsigned SMULLOpc = ~0U;
+
+  if (IntNo == Intrinsic::aarch64_neon_smull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::SMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::SMULLv2i32_indexed;
+      break;
+    }
+  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::UMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::UMULLv2i32_indexed;
+      break;
+    }
+  } else
+    llvm_unreachable("Unrecognized intrinsic.");
+
+  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+  if (N.getValueType() == MVT::i32)
+    return N;
+
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                               SDLoc(N), MVT::i32, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand.  This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+                                                      SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  AArch64_AM::ShiftExtendType Ext;
+
+  if (N.getOpcode() == ISD::SHL) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return false;
+    ShiftVal = CSD->getZExtValue();
+    if (ShiftVal > 4)
+      return false;
+
+    Ext = getExtendTypeForNode(N.getOperand(0));
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0).getOperand(0);
+  } else {
+    Ext = getExtendTypeForNode(N);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0);
   }
-  default:
-    llvm_unreachable("Only small and large code models supported now");
+
+  // AArch64 mandates that the RHS of the operation must use the smallest
+  // register classs that could contain the size being extended from.  Thus,
+  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+  // there might not be an actual 32-bit value in the program.  We can
+  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+  assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+  Reg = narrowIfNeeded(CurDAG, Reg);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+  return isWorthFolding(N);
+}
+
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+                                              SDValue &Base, SDValue &OffImm) {
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+    return true;
+  }
+
+  if (N.getOpcode() == AArch64ISD::ADDlow) {
+    GlobalAddressSDNode *GAN =
+        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+    Base = N.getOperand(0);
+    OffImm = N.getOperand(1);
+    if (!GAN)
+      return true;
+
+    const GlobalValue *GV = GAN->getGlobal();
+    unsigned Alignment = GV->getAlignment();
+    const DataLayout *DL = TLI->getDataLayout();
+    if (Alignment == 0 && !Subtarget->isTargetDarwin())
+      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
+
+    if (Alignment >= Size)
+      return true;
   }
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = (int64_t)RHS->getZExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Before falling back to our general case, check if the unscaled
+  // instructions can handle this. If so, that's preferable.
+  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+    return false;
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    ldr x0, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+  return true;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
-  SDLoc DL(Node);
-  uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
-  int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
-  EVT DestType = Node->getValueType(0);
-
-  // Since we may end up loading a 64-bit constant from a 32-bit entry the
-  // constant in the pool may have a different type to the eventual node.
-  ISD::LoadExtType Extension;
-  EVT MemType;
-
-  assert((DestType == MVT::i64 || DestType == MVT::i32)
-         && "Only expect integer constants at the moment");
-
-  if (DestType == MVT::i32) {
-    Extension = ISD::NON_EXTLOAD;
-    MemType = MVT::i32;
-  } else if (UnsignedVal <= UINT32_MAX) {
-    Extension = ISD::ZEXTLOAD;
-    MemType = MVT::i32;
-  } else if (SignedVal >= INT32_MIN && SignedVal <= INT32_MAX) {
-    Extension = ISD::SEXTLOAD;
-    MemType = MVT::i32;
-  } else {
-    Extension = ISD::NON_EXTLOAD;
-    MemType = MVT::i64;
-  }
-
-  Constant *CV = ConstantInt::get(Type::getIntNTy(*CurDAG->getContext(),
-                                                  MemType.getSizeInBits()),
-                                  UnsignedVal);
-  SDValue PoolAddr = getConstantPoolItemAddress(DL, CV);
-  unsigned Alignment =
-    getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
-
-  return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
-                            PoolAddr,
-                            MachinePointerInfo::getConstantPool(), MemType,
-                            /* isVolatile = */ false,
-                            /* isNonTemporal = */ false,
-                            Alignment).getNode();
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address.  This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode.  The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+                                                 SDValue &Base,
+                                                 SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    // If the offset is valid as a scaled immediate, don't match here.
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+        RHSC < (0x1000 << Log2_32(Size)))
+      return false;
+    if (RHSC >= -256 && RHSC < 256) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        const TargetLowering *TLI = getTargetLowering();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+      return true;
+    }
+  }
+  return false;
 }
 
-SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
-  SDLoc DL(Node);
-  const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
-  EVT DestType = Node->getValueType(0);
-
-  unsigned Alignment =
-    getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType());
-  SDValue PoolAddr = getConstantPoolItemAddress(DL, FV);
-
-  return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
-                         MachinePointerInfo::getConstantPool(),
-                         /* isVolatile = */ false,
-                         /* isNonTemporal = */ false,
-                         /* isInvariant = */ true,
-                         Alignment).getNode();
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  SDValue ImpDef = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
+      0);
+  MachineSDNode *Node = CurDAG->getMachineNode(
+      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+  return SDValue(Node, 0);
 }
 
-bool
-AArch64DAGToDAGISel::SelectTSTBOperand(SDValue N, SDValue &FixedPos,
-                                       unsigned RegWidth) {
-  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-  if (!CN) return false;
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+                                            bool WantExtend, SDValue &Offset,
+                                            SDValue &SignExtend) {
+  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+    return false;
 
-  uint64_t Val = CN->getZExtValue();
+  if (WantExtend) {
+    AArch64_AM::ShiftExtendType Ext =
+        getExtendTypeForNode(N.getOperand(0), true);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
 
-  if (!isPowerOf2_64(Val)) return false;
+    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+  } else {
+    Offset = N.getOperand(0);
+    SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
+  }
 
-  unsigned TestedBit = Log2_64(Val);
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding TestedBit, but it must still be in range.
-  if (TestedBit >= RegWidth) return false;
+  unsigned LegalShiftVal = Log2_32(Size);
+  unsigned ShiftVal = CSD->getZExtValue();
 
-  FixedPos = CurDAG->getTargetConstant(TestedBit, MVT::i64);
-  return true;
+  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+    return false;
+
+  if (isWorthFolding(N))
+    return true;
+
+  return false;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
-                                          unsigned Op16,unsigned Op32,
-                                          unsigned Op64) {
-  // Mostly direct translation to the given operations, except that we preserve
-  // the AtomicOrdering for use later on.
-  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
-  EVT VT = AN->getMemoryVT();
-
-  unsigned Op;
-  if (VT == MVT::i8)
-    Op = Op8;
-  else if (VT == MVT::i16)
-    Op = Op16;
-  else if (VT == MVT::i32)
-    Op = Op32;
-  else if (VT == MVT::i64)
-    Op = Op64;
-  else
-    llvm_unreachable("Unexpected atomic operation");
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
 
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
-      Ops.push_back(AN->getOperand(i));
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // There was no shift, whatever else we find.
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+
+  AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+  // Try to match an unshifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(LHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = RHS;
+    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(LHS))
+      return true;
+  }
+
+  // Try to match an unshifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(RHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = LHS;
+    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(RHS))
+      return true;
+  }
 
-  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
-  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
 
-  return CurDAG->SelectNodeTo(Node, Op, AN->getValueType(0), MVT::Other, Ops);
+  // Match any non-shifted, non-extend, non-immediate add expression.
+  Base = LHS;
+  Offset = RHS;
+  SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+  // Reg1 + Reg2 is free: no check needed.
+  return true;
 }
 
 SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { AArch64::DPairRegClassID,
-                                    AArch64::DTripleRegClassID,
-                                    AArch64::DQuadRegClassID };
-  static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1,
-                                AArch64::dsub_2, AArch64::dsub_3 };
+  static unsigned RegClassIDs[] = {
+      AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+  static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
+                                AArch64::dsub2, AArch64::dsub3 };
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
 
 SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { AArch64::QPairRegClassID,
-                                    AArch64::QTripleRegClassID,
-                                    AArch64::QQuadRegClassID };
-  static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1,
-                                AArch64::qsub_2, AArch64::qsub_3 };
+  static unsigned RegClassIDs[] = {
+      AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+  static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
+                                AArch64::qsub2, AArch64::qsub3 };
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
@@ -477,1100 +874,2159 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
   return SDValue(N, 0);
 }
 
+SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, bool isExt) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
 
-// Get the register stride update opcode of a VLD/VST instruction that
-// is otherwise equivalent to the given fixed stride updating instruction.
-static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
-  switch (Opc) {
-  default: break;
-  case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register;
-  case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register;
-  case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register;
-  case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register;
-  case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register;
-  case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register;
-  case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register;
-  case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register;
-
-  case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register;
-  case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register;
-  case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register;
-  case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register;
-  case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register;
-  case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register;
-  case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register;
-
-  case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register;
-  case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register;
-  case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register;
-  case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register;
-  case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register;
-  case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register;
-  case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register;
-
-  case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register;
-  case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register;
-  case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register;
-  case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register;
-  case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register;
-  case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register;
-  case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register;
-
-  case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register;
-  case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register;
-  case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register;
-  case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register;
-  case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register;
-  case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register;
-  case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register;
-  case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register;
-
-  case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register;
-  case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register;
-  case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register;
-  case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register;
-  case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register;
-  case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register;
-  case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register;
-  case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register;
-
-  case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register;
-  case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register;
-  case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register;
-  case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register;
-  case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register;
-  case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register;
-  case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register;
-  case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register;
-
-  case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register;
-  case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register;
-  case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register;
-  case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register;
-  case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register;
-  case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register;
-  case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register;
-  case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register;
-
-  case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register;
-  case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register;
-  case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register;
-  case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register;
-  case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register;
-  case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register;
-  case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register;
-
-  case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register;
-  case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register;
-  case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register;
-  case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register;
-  case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register;
-  case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register;
-  case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register;
-
-  case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register;
-  case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register;
-  case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register;
-  case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register;
-  case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register;
-  case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register;
-  case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register;
-
-  case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register;
-  case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register;
-  case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register;
-  case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register;
-  case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register;
-  case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register;
-  case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register;
-  case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register;
-
-  case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register;
-  case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register;
-  case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register;
-  case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register;
-  case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register;
-  case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register;
-  case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register;
-  case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register;
-
-  case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register;
-  case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register;
-  case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register;
-  case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register;
-  case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register;
-  case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
-  case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
-  case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
-
-  // Post-index of duplicate loads
-  case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
-  case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
-  case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
-  case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
-  case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
-  case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
-  case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
-  case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
-
-  case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
-  case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
-  case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
-  case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
-  case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
-  case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
-  case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
-  case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
-
-  case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
-  case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
-  case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
-  case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
-  case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
-  case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
-  case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
-  case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
-
-  // Post-index of lane loads
-  case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
-  case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
-  case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
-  case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
-
-  case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
-  case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
-  case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
-  case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
-
-  case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
-  case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
-  case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
-  case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
-
-  // Post-index of lane stores
-  case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
-  case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
-  case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
-  case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
-
-  case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
-  case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
-  case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
-  case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
-
-  case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
-  case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
-  case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
-  case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
-  }
-  return Opc; // If not one we handle, return it unchanged.
-}
-
-SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
-                                       unsigned NumVecs,
-                                       const uint16_t *Opcodes) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  unsigned ExtOff = isExt;
 
-  EVT VT = N->getValueType(0);
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector load type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
+  // Form a REG_SEQUENCE to force register allocation.
+  unsigned Vec0Off = ExtOff + 1;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+                               N->op_begin() + Vec0Off + NumVecs);
+  SDValue RegSeq = createQTuple(Regs);
 
-  SmallVector<SDValue, 2> Ops;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  SmallVector<SDValue, 6> Ops;
+  if (isExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
 
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
+SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isUnindexed())
+    return nullptr;
+  EVT VT = LD->getMemoryVT();
+  EVT DstVT = N->getValueType(0);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+  // We're not doing validity checking here. That was done when checking
+  // if we should mark the load as indexed or not. We're just selecting
+  // the right instruction.
+  unsigned Opcode = 0;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool InsertTo64 = false;
+  if (VT == MVT::i64)
+    Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+  else if (VT == MVT::i32) {
+    if (ExtType == ISD::NON_EXTLOAD)
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+    else if (ExtType == ISD::SEXTLOAD)
+      Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+    else {
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+      InsertTo64 = true;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i16) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i8) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::f32) {
+    Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+  } else if (VT == MVT::f64 || VT.is64BitVector()) {
+    Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+  } else if (VT.is128BitVector()) {
+    Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+  } else
+    return nullptr;
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+  SDValue Ops[] = { Base, Offset, Chain };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
+                                       MVT::Other, Ops);
+  // Either way, we're replacing the node, so tell the caller that.
+  Done = true;
+  SDValue LoadedVal = SDValue(Res, 1);
+  if (InsertTo64) {
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    LoadedVal =
+        SDValue(CurDAG->getMachineNode(
+                    AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
+                    CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg),
+                0);
   }
 
-  Ops.push_back(N->getOperand(0)); // Push back the Chain
+  ReplaceUses(SDValue(N, 0), LoadedVal);
+  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
 
-  SmallVector<EVT, 3> ResTys;
-  // Push back the type of return super register
-  if (NumVecs == 1)
-    ResTys.push_back(VT);
-  else if (NumVecs == 3)
-    ResTys.push_back(MVT::Untyped);
-  else {
-    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                 is64BitVector ? NumVecs : NumVecs * 2);
-    ResTys.push_back(ResTy);
-  }
-
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of the Chain
-  SDLoc dl(N);
-  SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  return nullptr;
+}
 
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
+                                        unsigned Opc, unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
 
-  if (NumVecs == 1)
-    return VLd;
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(2)); // Mem operand;
+  Ops.push_back(Chain);
 
-  // If NumVecs > 1, the return result is a super register containing 2-4
-  // consecutive vector registers.
-  SDValue SuperReg = SDValue(VLd, 0);
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
 
-  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
-  // Update users of the Chain
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i),
+        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
 
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
   return nullptr;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
-                                       unsigned NumVecs,
-                                       const uint16_t *Opcodes) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc, unsigned SubRegIdx) {
   SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
 
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Mem operand
+  Ops.push_back(N->getOperand(2)); // Incremental
+  Ops.push_back(Chain);
 
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  unsigned Vec0Idx = 3;
-  EVT VT = N->getOperand(Vec0Idx).getValueType();
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector store type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1)
+    ReplaceUses(SDValue(N, 0), SuperReg);
+  else
+    for (unsigned i = 0; i < NumVecs; ++i)
+      ReplaceUses(SDValue(N, i),
+          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  // Update the chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 2));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  return St;
+}
 
+SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
   SmallVector<EVT, 2> ResTys;
-  if (isUpdating)
-    ResTys.push_back(MVT::i64);
+  ResTys.push_back(MVT::i64);   // Type of the write back register
   ResTys.push_back(MVT::Other); // Type for the Chain
 
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
+  Ops.push_back(N->getOperand(0)); // Chain
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  return St;
+}
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+  SelectionDAG &DAG;
 
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
+public:
+  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+  SDValue operator()(SDValue V64Reg) {
+    EVT VT = V64Reg.getValueType();
+    unsigned NarrowSize = VT.getVectorNumElements();
+    MVT EltTy = VT.getVectorElementType().getSimpleVT();
+    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+    SDLoc DL(V64Reg);
+
+    SDValue Undef =
+        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+    return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
   }
+};
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+                                    V128Reg);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
 
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs);
-  Ops.push_back(SrcReg);
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
 
-  // Push back the Chain
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
   Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+  static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                              AArch64::qsub3 };
+  for (unsigned i = 0; i < NumVecs; ++i) {
+    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+    if (Narrow)
+      NV = NarrowVector(NV, *CurDAG);
+    ReplaceUses(SDValue(N, i), NV);
+  }
 
-  // Transfer memoperands.
-  SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
 
-  return VSt;
+  return Ld;
 }
 
-SDValue
-AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
-                                          SDValue Operand) {
-  SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
-                        VT, VTD, MVT::Other,
-                        CurDAG->getTargetConstant(0, MVT::i64),
-                        Operand,
-                        CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
-  return SDValue(Reg, 0);
+SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                                unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of the write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of the vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0),
+                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+  } else {
+    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+    static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                                AArch64::qsub3 };
+    for (unsigned i = 0; i < NumVecs; ++i) {
+      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+                                                  SuperReg);
+      if (Narrow)
+        NV = NarrowVector(NV, *CurDAG);
+      ReplaceUses(SDValue(N, i), NV);
+    }
+  }
+
+  // Update the Chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+
+  return Ld;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                          unsigned NumVecs,
-                                          const uint16_t *Opcodes) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
   SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
 
-  EVT VT = N->getValueType(0);
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector duplicate lane load type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
-
-  SDValue SuperReg;
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Push back the Memory Address
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(2);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
-  }
-  Ops.push_back(N->getOperand(0)); // Push back the Chain
-
-  SmallVector<EVT, 3> ResTys;
-  // Push back the type of return super register
-  if (NumVecs == 3)
-    ResTys.push_back(MVT::Untyped);
-  else {
-    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                 is64BitVector ? NumVecs : NumVecs * 2);
-    ResTys.push_back(ResTy);
-  }
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of the Chain
-  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
 
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
-
-  SuperReg = SDValue(VLdDup, 0);
-  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
-  // Update uses of each registers in super register
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
-  // Update uses of the Chain
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return nullptr;
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
 }
 
-// We only have 128-bit vector type of load/store lane instructions.
-// If it is 64-bit vector, we also select it to the 128-bit instructions.
-// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
-// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
-SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
-                                             bool isUpdating, unsigned NumVecs,
-                                             const uint16_t *Opcodes) {
-  assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                                 unsigned Opc) {
   SDLoc dl(N);
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  unsigned Vec0Idx = 3;
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
 
-  SDValue Chain = N->getOperand(0);
-  unsigned Lane =
-      cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
-  EVT VT = N->getOperand(Vec0Idx).getValueType();
-  bool is64BitVector = VT.is64BitVector();
-  EVT VT64; // 64-bit Vector Type
-
-  if (is64BitVector) {
-    VT64 = VT;
-    VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
-                          VT.getVectorNumElements() * 2);
-  }
-
-  unsigned OpcodeIndex;
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = 0; break;
-  case 16: OpcodeIndex = 1; break;
-  case 32: OpcodeIndex = 2; break;
-  case 64: OpcodeIndex = 3; break;
-  default: llvm_unreachable("unhandled vector lane load/store type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
-
-  SmallVector<EVT, 3> ResTys;
-  if (IsLoad) {
-    // Push back the type of return super register
-    if (NumVecs == 3)
-      ResTys.push_back(MVT::Untyped);
-    else {
-      EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                   is64BitVector ? NumVecs : NumVecs * 2);
-      ResTys.push_back(ResTy);
-    }
-  }
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of Chain
-  SmallVector<SDValue, 5> Ops;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
-  }
-
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  if (is64BitVector)
-    for (unsigned i = 0; i < Regs.size(); i++)
-      Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
-  SDValue SuperReg = createQTuple(Regs);
-
-  Ops.push_back(SuperReg); // Source Reg
-  SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
-  Ops.push_back(LaneValue);
-  Ops.push_back(Chain); // Push back the Chain
-
-  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<EVT, 2> ResTys;
+  ResTys.push_back(MVT::i64);   // Type of the write back register
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
-  if (!IsLoad)
-    return VLdLn;
-
-  // Extract the subregisters.
-  SuperReg = SDValue(VLdLn, 0);
-  unsigned Sub0 = AArch64::qsub_0;
-  // Update uses of each registers in super register
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-    SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
-    if (is64BitVector) {
-      SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
-    }
-    ReplaceUses(SDValue(N, Vec), SUB0);
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+                                       unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned NumberOfIgnoredLowBits,
+                                       bool BiggerPattern) {
+  assert(N->getOpcode() == ISD::AND &&
+         "N must be a AND operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // FIXME: simplify-demanded-bits in DAGCombine will probably have
+  // changed the AND node to a 32-bit mask operation. We'll have to
+  // undo that as part of the transform here if we want to catch all
+  // the opportunities.
+  // Currently the NumberOfIgnoredLowBits argument helps to recover
+  // form these situations when matching bigger pattern (bitfield insert).
+
+  // For unsigned extracts, check for a shift right and mask
+  uint64_t And_imm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+    return false;
+
+  const SDNode *Op0 = N->getOperand(0).getNode();
+
+  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+  // simplified. Try to undo that
+  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+  if (And_imm & (And_imm + 1))
+    return false;
+
+  bool ClampMSB = false;
+  uint64_t Srl_imm = 0;
+  // Handle the SRL + ANY_EXTEND case.
+  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+    // Extend the incoming operand of the SRL to 64-bit.
+    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+    // Make sure to clamp the MSB so that we preserve the semantics of the
+    // original operations.
+    ClampMSB = true;
+  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+                                   Srl_imm)) {
+    // If the shift result was truncated, we can still combine them.
+    Opd0 = Op0->getOperand(0).getOperand(0);
+
+    // Use the type of SRL node.
+    VT = Opd0->getValueType(0);
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+    Opd0 = Op0->getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift right has been performed.
+    // The resulting code will be at least as good as the original one
+    // plus it may expose more opportunities for bitfield insert pattern.
+    // FIXME: Currently we limit this to the bigger pattern, because
+    // some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
+         "bad amount in shift node!");
+
+  LSB = Srl_imm;
+  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
+                                  : CountTrailingOnes_64(And_imm)) -
+        1;
+  if (ClampMSB)
+    // Since we're moving the extend before the right shift operation, we need
+    // to clamp the MSB to make sure we don't shift in undefined bits instead of
+    // the zeros which would get shifted in with the original right shift
+    // operation.
+    MSB = MSB > 31 ? 31 : MSB;
+
+  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                     unsigned &LSB, unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts a single
+  // bit from the source value and places it in the LSB of the destination
+  // value, all other bits of the destination value or set to zero:
+  //
+  // Value2 = AND Value, MaskImm
+  // SRL Value2, ShiftImm
+  //
+  // with MaskImm >> ShiftImm == 1.
+  //
+  // This gets selected into a single UBFM:
+  //
+  // UBFM Value, ShiftImm, ShiftImm
+  //
+
+  if (N->getOpcode() != ISD::SRL)
+    return false;
+
+  uint64_t And_mask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+    return false;
+
+  Opd0 = N->getOperand(0).getOperand(0);
+
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  // Check whether we really have a one bit extract here.
+  if (And_mask >> Srl_imm == 0x1) {
+    if (N->getValueType(0) == MVT::i32)
+      Opc = AArch64::UBFMWri;
+    else
+      Opc = AArch64::UBFMXri;
+
+    LSB = MSB = Srl_imm;
+
+    return true;
   }
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return nullptr;
+
+  return false;
+}
+
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       bool BiggerPattern) {
+  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+         "N must be a SHR/SRA operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // Check for AND + SRL doing a one bit extract.
+  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+    return true;
+
+  // we're looking for a shift of a shift
+  uint64_t Shl_imm = 0;
+  uint64_t Trunc_bits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    Opd0 = N->getOperand(0).getOperand(0);
+  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+    // We are looking for a shift of truncate. Truncate from i64 to i32 could
+    // be considered as setting high 32 bits as zero. Our strategy here is to
+    // always generate 64bit UBFM. This consistency will help the CSE pass
+    // later find more redundancy.
+    Opd0 = N->getOperand(0).getOperand(0);
+    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    VT = Opd0->getValueType(0);
+    assert(VT == MVT::i64 && "the promoted type should be i64");
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift left has been performed.
+    // FIXME: Currently we limit this to the bigger pattern case,
+    // because some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+         "bad amount in shift node!");
+  // Note: The width operand is encoded as width-1.
+  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
+  int sLSB = Srl_imm - Shl_imm;
+  if (sLSB < 0)
+    return false;
+  LSB = sLSB;
+  MSB = LSB + Width;
+  // SRA requires a signed extraction
+  if (VT == MVT::i32)
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+  else
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+  return true;
 }
 
-unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
-                                        unsigned NumOfVec) {
-  assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                unsigned NumberOfIgnoredLowBits = 0,
+                                bool BiggerPattern = false) {
+  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+    return false;
 
-  unsigned Opc = 0;
-  switch (NumOfVec) {
+  switch (N->getOpcode()) {
   default:
+    if (!N->isMachineOpcode())
+      return false;
     break;
-  case 1:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b;
+  case ISD::AND:
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+                                      NumberOfIgnoredLowBits, BiggerPattern);
+  case ISD::SRL:
+  case ISD::SRA:
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+  }
+
+  unsigned NOpc = N->getMachineOpcode();
+  switch (NOpc) {
+  default:
+    return false;
+  case AArch64::SBFMWri:
+  case AArch64::UBFMWri:
+  case AArch64::SBFMXri:
+  case AArch64::UBFMXri:
+    Opc = NOpc;
+    Opd0 = N->getOperand(0);
+    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    return true;
+  }
+  // Unreachable
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+  unsigned Opc, LSB, MSB;
+  SDValue Opd0;
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+
+  // If the bit extract operation is 64bit but the original type is 32bit, we
+  // need to add one EXTRACT_SUBREG.
+  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
+                       CurDAG->getTargetConstant(MSB, MVT::i64)};
+
+    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    MachineSDNode *Node =
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+                               SDValue(BFM, 0), SubReg);
+    return Node;
+  }
+
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
+                   CurDAG->getTargetConstant(MSB, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+                              unsigned NumberOfIgnoredHighBits, EVT VT) {
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "i32 or i64 mask type expected!");
+  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+  APInt SignificantDstMask = APInt(BitWidth, DstMask);
+  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+  getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+                                             uint64_t Imm, uint64_t MSB,
+                                             unsigned Depth) {
+  // inherit the bitwidth value
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    // The interesting part will be in the lower part of the result
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was starting at Imm in the argument
+    OpUsefulBits = OpUsefulBits.shl(Imm);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    // The interesting part will be shifted in the result
+    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was at zero in the argument
+    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+  }
+
+  UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+                                  unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t ShiftTypeAndValue =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  APInt Mask(UsefulBits);
+  Mask.clearAllBits();
+  Mask.flipAllBits();
+
+  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+    // Shift Left
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.shl(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.lshr(ShiftAmt);
+  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+    // Shift Right
+    // We do not handle AArch64_AM::ASR, because the sign will change the
+    // number of useful bits
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.lshr(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.shl(ShiftAmt);
+  } else
+    return;
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+                                 unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+  if (Op.getOperand(1) == Orig)
+    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    UsefulBits &= ~OpUsefulBits;
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  }
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+                                SDValue Orig, unsigned Depth) {
+
+  // Users of this node should have already been instruction selected
+  // FIXME: Can we turn that into an assert?
+  if (!UserNode->isMachineOpcode())
+    return;
+
+  switch (UserNode->getMachineOpcode()) {
+  default:
+    return;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+    // We increment Depth only when we call the getUsefulBits
+    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::UBFMWri:
+  case AArch64::UBFMXri:
+    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    if (UserNode->getOperand(1) != Orig)
+      return;
+    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::BFMWri:
+  case AArch64::BFMXri:
+    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+  }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+  if (Depth >= 6)
+    return;
+  // Initialize UsefulBits
+  if (!Depth) {
+    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
+    // At the beginning, assume every produced bits is useful
+    UsefulBits = APInt(Bitwidth, 0);
+    UsefulBits.flipAllBits();
+  }
+  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+  for (SDNode *Node : Op.getNode()->uses()) {
+    // A use cannot produce useful bits
+    APInt UsefulBitsForUse = APInt(UsefulBits);
+    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+    UsersUsefulBits |= UsefulBitsForUse;
+  }
+  // UsefulBits contains the produced bits that are meaningful for the
+  // current definition, thus a user cannot make a bit meaningful at
+  // this point
+  UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+  if (ShlAmount == 0)
+    return Op;
+
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+  SDNode *ShiftNode;
+  if (ShlAmount > 0) {
+    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+  } else {
+    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+    assert(ShlAmount < 0 && "expected right shift");
+    int ShrAmount = -ShlAmount;
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, VT));
+  }
+
+  return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    SDValue &Src, int &ShiftAmount,
+                                    int &MaskWidth) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  (void)BitWidth;
+  assert(BitWidth == 32 || BitWidth == 64);
+
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // Discard a constant AND mask if present. It's safe because the node will
+  // already have been factored into the computeKnownBits calculation above.
+  uint64_t AndImm;
+  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    Op = Op.getOperand(0);
+  }
+
+  uint64_t ShlImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+    return false;
+  Op = Op.getOperand(0);
+
+  if (!isShiftedMask_64(NonZeroBits))
+    return false;
+
+  ShiftAmount = countTrailingZeros(NonZeroBits);
+  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+
+  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+  // amount.
+  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+  return true;
+}
+
+// Given a OR operation, check if we have the following pattern
+// ubfm c, b, imm, imm2 (or something that does the same jobs, see
+//                       isBitfieldExtractOp)
+// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+//                 countTrailingZeros(mask2) == imm2 - imm + 1
+// f = d | c
+// if yes, given reference arguments will be update so that one can replace
+// the OR instruction with:
+// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
+                                     SDValue &Src, unsigned &ImmR,
+                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  // Set Opc
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i32)
+    Opc = AArch64::BFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::BFMXri;
+  else
+    return false;
+
+  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+  // have the expected shape. Try to undo that.
+  APInt UsefulBits;
+  getUsefulBits(SDValue(N, 0), UsefulBits);
+
+  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+  // OR is commutative, check both possibilities (does llvm provide a
+  // way to do that directely, e.g., via code matcher?)
+  SDValue OrOpd1Val = N->getOperand(1);
+  SDNode *OrOpd0 = N->getOperand(0).getNode();
+  SDNode *OrOpd1 = N->getOperand(1).getNode();
+  for (int i = 0; i < 2;
+       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+    unsigned BFXOpc;
+    int DstLSB, Width;
+    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+                            NumberOfIgnoredLowBits, true)) {
+      // Check that the returned opcode is compatible with the pattern,
+      // i.e., same type and zero extended (U and not S)
+      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+          (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+        continue;
+
+      // Compute the width of the bitfield insertion
+      DstLSB = 0;
+      Width = ImmS - ImmR + 1;
+      // FIXME: This constraint is to catch bitfield insertion we may
+      // want to widen the pattern if we want to grab general bitfied
+      // move case
+      if (Width <= 0)
+        continue;
+
+      // If the mask on the insertee is correct, we have a BFXIL operation. We
+      // can share the ImmR and ImmS values from the already-computed UBFM.
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
+                                       DstLSB, Width)) {
+      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmS = Width - 1;
+    } else
+      continue;
+
+    // Check the second part of the pattern
+    EVT VT = OrOpd1->getValueType(0);
+    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+    // Compute the Known Zero for the candidate of the first operand.
+    // This allows to catch more general case than just looking for
+    // AND with imm. Indeed, simplify-demanded-bits may have removed
+    // the AND instruction because it proves it was useless.
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+    // Check if there is enough room for the second operand to appear
+    // in the first one
+    APInt BitsToBeInserted =
+        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+    if ((BitsToBeInserted & ~KnownZero) != 0)
+      continue;
+
+    // Set the first operand
+    uint64_t Imm;
+    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+      // In that case, we can eliminate the AND
+      Dst = OrOpd1->getOperand(0);
     else
-      Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b;
+      // Maybe the AND has been removed by simplify-demanded-bits
+      // or is useful because it discards more bits
+      Dst = OrOpd1Val;
+
+    // both parts match
+    return true;
+  }
+
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return nullptr;
+
+  unsigned Opc;
+  unsigned LSB, MSB;
+  SDValue Opd0, Opd1;
+
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = { Opd0,
+                    Opd1,
+                    CurDAG->getTargetConstant(LSB, VT),
+                    CurDAG->getTargetConstant(MSB, VT) };
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned Variant;
+  unsigned Opc;
+  unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
+
+  if (VT == MVT::f32) {
+    Variant = 0;
+  } else if (VT == MVT::f64) {
+    Variant = 1;
+  } else
+    return nullptr; // Unrecognized argument type. Fall back on default codegen.
+
+  // Pick the FRINTX variant needed to set the flags.
+  unsigned FRINTXOpc = FRINTXOpcs[Variant];
+
+  switch (N->getOpcode()) {
+  default:
+    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
+  case ISD::FCEIL: {
+    unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
+    Opc = FRINTPOpcs[Variant];
     break;
-  case 2:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b;
+  }
+  case ISD::FFLOOR: {
+    unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
+    Opc = FRINTMOpcs[Variant];
     break;
-  case 3:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b;
+  }
+  case ISD::FTRUNC: {
+    unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
+    Opc = FRINTZOpcs[Variant];
     break;
-  case 4:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b;
+  }
+  case ISD::FROUND: {
+    unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
+    Opc = FRINTAOpcs[Variant];
     break;
   }
+  }
+
+  SDLoc dl(N);
+  SDValue In = N->getOperand(0);
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(In);
+
+  if (!TM.Options.UnsafeFPMath) {
+    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
+    Ops.push_back(SDValue(FRINTX, 1));
+  }
 
-  return Opc;
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs,
-                                        bool IsExt) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
-  SDLoc dl(N);
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  APFloat FVal(0.0);
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    FVal = CN->getValueAPF();
+  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+    // Some otherwise illegal constants are allowed in this case.
+    if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+      return false;
+
+    ConstantPoolSDNode *CN =
+        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+  } else
+    return false;
+
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
 
-  // Check the element of look up table is 64-bit or not
-  unsigned Vec0Idx = IsExt ? 2 : 1;
-  assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() &&
-         "The element of lookup table for vtbl and vtbx must be 128-bit");
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
 
-  // Check the return value type is 64-bit or not
-  EVT ResVT = N->getValueType(0);
-  bool is64BitRes = ResVT.is64BitVector();
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
 
-  // Create new SDValue for vector list
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  SDValue TblReg = createQTuple(Regs);
-  unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs);
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
 
-  SmallVector<SDValue, 3> Ops;
-  if (IsExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(TblReg);
-  Ops.push_back(N->getOperand(Vec0Idx + NumVecs));
-  return CurDAG->getMachineNode(Opc, dl, ResVT, Ops);
+  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
+  return true;
 }
 
 SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
-  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
 
+  // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
     return nullptr;
   }
 
-  switch (Node->getOpcode()) {
-  case ISD::ATOMIC_LOAD_ADD:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_ADD_I8,
-                        AArch64::ATOMIC_LOAD_ADD_I16,
-                        AArch64::ATOMIC_LOAD_ADD_I32,
-                        AArch64::ATOMIC_LOAD_ADD_I64);
-  case ISD::ATOMIC_LOAD_SUB:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_SUB_I8,
-                        AArch64::ATOMIC_LOAD_SUB_I16,
-                        AArch64::ATOMIC_LOAD_SUB_I32,
-                        AArch64::ATOMIC_LOAD_SUB_I64);
-  case ISD::ATOMIC_LOAD_AND:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_AND_I8,
-                        AArch64::ATOMIC_LOAD_AND_I16,
-                        AArch64::ATOMIC_LOAD_AND_I32,
-                        AArch64::ATOMIC_LOAD_AND_I64);
-  case ISD::ATOMIC_LOAD_OR:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_OR_I8,
-                        AArch64::ATOMIC_LOAD_OR_I16,
-                        AArch64::ATOMIC_LOAD_OR_I32,
-                        AArch64::ATOMIC_LOAD_OR_I64);
-  case ISD::ATOMIC_LOAD_XOR:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_XOR_I8,
-                        AArch64::ATOMIC_LOAD_XOR_I16,
-                        AArch64::ATOMIC_LOAD_XOR_I32,
-                        AArch64::ATOMIC_LOAD_XOR_I64);
-  case ISD::ATOMIC_LOAD_NAND:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_NAND_I8,
-                        AArch64::ATOMIC_LOAD_NAND_I16,
-                        AArch64::ATOMIC_LOAD_NAND_I32,
-                        AArch64::ATOMIC_LOAD_NAND_I64);
-  case ISD::ATOMIC_LOAD_MIN:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_MIN_I8,
-                        AArch64::ATOMIC_LOAD_MIN_I16,
-                        AArch64::ATOMIC_LOAD_MIN_I32,
-                        AArch64::ATOMIC_LOAD_MIN_I64);
-  case ISD::ATOMIC_LOAD_MAX:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_MAX_I8,
-                        AArch64::ATOMIC_LOAD_MAX_I16,
-                        AArch64::ATOMIC_LOAD_MAX_I32,
-                        AArch64::ATOMIC_LOAD_MAX_I64);
-  case ISD::ATOMIC_LOAD_UMIN:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_UMIN_I8,
-                        AArch64::ATOMIC_LOAD_UMIN_I16,
-                        AArch64::ATOMIC_LOAD_UMIN_I32,
-                        AArch64::ATOMIC_LOAD_UMIN_I64);
-  case ISD::ATOMIC_LOAD_UMAX:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_UMAX_I8,
-                        AArch64::ATOMIC_LOAD_UMAX_I16,
-                        AArch64::ATOMIC_LOAD_UMAX_I32,
-                        AArch64::ATOMIC_LOAD_UMAX_I64);
-  case ISD::ATOMIC_SWAP:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_SWAP_I8,
-                        AArch64::ATOMIC_SWAP_I16,
-                        AArch64::ATOMIC_SWAP_I32,
-                        AArch64::ATOMIC_SWAP_I64);
-  case ISD::ATOMIC_CMP_SWAP:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_CMP_SWAP_I8,
-                        AArch64::ATOMIC_CMP_SWAP_I16,
-                        AArch64::ATOMIC_CMP_SWAP_I32,
-                        AArch64::ATOMIC_CMP_SWAP_I64);
-  case ISD::FrameIndex: {
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    EVT PtrTy = getTargetLowering()->getPointerTy();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
-    return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
-                                TFI, CurDAG->getTargetConstant(0, PtrTy));
-  }
-  case ISD::Constant: {
-    SDNode *ResNode = nullptr;
-    if (cast<ConstantSDNode>(Node)->getZExtValue() == 0) {
-      // XZR and WZR are probably even better than an actual move: most of the
-      // time they can be folded into another instruction with *no* cost.
-
-      EVT Ty = Node->getValueType(0);
-      assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
-      uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
-      ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                       SDLoc(Node),
-                                       Register, Ty).getNode();
-    }
-
-    // Next best option is a move-immediate, see if we can do that.
-    if (!ResNode) {
-      ResNode = TrySelectToMoveImm(Node);
-    }
+  // Few custom selection stuff.
+  SDNode *ResNode = nullptr;
+  EVT VT = Node->getValueType(0);
 
-    if (ResNode)
-      return ResNode;
+  switch (Node->getOpcode()) {
+  default:
+    break;
 
-    // If even that fails we fall back to a lit-pool entry at the moment. Future
-    // tuning may change this to a sequence of MOVZ/MOVN/MOVK instructions.
-    ResNode = SelectToLitPool(Node);
-    assert(ResNode && "We need *some* way to materialise a constant");
+  case ISD::ADD:
+    if (SDNode *I = SelectMLAV64LaneV128(Node))
+      return I;
+    break;
 
-    // We want to continue selection at this point since the litpool access
-    // generated used generic nodes for simplicity.
-    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-    Node = ResNode;
+  case ISD::LOAD: {
+    // Try to select as an indexed load. Fall through to normal processing
+    // if we can't.
+    bool Done = false;
+    SDNode *I = SelectIndexedLoad(Node, Done);
+    if (Done)
+      return I;
     break;
   }
-  case ISD::ConstantFP: {
-    if (A64Imms::isFPImm(cast<ConstantFPSDNode>(Node)->getValueAPF())) {
-      // FMOV will take care of it from TableGen
-      break;
-    }
 
-    SDNode *ResNode = LowerToFPLitPool(Node);
-    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::SRA:
+    if (SDNode *I = SelectBitfieldExtractOp(Node))
+      return I;
+    break;
 
-    // We want to continue selection at this point since the litpool access
-    // generated used generic nodes for simplicity.
-    Node = ResNode;
+  case ISD::OR:
+    if (SDNode *I = SelectBitfieldInsertOp(Node))
+      return I;
     break;
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Extracting lane zero is a special case where we can just use a plain
+    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+    // the rest of the compiler, especially the register allocator and copyi
+    // propagation, to reason about, so is preferred when it's possible to
+    // use it.
+    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+    // Bail and use the default Select() for non-zero lanes.
+    if (LaneNode->getZExtValue() != 0)
+      break;
+    // If the element type is not the same as the result type, likewise
+    // bail and use the default Select(), as there's more to do than just
+    // a cross-class COPY. This catches extracts of i8 and i16 elements
+    // since they will need an explicit zext.
+    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+      break;
+    unsigned SubReg;
+    switch (Node->getOperand(0)
+                .getValueType()
+                .getVectorElementType()
+                .getSizeInBits()) {
+    default:
+      llvm_unreachable("Unexpected vector element type!");
+    case 64:
+      SubReg = AArch64::dsub;
+      break;
+    case 32:
+      SubReg = AArch64::ssub;
+      break;
+    case 16: // FALLTHROUGH
+    case 8:
+      llvm_unreachable("unexpected zext-requiring extract element!");
+    }
+    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+                                                     Node->getOperand(0));
+    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    DEBUG(Extract->dumpr(CurDAG));
+    DEBUG(dbgs() << "\n");
+    return Extract.getNode();
   }
-  case AArch64ISD::NEON_LD1_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1WB_8B_fixed,  AArch64::LD1WB_4H_fixed,
-      AArch64::LD1WB_2S_fixed,  AArch64::LD1WB_1D_fixed,
-      AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
-      AArch64::LD1WB_4S_fixed,  AArch64::LD1WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 1, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD2WB_8B_fixed,  AArch64::LD2WB_4H_fixed,
-      AArch64::LD2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
-      AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
-      AArch64::LD2WB_4S_fixed,  AArch64::LD2WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD3WB_8B_fixed,  AArch64::LD3WB_4H_fixed,
-      AArch64::LD3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
-      AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
-      AArch64::LD3WB_4S_fixed,  AArch64::LD3WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD4WB_8B_fixed,  AArch64::LD4WB_4H_fixed,
-      AArch64::LD4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
-      AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
-      AArch64::LD4WB_4S_fixed,  AArch64::LD4WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x2WB_8B_fixed,  AArch64::LD1x2WB_4H_fixed,
-      AArch64::LD1x2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
-      AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
-      AArch64::LD1x2WB_4S_fixed,  AArch64::LD1x2WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x3WB_8B_fixed,  AArch64::LD1x3WB_4H_fixed,
-      AArch64::LD1x3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
-      AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
-      AArch64::LD1x3WB_4S_fixed,  AArch64::LD1x3WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x4WB_8B_fixed,  AArch64::LD1x4WB_4H_fixed,
-      AArch64::LD1x4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
-      AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
-      AArch64::LD1x4WB_4S_fixed,  AArch64::LD1x4WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1WB_8B_fixed,  AArch64::ST1WB_4H_fixed,
-      AArch64::ST1WB_2S_fixed,  AArch64::ST1WB_1D_fixed,
-      AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
-      AArch64::ST1WB_4S_fixed,  AArch64::ST1WB_2D_fixed
-    };
-    return SelectVST(Node, true, 1, Opcodes);
-  }
-  case AArch64ISD::NEON_ST2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST2WB_8B_fixed,  AArch64::ST2WB_4H_fixed,
-      AArch64::ST2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
-      AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
-      AArch64::ST2WB_4S_fixed,  AArch64::ST2WB_2D_fixed
-    };
-    return SelectVST(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST3WB_8B_fixed,  AArch64::ST3WB_4H_fixed,
-      AArch64::ST3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
-      AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
-      AArch64::ST3WB_4S_fixed,  AArch64::ST3WB_2D_fixed
-    };
-    return SelectVST(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST4WB_8B_fixed,  AArch64::ST4WB_4H_fixed,
-      AArch64::ST4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
-      AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
-      AArch64::ST4WB_4S_fixed,  AArch64::ST4WB_2D_fixed
-    };
-    return SelectVST(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
-        AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
-        AArch64::LD2R_4S, AArch64::LD2R_2D
-    };
-    return SelectVLDDup(Node, false, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
-        AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
-        AArch64::LD3R_4S, AArch64::LD3R_2D
-    };
-    return SelectVLDDup(Node, false, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
-        AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
-        AArch64::LD4R_4S, AArch64::LD4R_2D
-    };
-    return SelectVLDDup(Node, false, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD2R_WB_8B_fixed,  AArch64::LD2R_WB_4H_fixed,
-      AArch64::LD2R_WB_2S_fixed,  AArch64::LD2R_WB_1D_fixed,
-      AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
-      AArch64::LD2R_WB_4S_fixed,  AArch64::LD2R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD3R_WB_8B_fixed,  AArch64::LD3R_WB_4H_fixed,
-      AArch64::LD3R_WB_2S_fixed,  AArch64::LD3R_WB_1D_fixed,
-      AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
-      AArch64::LD3R_WB_4S_fixed,  AArch64::LD3R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD4R_WB_8B_fixed,  AArch64::LD4R_WB_4H_fixed,
-      AArch64::LD4R_WB_2S_fixed,  AArch64::LD4R_WB_1D_fixed,
-      AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
-      AArch64::LD4R_WB_4S_fixed,  AArch64::LD4R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
-        AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
-        AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
-        AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST2LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
-        AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST3LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
-        AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST4LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
-        AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x2WB_8B_fixed,  AArch64::ST1x2WB_4H_fixed,
-      AArch64::ST1x2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
-      AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
-      AArch64::ST1x2WB_4S_fixed,  AArch64::ST1x2WB_2D_fixed
-    };
-    return SelectVST(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x3WB_8B_fixed,  AArch64::ST1x3WB_4H_fixed,
-      AArch64::ST1x3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
-      AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
-      AArch64::ST1x3WB_4S_fixed,  AArch64::ST1x3WB_2D_fixed
-    };
-    return SelectVST(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x4WB_8B_fixed,  AArch64::ST1x4WB_4H_fixed,
-      AArch64::ST1x4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
-      AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
-      AArch64::ST1x4WB_4S_fixed,  AArch64::ST1x4WB_2D_fixed
-    };
-    return SelectVST(Node, true, 4, Opcodes);
-  }
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
-    bool IsExt = false;
-    switch (IntNo) {
-      default:
-        break;
-      case Intrinsic::aarch64_neon_vtbx1:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl1:
-        return SelectVTBL(Node, 1, IsExt);
-      case Intrinsic::aarch64_neon_vtbx2:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl2:
-        return SelectVTBL(Node, 2, IsExt);
-      case Intrinsic::aarch64_neon_vtbx3:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl3:
-        return SelectVTBL(Node, 3, IsExt);
-      case Intrinsic::aarch64_neon_vtbx4:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl4:
-        return SelectVTBL(Node, 4, IsExt);
+  case ISD::Constant: {
+    // Materialize zero constants as copies from WZR/XZR.  This allows
+    // the coalescer to propagate these into other instructions.
+    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+    if (ConstNode->isNullValue()) {
+      if (VT == MVT::i32)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::WZR, MVT::i32).getNode();
+      else if (VT == MVT::i64)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::XZR, MVT::i64).getNode();
     }
     break;
   }
-  case ISD::INTRINSIC_VOID:
+
+  case ISD::FrameIndex: {
+    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+    const TargetLowering *TLI = getTargetLowering();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
+    return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
     default:
       break;
-    case Intrinsic::arm_neon_vld1: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1_8B,  AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
-          AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
-      };
-      return SelectVLD(Node, false, 1, Opcodes);
-    }
-    case Intrinsic::arm_neon_vld2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD2_8B,  AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
-          AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
-      };
-      return SelectVLD(Node, false, 2, Opcodes);
-    }
-    case Intrinsic::arm_neon_vld3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD3_8B,  AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
-          AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
-      };
-      return SelectVLD(Node, false, 3, Opcodes);
-    }
-    case Intrinsic::arm_neon_vld4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD4_8B,  AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
-          AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
-      };
-      return SelectVLD(Node, false, 4, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x2_8B, AArch64::LD1x2_4H,  AArch64::LD1x2_2S,
-          AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
-          AArch64::LD1x2_4S, AArch64::LD1x2_2D
-      };
-      return SelectVLD(Node, false, 2, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x3_8B, AArch64::LD1x3_4H,  AArch64::LD1x3_2S,
-          AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
-          AArch64::LD1x3_4S, AArch64::LD1x3_2D
-      };
-      return SelectVLD(Node, false, 3, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x4_8B, AArch64::LD1x4_4H,  AArch64::LD1x4_2S,
-          AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
-          AArch64::LD1x4_4S, AArch64::LD1x4_2D
-      };
-      return SelectVLD(Node, false, 4, Opcodes);
+    case Intrinsic::aarch64_ldaxp:
+    case Intrinsic::aarch64_ldxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+      SDValue MemAddr = Node->getOperand(2);
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+
+      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+                                          MVT::Other, MemAddr, Chain);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+      return Ld;
     }
-    case Intrinsic::arm_neon_vst1: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1_8B,  AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
-          AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
-      };
-      return SelectVST(Node, false, 1, Opcodes);
+    case Intrinsic::aarch64_stlxp:
+    case Intrinsic::aarch64_stxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue ValLo = Node->getOperand(2);
+      SDValue ValHi = Node->getOperand(3);
+      SDValue MemAddr = Node->getOperand(4);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(ValLo);
+      Ops.push_back(ValHi);
+      Ops.push_back(MemAddr);
+      Ops.push_back(Chain);
+
+      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
     }
-    case Intrinsic::arm_neon_vst2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST2_8B,  AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
-          AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
-      };
-      return SelectVST(Node, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_ld1x2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 2, AArch64::LD2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 2, AArch64::LD2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 2, AArch64::LD2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 2, AArch64::LD2i64);
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 3, AArch64::LD3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 3, AArch64::LD3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 3, AArch64::LD3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 3, AArch64::LD3i64);
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 4, AArch64::LD4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 4, AArch64::LD4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 4, AArch64::LD4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 4, AArch64::LD4i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST3_8B,  AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
-          AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
-      };
-      return SelectVST(Node, false, 3, Opcodes);
+  } break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_tbl2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
+                                                  : AArch64::TBLv16i8Two,
+                         false);
+    case Intrinsic::aarch64_neon_tbl3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                                  : AArch64::TBLv16i8Three,
+                         false);
+    case Intrinsic::aarch64_neon_tbl4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                                  : AArch64::TBLv16i8Four,
+                         false);
+    case Intrinsic::aarch64_neon_tbx2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
+                                                  : AArch64::TBXv16i8Two,
+                         true);
+    case Intrinsic::aarch64_neon_tbx3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                                  : AArch64::TBXv16i8Three,
+                         true);
+    case Intrinsic::aarch64_neon_tbx4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                                  : AArch64::TBXv16i8Four,
+                         true);
+    case Intrinsic::aarch64_neon_smull:
+    case Intrinsic::aarch64_neon_umull:
+      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
+        return N;
+      break;
     }
-    case Intrinsic::arm_neon_vst4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST4_8B,  AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
-          AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
-      };
-      return SelectVST(Node, false, 4, Opcodes);
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    if (Node->getNumOperands() >= 3)
+      VT = Node->getOperand(2)->getValueType(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_st1x2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x2_8B, AArch64::ST1x2_4H,  AArch64::ST1x2_2S,
-          AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
-          AArch64::ST1x2_4S, AArch64::ST1x2_2D
-      };
-      return SelectVST(Node, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st1x3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x3_8B, AArch64::ST1x3_4H,  AArch64::ST1x3_2S,
-          AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
-          AArch64::ST1x3_4S, AArch64::ST1x3_2D
-      };
-      return SelectVST(Node, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st1x4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x4_8B, AArch64::ST1x4_4H,  AArch64::ST1x4_2S,
-          AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
-          AArch64::ST1x4_4S, AArch64::ST1x4_2D
-      };
-      return SelectVST(Node, false, 4, Opcodes);
+    case Intrinsic::aarch64_neon_st2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST2Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld2lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST3Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld3lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld4lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 4, Opcodes);
+    case Intrinsic::aarch64_neon_st2lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 2, AArch64::ST2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 2, AArch64::ST2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 2, AArch64::ST2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 2, AArch64::ST2i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst2lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st3lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 3, AArch64::ST3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 3, AArch64::ST3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 3, AArch64::ST3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 3, AArch64::ST3i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst3lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st4lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 4, AArch64::ST4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 4, AArch64::ST4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 4, AArch64::ST4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 4, AArch64::ST4i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst4lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 4, Opcodes);
     }
-    } // End of switch IntNo
+  }
+  case AArch64ISD::LD2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD2DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+    break;
+  }
+  case AArch64ISD::LD2LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+    break;
+  }
+  case AArch64ISD::LD3LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+    break;
+  }
+  case AArch64ISD::LD4LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+    break;
+  }
+  case AArch64ISD::ST2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    break;
+  }
+  case AArch64ISD::ST3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    break;
+  }
+  case AArch64ISD::ST4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+    break;
+  }
+  case AArch64ISD::ST2LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+    break;
+  }
+  case AArch64ISD::ST3LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+    break;
+  }
+  case AArch64ISD::ST4LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
     break;
-  } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
-  default:
-    break; // Let generic code handle it
   }
 
-  SDNode *ResNode = SelectCode(Node);
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FTRUNC:
+  case ISD::FROUND:
+    if (SDNode *I = SelectLIBM(Node))
+      return I;
+    break;
+  }
 
-  DEBUG(dbgs() << "=> ";
-        if (ResNode == nullptr || ResNode == Node)
-          Node->dump(CurDAG);
-        else
-          ResNode->dump(CurDAG);
-        dbgs() << "\n");
+  // Select the default instruction
+  ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
 
   return ResNode;
 }
 
-/// This pass converts a legalized DAG into a AArch64-specific DAG, ready for
-/// instruction scheduling.
-FunctionPass *llvm::createAArch64ISelDAG(AArch64TargetMachine &TM,
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
                                          CodeGenOpt::Level OptLevel) {
   return new AArch64DAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index d02a03ccb2a7..e45ca4dbc0d9 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,48 +7,87 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the interfaces that AArch64 uses to lower LLVM code into a
-// selection DAG.
+// This file implements the AArch64TargetLowering class.
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
 #include "AArch64ISelLowering.h"
-#include "AArch64MachineFunctionInfo.h"
+#include "AArch64PerfectShuffle.h"
 #include "AArch64Subtarget.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Support/MathExtras.h"
-
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "aarch64-isel"
+#define DEBUG_TYPE "aarch64-lower"
 
-static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
-  assert (TM.getSubtarget<AArch64Subtarget>().isTargetELF() &&
-          "unknown subtarget type");
-  return new AArch64ElfTargetObjectFile();
-}
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+enum AlignMode {
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(NoStrictAlign),
+      cl::values(
+          clEnumValN(StrictAlign,   "aarch64-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
+
+// Place holder until extr generation is tested fully.
+static cl::opt<bool>
+EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
+                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
+                          cl::init(true));
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+                         cl::desc("Allow AArch64 SLI/SRI formation"),
+                         cl::init(false));
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return new AArch64_MachoTargetObjectFile();
 
-AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
+  return new AArch64_ELFTargetObjectFile();
+}
 
-  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
 
-  // SIMD compares set the entire lane's bits to 1
+  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+  // we have to make something up. Arbitrarily, choose ZeroOrOne.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // When comparing vectors the result sets the different elements in the
+  // vector to all-one or all-zero.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  // Scalar register <-> type mapping
-  addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
-  addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
@@ -58,201 +97,86 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   }
 
   if (Subtarget->hasNEON()) {
-    // And the vectors
-    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
-    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
-    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
-  }
-
+    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+    // Someone set us up the NEON.
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+    addDRTypeForNEON(MVT::v1f64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+  }
+
+  // Compute derived properties from the register classes
   computeRegisterProperties();
 
-  // We combine OR nodes for bitfield and NEON BSL operations.
-  setTargetDAGCombine(ISD::OR);
-
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::SRA);
-  setTargetDAGCombine(ISD::SRL);
-  setTargetDAGCombine(ISD::SHL);
-
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-
-  // AArch64 does not have i1 loads, or much of anything for i1 really.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
-
-  setStackPointerRegisterToSaveRestore(AArch64::XSP);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-
-  // We'll lower globals to wrappers for selection.
+  // Provide all sorts of operation actions
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-
-  // A64 instructions have the comparison predicate attached to the user of the
-  // result, but having a separate comparison is valuable for matching.
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-
-  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f64, Custom);
-
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-  setOperationAction(ISD::VAARG, MVT::Other, Expand);
-
-  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
-  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
-  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
-  // Legal floating-point operations.
-  setOperationAction(ISD::FABS, MVT::f32, Legal);
-  setOperationAction(ISD::FABS, MVT::f64, Legal);
-
-  setOperationAction(ISD::FCEIL, MVT::f32, Legal);
-  setOperationAction(ISD::FCEIL, MVT::f64, Legal);
-
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FNEG, MVT::f32, Legal);
-  setOperationAction(ISD::FNEG, MVT::f64, Legal);
-
-  setOperationAction(ISD::FRINT, MVT::f32, Legal);
-  setOperationAction(ISD::FRINT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FSQRT, MVT::f32, Legal);
-  setOperationAction(ISD::FSQRT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
-
-  // Illegal floating-point operations.
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-
-  setOperationAction(ISD::FCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FCOS, MVT::f64, Expand);
-
-  setOperationAction(ISD::FEXP, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP, MVT::f64, Expand);
-
-  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
-
-  setOperationAction(ISD::FPOW, MVT::f32, Expand);
-  setOperationAction(ISD::FPOW, MVT::f64, Expand);
-
-  setOperationAction(ISD::FPOWI, MVT::f32, Expand);
-  setOperationAction(ISD::FPOWI, MVT::f64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
 
   setOperationAction(ISD::FREM, MVT::f32, Expand);
   setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
 
-  setOperationAction(ISD::FSIN, MVT::f32, Expand);
-  setOperationAction(ISD::FSIN, MVT::f64, Expand);
-
-  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  // Custom lowering hooks are needed for XOR
+  // to fold it into CSINC/CSINV.
+  setOperationAction(ISD::XOR, MVT::i32, Custom);
+  setOperationAction(ISD::XOR, MVT::i64, Custom);
 
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
-  setOperationAction(ISD::FABS,       MVT::f128, Expand);
-  setOperationAction(ISD::FADD,       MVT::f128, Custom);
-  setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
-  setOperationAction(ISD::FCOS,       MVT::f128, Expand);
-  setOperationAction(ISD::FDIV,       MVT::f128, Custom);
-  setOperationAction(ISD::FMA,        MVT::f128, Expand);
-  setOperationAction(ISD::FMUL,       MVT::f128, Custom);
-  setOperationAction(ISD::FNEG,       MVT::f128, Expand);
-  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
-  setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
-  setOperationAction(ISD::FPOW,       MVT::f128, Expand);
-  setOperationAction(ISD::FREM,       MVT::f128, Expand);
-  setOperationAction(ISD::FRINT,      MVT::f128, Expand);
-  setOperationAction(ISD::FSIN,       MVT::f128, Expand);
-  setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
-  setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
-  setOperationAction(ISD::FSUB,       MVT::f128, Custom);
-  setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
-  setOperationAction(ISD::SETCC,      MVT::f128, Custom);
-  setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
-  setOperationAction(ISD::SELECT,     MVT::f128, Expand);
-  setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
-  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
+  setOperationAction(ISD::FABS, MVT::f128, Expand);
+  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FNEG, MVT::f128, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
+  setOperationAction(ISD::FRINT, MVT::f128, Expand);
+  setOperationAction(ISD::FSIN, MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
 
   // Lowering for many of the conversions is actually specified by the non-f128
   // type. The LowerXXX function will be trivial when f128 isn't involved.
@@ -268,631 +192,583 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
-  setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
-
-  // i128 shift operation support
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 
-  // This prevents LLVM trying to compress double constants into a floating
-  // constant-pool entry and trying to load from there. It's of doubtful benefit
-  // for A64: we'd need LDR followed by FCVT, I believe.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  // Variable arguments.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  // Variable-sized objects.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
+  // Exception handling.
+  // FIXME: These are guesses. Has this been defined yet?
   setExceptionPointerRegister(AArch64::X0);
   setExceptionSelectorRegister(AArch64::X1);
 
-  if (Subtarget->hasNEON()) {
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v1i64, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v16i8, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
-    setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
-    setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
-    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
-    setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
-    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
-    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i8, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i16, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i8, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i16, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
-
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i8, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Custom);
-
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i8, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Custom);
-
-    // Neon does not support vector divide/remainder operations except
-    // floating-point divide.
-    setOperationAction(ISD::SDIV, MVT::v1i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v8i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v16i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v4i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v8i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v2i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v4i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i64, Expand);
-    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::UDIV, MVT::v1i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v8i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v16i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v8i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i64, Expand);
-    setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::SREM, MVT::v1i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v8i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v16i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v4i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v8i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v4i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i64, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::UREM, MVT::v1i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v8i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v16i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v4i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v8i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v2i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v4i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i64, Expand);
-    setOperationAction(ISD::UREM, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::FREM, MVT::v2f32, Expand);
-    setOperationAction(ISD::FREM, MVT::v4f32, Expand);
-    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
-
-    setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
-    setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
-    setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
-
-    setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
-
-    // Vector ExtLoad and TruncStore are expanded.
-    for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
-         I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
-      MVT VT = (MVT::SimpleValueType) I;
-      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
-      for (unsigned II = MVT::FIRST_VECTOR_VALUETYPE;
-           II <= MVT::LAST_VECTOR_VALUETYPE; ++II) {
-        MVT VT1 = (MVT::SimpleValueType) II;
-        // A TruncStore has two vector types of the same number of elements
-        // and different element sizes.
-        if (VT.getVectorNumElements() == VT1.getVectorNumElements() &&
-            VT.getVectorElementType().getSizeInBits()
-                > VT1.getVectorElementType().getSizeInBits())
-          setTruncStoreAction(VT, VT1, Expand);
-      }
-
-      setOperationAction(ISD::MULHS, VT, Expand);
-      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-      setOperationAction(ISD::MULHU, VT, Expand);
-      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-
-      setOperationAction(ISD::BSWAP, VT, Expand);
-    }
-
-    // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
-    // FIXME: For a v2i64 multiply, we copy VPR to GPR and do 2 i64 multiplies,
-    // and then copy back to VPR. This solution may be optimized by Following 3
-    // NEON instructions:
-    //        pmull  v2.1q, v0.1d, v1.1d
-    //        pmull2 v3.1q, v0.2d, v1.2d
-    //        ins    v2.d[1], v3.d[0]
-    // As currently we can't verify the correctness of such assumption, we can
-    // do such optimization in the future.
-    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
-    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+  // Constant pool entries
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 
-    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
-    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
-    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
-    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
-    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
-    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
-    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
-    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
-  }
+  // BlockAddress
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::VSELECT);
+  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i64, Custom);
+  setOperationAction(ISD::ADDE, MVT::i64, Custom);
+  setOperationAction(ISD::SUBC, MVT::i64, Custom);
+  setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+  // AArch64 lacks both left-rotate and popcount instructions.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
 
-  MaskAndBranchFoldingIsLegal = true;
-}
+  // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
-EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  // It's reasonably important that this value matches the "natural" legal
-  // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
-  // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
-  if (!VT.isVector()) return MVT::i32;
-  return VT.changeVectorElementTypeToInteger();
-}
 
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
-                                  unsigned &LdrOpc,
-                                  unsigned &StrOpc) {
-  static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
-                                       AArch64::LDXR_word, AArch64::LDXR_dword};
-  static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
-                                     AArch64::LDAXR_word, AArch64::LDAXR_dword};
-  static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
-                                       AArch64::STXR_word, AArch64::STXR_dword};
-  static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
-                                     AArch64::STLXR_word, AArch64::STLXR_dword};
-
-  const unsigned *LoadOps, *StoreOps;
-  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    LoadOps = LoadAcqs;
-  else
-    LoadOps = LoadBares;
+  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
+  // counterparts, which AArch64 supports directly.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    StoreOps = StoreRels;
-  else
-    StoreOps = StoreBares;
+  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
-  assert(isPowerOf2_32(Size) && Size <= 8 &&
-         "unsupported size for atomic binary op!");
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
 
-  LdrOpc = LoadOps[Log2_32(Size)];
-  StrOpc = StoreOps[Log2_32(Size)];
-}
+  // Custom lower Add/Sub/Mul with overflow.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
 
-// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
-// have value type mapped, and they are both being defined as MVT::untyped.
-// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
-// would fail to figure out the register pressure correctly.
-std::pair<const TargetRegisterClass*, uint8_t>
-AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = nullptr;
-  uint8_t Cost = 1;
-  switch (VT.SimpleTy) {
-  default:
-    return TargetLowering::findRepresentativeClass(VT);
-  case MVT::v4i64:
-    RRC = &AArch64::QPairRegClass;
-    Cost = 2;
-    break;
-  case MVT::v8i64:
-    RRC = &AArch64::QQuadRegClass;
-    Cost = 4;
-    break;
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+  // AArch64 has implementations of a lot of rounding-like FP operations.
+  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
+  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
+    MVT Ty = RoundingTypes[I];
+    setOperationAction(ISD::FFLOOR, Ty, Legal);
+    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+    setOperationAction(ISD::FCEIL, Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FTRUNC, Ty, Legal);
+    setOperationAction(ISD::FROUND, Ty, Legal);
+  }
+
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+  if (Subtarget->isTargetMachO()) {
+    // For iOS, we don't want to the normal expansion of a libcall to
+    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+    // traffic.
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   }
-  return std::make_pair(RRC, Cost);
-}
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                        unsigned Size,
-                                        unsigned BinOpcode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  // AArch64 does not have floating-point extending loads, i1 sign-extending
+  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  // Indexed loads and stores are supported.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::i8, Legal);
+    setIndexedLoadAction(im, MVT::i16, Legal);
+    setIndexedLoadAction(im, MVT::i32, Legal);
+    setIndexedLoadAction(im, MVT::i64, Legal);
+    setIndexedLoadAction(im, MVT::f64, Legal);
+    setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::i8, Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i64, Legal);
+    setIndexedStoreAction(im, MVT::f64, Legal);
+    setIndexedStoreAction(im, MVT::f32, Legal);
+  }
+
+  // Trap.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // We combine OR nodes for bitfield operations.
+  setTargetDAGCombine(ISD::OR);
 
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  // Vector add and sub nodes may conceal a high-half opportunity.
+  // Also, try to fold ADD into CSINC/CSINV..
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  const TargetRegisterClass *TRC
-    = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
-  unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   <binop> scratch, dest, incr
-  //   stxr stxr_status, scratch, ptr
-  //   cbnz stxr_status, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (BinOpcode) {
-    // All arithmetic operations we'll be creating are designed to take an extra
-    // shift or extend operand, which we can conveniently set to zero.
-
-    // Operand order needs to go the other way for NAND.
-    if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
-        .addReg(incr).addReg(dest).addImm(0);
-    else
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
-        .addReg(dest).addReg(incr).addImm(0);
-  }
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
-  // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::STORE);
 
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loopMBB);
+  setTargetDAGCombine(ISD::MUL);
 
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::VSELECT);
 
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
 
-  return BB;
-}
+  setStackPointerRegisterToSaveRestore(AArch64::SP);
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
-                                              MachineBasicBlock *BB,
-                                              unsigned Size,
-                                              unsigned CmpOp,
-                                              A64CC::CondCodes Cond) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  setSchedulingPreference(Sched::Hybrid);
 
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  // Enable TBZ/TBNZ
+  MaskAndBranchFoldingIsLegal = true;
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
+  setMinFunctionAlignment(2);
 
-  unsigned oldval = dest;
-  DebugLoc dl = MI->getDebugLoc();
+  RequireStrictAlign = (Align == StrictAlign);
 
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const TargetRegisterClass *TRC, *TRCsp;
-  if (Size == 8) {
-    TRC = &AArch64::GPR64RegClass;
-    TRCsp = &AArch64::GPR64xspRegClass;
-  } else {
-    TRC = &AArch64::GPR32RegClass;
-    TRCsp = &AArch64::GPR32wspRegClass;
-  }
+  setHasExtractBitsInsn(true);
 
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
+  if (Subtarget->hasNEON()) {
+    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+    // silliness like this:
+    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
 
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
 
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 
-  unsigned scratch = MRI.createVirtualRegister(TRC);
-  MRI.constrainRegClass(scratch, TRCsp);
+    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+    // elements smaller than i32, so promote the input to i32 first.
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
 
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
+    // AArch64 doesn't have MUL.2d:
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    // Likewise, narrowing and extending vector loads/stores aren't handled
+    // directly.
+    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                         Expand);
+
+      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+
+      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+        setTruncStoreAction((MVT::SimpleValueType)VT,
+                            (MVT::SimpleValueType)InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    }
 
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   cmp incr, dest (, sign extend if necessary)
-  //   csel scratch, dest, incr, cond
-  //   stxr stxr_status, scratch, ptr
-  //   cbnz stxr_status, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+    // AArch64 has implementations of a lot of rounding-like FP operations.
+    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
+    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
+      MVT Ty = RoundingVecTypes[I];
+      setOperationAction(ISD::FFLOOR, Ty, Legal);
+      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+      setOperationAction(ISD::FCEIL, Ty, Legal);
+      setOperationAction(ISD::FRINT, Ty, Legal);
+      setOperationAction(ISD::FTRUNC, Ty, Legal);
+      setOperationAction(ISD::FROUND, Ty, Legal);
+    }
+  }
+}
 
-  // Build compare and cmov instructions.
-  MRI.constrainRegClass(incr, TRCsp);
-  BuildMI(BB, dl, TII->get(CmpOp))
-    .addReg(incr).addReg(oldval).addImm(0);
+void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+  if (VT == MVT::v2f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+  }
+
+  // Mark vector float intrinsics as expand.
+  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+  }
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
+
+  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+
+  // CNT supports only B element sizes.
+  if (VT != MVT::v8i8 && VT != MVT::v16i8)
+    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+
+  if (Subtarget->isLittleEndian()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
+      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+    }
+  }
+}
 
-  BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
-          scratch)
-    .addReg(oldval).addReg(incr).addImm(Cond);
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR64RegClass);
+  addTypeForNEON(VT, MVT::v2i32);
+}
 
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR128RegClass);
+  addTypeForNEON(VT, MVT::v4i32);
+}
 
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status)
-    .addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loopMBB);
+EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
 
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case AArch64ISD::CSEL: {
+    APInt KnownZero2, KnownOne2;
+    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::aarch64_ldaxr:
+    case Intrinsic::aarch64_ldxr: {
+      unsigned BitWidth = KnownOne.getBitWidth();
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_umaxv:
+    case Intrinsic::aarch64_neon_uminv: {
+      // Figure out the datatype of the vector operand. The UMINV instruction
+      // will zero extend the result, so we can mark as known zero all the
+      // bits larger than the element datatype. 32-bit or larget doesn't need
+      // this as those are legal types and will be handled by isel directly.
+      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+      unsigned BitWidth = KnownZero.getBitWidth();
+      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+        assert(BitWidth >= 8 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+        KnownZero |= Mask;
+      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+        assert(BitWidth >= 16 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+        KnownZero |= Mask;
+      }
+      break;
+    } break;
+    }
+  }
+  }
+}
 
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
+MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+  return MVT::i64;
+}
 
-  MI->eraseFromParent();   // The instruction is gone now.
+unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
+  // FIXME: On AArch64, this depends on the type.
+  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
+  return 4095;
+}
 
-  return BB;
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                      const TargetLibraryInfo *libInfo) const {
+  return AArch64::createFastISel(funcInfo, libInfo);
 }
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned Size) const {
-  unsigned dest    = MI->getOperand(0).getReg();
-  unsigned ptr     = MI->getOperand(1).getReg();
-  unsigned oldval  = MI->getOperand(2).getReg();
-  unsigned newval  = MI->getOperand(3).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const TargetRegisterClass *TRCsp;
-  TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-
-  // loop1MBB:
-  //   ldxr dest, [ptr]
-  //   cmp dest, oldval
-  //   b.ne exitMBB
-  BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-
-  unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
-  MRI.constrainRegClass(dest, TRCsp);
-  BuildMI(BB, dl, TII->get(CmpOp))
-    .addReg(dest).addReg(oldval).addImm(0);
-  BuildMI(BB, dl, TII->get(AArch64::Bcc))
-    .addImm(A64CC::NE).addMBB(exitMBB);
-  BB->addSuccessor(loop2MBB);
-  BB->addSuccessor(exitMBB);
-
-  // loop2MBB:
-  //   strex stxr_status, newval, [ptr]
-  //   cbnz stxr_status, loop1MBB
-  BB = loop2MBB;
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
-
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
+  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
+  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
+  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
+  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
+  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
+  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
+  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
+  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
+  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
+  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
+  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
+  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
+  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
+  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
+  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
+  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
+  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
+  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
+  case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
+  case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
+  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
+  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
+  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
+  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
+  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
+  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
+  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
+  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
+  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
+  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
+  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
+  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
+  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
+  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
+  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
+  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
+  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
+  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
+  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
+  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
+  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
+  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
+  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
+  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
+  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
+  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
+  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
+  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
+  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
+  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
+  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
+  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
+  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
+  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
+  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
+  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
+  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
+  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
+  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
+  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
+  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
+  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
+  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
+  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
+  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
+  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
+  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
+  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
+  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
+  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
+  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
+  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
+  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
+  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
+  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
+  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
+  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
+  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
+  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
+  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
+  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
+  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
+  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
+  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
+  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
+  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
+  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
+  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
+  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
+  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
+  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
+  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
+  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
+  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
+  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
+  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
+  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
+  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
+  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
+  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
+  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
+  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
+  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  }
 }
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
-  // We materialise the F128CSEL pseudo-instruction using conditional branches
-  // and loads, giving an instruciton sequence like:
-  //     str q0, [sp]
-  //     b.ne IfTrue
-  //     b Finish
-  // IfTrue:
-  //     str q1, [sp]
-  // Finish:
-  //     ldr q0, [sp]
-  //
-  // Using virtual registers would probably not be beneficial since COPY
-  // instructions are expensive for f128 (there's no actual instruction to
-  // implement them).
-  //
-  // An alternative would be to do an integer-CSEL on some address. E.g.:
-  //     mov x0, sp
-  //     add x1, sp, #16
-  //     str q0, [x0]
-  //     str q1, [x1]
-  //     csel x0, x0, x1, ne
-  //     ldr q0, [x0]
-  //
-  // It's unclear which approach is actually optimal.
+  // We materialise the F128CSEL pseudo-instruction as some control flow and a
+  // phi node:
+
+  // OrigBB:
+  //     [... previous instrs leading to comparison ...]
+  //     b.ne TrueBB
+  //     b EndBB
+  // TrueBB:
+  //     ; Fallthrough
+  // EndBB:
+  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   MachineFunction *MF = MBB->getParent();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
@@ -916,49 +792,24 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
                 MBB->end());
   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 
-  // We need somewhere to store the f128 value needed.
-  int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
-
-  //     [... start of incoming MBB ...]
-  //     str qIFFALSE, [sp]
-  //     b.cc IfTrue
-  //     b Done
-  BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
-    .addReg(IfFalseReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
-  BuildMI(MBB, DL, TII->get(AArch64::Bcc))
-    .addImm(CondCode)
-    .addMBB(TrueBB);
-  BuildMI(MBB, DL, TII->get(AArch64::Bimm))
-    .addMBB(EndBB);
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
+  // TrueBB falls through to the end.
+  TrueBB->addSuccessor(EndBB);
+
   if (!NZCVKilled) {
-    // NZCV is live-through TrueBB.
     TrueBB->addLiveIn(AArch64::NZCV);
     EndBB->addLiveIn(AArch64::NZCV);
   }
 
-  // IfTrue:
-  //     str qIFTRUE, [sp]
-  BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
-    .addReg(IfTrueReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
-
-  // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
-  // blocks.
-  TrueBB->addSuccessor(EndBB);
-
-  // Done:
-  //     ldr qDEST, [sp]
-  //     [... rest of incoming MBB ...]
-  MachineInstr *StartOfEnd = EndBB->begin();
-  BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
+  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+      .addReg(IfTrueReg)
+      .addMBB(TrueBB)
+      .addReg(IfFalseReg)
+      .addMBB(MBB);
 
   MI->eraseFromParent();
   return EndBB;
@@ -966,854 +817,1136 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *MBB) const {
+                                                 MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unhandled instruction with custom inserter");
+  default:
+#ifndef NDEBUG
+    MI->dump();
+#endif
+    assert(0 && "Unexpected instruction for custom inserter!");
+    break;
+
   case AArch64::F128CSEL:
-    return EmitF128CSEL(MI, MBB);
-  case AArch64::ATOMIC_LOAD_ADD_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_SUB_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_AND_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_OR_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_XOR_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_NAND_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_MIN_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
-
-  case AArch64::ATOMIC_LOAD_MAX_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
-
-  case AArch64::ATOMIC_LOAD_UMIN_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
-
-  case AArch64::ATOMIC_LOAD_UMAX_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
-
-  case AArch64::ATOMIC_SWAP_I8:
-    return emitAtomicBinary(MI, MBB, 1, 0);
-  case AArch64::ATOMIC_SWAP_I16:
-    return emitAtomicBinary(MI, MBB, 2, 0);
-  case AArch64::ATOMIC_SWAP_I32:
-    return emitAtomicBinary(MI, MBB, 4, 0);
-  case AArch64::ATOMIC_SWAP_I64:
-    return emitAtomicBinary(MI, MBB, 8, 0);
-
-  case AArch64::ATOMIC_CMP_SWAP_I8:
-    return emitAtomicCmpSwap(MI, MBB, 1);
-  case AArch64::ATOMIC_CMP_SWAP_I16:
-    return emitAtomicCmpSwap(MI, MBB, 2);
-  case AArch64::ATOMIC_CMP_SWAP_I32:
-    return emitAtomicCmpSwap(MI, MBB, 4);
-  case AArch64::ATOMIC_CMP_SWAP_I64:
-    return emitAtomicCmpSwap(MI, MBB, 8);
+    return EmitF128CSEL(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
   }
+  llvm_unreachable("Unexpected instruction for custom inserter!");
 }
 
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
 
-const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
-  case AArch64ISD::Call:           return "AArch64ISD::Call";
-  case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
-  case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
-  case AArch64ISD::BFI:            return "AArch64ISD::BFI";
-  case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
-  case AArch64ISD::Ret:            return "AArch64ISD::Ret";
-  case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
-  case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
-  case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
-  case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
-  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
-  case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
-  case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
-  case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
-
-  case AArch64ISD::NEON_MOVIMM:
-    return "AArch64ISD::NEON_MOVIMM";
-  case AArch64ISD::NEON_MVNIMM:
-    return "AArch64ISD::NEON_MVNIMM";
-  case AArch64ISD::NEON_FMOVIMM:
-    return "AArch64ISD::NEON_FMOVIMM";
-  case AArch64ISD::NEON_CMP:
-    return "AArch64ISD::NEON_CMP";
-  case AArch64ISD::NEON_CMPZ:
-    return "AArch64ISD::NEON_CMPZ";
-  case AArch64ISD::NEON_TST:
-    return "AArch64ISD::NEON_TST";
-  case AArch64ISD::NEON_QSHLs:
-    return "AArch64ISD::NEON_QSHLs";
-  case AArch64ISD::NEON_QSHLu:
-    return "AArch64ISD::NEON_QSHLu";
-  case AArch64ISD::NEON_VDUP:
-    return "AArch64ISD::NEON_VDUP";
-  case AArch64ISD::NEON_VDUPLANE:
-    return "AArch64ISD::NEON_VDUPLANE";
-  case AArch64ISD::NEON_REV16:
-    return "AArch64ISD::NEON_REV16";
-  case AArch64ISD::NEON_REV32:
-    return "AArch64ISD::NEON_REV32";
-  case AArch64ISD::NEON_REV64:
-    return "AArch64ISD::NEON_REV64";
-  case AArch64ISD::NEON_UZP1:
-    return "AArch64ISD::NEON_UZP1";
-  case AArch64ISD::NEON_UZP2:
-    return "AArch64ISD::NEON_UZP2";
-  case AArch64ISD::NEON_ZIP1:
-    return "AArch64ISD::NEON_ZIP1";
-  case AArch64ISD::NEON_ZIP2:
-    return "AArch64ISD::NEON_ZIP2";
-  case AArch64ISD::NEON_TRN1:
-    return "AArch64ISD::NEON_TRN1";
-  case AArch64ISD::NEON_TRN2:
-    return "AArch64ISD::NEON_TRN2";
-  case AArch64ISD::NEON_LD1_UPD:
-    return "AArch64ISD::NEON_LD1_UPD";
-  case AArch64ISD::NEON_LD2_UPD:
-    return "AArch64ISD::NEON_LD2_UPD";
-  case AArch64ISD::NEON_LD3_UPD:
-    return "AArch64ISD::NEON_LD3_UPD";
-  case AArch64ISD::NEON_LD4_UPD:
-    return "AArch64ISD::NEON_LD4_UPD";
-  case AArch64ISD::NEON_ST1_UPD:
-    return "AArch64ISD::NEON_ST1_UPD";
-  case AArch64ISD::NEON_ST2_UPD:
-    return "AArch64ISD::NEON_ST2_UPD";
-  case AArch64ISD::NEON_ST3_UPD:
-    return "AArch64ISD::NEON_ST3_UPD";
-  case AArch64ISD::NEON_ST4_UPD:
-    return "AArch64ISD::NEON_ST4_UPD";
-  case AArch64ISD::NEON_LD1x2_UPD:
-    return "AArch64ISD::NEON_LD1x2_UPD";
-  case AArch64ISD::NEON_LD1x3_UPD:
-    return "AArch64ISD::NEON_LD1x3_UPD";
-  case AArch64ISD::NEON_LD1x4_UPD:
-    return "AArch64ISD::NEON_LD1x4_UPD";
-  case AArch64ISD::NEON_ST1x2_UPD:
-    return "AArch64ISD::NEON_ST1x2_UPD";
-  case AArch64ISD::NEON_ST1x3_UPD:
-    return "AArch64ISD::NEON_ST1x3_UPD";
-  case AArch64ISD::NEON_ST1x4_UPD:
-    return "AArch64ISD::NEON_ST1x4_UPD";
-  case AArch64ISD::NEON_LD2DUP:
-    return "AArch64ISD::NEON_LD2DUP";
-  case AArch64ISD::NEON_LD3DUP:
-    return "AArch64ISD::NEON_LD3DUP";
-  case AArch64ISD::NEON_LD4DUP:
-    return "AArch64ISD::NEON_LD4DUP";
-  case AArch64ISD::NEON_LD2DUP_UPD:
-    return "AArch64ISD::NEON_LD2DUP_UPD";
-  case AArch64ISD::NEON_LD3DUP_UPD:
-    return "AArch64ISD::NEON_LD3DUP_UPD";
-  case AArch64ISD::NEON_LD4DUP_UPD:
-    return "AArch64ISD::NEON_LD4DUP_UPD";
-  case AArch64ISD::NEON_LD2LN_UPD:
-    return "AArch64ISD::NEON_LD2LN_UPD";
-  case AArch64ISD::NEON_LD3LN_UPD:
-    return "AArch64ISD::NEON_LD3LN_UPD";
-  case AArch64ISD::NEON_LD4LN_UPD:
-    return "AArch64ISD::NEON_LD4LN_UPD";
-  case AArch64ISD::NEON_ST2LN_UPD:
-    return "AArch64ISD::NEON_ST2LN_UPD";
-  case AArch64ISD::NEON_ST3LN_UPD:
-    return "AArch64ISD::NEON_ST3LN_UPD";
-  case AArch64ISD::NEON_ST4LN_UPD:
-    return "AArch64ISD::NEON_ST4LN_UPD";
-  case AArch64ISD::NEON_VEXTRACT:
-    return "AArch64ISD::NEON_VEXTRACT";
-  default:
-    return nullptr;
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:
+    return AArch64CC::NE;
+  case ISD::SETEQ:
+    return AArch64CC::EQ;
+  case ISD::SETGT:
+    return AArch64CC::GT;
+  case ISD::SETGE:
+    return AArch64CC::GE;
+  case ISD::SETLT:
+    return AArch64CC::LT;
+  case ISD::SETLE:
+    return AArch64CC::LE;
+  case ISD::SETUGT:
+    return AArch64CC::HI;
+  case ISD::SETUGE:
+    return AArch64CC::HS;
+  case ISD::SETULT:
+    return AArch64CC::LO;
+  case ISD::SETULE:
+    return AArch64CC::LS;
   }
 }
 
-static const MCPhysReg AArch64FPRArgRegs[] = {
-  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
-  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
-};
-static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
-
-static const MCPhysReg AArch64ArgRegs[] = {
-  AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
-  AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
-};
-static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
-
-static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                 CCValAssign::LocInfo LocInfo,
-                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  // Mark all remaining general purpose registers as allocated. We don't
-  // backtrack: if (for example) an i128 gets put on the stack, no subsequent
-  // i64 will go in registers (C.11).
-  for (unsigned i = 0; i < NumArgRegs; ++i)
-    State.AllocateReg(AArch64ArgRegs[i]);
-
-  return false;
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+                                  AArch64CC::CondCode &CondCode,
+                                  AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case ISD::SETOLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case ISD::SETOLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case ISD::SETONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case ISD::SETO:
+    CondCode = AArch64CC::VC;
+    break;
+  case ISD::SETUO:
+    CondCode = AArch64CC::VS;
+    break;
+  case ISD::SETUEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case ISD::SETUGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case ISD::SETUGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case ISD::SETLT:
+  case ISD::SETULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case ISD::SETLE:
+  case ISD::SETULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
 }
 
-#include "AArch64GenCallingConv.inc"
-
-CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
-
-  switch(CC) {
-  default: llvm_unreachable("Unsupported calling convention");
-  case CallingConv::Fast:
-  case CallingConv::C:
-    return CC_A64_APCS;
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+                                        AArch64CC::CondCode &CondCode,
+                                        AArch64CC::CondCode &CondCode2,
+                                        bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default:
+    // Mostly the scalar mappings work fine.
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    break;
+  case ISD::SETUO:
+    Invert = true; // Fallthrough
+  case ISD::SETO:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GE;
+    break;
+  case ISD::SETUEQ:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // All of the compare-mask comparisons are ordered, but we can switch
+    // between the two by a double inversion. E.g. ULE == !OGT.
+    Invert = true;
+    changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+    break;
   }
 }
 
-void
-AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                           SDLoc DL, SDValue &Chain) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-
-  SmallVector<SDValue, 8> MemOps;
-
-  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
-                                                         NumArgRegs);
-  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
-                                                         NumFPRArgRegs);
-
-  unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
-  int GPRIdx = 0;
-  if (GPRSaveSize != 0) {
-    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+static bool isLegalArithImmed(uint64_t C) {
+  // Matches AArch64DAGToDAGISel::SelectArithImmed().
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
 
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              SDLoc dl, SelectionDAG &DAG) {
+  EVT VT = LHS.getValueType();
+
+  if (VT.isFloatingPoint())
+    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+
+  // The CMP instruction is just an alias for SUBS, and representing it as
+  // SUBS means that it's possible to get CSE with subtract operations.
+  // A later phase can perform the optimization of setting the destination
+  // register to WZR/XZR if it ends up being unused.
+  unsigned Opcode = AArch64ISD::SUBS;
+
+  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
+      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+    // can be set differently by this operation. It comes down to whether
+    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+    // everything is fine. If not then the optimization is wrong. Thus general
+    // comparisons are only valid if op2 != 0.
+
+    // So, finally, the only LLVM-native comparisons that don't mention C and V
+    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+    // the absence of information about op2.
+    Opcode = AArch64ISD::ADDS;
+    RHS = RHS.getOperand(1);
+  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
+             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+             !isUnsignedIntSetCC(CC)) {
+    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+    // of the signed comparisons.
+    Opcode = AArch64ISD::ANDS;
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+  }
+
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+      .getValue(1);
+}
 
-    for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                                   MachinePointerInfo::getStack(i * 8),
-                                   false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, getPointerTy()));
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    EVT VT = RHS.getValueType();
+    uint64_t C = RHSC->getZExtValue();
+    if (!isLegalArithImmed(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if ((VT == MVT::i32 && C != 0x80000000 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0x80000000ULL &&
+             isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if ((VT == MVT::i32 && C != 0 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if ((VT == MVT::i32 && C != 0x7fffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if ((VT == MVT::i32 && C != 0xffffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      }
     }
   }
 
-  if (getSubtarget()->hasFPARMv8()) {
-  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
-  int FPRIdx = 0;
-    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
-    // can omit a register save area if we know we'll never use registers of
-    // that class.
-    if (FPRSaveSize != 0) {
-      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+  return Cmp;
+}
 
-      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
-            &AArch64::FPR128RegClass);
-        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-            MachinePointerInfo::getStack(i * 16),
-            false, false, 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-            DAG.getConstant(16, getPointerTy()));
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+         "Unsupported value type");
+  SDValue Value, Overflow;
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Opc = 0;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::UADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::HS;
+    break;
+  case ISD::SSUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::USUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::LO;
+    break;
+  // Multiply needs a little bit extra work.
+  case ISD::SMULO:
+  case ISD::UMULO: {
+    CC = AArch64CC::NE;
+    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+    if (Op.getValueType() == MVT::i32) {
+      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      // For a 32 bit multiply with overflow check we want the instruction
+      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+      // need to generate the following pattern:
+      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+                                DAG.getConstant(0, MVT::i64));
+      // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+      // operation. We need to clear out the upper 32 bits, because we used a
+      // widening multiply that wrote all 64 bits. In the end this should be a
+      // noop.
+      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+      if (IsSigned) {
+        // The signed overflow check requires more than just a simple check for
+        // any bit set in the upper 32 bits of the result. These bits could be
+        // just the sign bits of a negative number. To perform the overflow
+        // check we have to arithmetic shift right the 32nd bit of the result by
+        // 31 bits. Then we compare the result to the upper 32 bits.
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+                                        DAG.getConstant(32, MVT::i64));
+        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+                                        DAG.getConstant(31, MVT::i64));
+        // It is important that LowerBits is last, otherwise the arithmetic
+        // shift will not be folded into the compare (SUBS).
+        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+        Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                       .getValue(1);
+      } else {
+        // The overflow check for unsigned multiply is easy. We only need to
+        // check if any of the upper 32 bits are set. This can be done with a
+        // CMP (shifted register). For that we need to generate the following
+        // pattern:
+        // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                                        DAG.getConstant(32, MVT::i64));
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+        Overflow =
+            DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                        UpperBits).getValue(1);
       }
+      break;
     }
-    FuncInfo->setVariadicFPRIdx(FPRIdx);
-    FuncInfo->setVariadicFPRSize(FPRSaveSize);
+    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+    // For the 64 bit multiply
+    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+    if (IsSigned) {
+      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+                                      DAG.getConstant(63, MVT::i64));
+      // It is important that LowerBits is last, otherwise the arithmetic
+      // shift will not be folded into the compare (SUBS).
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                     .getValue(1);
+    } else {
+      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow =
+          DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                      UpperBits).getValue(1);
+    }
+    break;
   }
+  } // switch (...)
 
-  unsigned StackOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), 8);
-  int StackIdx = MFI->CreateFixedObject(8, StackOffset, true);
-
-  FuncInfo->setVariadicStackIdx(StackIdx);
-  FuncInfo->setVariadicGPRIdx(GPRIdx);
-  FuncInfo->setVariadicGPRSize(GPRSaveSize);
+  if (Opc) {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
 
-  if (!MemOps.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+    // Emit the AArch64 operation with overflow check.
+    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+    Overflow = Value.getValue(1);
   }
+  return std::make_pair(Value, Overflow);
 }
 
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                             RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
 
-SDValue
-AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
-                                      CallingConv::ID CallConv, bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Sel = Op.getOperand(0);
+  SDValue Other = Op.getOperand(1);
 
-  SmallVector<SDValue, 16> ArgValues;
+  // If neither operand is a SELECT_CC, give up.
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    std::swap(Sel, Other);
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    return Op;
 
-  SDValue ArgValue;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  // The folding we want to perform is:
+  // (xor x, (select_cc a, b, cc, 0, -1) )
+  //   -->
+  // (csel x, (xor x, -1), cc ...)
+  //
+  // The latter will get matched to a CSINV instruction.
 
-    if (Flags.isByVal()) {
-      // Byval is used for small structs and HFAs in the PCS, but the system
-      // should work in a non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
-      int Size = Flags.getByValSize();
-      unsigned NumRegs = (Size + 7) / 8;
+  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+  SDValue LHS = Sel.getOperand(0);
+  SDValue RHS = Sel.getOperand(1);
+  SDValue TVal = Sel.getOperand(2);
+  SDValue FVal = Sel.getOperand(3);
+  SDLoc dl(Sel);
 
-      uint32_t BEAlign = 0;
-      if (Size < 8 && !getSubtarget()->isLittle())
-        BEAlign = 8-Size;
-      unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
-                                                 VA.getLocMemOffset() + BEAlign,
-                                                 false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
-      InVals.push_back(FrameIdxN);
+  // FIXME: This could be generalized to non-integer comparisons.
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return Op;
 
-      continue;
-    } else if (VA.isRegLoc()) {
-      MVT RegVT = VA.getLocVT();
-      const TargetRegisterClass *RC = getRegClassFor(RegVT);
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
 
-      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
-    } else { // VA.isRegLoc()
-      assert(VA.isMemLoc());
+  // The the values aren't constants, this isn't the pattern we're looking for.
+  if (!CFVal || !CTVal)
+    return Op;
 
-      int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
-                                      VA.getLocMemOffset(), true);
+  // We can commute the SELECT_CC by inverting the condition.  This
+  // might be needed to make this fit into a CSINV pattern.
+  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    std::swap(TVal, FVal);
+    std::swap(CTVal, CFVal);
+    CC = ISD::getSetCCInverse(CC, true);
+  }
 
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                             MachinePointerInfo::getFixedStack(FI),
-                             false, false, false, 0);
+  // If the constants line up, perform the transform!
+  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
+    FVal = Other;
+    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+                       DAG.getConstant(-1ULL, Other.getValueType()));
 
-    }
+    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+                       CCVal, Cmp);
+  }
 
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::BCvt:
-      ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
-      break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-    case CCValAssign::FPExt: {
-      unsigned DestSize = VA.getValVT().getSizeInBits();
-      unsigned DestSubReg;
-
-      switch (DestSize) {
-      case 8: DestSubReg = AArch64::sub_8; break;
-      case 16: DestSubReg = AArch64::sub_16; break;
-      case 32: DestSubReg = AArch64::sub_32; break;
-      case 64: DestSubReg = AArch64::sub_64; break;
-      default: llvm_unreachable("Unexpected argument promotion");
-      }
+  return Op;
+}
 
-      ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
-                                   VA.getValVT(), ArgValue,
-                                   DAG.getTargetConstant(DestSubReg, MVT::i32)),
-                         0);
-      break;
-    }
-    }
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
 
-    InVals.push_back(ArgValue);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Invalid code");
+  case ISD::ADDC:
+    Opc = AArch64ISD::ADDS;
+    break;
+  case ISD::SUBC:
+    Opc = AArch64ISD::SUBS;
+    break;
+  case ISD::ADDE:
+    Opc = AArch64ISD::ADCS;
+    ExtraOp = true;
+    break;
+  case ISD::SUBE:
+    Opc = AArch64ISD::SBCS;
+    ExtraOp = true;
+    break;
   }
 
-  if (isVarArg)
-    SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+                     Op.getOperand(2));
+}
 
-  unsigned StackArgSize = CCInfo.getNextStackOffset();
-  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
-    // This is a non-standard ABI so by fiat I say we're allowed to make full
-    // use of the stack area to be popped, which must be aligned to 16 bytes in
-    // any case:
-    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
 
-    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
-    // a multiple of 16.
-    FuncInfo->setArgumentStackToRestore(StackArgSize);
+  AArch64CC::CondCode CC;
+  // The actual operation that sets the overflow or carry flag.
+  SDValue Value, Overflow;
+  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
 
-    // This realignment carries over to the available bytes below. Our own
-    // callers will guarantee the space is free by giving an aligned value to
-    // CALLSEQ_START.
-  }
-  // Even if we're not expected to free up the space, it's useful to know how
-  // much is there while considering tail calls (because we can reuse it).
-  FuncInfo->setBytesInStackArgArea(StackArgSize);
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
 
-  return Chain;
+  // We use an inverted condition, because the conditional select is inverted
+  // too. This will allow it to be selected to a single instruction:
+  // CSINC Wd, WZR, WZR, invert(cond).
+  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
+  Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
+                         CCVal, Overflow);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
 }
 
-SDValue
-AArch64TargetLowering::LowerReturn(SDValue Chain,
-                                   CallingConv::ID CallConv, bool isVarArg,
-                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                   const SmallVectorImpl<SDValue> &OutVals,
-                                   SDLoc dl, SelectionDAG &DAG) const {
-  // CCValAssign - represent the assignment of the return value to a location.
-  SmallVector<CCValAssign, 16> RVLocs;
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  // The data thing is not used.
+  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  bool IsStream = !Locality;
+  // When the locality number is set
+  if (Locality) {
+    // The front-end should have filtered out the out-of-range values
+    assert(Locality <= 3 && "Prefetch locality out-of-range");
+    // The locality degree is the opposite of the cache speed.
+    // Put the number the other way around.
+    // The encoding starts at 0 for level 1
+    Locality = 3 - Locality;
+  }
+
+  // built the mask value encoding the expected behavior.
+  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (Locality << 1) |    // Cache level bits
+                   (unsigned)IsStream;  // Stream bit
+  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
+}
 
-  // CCState - Info about the registers and stack slots.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
 
-  // Analyze outgoing return values.
-  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
 
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
+  return LowerF128Call(Op, DAG, LC);
+}
 
-  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
-    // PCS: "If the type, T, of the result of a function is such that
-    // void func(T arg) would require that arg be passed as a value in a
-    // register (or set of registers) according to the rules in 5.4, then the
-    // result is returned in the same registers as would be used for such an
-    // argument.
-    //
-    // Otherwise, the caller shall reserve a block of memory of sufficient
-    // size and alignment to hold the result. The address of the memory block
-    // shall be passed as an additional argument to the function in x8."
-    //
-    // This is implemented in two places. The register-return values are dealt
-    // with here, more complex returns are passed as an sret parameter, which
-    // means we don't have to worry about it during actual return.
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
 
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
-    SDValue Arg = OutVals[i];
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
 
-    // There's no convenient note in the ABI about this as there is for normal
-    // arguments, but it says return values are passed in the same registers as
-    // an argument would be. I believe that includes the comments about
-    // unspecified higher bits, putting the burden of widening on the *caller*
-    // for return values.
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info");
-    case CCValAssign::Full: break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-      // Floating-point values should only be extended when they're going into
-      // memory, which can't happen here so an integer extend is acceptable.
-      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      break;
-    }
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT VT = Op.getValueType();
 
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    SDLoc dl(Op);
+    SDValue Cv =
+        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+                    Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
   }
 
-  RetOps[0] = Chain;  // Update chain.
-
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+    SDLoc dl(Op);
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
+  }
 
-  return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other, RetOps);
+  // Type changing conversions are illegal.
+  return Op;
 }
 
-unsigned AArch64TargetLowering::getByValTypeAlignment(Type *Ty) const {
-  // This is a new backend. For anything more precise than this a FE should
-  // set an explicit alignment.
-  return 4;
-}
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
 
-SDValue
-AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                 SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG                     = CLI.DAG;
-  SDLoc &dl                             = CLI.DL;
-  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
-  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
-  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
-  SDValue Chain                         = CLI.Chain;
-  SDValue Callee                        = CLI.Callee;
-  bool &IsTailCall                      = CLI.IsTailCall;
-  CallingConv::ID CallConv              = CLI.CallConv;
-  bool IsVarArg                         = CLI.IsVarArg;
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
-  bool IsSibCall = false;
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
-  if (IsTailCall) {
-    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                    IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
-                                                   Outs, OutVals, Ins, DAG);
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
 
-    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
-      report_fatal_error("failed to perform tail call elimination on a call "
-                         "site marked musttail");
+  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
 
-    // A sibling call is one where we're under the usual C ABI and not planning
-    // to change that but can still do a tail call:
-    if (!TailCallOpt && IsTailCall)
-      IsSibCall = true;
-  }
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    MVT CastVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
+                         InVT.getVectorNumElements());
+    In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
+  }
 
-  // On AArch64 (and all other architectures I'm aware of) the most this has to
-  // do is adjust the stack pointer.
-  unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
-  if (IsSibCall) {
-    // Since we're not changing the ABI to make this a tail call, the memory
-    // operands are already available in the caller's incoming argument space.
-    NumBytes = 0;
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+    unsigned CastOpc =
+        Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    EVT CastVT = VT.changeVectorElementTypeToInteger();
+    In = DAG.getNode(CastOpc, dl, CastVT, In);
+    return DAG.getNode(Op.getOpcode(), dl, VT, In);
   }
 
-  // FPDiff is the byte offset of the call's argument area from the callee's.
-  // Stores to callee stack arguments will be placed in FixedStackSlots offset
-  // by this amount for a tail call. In a sibling call it must be 0 because the
-  // caller will deallocate the entire stack and the callee still expects its
-  // arguments to begin at SP+0. Completely unused for non-tail calls.
-  int FPDiff = 0;
+  return Op;
+}
 
-  if (IsTailCall && !IsSibCall) {
-    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
 
-    // FPDiff will be negative if this tail call requires more space than we
-    // would automatically have in our incoming argument space. Positive if we
-    // can actually shrink the stack.
-    FPDiff = NumReusableBytes - NumBytes;
+  // i128 conversions are libcalls.
+  if (Op.getOperand(0).getValueType() == MVT::i128)
+    return SDValue();
 
-    // The stack pointer must be 16-byte aligned at all times it's used for a
-    // memory operation, which in practice means at *all* times and in
-    // particular across call boundaries. Therefore our own arguments started at
-    // a 16-byte aligned SP and the delta applied for the tail call should
-    // satisfy the same constraint.
-    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  // Other conversions are legal, unless it's to the completely software-based
+  // fp128.
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT:
+    return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return LowerXALUO(Op, DAG);
+  case ISD::FADD:
+    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB:
+    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL:
+    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:
+    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:
+    return LowerVectorSRA_SRL_SHL(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return LowerFCOPYSIGN(Op, DAG);
+  case ISD::AND:
+    return LowerVectorAND(Op, DAG);
+  case ISD::OR:
+    return LowerVectorOR(Op, DAG);
+  case ISD::XOR:
+    return LowerXOR(Op, DAG);
+  case ISD::PREFETCH:
+    return LowerPREFETCH(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
   }
+}
 
-  if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                                 dl);
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
-                                        getPointerTy());
+#include "AArch64GenCallingConv.inc"
 
-  SmallVector<SDValue, 8> MemOpChains;
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+/// Selects the correct CCAssignFn for a the given CallingConvention
+/// value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                     bool IsVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return CC_AArch64_WebKit_JS;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    if (!Subtarget->isTargetDarwin())
+      return CC_AArch64_AAPCS;
+    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  }
+}
+
+SDValue AArch64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
+  // At this point, Ins[].VT may already be promoted to i32. To correctly
+  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+  // LocVT.
+  unsigned NumArgs = Ins.size();
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ValVT = Ins[i].VT;
+    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+
+    // Get type of the original argument.
+    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+      ValVT = MVT::i8;
+    else if (ActualMVT == MVT::i16)
+      ValVT = MVT::i16;
+
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Ins.size());
+  SmallVector<SDValue, 16> ArgValues;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
-    SDValue Arg = OutVals[i];
-
-    // Callee does the actual widening, so all extensions just use an implicit
-    // definition of the rest of the Loc. Aesthetically, this would be nicer as
-    // an ANY_EXTEND, but that isn't valid for floating-point types and this
-    // alternative works on integer types too.
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-    case CCValAssign::FPExt: {
-      unsigned SrcSize = VA.getValVT().getSizeInBits();
-      unsigned SrcSubReg;
-
-      switch (SrcSize) {
-      case 8: SrcSubReg = AArch64::sub_8; break;
-      case 16: SrcSubReg = AArch64::sub_16; break;
-      case 32: SrcSubReg = AArch64::sub_32; break;
-      case 64: SrcSubReg = AArch64::sub_64; break;
-      default: llvm_unreachable("Unexpected argument promotion");
-      }
 
-      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
-                                    VA.getLocVT(),
-                                    DAG.getUNDEF(VA.getLocVT()),
-                                    Arg,
-                                    DAG.getTargetConstant(SrcSubReg, MVT::i32)),
-                    0);
+    if (Ins[i].Flags.isByVal()) {
+      // Byval is used for HFAs in the PCS, but the system should work in a
+      // non-compliant manner for larger structs.
+      EVT PtrTy = getPointerTy();
+      int Size = Ins[i].Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
 
-      break;
-    }
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      break;
-    }
+      // FIXME: This works on big-endian for composite byvals, which are the common
+      // case. It should also work for fundamental types too.
+      unsigned FrameIdx =
+        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      InVals.push_back(FrameIdxN);
 
-    if (VA.isRegLoc()) {
-      // A normal register (sub-) argument. For now we just note it down because
-      // we want to copy things into registers as late as possible to avoid
-      // register-pressure (and possibly worse).
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       continue;
     }
+    
+    if (VA.isRegLoc()) {
+      // Arguments stored in registers.
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &AArch64::GPR32RegClass;
+      else if (RegVT == MVT::i64)
+        RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &AArch64::FPR32RegClass;
+      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+        RC = &AArch64::FPR64RegClass;
+      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+        RC = &AArch64::FPR128RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
-    assert(VA.isMemLoc() && "unexpected argument location");
+      // Transform the arguments in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
-    SDValue DstAddr;
-    MachinePointerInfo DstInfo;
-    if (IsTailCall) {
-      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
-                                          VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
-      int32_t Offset = VA.getLocMemOffset() + FPDiff;
-      int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      // If this is an 8, 16 or 32-bit value, it is really passed promoted
+      // to 64 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::AExt:
+      case CCValAssign::SExt:
+      case CCValAssign::ZExt:
+        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
+        // nodes after our lowering.
+        assert(RegVT == Ins[i].VT && "incorrect register location selected");
+        break;
+      }
 
-      DstAddr = DAG.getFrameIndex(FI, getPointerTy());
-      DstInfo = MachinePointerInfo::getFixedStack(FI);
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+      unsigned ArgOffset = VA.getLocMemOffset();
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
 
-      // Make sure any stack arguments overlapping with where we're storing are
-      // loaded before this eventual operation. Otherwise they'll be clobbered.
-      Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
-    } else {
-      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize()*8 :
-                                          VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
       uint32_t BEAlign = 0;
-      if (OpSize < 8 && !getSubtarget()->isLittle())
-        BEAlign = 8-OpSize;
-      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + BEAlign);
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
 
-      DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
-      DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
-    }
+      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
 
-    if (Flags.isByVal()) {
-      SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
-      SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
-                                  Flags.getByValAlign(),
-                                  /*isVolatile = */ false,
-                                  /*alwaysInline = */ false,
-                                  DstInfo, MachinePointerInfo());
-      MemOpChains.push_back(Cpy);
-    } else {
-      // Normal stack argument, put it where it's needed.
-      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
-                                   false, false, 0);
-      MemOpChains.push_back(Store);
-    }
-  }
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue ArgValue;
 
-  // The loads and stores generated above shouldn't clash with each
-  // other. Combining them with this TokenFactor notes that fact for the rest of
-  // the backend.
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+      MVT MemVT = VA.getValVT();
 
-  // Most of the rest of the instructions need to be glued together; we don't
-  // want assignments to actual registers used by a call to be rearranged by a
-  // well-meaning scheduler.
-  SDValue InFlag;
+      switch (VA.getLocInfo()) {
+      default:
+        break;
+      case CCValAssign::BCvt:
+        MemVT = VA.getLocVT();
+        break;
+      case CCValAssign::SExt:
+        ExtType = ISD::SEXTLOAD;
+        break;
+      case CCValAssign::ZExt:
+        ExtType = ISD::ZEXTLOAD;
+        break;
+      case CCValAssign::AExt:
+        ExtType = ISD::EXTLOAD;
+        break;
+      }
 
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
+      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
+                                MachinePointerInfo::getFixedStack(FI),
+                                MemVT, false, false, false, nullptr);
 
-  // The linker is responsible for inserting veneers when necessary to put a
-  // function call destination in range, so we don't need to bother with a
-  // wrapper here.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+      InVals.push_back(ArgValue);
+    }
   }
 
-  // We don't usually want to end the call-sequence here because we would tidy
-  // the frame up *after* the call, however in the ABI-changing tail-call case
-  // we've carefully laid out the parameters so that when sp is reset they'll be
-  // in the correct location.
-  if (IsTailCall && !IsSibCall) {
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(0, true), InFlag, dl);
-    InFlag = Chain.getValue(1);
+  // varargs
+  if (isVarArg) {
+    if (!Subtarget->isTargetDarwin()) {
+      // The AAPCS variadic function ABI is identical to the non-variadic
+      // one. As a result there may be more arguments in registers and we should
+      // save them for future reference.
+      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+    }
+
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    // This will point to the next argument passed via stack.
+    unsigned StackOffset = CCInfo.getNextStackOffset();
+    // We currently pass all varargs at 8-byte alignment.
+    StackOffset = ((StackOffset + 7) & ~7);
+    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   }
 
-  // We produce the following DAG scheme for the actual call instruction:
-  //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
-  //
-  // Most arguments aren't going to be used and just keep the values live as
-  // far as LLVM is concerned. It's expected to be selected as simply "bl
-  // callee" (for a direct, non-tail call).
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
 
-  if (IsTailCall) {
-    // Each tail call may have to adjust the stack by a different amount, so
-    // this information must travel along with the operation for eventual
-    // consumption by emitEpilogue.
-    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
   }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
 
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
+  return Chain;
+}
 
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+                                                SelectionDAG &DAG, SDLoc DL,
+                                                SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
-  // Add a register mask operand representing the call-preserved registers. This
-  // is used later in codegen to constrain register-allocation.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  SmallVector<SDValue, 8> MemOps;
 
-  // If we needed glue, put it in as the last argument.
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
+  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+                                          AArch64::X3, AArch64::X4, AArch64::X5,
+                                          AArch64::X6, AArch64::X7 };
+  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+  unsigned FirstVariadicGPR =
+      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
 
-  if (IsTailCall) {
-    return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, Ops);
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getStack(i * 8), false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
   }
+  FuncInfo->setVarArgsGPRIndex(GPRIdx);
+  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
-  Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, Ops);
-  InFlag = Chain.getValue(1);
+  if (Subtarget->hasFPARMv8()) {
+    static const MCPhysReg FPRArgRegs[] = {
+        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+    unsigned FirstVariadicFPR =
+        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+
+    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+    int FPRIdx = 0;
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
 
-  // Now we can reclaim the stack, just as well do it before working out where
-  // our return value is.
-  if (!IsSibCall) {
-    uint64_t CalleePopBytes
-      = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
 
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(CalleePopBytes, true),
-                               InFlag, dl);
-    InFlag = Chain.getValue(1);
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                         MachinePointerInfo::getStack(i * 16), false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                          DAG.getConstant(16, getPointerTy()));
+      }
+    }
+    FuncInfo->setVarArgsFPRIndex(FPRIdx);
+    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
   }
 
-  return LowerCallResult(Chain, InFlag, CallConv,
-                         IsVarArg, Ins, dl, DAG, InVals);
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+  }
 }
 
-SDValue
-AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                      CallingConv::ID CallConv, bool IsVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
 
+  // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign VA = RVLocs[i];
 
-    // Return values that are too big to fit into registers should use an sret
-    // pointer, so this can be a lot simpler than the main argument code.
-    assert(VA.isRegLoc() && "Memory locations not expected for call return");
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
 
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
-                                     InFlag);
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
     Chain = Val.getValue(1);
     InFlag = Val.getValue(2);
 
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
       break;
-    case CCValAssign::ZExt:
-    case CCValAssign::SExt:
-    case CCValAssign::AExt:
-      // Floating-point arguments only get extended/truncated if they're going
-      // in memory, so using the integer operation is acceptable here.
-      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
       break;
     }
 
@@ -1823,17 +1956,12 @@ AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   return Chain;
 }
 
-bool
-AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    bool IsVarArg,
-                                    bool IsCalleeStructRet,
-                                    bool IsCallerStructRet,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const {
-
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   // For CallingConv::C this function knows whether the ABI needs
   // changing. That's not true for other conventions so they will have to opt in
   // manually.
@@ -1849,7 +1977,8 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // we want to reuse during a tail call. Working around this *is* possible (see
   // X86) but less efficient and uglier in LowerCall.
   for (Function::const_arg_iterator i = CallerF->arg_begin(),
-         e = CallerF->arg_end(); i != e; ++i)
+                                    e = CallerF->arg_end();
+       i != e; ++i)
     if (i->hasByValAttr())
       return false;
 
@@ -1865,10 +1994,10 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   // I want anyone implementing a new calling convention to think long and hard
   // about this assert.
-  assert((!IsVarArg || CalleeCC == CallingConv::C)
-         && "Unexpected variadic calling convention");
+  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
 
-  if (IsVarArg && !Outs.empty()) {
+  if (isVarArg && !Outs.empty()) {
     // At least two cases here: if caller is fastcc then we can't have any
     // memory arguments (we'd be expected to clean up the stack afterwards). If
     // caller is C then we could potentially use its argument area.
@@ -1876,10 +2005,10 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                    getTargetMachine(), ArgLocs, *DAG.getContext());
 
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
       if (!ArgLocs[i].isRegLoc())
         return false;
@@ -1891,12 +2020,12 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
                     getTargetMachine(), RVLocs1, *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
                     getTargetMachine(), RVLocs2, *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
       return false;
@@ -1920,28 +2049,18 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
-  const AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
   // If the stack arguments for this call would fit into our own save area then
   // the call can be made tail.
   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
 }
 
-bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
-                                                   bool TailCallOpt) const {
-  return CallCC == CallingConv::Fast && TailCallOpt;
-}
-
-bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
-  return CallCC == CallingConv::Fast;
-}
-
 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
                                                    SelectionDAG &DAG,
                                                    MachineFrameInfo *MFI,
@@ -1957,7 +2076,8 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
 
   // Add a chain value for each stack argument corresponding
   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
-         UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U)
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0) {
@@ -1970,636 +2090,607 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
             ArgChains.push_back(SDValue(L, 1));
         }
 
-   // Build a tokenfactor for all the chains.
-   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
 }
 
-static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
-  switch (CC) {
-  case ISD::SETEQ:  return A64CC::EQ;
-  case ISD::SETGT:  return A64CC::GT;
-  case ISD::SETGE:  return A64CC::GE;
-  case ISD::SETLT:  return A64CC::LT;
-  case ISD::SETLE:  return A64CC::LE;
-  case ISD::SETNE:  return A64CC::NE;
-  case ISD::SETUGT: return A64CC::HI;
-  case ISD::SETUGE: return A64CC::HS;
-  case ISD::SETULT: return A64CC::LO;
-  case ISD::SETULE: return A64CC::LS;
-  default: llvm_unreachable("Unexpected condition code");
-  }
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
 }
 
-bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
-  // icmp is implemented using adds/subs immediate, which take an unsigned
-  // 12-bit immediate, optionally shifted left by 12 bits.
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
+}
 
-  // Symmetric by using adds/subs
-  if (Val < 0)
-    Val = -Val;
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
 
-  return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
-}
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsThisReturn = false;
 
-SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
-                                        ISD::CondCode CC, SDValue &A64cc,
-                                        SelectionDAG &DAG, SDLoc &dl) const {
-  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
-    int64_t C = 0;
-    EVT VT = RHSC->getValueType(0);
-    bool knownInvalid = false;
-
-    // I'm not convinced the rest of LLVM handles these edge cases properly, but
-    // we can at least get it right.
-    if (isSignedIntSetCC(CC)) {
-      C = RHSC->getSExtValue();
-    } else if (RHSC->getZExtValue() > INT64_MAX) {
-      // A 64-bit constant not representable by a signed 64-bit integer is far
-      // too big to fit into a SUBS immediate anyway.
-      knownInvalid = true;
-    } else {
-      C = RHSC->getZExtValue();
-    }
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsSibCall = false;
 
-    if (!knownInvalid && !isLegalICmpImmediate(C)) {
-      // Constant does not fit, try adjusting it by one?
-      switch (CC) {
-      default: break;
-      case ISD::SETLT:
-      case ISD::SETGE:
-        if (isLegalICmpImmediate(C-1)) {
-          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          RHS = DAG.getConstant(C-1, VT);
-        }
-        break;
-      case ISD::SETULT:
-      case ISD::SETUGE:
-        if (isLegalICmpImmediate(C-1)) {
-          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          RHS = DAG.getConstant(C-1, VT);
-        }
-        break;
-      case ISD::SETLE:
-      case ISD::SETGT:
-        if (isLegalICmpImmediate(C+1)) {
-          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          RHS = DAG.getConstant(C+1, VT);
-        }
-        break;
-      case ISD::SETULE:
-      case ISD::SETUGT:
-        if (isLegalICmpImmediate(C+1)) {
-          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          RHS = DAG.getConstant(C+1, VT);
-        }
-        break;
-      }
-    }
+  if (IsTailCall) {
+    // Check if it's really possible to do a tail call.
+    IsTailCall = isEligibleForTailCallOptimization(
+        Callee, CallConv, IsVarArg, IsStructRet,
+        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+
+    if (IsTailCall)
+      ++NumTailCalls;
   }
 
-  A64CC::CondCodes CondCode = IntCCToA64CC(CC);
-  A64cc = DAG.getConstant(CondCode, MVT::i32);
-  return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                     DAG.getCondCode(CC));
-}
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
-static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
-                                    A64CC::CondCodes &Alternative) {
-  A64CC::CondCodes CondCode = A64CC::Invalid;
-  Alternative = A64CC::Invalid;
+  if (IsVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+                                               /*IsVarArg=*/ !Outs[i].IsFixed);
+      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  } else {
+    // At this point, Outs[].VT may already be promoted to i32. To correctly
+    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+    // we use a special version of AnalyzeCallOperands to pass in ValVT and
+    // LocVT.
+    unsigned NumArgs = Outs.size();
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ValVT = Outs[i].VT;
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  }
 
-  switch (CC) {
-  default: llvm_unreachable("Unknown FP condition!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ: CondCode = A64CC::EQ; break;
-  case ISD::SETGT:
-  case ISD::SETOGT: CondCode = A64CC::GT; break;
-  case ISD::SETGE:
-  case ISD::SETOGE: CondCode = A64CC::GE; break;
-  case ISD::SETOLT: CondCode = A64CC::MI; break;
-  case ISD::SETOLE: CondCode = A64CC::LS; break;
-  case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
-  case ISD::SETO:   CondCode = A64CC::VC; break;
-  case ISD::SETUO:  CondCode = A64CC::VS; break;
-  case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
-  case ISD::SETUGT: CondCode = A64CC::HI; break;
-  case ISD::SETUGE: CondCode = A64CC::PL; break;
-  case ISD::SETLT:
-  case ISD::SETULT: CondCode = A64CC::LT; break;
-  case ISD::SETLE:
-  case ISD::SETULE: CondCode = A64CC::LE; break;
-  case ISD::SETNE:
-  case ISD::SETUNE: CondCode = A64CC::NE; break;
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
   }
-  return CondCode;
-}
 
-SDValue
-AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
 
-  switch(getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    // The most efficient code is PC-relative anyway for the small memory model,
-    // so we don't need to worry about relocation model.
-    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                                 AArch64II::MO_NO_FLAG),
-                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                                 AArch64II::MO_LO12),
-                       DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, DL, PtrVT,
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    NumBytes = RoundUpToAlignment(NumBytes, 16);
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
   }
-}
 
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain =
+        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
 
-// (BRCOND chain, val, dest)
-SDValue
-AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue TheBit = Op.getOperand(1);
-  SDValue DestBB = Op.getOperand(2);
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
 
-  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
-  // that as the consumer we are responsible for ignoring rubbish in higher
-  // bits.
-  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
-                       DAG.getConstant(1, MVT::i32));
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
 
-  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
-                               DAG.getConstant(0, TheBit.getValueType()),
-                               DAG.getCondCode(ISD::SETNE));
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
-  return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
-                     A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
-                     DestBB);
-}
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      if (Outs[realArgIdx].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+      }
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
 
-// (BR_CC chain, condcode, lhs, rhs, dest)
-SDValue
-AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Chain = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS = Op.getOperand(2);
-  SDValue RHS = Op.getOperand(3);
-  SDValue DestBB = Op.getOperand(4);
+    if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+        assert(VA.getLocVT() == MVT::i64 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+               "unexpected use of 'returned'");
+        IsThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
 
-  if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to runtime calls by a routine which sets
-    // LHS, RHS and CC appropriately for the rest of this function to continue.
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
 
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
+      uint32_t BEAlign = 0;
+      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                        : VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+        if (OpSize < 8)
+          BEAlign = 8 - OpSize;
+      }
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      int32_t Offset = LocMemOffset + BEAlign;
+      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+
+      if (IsTailCall) {
+        Offset = Offset + FPDiff;
+        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+
+        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+      }
+
+      if (Outs[i].Flags.isByVal()) {
+        SDValue SizeNode =
+            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
+        SDValue Cpy = DAG.getMemcpy(
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            /*isVolatile = */ false,
+            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+
+        MemOpChains.push_back(Cpy);
+      } else {
+        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+        // promoted to a legal register type i32, we should truncate Arg back to
+        // i1/i8/i16.
+        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
+            VA.getValVT() == MVT::i16)
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+        MemOpChains.push_back(Store);
+      }
     }
   }
 
-  if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
 
-    return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                       Chain, CmpOp, A64cc, DestBB);
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget->isTargetMachO()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      bool InternalLinkage = GV->hasInternalLinkage();
+      if (InternalLinkage)
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+      else {
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
+                                            AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      }
+    } else if (ExternalSymbolSDNode *S =
+                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
   }
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                                 Chain, SetCC, A64cc, DestBB);
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                           A64BR_CC, SetCC, A64cc, DestBB);
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
 
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
   }
 
-  return A64BR_CC;
-}
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
 
-SDValue
-AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
-                                       RTLIB::Libcall Call) const {
-  ArgListTy Args;
-  ArgListEntry Entry;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
-    EVT ArgVT = Op.getOperand(i).getValueType();
-    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Args.push_back(Entry);
-  }
-  SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
-
-  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
-
-  // By default, the input chain to this libcall is the entry node of the
-  // function. If the libcall is going to be emitted as a tail call then
-  // isUsedByReturnOnly will change it to the right chain if the return
-  // node which is being folded has a non-entry input chain.
-  SDValue InChain = DAG.getEntryNode();
-
-  // isTailCall may be true since the callee does not reference caller stack
-  // frame. Check if it's in the right position.
-  SDValue TCChain = InChain;
-  bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
-  if (isTailCall)
-    InChain = TCChain;
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  if (IsThisReturn) {
+    // For 'this' returns, use the X0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    if (!Mask) {
+      IsThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(CallConv);
 
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Op)).setChain(InChain)
-    .setCallee(getLibcallCallingConv(Call), RetTy, Callee, &Args, 0)
-    .setTailCall(isTailCall);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall)
+    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
 
-  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+                                ? RoundUpToAlignment(NumBytes, 16)
+                                : 0;
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(CalleePopBytes, true),
+                             InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
 
-  if (!CallInfo.second.getNode())
-    // It's a tailcall, return the chain (which is the DAG root).
-    return DAG.getRoot();
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
 
-  return CallInfo.first;
+bool AArch64TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
 }
 
 SDValue
-AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc DL, SelectionDAG &DAG) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC);
+
+  // Copy the result values into the output registers.
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (Outs[i].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
+        // value. This is strictly redundant on Darwin (which uses "zeroext
+        // i1"), but will be optimised out before ISel.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      }
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  RTLIB::Libcall LC;
-  LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+  RetOps[0] = Chain; // Update chain.
 
-  SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
-SDValue
-AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
 
-  RTLIB::Libcall LC;
-  LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
-  return LowerF128ToCall(Op, DAG, LC);
-}
+  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+         "unexpected offset in global node");
 
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned) {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  SDValue Vec = Op.getOperand(0);
-  EVT OpVT = Vec.getValueType();
-  unsigned Opc = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
-
-  if (VT.getVectorNumElements() == 1) {
-    assert(OpVT == MVT::v1f64 && "Unexpected vector type!");
-    if (VT.getSizeInBits() == OpVT.getSizeInBits())
-      return Op;
-    return DAG.UnrollVectorOp(Op.getNode());
-  }
-
-  if (VT.getSizeInBits() > OpVT.getSizeInBits()) {
-    assert(Vec.getValueType() == MVT::v2f32 && VT == MVT::v2i64 &&
-           "Unexpected vector type!");
-    Vec = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Vec);
-    return DAG.getNode(Opc, dl, VT, Vec);
-  } else if (VT.getSizeInBits() < OpVT.getSizeInBits()) {
-    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
-                                   OpVT.getVectorElementType().getSizeInBits());
-    CastVT =
-        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
-    Vec = DAG.getNode(Opc, dl, CastVT, Vec);
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Vec);
-  }
-  return DAG.getNode(Opc, dl, VT, Vec);
-}
-
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  // We custom lower concat_vectors with 4, 8, or 16 operands that are all the
-  // same operand and of type v1* using the DUP instruction.
-  unsigned NumOps = Op->getNumOperands();
-  if (NumOps == 2) {
-    assert(Op.getValueType().getSizeInBits() == 128 && "unexpected concat");
-    return Op;
+  // This also catched the large code model case for Darwin.
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes instead of using a wrapper node.
+    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   }
 
-  if (NumOps != 4 && NumOps != 8 && NumOps != 16)
-    return SDValue();
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+    // the only correct model on Darwin.
+    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                            OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
 
-  // Must be a single value for VDUP.
-  SDValue Op0 = Op.getOperand(0);
-  for (unsigned i = 1; i < NumOps; ++i) {
-    SDValue OpN = Op.getOperand(i);
-    if (Op0 != OpN)
-      return SDValue();
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   }
+}
 
-  // Verify the value type.
-  EVT EltVT = Op0.getValueType();
-  switch (NumOps) {
-  default: llvm_unreachable("Unexpected number of operands");
-  case 4:
-    if (EltVT != MVT::v1i16 && EltVT != MVT::v1i32)
-      return SDValue();
-    break;
-  case 8:
-    if (EltVT != MVT::v1i8 && EltVT != MVT::v1i16)
-      return SDValue();
-    break;
-  case 16:
-    if (EltVT != MVT::v1i8)
-      return SDValue();
-    break;
-  }
-
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  // VDUP produces better code for constants.
-  if (Op0->getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Op0->getOperand(0));
-  return DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, Op0,
-                     DAG.getConstant(0, MVT::i64));
-}
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
 
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+///     adrp x0, _var@TLVPPAGE
+///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
+///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
+///                                      ; the function pointer
+///     blr x1                           ; Uses descriptor address in x0
+///     ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
 SDValue
-AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                      bool IsSigned) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorFP_TO_INT(Op, DAG, IsSigned);
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  if (IsSigned)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128ToCall(Op, DAG, LC);
-}
-
-SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MFI->setReturnAddressIsTaken(true);
-
-  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
-    return SDValue();
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
 
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  if (Depth) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, MVT::i64);
-    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
-  }
+  SDLoc DL(Op);
+  MVT PtrVT = getPointerTy();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
-  // Return X30, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
-  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
-}
+  SDValue TLVPAddr =
+      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
 
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
+                  false, true, true, 8);
+  Chain = FuncTLVGet.getValue(1);
 
-SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
-                                              const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setFrameAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = AArch64::X29;
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
-  while (Depth--)
-    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
-  return FrameAddr;
-}
-
-// FIXME? Maybe this could be a TableGen attribute on some registers and
-// this table could be generated automatically from RegInfo.
-unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
-                                                  EVT VT) const {
-  unsigned Reg = StringSwitch<unsigned>(RegName)
-                       .Case("sp", AArch64::XSP)
-                       .Default(0);
-  if (Reg)
-    return Reg;
-  report_fatal_error("Invalid register name global variable");
-}
-
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(getTargetMachine().getCodeModel() == CodeModel::Large);
-  assert(getTargetMachine().getRelocationModel() == Reloc::Static);
-
-  EVT PtrVT = getPointerTy();
-  SDLoc dl(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GN->getGlobal();
-
-  SDValue GlobalAddr = DAG.getNode(
-      AArch64ISD::WrapperLarge, dl, PtrVT,
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
-
-  if (GN->getOffset() != 0)
-    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
-                       DAG.getConstant(GN->getOffset(), PtrVT));
-
-  return GlobalAddr;
-}
-
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small);
-
-  EVT PtrVT = getPointerTy();
-  SDLoc dl(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GN->getGlobal();
-  unsigned Alignment = GV->getAlignment();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
-    // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
-    // to zero when they remain undefined. In PIC mode the GOT can take care of
-    // this, but in absolute mode we use a constant pool load.
-    SDValue PoolAddr;
-    PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
-                                                     AArch64II::MO_NO_FLAG),
-                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
-                                                     AArch64II::MO_LO12),
-                           DAG.getConstant(8, MVT::i32));
-    SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
-                                     MachinePointerInfo::getConstantPool(),
-                                     /*isVolatile=*/ false,
-                                     /*isNonTemporal=*/ true,
-                                     /*isInvariant=*/ true, 8);
-    if (GN->getOffset() != 0)
-      return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
-                         DAG.getConstant(GN->getOffset(), PtrVT));
-
-    return GlobalAddr;
-  }
-
-  if (Alignment == 0) {
-    const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
-    if (GVPtrTy->getElementType()->isSized()) {
-      Alignment
-        = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
-    } else {
-      // Be conservative if we can't guess, not that it really matters:
-      // functions and labels aren't valid for loads, and the methods used to
-      // actually calculate an address work with any alignment.
-      Alignment = 1;
-    }
-  }
-
-  unsigned char HiFixup, LoFixup;
-  bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
-
-  if (UseGOT) {
-    HiFixup = AArch64II::MO_GOT;
-    LoFixup = AArch64II::MO_GOT_LO12;
-    Alignment = 8;
-  } else {
-    HiFixup = AArch64II::MO_NO_FLAG;
-    LoFixup = AArch64II::MO_LO12;
-  }
-
-  // AArch64's small model demands the following sequence:
-  // ADRP x0, somewhere
-  // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
-  SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                             HiFixup),
-                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                             LoFixup),
-                                  DAG.getConstant(Alignment, MVT::i32));
-
-  if (UseGOT) {
-    GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
-                            GlobalRef);
-  }
-
-  if (GN->getOffset() != 0)
-    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
-                       DAG.getConstant(GN->getOffset(), PtrVT));
-
-  return GlobalRef;
-}
+  MFI->setAdjustsStack(true);
 
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
-  // we make those distinctions here.
-
-  switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    return LowerGlobalAddressELFSmall(Op, DAG);
-  case CodeModel::Large:
-    return LowerGlobalAddressELFLarge(Op, DAG);
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
-}
-
-SDValue
-AArch64TargetLowering::LowerConstantPool(SDValue Op,
-                                         SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
-  ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Op);
-  const Constant *C = CN->getConstVal();
-
-  switch(getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    // The most efficient code is PC-relative anyway for the small memory model,
-    // so we don't need to worry about relocation model.
-    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
-                                                 AArch64II::MO_NO_FLAG),
-                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
-                                                 AArch64II::MO_LO12),
-                       DAG.getConstant(CN->getAlignment(), MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, DL, PtrVT,
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: x0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
 }
 
-SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
-                                                SDValue DescAddr,
-                                                SDLoc DL,
-                                                SelectionDAG &DAG) const {
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution. This function takes the
+/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
+/// other registers (except LR, NZCV) are preserved.
+///
+/// Thus, the ideal call sequence on AArch64 is:
+///
+///     adrp x0, :tlsdesc:thread_var
+///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
+///     add x0, x0, :tlsdesc_lo12:thread_var
+///     .tlsdesccall thread_var
+///     blr x8
+///     (TPIDR_EL0 offset now in x0).
+///
+/// The ".tlsdesccall" directive instructs the assembler to insert a particular
+/// relocation to help the linker relax this sequence if it turns out to be too
+/// conservative.
+///
+/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
+/// is harmless.
+SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
+                                                   SDValue DescAddr, SDLoc DL,
+                                                   SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
 
   // The function we need to call is simply the first entry in the GOT for this
   // descriptor, load it in preparation.
-  SDValue Func, Chain;
-  Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
-                     DescAddr);
+  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
 
   // The function takes only one argument: the address of the descriptor itself
   // in X0.
-  SDValue Glue;
+  SDValue Glue, Chain;
   Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
   Glue = Chain.getValue(1);
 
-  // Finally, there's a special calling-convention which means that the lookup
-  // must preserve all registers (except X0, obviously).
-  const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
-  const AArch64RegisterInfo *A64RI
-    = static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
-
   // We're now ready to populate the argument list, as with a normal call:
-  std::vector<SDValue> Ops;
+  SmallVector<SDValue, 6> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Func);
   Ops.push_back(SymAddr);
@@ -2608,21 +2699,18 @@ SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
   Ops.push_back(Glue);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
   Glue = Chain.getValue(1);
 
-  // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
-  // back to the generic handling code.
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
 }
 
 SDValue
-AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  assert(getSubtarget()->isTargetELF() &&
-         "TLS not implemented for non-ELF targets");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small
-         && "TLS only supported in small memory model");
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+         "ELF TLS only supported in small memory model");
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
@@ -2634,39 +2722,22 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
 
-  if (Model == TLSModel::InitialExec) {
-    TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                   AArch64II::MO_GOTTPREL),
-                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                   AArch64II::MO_GOTTPREL_LO12),
-                        DAG.getConstant(8, MVT::i32));
-    TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
-                        TPOff);
-  } else if (Model == TLSModel::LocalExec) {
-    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_TPREL_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_TPREL_G0_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(1, MVT::i32)), 0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
-                                       TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                AArch64II::MO_TLSDESC);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                AArch64II::MO_TLSDESC_LO12);
-    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                                   HiDesc, LoDesc,
-                                   DAG.getConstant(8, MVT::i32));
-    SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
-
-    TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(16, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
   } else if (Model == TLSModel::LocalDynamic) {
     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
@@ -2674,573 +2745,758 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
     // calculation.
 
     // These accesses will need deduplicating if there's more than one.
-    AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
-      .getInfo<AArch64MachineFunctionInfo>();
+    AArch64FunctionInfo *MFI =
+        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
     MFI->incNumLocalDynamicTLSAccesses();
 
-
-    // Get the location of _TLS_MODULE_BASE_:
-    SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                AArch64II::MO_TLSDESC);
-    SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                AArch64II::MO_TLSDESC_LO12);
-    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                                   HiDesc, LoDesc,
-                                   DAG.getConstant(8, MVT::i32));
-    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
-
-    ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
-
-    // Get the variable's offset from _TLS_MODULE_BASE_
-    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_DTPREL_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_DTPREL_G0_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
-                                       TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                  AArch64II::MO_TLS);
+
+    // Now we can calculate the offset from TPIDR_EL0 to this module's
+    // thread-local area.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Now use :dtprel_whatever: operations to calculate this variable's offset
+    // in its thread-storage area.
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    SDValue DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                   DAG.getTargetConstant(16, MVT::i32)),
+                0);
+    DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
+                                   DAG.getTargetConstant(0, MVT::i32)),
+                0);
+
+    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr =
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+    // Finally we can make a call to calculate the offset from tpidr_el0.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
   } else
-      llvm_unreachable("Unsupported TLS access model");
-
+    llvm_unreachable("Unsupported ELF TLS access model");
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
 }
 
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned) {
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerDarwinGlobalTLSAddress(Op, DAG);
+  else if (Subtarget->isTargetELF())
+    return LowerELFGlobalTLSAddress(Op, DAG);
+
+  llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  SDValue Vec = Op.getOperand(0);
-  unsigned Opc = IsSigned ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
 
-  if (VT.getVectorNumElements() == 1) {
-    assert(VT == MVT::v1f64 && "Unexpected vector type!");
-    if (VT.getSizeInBits() == Vec.getValueSizeInBits())
-      return Op;
-    return DAG.UnrollVectorOp(Op.getNode());
+  // Handle f128 first, since lowering it will result in comparing the return
+  // value of a libcall against zero, which is just what the rest of LowerBR_CC
+  // is expecting to deal with.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
   }
 
-  if (VT.getSizeInBits() < Vec.getValueSizeInBits()) {
-    assert(Vec.getValueType() == MVT::v2i64 && VT == MVT::v2f32 &&
-           "Unexpected vector type!");
-    Vec = DAG.getNode(Opc, dl, MVT::v2f64, Vec);
-    return DAG.getNode(ISD::FP_ROUND, dl, VT, Vec, DAG.getIntPtrConstant(0));
-  } else if (VT.getSizeInBits() > Vec.getValueSizeInBits()) {
-    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
-                                   VT.getVectorElementType().getSizeInBits());
-    CastVT =
-        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
-    Vec = DAG.getNode(CastOpc, dl, CastVT, Vec);
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isOne() &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           "Unexpected condition code.");
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
+
+    // The actual operation with overflow check.
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
+
+    if (CC == ISD::SETNE)
+      OFCC = getInvertedCondCode(OFCC);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
+                       CCVal, Overflow);
   }
 
-  return DAG.getNode(Opc, dl, VT, Vec);
-}
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    // If the RHS of the comparison is zero, we can potentially fold this
+    // to a specialized branch.
+    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if (RHSC && RHSC->getZExtValue() == 0) {
+      if (CC == ISD::SETEQ) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
 
-SDValue
-AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                                      bool IsSigned) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorINT_TO_FP(Op, DAG, IsSigned);
-  if (Op.getValueType() != MVT::f128) {
-    // Legal for everything except f128.
-    return Op;
+        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETNE) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBNZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
+
+        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
   }
 
-  RTLIB::Libcall LC;
-  if (IsSigned)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
-  return LowerF128ToCall(Op, DAG, LC);
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue BR1 =
+      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+                       Cmp);
+  }
+
+  return BR1;
 }
 
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
 
-SDValue
-AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  SDLoc dl(JT);
-  EVT PtrVT = getPointerTy();
+  SDValue In1 = Op.getOperand(0);
+  SDValue In2 = Op.getOperand(1);
+  EVT SrcVT = In2.getValueType();
+  if (SrcVT != VT) {
+    if (SrcVT == MVT::f32 && VT == MVT::f64)
+      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+    else if (SrcVT == MVT::f64 && VT == MVT::f32)
+      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
+    else
+      // FIXME: Src type is different, bail out for now. Can VT really be a
+      // vector type?
+      return SDValue();
+  }
 
-  // When compiling PIC, jump tables get put in the code section so a static
-  // relocation-style is acceptable for both cases.
-  switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
-                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                              AArch64II::MO_LO12),
-                       DAG.getConstant(1, MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, dl, PtrVT,
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
+  EVT VecVT;
+  EVT EltVT;
+  SDValue EltMask, VecVal1, VecVal2;
+  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+    EltVT = MVT::i32;
+    VecVT = MVT::v4i32;
+    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+    EltVT = MVT::i64;
+    VecVT = MVT::v2i64;
+
+    // We want to materialize a mask with the the high bit set, but the AdvSIMD
+    // immediate moves cannot materialize that in a single instruction for
+    // 64-bit elements. Instead, materialize zero and then negate it.
+    EltMask = DAG.getConstant(0, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else {
+    llvm_unreachable("Invalid type for copysign!");
   }
-}
 
-// (SELECT testbit, iftrue, iffalse)
-SDValue
-AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue TheBit = Op.getOperand(0);
-  SDValue IfTrue = Op.getOperand(1);
-  SDValue IfFalse = Op.getOperand(2);
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
+    BuildVectorOps.push_back(EltMask);
+
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
+
+  // If we couldn't materialize the mask above, then the mask vector will be
+  // the zero vector, and we need to negate it here.
+  if (VT == MVT::f64 || VT == MVT::v2f64) {
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+  }
 
-  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
-  // that as the consumer we are responsible for ignoring rubbish in higher
-  // bits.
-  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
-                       DAG.getConstant(1, MVT::i32));
-  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
-                               DAG.getConstant(0, TheBit.getValueType()),
-                               DAG.getCondCode(ISD::SETNE));
+  SDValue Sel =
+      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
 
-  return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                     A64CMP, IfTrue, IfFalse,
-                     DAG.getConstant(A64CC::NE, MVT::i32));
+  if (VT == MVT::f32)
+    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+  else if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+  else
+    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
 }
 
-static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+    return SDValue();
+
+  // While there is no integer popcount instruction, it can
+  // be more efficiently lowered to the following sequence that uses
+  // AdvSIMD registers/instructions as long as the copies to/from
+  // the AdvSIMD registers are cheap.
+  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
+  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
+  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
+  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  SDValue Val = Op.getOperand(0);
   SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
+
+  SDValue VecVal;
+  if (VT == MVT::i32) {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
+                                       VecVal);
+  } else {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+  }
+
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue UaddLV = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
+
+  if (VT == MVT::i64)
+    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+  return UaddLV;
+}
+
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  if (Op.getValueType().isVector())
+    return LowerVSETCC(Op, DAG);
+
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+
+  // We chose ZeroOrOneBooleanContents, so use zero and one.
   EVT VT = Op.getValueType();
-  bool Invert = false;
-  SDValue Op0, Op1;
-  unsigned Opcode;
+  SDValue TVal = DAG.getConstant(1, VT);
+  SDValue FVal = DAG.getConstant(0, VT);
 
-  if (LHS.getValueType().isInteger()) {
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets picked up by the next if statement.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
-    // Attempt to use Vector Integer Compare Mask Test instruction.
-    // TST = icmp ne (and (op0, op1), zero).
-    if (CC == ISD::SETNE) {
-      if (((LHS.getOpcode() == ISD::AND) &&
-           ISD::isBuildVectorAllZeros(RHS.getNode())) ||
-          ((RHS.getOpcode() == ISD::AND) &&
-           ISD::isBuildVectorAllZeros(LHS.getNode()))) {
-
-        SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
-        SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
-        SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
-        return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
-      }
+    // If softenSetCCOperands returned a scalar, use it.
+    if (!RHS.getNode()) {
+      assert(LHS.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return LHS;
     }
+  }
 
-    // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
-    // Note: Compare against Zero does not support unsigned predicates.
-    if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
-         ISD::isBuildVectorAllZeros(LHS.getNode())) &&
-        !isUnsignedIntSetCC(CC)) {
-
-      // If LHS is the zero value, swap operands and CondCode.
-      if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
-        CC = getSetCCSwappedOperands(CC);
-        Op0 = RHS;
-      } else
-        Op0 = LHS;
-
-      // Ensure valid CondCode for Compare Mask against Zero instruction:
-      // EQ, GE, GT, LE, LT.
-      if (ISD::SETNE == CC) {
-        Invert = true;
-        CC = ISD::SETEQ;
-      }
+  if (LHS.getValueType().isInteger()) {
+    SDValue CCVal;
+    SDValue Cmp =
+        getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
 
-      // Using constant type to differentiate integer and FP compares with zero.
-      Op1 = DAG.getConstant(0, MVT::i32);
-      Opcode = AArch64ISD::NEON_CMPZ;
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+  }
 
-    } else {
-      // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
-      // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
-      bool Swap = false;
-      switch (CC) {
-      default:
-        llvm_unreachable("Illegal integer comparison.");
-      case ISD::SETEQ:
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-        break;
-      case ISD::SETNE:
-        Invert = true;
-        CC = ISD::SETEQ;
-        break;
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETLT:
-      case ISD::SETLE:
-        Swap = true;
-        CC = getSetCCSwappedOperands(CC);
-      }
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
-      if (Swap)
-        std::swap(LHS, RHS);
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
-      Opcode = AArch64ISD::NEON_CMP;
-      Op0 = LHS;
-      Op1 = RHS;
-    }
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  if (CC2 == AArch64CC::AL) {
+    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
 
-    // Generate Compare Mask instr or Compare Mask against Zero instr.
-    SDValue NeonCmp =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+  } else {
+    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+    // totally clean.  Some of them require two CSELs to implement.  As is in
+    // this case, we emit the first CSEL and then emit a second using the output
+    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
 
-    if (Invert)
-      NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+    SDValue CS1 =
+        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
 
-    return NeonCmp;
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
+}
 
-  // Now handle Floating Point cases.
-  // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
-  if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
-      ISD::isBuildVectorAllZeros(LHS.getNode())) {
+/// A SELECT_CC operation is really some kind of max or min if both values being
+/// compared are, in some sense, equal to the results in either case. However,
+/// it is permissible to compare f32 values and produce directly extended f64
+/// values.
+///
+/// Extending the comparison operands would also be allowed, but is less likely
+/// to happen in practice since their use is right here. Note that truncate
+/// operations would *not* be semantically equivalent.
+static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
+  if (Cmp == Result)
+    return true;
+
+  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
+  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
+  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
+      Result.getValueType() == MVT::f64) {
+    bool Lossy;
+    APFloat CmpVal = CCmp->getValueAPF();
+    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
+    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
+  }
 
-    // If LHS is the zero value, swap operands and CondCode.
-    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
-      CC = getSetCCSwappedOperands(CC);
-      Op0 = RHS;
-    } else
-      Op0 = LHS;
+  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
+}
 
-    // Using constant type to differentiate integer and FP compares with zero.
-    Op1 = DAG.getConstantFP(0, MVT::f32);
-    Opcode = AArch64ISD::NEON_CMPZ;
-  } else {
-    // Attempt to use Vector Floating Point Compare Mask instruction.
-    Op0 = LHS;
-    Op1 = RHS;
-    Opcode = AArch64ISD::NEON_CMP;
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue CC = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  unsigned Opc = CC.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CC.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
+      return SDValue();
+
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+                       CCVal, Overflow);
   }
 
-  SDValue NeonCmpAlt;
-  // Some register compares have to be implemented with swapped CC and operands,
-  // e.g.: OLT implemented as OGT with swapped operands.
-  bool SwapIfRegArgs = false;
+  if (CC.getOpcode() == ISD::SETCC)
+    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
+                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
+  else
+    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
+                           FVal, ISD::SETNE);
+}
 
-  // Ensure valid CondCode for FP Compare Mask against Zero instruction:
-  // EQ, GE, GT, LE, LT.
-  // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
-  switch (CC) {
-  default:
-    llvm_unreachable("Illegal FP comparison");
-  case ISD::SETUNE:
-  case ISD::SETNE:
-    Invert = true; // Fallthrough
-  case ISD::SETOEQ:
-  case ISD::SETEQ:
-    CC = ISD::SETEQ;
-    break;
-  case ISD::SETOLT:
-  case ISD::SETLT:
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETOGT:
-  case ISD::SETGT:
-    CC = ISD::SETGT;
-    break;
-  case ISD::SETOLE:
-  case ISD::SETLE:
-    CC = ISD::SETLE;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETOGE:
-  case ISD::SETGE:
-    CC = ISD::SETGE;
-    break;
-  case ISD::SETUGE:
-    Invert = true;
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETULE:
-    Invert = true;
-    CC = ISD::SETGT;
-    break;
-  case ISD::SETUGT:
-    Invert = true;
-    CC = ISD::SETLE;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETULT:
-    Invert = true;
-    CC = ISD::SETGE;
-    break;
-  case ISD::SETUEQ:
-    Invert = true; // Fallthrough
-  case ISD::SETONE:
-    // Expand this to (OGT |OLT).
-    NeonCmpAlt =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETUO:
-    Invert = true; // Fallthrough
-  case ISD::SETO:
-    // Expand this to (OGE | OLT).
-    NeonCmpAlt =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  }
-
-  if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
-    CC = getSetCCSwappedOperands(CC);
-    std::swap(Op0, Op1);
-  }
-
-  // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
-  SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
-
-  if (NeonCmpAlt.getNode())
-    NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
-
-  if (Invert)
-    NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
-
-  return NeonCmp;
-}
-
-// (SETCC lhs, rhs, condcode)
-SDValue
-AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  EVT VT = Op.getValueType();
-
-  if (VT.isVector())
-    return LowerVectorSETCC(Op, DAG);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc dl(Op);
 
+  // Handle f128 first, because it will result in a comparison of some RTLIB
+  // call result against zero.
   if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
-    // for the rest of the function (some i32 or i64 values).
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
-    // If softenSetCCOperands returned a scalar, use it.
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
     if (!RHS.getNode()) {
-      assert(LHS.getValueType() == Op.getValueType() &&
-             "Unexpected setcc expansion!");
-      return LHS;
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
     }
   }
 
+  // Handle integers first.
   if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    unsigned Opcode = AArch64ISD::CSEL;
+
+    // If both the TVal and the FVal are constants, see if we can swap them in
+    // order to for a CSINV or CSINC out of them.
+    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (TVal.getOpcode() == ISD::XOR) {
+      // If TVal is a NOT we want to swap TVal and FVal so that we can match
+      // with a CSINV rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
+
+      if (CVal && CVal->isAllOnesValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (TVal.getOpcode() == ISD::SUB) {
+      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+      // that we can match with a CSNEG rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
+
+      if (CVal && CVal->isNullValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (CTVal && CFVal) {
+      const int64_t TrueVal = CTVal->getSExtValue();
+      const int64_t FalseVal = CFVal->getSExtValue();
+      bool Swap = false;
+
+      // If both TVal and FVal are constants, see if FVal is the
+      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+      // instead of a CSEL in that case.
+      if (TrueVal == ~FalseVal) {
+        Opcode = AArch64ISD::CSINV;
+      } else if (TrueVal == -FalseVal) {
+        Opcode = AArch64ISD::CSNEG;
+      } else if (TVal.getValueType() == MVT::i32) {
+        // If our operands are only 32-bit wide, make sure we use 32-bit
+        // arithmetic for the check whether we can use CSINC. This ensures that
+        // the addition in the check will wrap around properly in case there is
+        // an overflow (which would not be the case if we do the check with
+        // 64-bit arithmetic).
+        const uint32_t TrueVal32 = CTVal->getZExtValue();
+        const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+          Opcode = AArch64ISD::CSINC;
+
+          if (TrueVal32 > FalseVal32) {
+            Swap = true;
+          }
+        }
+        // 64-bit check whether we can use CSINC.
+      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+        Opcode = AArch64ISD::CSINC;
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+        if (TrueVal > FalseVal) {
+          Swap = true;
+        }
+      }
 
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
-                       CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
-                       A64cc);
-  }
+      // Swap TVal and FVal if necessary.
+      if (Swap) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
-                                     CmpOp, DAG.getConstant(1, VT),
-                                     DAG.getConstant(0, VT), A64cc);
+      if (Opcode != AArch64ISD::CSEL) {
+        // Drop FVal since we can get its value by simply inverting/negating
+        // TVal.
+        FVal = TVal;
+      }
+    }
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                               DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
-  }
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
-  return A64SELECT_CC;
-}
+    EVT VT = Op.getValueType();
+    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
+  }
 
-static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  EVT IfTrueVT = IfTrue.getValueType();
-  EVT CondVT = IfTrueVT.changeVectorElementTypeToInteger();
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == RHS.getValueType());
+  EVT VT = Op.getValueType();
 
-  // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
-  // use NEON compare.
-  if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
-    EVT EltVT = LHS.getValueType();
-    unsigned EltNum = 128 / EltVT.getSizeInBits();
-    EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
-    unsigned SubConstant =
-        (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
-    EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
-    EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
-
-    LHS
-      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                  VT, DAG.getTargetConstant(0, MVT::i32), LHS,
-                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
-    RHS
-      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                  VT, DAG.getTargetConstant(0, MVT::i32), RHS,
-                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
-
-    SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
-    SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
-    if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
-      EVT DUPVT =
-          EVT::getVectorVT(*DAG.getContext(), CEltT,
-                           IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
-      ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
-                          DAG.getConstant(0, MVT::i64, false));
-
-      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
-    } else {
-      // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
-      // can't handle them and will hit this assert.
-      assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
-             "Vector of IfTrue & IfFalse is too small.");
-
-      unsigned ExEltNum =
-          EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
-      EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
-      ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
-                          DAG.getConstant(0, MVT::i64, false));
-      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
+  // Try to match this select into a max/min operation, which have dedicated
+  // opcode in the instruction set.
+  // FIXME: This is not correct in the presence of NaNs, so we only enable this
+  // in no-NaNs mode.
+  if (getTargetMachine().Options.NoNaNsFPMath) {
+    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      std::swap(MinMaxLHS, MinMaxRHS);
     }
-    SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
-                                  ResCC, IfTrue, IfFalse);
-    return VSelect;
-  }
-
-  // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
-  // vectors.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  EVT SEVT = MVT::i32;
-  if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
-    SEVT = MVT::i64;
-  SDValue AllOne = DAG.getConstant(-1, SEVT);
-  SDValue AllZero = DAG.getConstant(0, SEVT);
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
-                                     AllOne, AllZero, A64cc);
-
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, AllOne, A64SELECT_CC, A64cc);
-  }
-  SDValue VDup;
-  if (IfTrue.getValueType().getVectorNumElements() == 1)
-    VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, CondVT, A64SELECT_CC);
-  else
-    VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, CondVT, A64SELECT_CC);
-  SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
-                                VDup, IfTrue, IfFalse);
-  return VSelect;
-}
 
-// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
-SDValue
-AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETGT:
+      case ISD::SETGE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETOGT:
+      case ISD::SETOGE:
+        return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      case ISD::SETLT:
+      case ISD::SETLE:
+      case ISD::SETULT:
+      case ISD::SETULE:
+      case ISD::SETOLT:
+      case ISD::SETOLE:
+        return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      }
+    }
+  }
 
-  if (IfTrue.getValueType().isVector())
-    return LowerVectorSELECT_CC(Op, DAG);
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
-  if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to libcalls, but slot in nicely here
-    // afterwards.
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two CSELs to implement.
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
 
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
+  // If we need a second CSEL, emit it, using the output of the first as the
+  // RHS.  We're effectively OR'ing the two CC's together.
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
 
-  if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+  // Otherwise, return the output of the first CSEL.
+  return CS1;
+}
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
 
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
-                       IfTrue, IfFalse, A64cc);
-  }
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                               AArch64II::MO_G0 | MO_NC));
+  }
+
+  SDValue Hi =
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+}
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
-                                     Op.getValueType(),
-                                     SetCC, IfTrue, IfFalse, A64cc);
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, IfTrue, A64SELECT_CC, A64cc);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    // Use the GOT for the large code model on iOS.
+    if (Subtarget->isTargetMachO()) {
+      SDValue GotAddr = DAG.getTargetConstantPool(
+          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+          AArch64II::MO_GOT);
+      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    }
 
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G3),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+    // ELF, the only valid one on Darwin.
+    SDValue Hi =
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetConstantPool(
+        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   }
+}
 
-  return A64SELECT_CC;
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+                                                             AArch64II::MO_NC);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
 }
 
-SDValue
-AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
-  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
-  // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
-  // rather than just 8.
-  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
-                       Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(32, MVT::i32), 8, false, false,
-                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
+  SDLoc DL(Op);
+  SDValue FR =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
 }
 
-SDValue
-AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
@@ -3249,1486 +3505,1923 @@ AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
-  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
-                                    getPointerTy());
+  SDValue Stack =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 0));
+                                MachinePointerInfo(SV), false, false, 8));
 
   // void *__gr_top at offset 8
-  int GPRSize = FuncInfo->getVariadicGPRSize();
+  int GPRSize = FuncInfo->getVarArgsGPRSize();
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(8, getPointerTy()));
 
-    GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
                         DAG.getConstant(GPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8),
-                                  false, false, 0));
+                                  MachinePointerInfo(SV, 8), false, false, 8));
   }
 
   // void *__vr_top at offset 16
-  int FPRSize = FuncInfo->getVariadicFPRSize();
+  int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(16, getPointerTy()));
 
-    VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
                         DAG.getConstant(FPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16),
-                                  false, false, 0));
+                                  MachinePointerInfo(SV, 16), false, false, 8));
   }
 
   // int __gr_offs at offset 24
   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(24, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24),
-                                false, false, 0));
+                                GROffsAddr, MachinePointerInfo(SV, 24), false,
+                                false, 4));
 
   // int __vr_offs at offset 28
   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(28, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28),
-                                false, false, 0));
+                                VROffsAddr, MachinePointerInfo(SV, 28), false,
+                                false, 4));
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
-SDValue
-AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Don't know how to custom lower this!");
-  case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
-  case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
-  case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
-  case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
-  case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
-  case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
-  case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
-  case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
-  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
-  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
-  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
-  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
-
-  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRL_PARTS:
-  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
-
-  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
-  case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
-  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
-  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
-  case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SETCC: return LowerSETCC(Op, DAG);
-  case ISD::VACOPY: return LowerVACOPY(Op, DAG);
-  case ISD::VASTART: return LowerVASTART(Op, DAG);
-  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
-  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
-  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
-  }
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+                                     : LowerAAPCS_VASTART(Op, DAG);
+}
 
-  return SDValue();
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+  // pointer.
+  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
+                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
+                       8, false, false, MachinePointerInfo(DestSV),
+                       MachinePointerInfo(SrcSV));
 }
 
-/// Check if the specified splat value corresponds to a valid vector constant
-/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
-/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
-/// values.
-static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
-                              unsigned SplatBitSize, SelectionDAG &DAG,
-                              bool is128Bits, NeonModImmType type, EVT &VT,
-                              unsigned &Imm, unsigned &OpCmode) {
-  switch (SplatBitSize) {
-  default:
-    llvm_unreachable("unexpected size for isNeonModifiedImm");
-  case 8: {
-    if (type != Neon_Mov_Imm)
-      return false;
-    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
-    // Neon movi per byte: Op=0, Cmode=1110.
-    OpCmode = 0xe;
-    Imm = SplatBits;
-    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
-    break;
-  }
-  case 16: {
-    // Neon move inst per halfword
-    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x00nn is 0x00nn LSL 0
-      // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
-      // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
-      // Op=x, Cmode=100y
-      Imm = SplatBits;
-      OpCmode = 0x8;
-      break;
-    }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0xnn00 is 0x00nn LSL 8
-      // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
-      // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
-      // Op=x, Cmode=101x
-      Imm = SplatBits >> 8;
-      OpCmode = 0xa;
-      break;
-    }
-    // can't handle any other
-    return false;
-  }
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() &&
+         "automatic va_arg instruction only works on Darwin");
 
-  case 32: {
-    // First the LSL variants (MSL is unusable by some interested instructions).
+  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  unsigned Align = Op.getConstantOperandVal(3);
+
+  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
+                               MachinePointerInfo(V), false, false, false, 0);
+  Chain = VAList.getValue(1);
+
+  if (Align > 8) {
+    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                         DAG.getConstant(Align - 1, getPointerTy()));
+    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
+                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
+  }
+
+  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+
+  // Scalar integer and FP values smaller than 64 bits are implicitly extended
+  // up to 64 bits.  At the very least, we have to increase the striding of the
+  // vaargs list to match this, and for FP values we need to introduce
+  // FP_ROUND nodes as well.
+  if (VT.isInteger() && !VT.isVector())
+    ArgSize = 8;
+  bool NeedFPTrunc = false;
+  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+    ArgSize = 8;
+    NeedFPTrunc = true;
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                               DAG.getConstant(ArgSize, getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
+                                 false, false, 0);
+
+  // Load the actual argument out of the pointer VAList
+  if (NeedFPTrunc) {
+    // Load the value as an f64.
+    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
+                                 MachinePointerInfo(), false, false, false, 0);
+    // Round the value down to an f32.
+    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+                                   DAG.getIntPtrConstant(1));
+    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    // Merge the rounded value with the chain output of the load.
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
+                     false, false, 0);
+}
 
-    // Neon move instr per word, shift zeros
-    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x000000nn is 0x000000nn LSL 0
-      // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
-      // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
-      // Op=x, Cmode=000x
-      Imm = SplatBits;
-      OpCmode = 0;
-      break;
-    }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0x0000nn00 is 0x000000nn LSL 8
-      // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
-      // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
-      // Op=x, Cmode=001x
-      Imm = SplatBits >> 8;
-      OpCmode = 0x2;
-      break;
-    }
-    if ((SplatBits & ~0xff0000) == 0) {
-      // Value = 0x00nn0000 is 0x000000nn LSL 16
-      // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
-      // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
-      // Op=x, Cmode=010x
-      Imm = SplatBits >> 16;
-      OpCmode = 0x4;
-      break;
-    }
-    if ((SplatBits & ~0xff000000) == 0) {
-      // Value = 0xnn000000 is 0x000000nn LSL 24
-      // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
-      // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
-      // Op=x, Cmode=011x
-      Imm = SplatBits >> 24;
-      OpCmode = 0x6;
-      break;
-    }
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
 
-    // Now the MSL immediates.
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(), false, false, false, 0);
+  return FrameAddr;
+}
 
-    // Neon move instr per word, shift ones
-    if ((SplatBits & ~0xffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
-      // Value = 0x0000nnff is 0x000000nn MSL 8
-      // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
-      // Op=x, Cmode=1100
-      Imm = SplatBits >> 8;
-      OpCmode = 0xc;
-      break;
-    }
-    if ((SplatBits & ~0xffffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
-      // Value = 0x00nnffff is 0x000000nn MSL 16
-      // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
-      // Op=x, Cmode=1101
-      Imm = SplatBits >> 16;
-      OpCmode = 0xd;
-      break;
-    }
-    // can't handle any other
-    return false;
-  }
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
+                                                  EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", AArch64::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
 
-  case 64: {
-    if (type != Neon_Mov_Imm)
-      return false;
-    // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
-    // movi Op=1, Cmode=1110.
-    OpCmode = 0x1e;
-    uint64_t BitMask = 0xff;
-    uint64_t Val = 0;
-    unsigned ImmMask = 1;
-    Imm = 0;
-    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
-        Val |= BitMask;
-        Imm |= ImmMask;
-      } else if ((SplatBits & BitMask) != 0) {
-        return false;
-      }
-      BitMask <<= 8;
-      ImmMask <<= 1;
-    }
-    SplatBits = Val;
-    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
-    break;
-  }
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, getPointerTy());
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
   }
 
-  return true;
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
 
-static SDValue PerformANDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
-  // We're looking for an SRA/SHL pair which form an SBFX.
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
 
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+
+  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  // AArch64 shifts larger than the register width are wrapped rather than
+  // clamped, so we can't just emit "hi >> x".
+  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue TrueValHi = Opc == ISD::SRA
+                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                                        DAG.getConstant(VTBits - 1, MVT::i64))
+                          : DAG.getConstant(0, VT);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
 
-  if (!isa<ConstantSDNode>(N->getOperand(1)))
-    return SDValue();
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
 
-  uint64_t TruncMask = N->getConstantOperandVal(1);
-  if (!isMask_64(TruncMask))
-    return SDValue();
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
 
-  uint64_t Width = CountPopulation_64(TruncMask);
-  SDValue Shift = N->getOperand(0);
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
 
-  if (Shift.getOpcode() != ISD::SRL)
-    return SDValue();
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
 
-  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
-    return SDValue();
-  uint64_t LSB = Shift->getConstantOperandVal(1);
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
 
-  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
-    return SDValue();
-
-  return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
-                     DAG.getConstant(LSB, MVT::i64),
-                     DAG.getConstant(LSB + Width - 1, MVT::i64));
-}
-
-/// For a true bitfield insert, the bits getting into that contiguous mask
-/// should come from the low part of an existing value: they must be formed from
-/// a compatible SHL operation (unless they're already low). This function
-/// checks that condition and returns the least-significant bit that's
-/// intended. If the operation not a field preparation, -1 is returned.
-static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
-                            SDValue &MaskedVal, uint64_t Mask) {
-  if (!isShiftedMask_64(Mask))
-    return -1;
-
-  // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
-  // instruction. BFI will do a left-shift by LSB before applying the mask we've
-  // spotted, so in general we should pre-emptively "undo" that by making sure
-  // the incoming bits have had a right-shift applied to them.
-  //
-  // This right shift, however, will combine with existing left/right shifts. In
-  // the simplest case of a completely straight bitfield operation, it will be
-  // expected to completely cancel out with an existing SHL. More complicated
-  // cases (e.g. bitfield to bitfield copy) may still need a real shift before
-  // the BFI.
-
-  uint64_t LSB = countTrailingZeros(Mask);
-  int64_t ShiftRightRequired = LSB;
-  if (MaskedVal.getOpcode() == ISD::SHL &&
-      isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
-    ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
-    MaskedVal = MaskedVal.getOperand(0);
-  } else if (MaskedVal.getOpcode() == ISD::SRL &&
-             isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
-    ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
-    MaskedVal = MaskedVal.getOperand(0);
-  }
-
-  if (ShiftRightRequired > 0)
-    MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
-                            DAG.getConstant(ShiftRightRequired, MVT::i64));
-  else if (ShiftRightRequired < 0) {
-    // We could actually end up with a residual left shift, for example with
-    // "struc.bitfield = val << 1".
-    MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
-                            DAG.getConstant(-ShiftRightRequired, MVT::i64));
-  }
-
-  return LSB;
-}
-
-/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
-/// a mask and an extension. Returns true if a BFI was found and provides
-/// information on its surroundings.
-static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
-                          bool &Extended) {
-  Extended = false;
-  if (N.getOpcode() == ISD::ZERO_EXTEND) {
-    Extended = true;
-    N = N.getOperand(0);
-  }
-
-  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
-    Mask = N->getConstantOperandVal(1);
-    N = N.getOperand(0);
-  } else {
-    // Mask is the whole width.
-    Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
-  }
+  // AArch64 shifts of larger than register sizes are wrapped rather than
+  // clamped, so we can't just emit "lo << a" if a is too big.
+  SDValue TrueValLo = DAG.getConstant(0, VT);
+  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
 
-  if (N.getOpcode() == AArch64ISD::BFI) {
-    BFI = N;
-    return true;
-  }
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
 
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // The AArch64 target doesn't support folding offsets into global addresses.
   return false;
 }
 
-/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
-/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
-/// can often be further combined with a larger mask. Ultimately, we want mask
-/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
-static SDValue tryCombineToBFI(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               const AArch64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
-
-  // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
-  // abandon the effort.
-  SDValue LHS = N->getOperand(0);
-  if (LHS.getOpcode() != ISD::AND)
-    return SDValue();
-
-  uint64_t LHSMask;
-  if (isa<ConstantSDNode>(LHS.getOperand(1)))
-    LHSMask = LHS->getConstantOperandVal(1);
-  else
-    return SDValue();
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+  // FIXME: We should be able to handle f128 as well with a clever lowering.
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+    return true;
 
-  // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
-  // is or abandon the effort.
-  SDValue RHS = N->getOperand(1);
-  if (RHS.getOpcode() != ISD::AND)
-    return SDValue();
+  if (VT == MVT::f64)
+    return AArch64_AM::getFP64Imm(Imm) != -1;
+  else if (VT == MVT::f32)
+    return AArch64_AM::getFP32Imm(Imm) != -1;
+  return false;
+}
 
-  uint64_t RHSMask;
-  if (isa<ConstantSDNode>(RHS.getOperand(1)))
-    RHSMask = RHS->getConstantOperandVal(1);
-  else
-    return SDValue();
+//===----------------------------------------------------------------------===//
+//                          AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
 
-  // Can't do anything if the masks are incompatible.
-  if (LHSMask & RHSMask)
-    return SDValue();
+//===----------------------------------------------------------------------===//
+//                          AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
 
-  // Now we need one of the masks to be a contiguous field. Without loss of
-  // generality that should be the RHS one.
-  SDValue Bitfield = LHS.getOperand(0);
-  if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
-    // We know that LHS is a candidate new value, and RHS isn't already a better
-    // one.
-    std::swap(LHS, RHS);
-    std::swap(LHSMask, RHSMask);
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+//   Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'z':
+      return C_Other;
+    case 'x':
+    case 'w':
+      return C_RegisterClass;
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as 'r'.
+    case 'Q':
+      return C_Memory;
+    }
   }
+  return TargetLowering::getConstraintType(Constraint);
+}
 
-  // We've done our best to put the right operands in the right places, all we
-  // can do now is check whether a BFI exists.
-  Bitfield = RHS.getOperand(0);
-  int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
-  if (LSB == -1)
-    return SDValue();
-
-  uint32_t Width = CountPopulation_64(RHSMask);
-  assert(Width && "Expected non-zero bitfield width");
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'x':
+  case 'w':
+    if (type->isFloatingPointTy() || type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'z':
+    weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
 
-  SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
-                            LHS.getOperand(0), Bitfield,
-                            DAG.getConstant(LSB, MVT::i64),
-                            DAG.getConstant(Width, MVT::i64));
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+    const std::string &Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
+      break;
+    // The instructions that this constraint is designed for can
+    // only take 128-bit registers so just use that regclass.
+    case 'x':
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
 
-  // Mask is trivial
-  if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
-    return BFI;
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass *> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    unsigned Size = Constraint.size();
+    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+      const std::string Reg =
+          std::string(&Constraint[2], &Constraint[Size - 1]);
+      int RegNo = atoi(Reg.c_str());
+      if (RegNo >= 0 && RegNo <= 31) {
+        // v0 - v31 are aliases of q0 - q31.
+        // By default we'll emit v0-v31 for this unless there's a modifier where
+        // we'll emit the correct register as well.
+        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+        Res.second = &AArch64::FPR128RegClass;
+      }
+    }
+  }
 
-  return DAG.getNode(ISD::AND, DL, VT, BFI,
-                     DAG.getConstant(LHSMask | RHSMask, VT));
+  return Res;
 }
 
-/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
-/// original input. This is surprisingly common because SROA splits things up
-/// into i8 chunks, so the originally detected MaskedBFI may actually only act
-/// on the low (say) byte of a word. This is then orred into the rest of the
-/// word afterwards.
-///
-/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
-///
-/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
-/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
-/// involved.
-static SDValue tryCombineToLargerBFI(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     const AArch64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // First job is to hunt for a MaskedBFI on either the left or right. Swap
-  // operands if it's actually on the right.
-  SDValue BFI;
-  SDValue PossExtraMask;
-  uint64_t ExistingMask = 0;
-  bool Extended = false;
-  if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
-    PossExtraMask = N->getOperand(1);
-  else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
-    PossExtraMask = N->getOperand(0);
-  else
-    return SDValue();
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result;
 
-  // We can only combine a BFI with another compatible mask.
-  if (PossExtraMask.getOpcode() != ISD::AND ||
-      !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
-    return SDValue();
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1)
+    return;
 
-  uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
 
-  // Masks must be compatible.
-  if (ExtraMask & ExistingMask)
-    return SDValue();
+  // This set of constraints deal with valid constants for various instructions.
+  // Validate and return a target constant for them if we can.
+  case 'z': {
+    // 'z' maps to xzr or wzr so it needs an input of 0.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C || C->getZExtValue() != 0)
+      return;
 
-  SDValue OldBFIVal = BFI.getOperand(0);
-  SDValue NewBFIVal = BFI.getOperand(1);
-  if (Extended) {
-    // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
-    // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
-    // need to be made compatible.
-    assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
-           && "Invalid types for BFI");
-    OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
-    NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
+    if (Op.getValueType() == MVT::i64)
+      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+    else
+      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+    break;
   }
 
-  // We need the MaskedBFI to be combined with a mask of the *same* value.
-  if (PossExtraMask.getOperand(0) != OldBFIVal)
-    return SDValue();
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    // Grab the value and do some validation.
+    uint64_t CVal = C->getZExtValue();
+    switch (ConstraintLetter) {
+    // The I constraint applies only to simple ADD or SUB immediate operands:
+    // i.e. 0 to 4095 with optional shift by 12
+    // The J constraint applies only to ADD or SUB immediates that would be
+    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+    // instruction [or vice versa], in other words -1 to -4095 with optional
+    // left shift by 12.
+    case 'I':
+      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+        break;
+      return;
+    case 'J': {
+      uint64_t NVal = -C->getSExtValue();
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+        break;
+      return;
+    }
+    // The K and L constraints apply *only* to logical immediates, including
+    // what used to be the MOVI alias for ORR (though the MOVI alias has now
+    // been removed and MOV should be used). So these constraints have to
+    // distinguish between bit patterns that are valid 32-bit or 64-bit
+    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+    // versa.
+    case 'K':
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      return;
+    case 'L':
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      return;
+    // The M and N constraints are a superset of K and L respectively, for use
+    // with the MOV (immediate) alias. As well as the logical immediates they
+    // also match 32 or 64-bit immediates that can be loaded either using a
+    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+    // (M) or 64-bit 0x1234000000000000 (N) etc.
+    // As a note some of this code is liberally stolen from the asm parser.
+    case 'M': {
+      if (!isUInt<32>(CVal))
+        return;
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      if ((CVal & 0xFFFF) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~(uint32_t)CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      return;
+    }
+    case 'N': {
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      if ((CVal & 0xFFFFULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF00000000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF000000000000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+        break;
+      return;
+    }
+    default:
+      return;
+    }
 
-  BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
-                    OldBFIVal, NewBFIVal,
-                    BFI.getOperand(2), BFI.getOperand(3));
+    // All assembler immediates are 64-bit integers.
+    Result = DAG.getTargetConstant(CVal, MVT::i64);
+    break;
+  }
 
-  // If the masking is trivial, we don't need to create it.
-  if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
-    return BFI;
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
 
-  return DAG.getNode(ISD::AND, DL, VT, BFI,
-                     DAG.getConstant(ExtraMask | ExistingMask, VT));
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-/// An EXTR instruction is made up of two shifts, ORed together. This helper
-/// searches for and classifies those shifts.
-static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
-                         bool &FromHi) {
-  if (N.getOpcode() == ISD::SHL)
-    FromHi = false;
-  else if (N.getOpcode() == ISD::SRL)
-    FromHi = true;
-  else
-    return false;
-
-  if (!isa<ConstantSDNode>(N.getOperand(1)))
-    return false;
+//===----------------------------------------------------------------------===//
+//                     AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
 
-  ShiftAmount = N->getConstantOperandVal(1);
-  Src = N->getOperand(0);
-  return true;
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+  EVT VT = V64Reg.getValueType();
+  unsigned NarrowSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+  SDLoc DL(V64Reg);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+                     V64Reg, DAG.getConstant(0, MVT::i32));
 }
 
-/// EXTR instruction extracts a contiguous chunk of bits from two existing
-/// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
-static SDValue tryCombineToEXTR(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+  EVT EltType = V.getValueType().getVectorElementType();
+  return EltType.getSizeInBits() / 8;
+}
 
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+  SDLoc DL(V128Reg);
 
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
+  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
 
-  SDValue LHS;
-  uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
-    return SDValue();
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
 
-  SDValue RHS;
-  uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
-    return SDValue();
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
 
-  // If they're both trying to come from the high part of the register, they're
-  // not really an EXTR.
-  if (LHSFromHi == RHSFromHi)
-    return SDValue();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    }
 
-  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
-    return SDValue();
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
 
-  if (LHSFromHi) {
-    std::swap(LHS, RHS);
-    std::swap(ShiftLHS, ShiftRHS);
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
   }
 
-  return DAG.getNode(AArch64ISD::EXTR, DL, VT,
-                     LHS, RHS,
-                     DAG.getConstant(ShiftRHS, MVT::i64));
-}
-
-/// Target-specific dag combine xforms for ISD::OR
-static SDValue PerformORCombine(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI,
-                                const AArch64Subtarget *Subtarget) {
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
     return SDValue();
 
-  // Attempt to recognise bitfield-insert operations.
-  SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
-  if (Res.getNode())
-    return Res;
+  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = { 0, 0 };
+  int OffsetMultipliers[2] = { 1, 1 };
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
+    SDValue CurSource = SourceVecs[i];
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType()) {
+      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
+      // Then bitcast it to the vector which holds asserted element type,
+      // and record the multiplier of element width between SourceVecs and
+      // Build_vector which is needed to extract the correct lanes later.
+      EVT CastVT =
+          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                           SourceVecs[i].getValueSizeInBits() /
+                               VT.getVectorElementType().getSizeInBits());
+
+      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
+      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
+      NumSrcElts *= OffsetMultipliers[i];
+      MaxElts[i] *= OffsetMultipliers[i];
+      MinElts[i] *= OffsetMultipliers[i];
+    }
 
-  // Attempt to combine an existing MaskedBFI operation into one with a larger
-  // mask.
-  Res = tryCombineToLargerBFI(N, DCI, Subtarget);
-  if (Res.getNode())
-    return Res;
+    if (CurSource.getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = CurSource;
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (NumSrcElts < NumElts) {
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
+                                   DAG.getUNDEF(CurSource.getValueType()));
+      continue;
+    }
 
-  Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
-    return Res;
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(NumSrcElts == 2 * NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
 
-  if (!Subtarget->hasNEON())
-    return SDValue();
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
 
-  // Attempt to use vector immediate-form BSL
-  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(NumElts));
+      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
+      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(Imm, MVT::i32));
+    }
+  }
 
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
+  SmallVector<int, 8> Mask;
 
-  SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
-    return SDValue();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
 
-  if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
-    APInt SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
-    APInt SplatBits0;
-    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
-                                      HasAnyUndefs) &&
-        !HasAnyUndefs) {
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
-      APInt SplatBits1;
-      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
-                                        HasAnyUndefs) && !HasAnyUndefs &&
-          SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
-          SplatBits0 == ~SplatBits1) {
-
-        return DAG.getNode(ISD::VSELECT, DL, VT, N0->getOperand(1),
-                           N0->getOperand(0), N1->getOperand(0));
-      }
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt =
+        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
+                     VEXTOffsets[1]);
     }
   }
 
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
   return SDValue();
 }
 
-/// Target-specific dag combine xforms for ISD::SRA
-static SDValue PerformSRACombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // We're looking for an SRA/SHL pair which form an SBFX.
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
 
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
 
-  if (!isa<ConstantSDNode>(N->getOperand(1)))
-    return SDValue();
+  Imm = M[0];
 
-  uint64_t ExtraSignBits = N->getConstantOperandVal(1);
-  SDValue Shift = N->getOperand(0);
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
 
-  if (Shift.getOpcode() != ISD::SHL)
-    return SDValue();
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
 
-  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
-    return SDValue();
+  return true;
+}
 
-  uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
-  uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
-  uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+                      unsigned &Imm) {
+  // Look for the first non-undef element.
+  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+      [](int Elt) {return Elt >= 0;});
 
-  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
-    return SDValue();
+  // Benefit form APInt to handle overflow when calculating expected element.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+  if (FirstWrongElt != M.end())
+    return false;
 
-  return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
-                     DAG.getConstant(LSB, MVT::i64),
-                     DAG.getConstant(LSB + Width - 1, MVT::i64));
-}
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g. 
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  Imm = ExpectedElt.getZExtValue();
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseEXT = true;
+  else
+    Imm -= NumElts;
 
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift operation, where all the elements of the build_vector
-/// must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                      HasAnyUndefs, ElementBits) ||
-      SplatBitSize > ElementBits)
-    return false;
-  Cnt = SplatBits.getSExtValue();
   return true;
 }
 
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift left operation.  That value must be in the range:
-/// 0 <= Value < ElementBits
-static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  return (Cnt >= 0 && Cnt < ElementBits);
-}
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
 
-/// Check if this is a valid build_vector for the immediate operand of a
-/// vector shift right operation. The value must be in the range:
-///   1 <= Value <= ElementBits
-static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
     return false;
-  return (Cnt >= 1 && Cnt <= ElementBits);
-}
 
-static SDValue GenForSextInreg(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               EVT SrcVT, EVT DestVT, EVT SubRegVT,
-                               const int *Mask, SDValue Src) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Bitcast
-    = DAG.getNode(ISD::BITCAST, SDLoc(N), SrcVT, Src);
-  SDValue Sext
-    = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), DestVT, Bitcast);
-  SDValue ShuffleVec
-    = DAG.getVectorShuffle(DestVT, SDLoc(N), Sext, DAG.getUNDEF(DestVT), Mask);
-  SDValue ExtractSubreg
-    = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
-                SubRegVT, ShuffleVec,
-                DAG.getTargetConstant(AArch64::sub_64, MVT::i32)), 0);
-  return ExtractSubreg;
-}
-
-/// Checks for vector shifts and lowers them.
-static SDValue PerformShiftCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const AArch64Subtarget *ST) {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
-    return PerformSRACombine(N, DCI);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
 
-  // We're looking for an SRA/SHL pair to help generating instruction
-  //   sshll  v0.8h, v0.8b, #0
-  // The instruction STXL is also the alias of this instruction.
-  //
-  // For example, for DAG like below,
-  //   v2i32 = sra (v2i32 (shl v2i32, 16)), 16
-  // we can transform it into
-  //   v2i32 = EXTRACT_SUBREG 
-  //             (v4i32 (suffle_vector
-  //                       (v4i32 (sext (v4i16 (bitcast v2i32))), 
-  //                       undef, (0, 2, u, u)),
-  //             sub_64
-  //
-  // With this transformation we expect to generate "SSHLL + UZIP1"
-  // Sometimes UZIP1 can be optimized away by combining with other context.
-  int64_t ShrCnt, ShlCnt;
-  if (N->getOpcode() == ISD::SRA
-      && (VT == MVT::v2i32 || VT == MVT::v4i16)
-      && isVShiftRImm(N->getOperand(1), VT, ShrCnt)
-      && N->getOperand(0).getOpcode() == ISD::SHL
-      && isVShiftRImm(N->getOperand(0).getOperand(1), VT, ShlCnt)) {
-    SDValue Src = N->getOperand(0).getOperand(0);
-    if (VT == MVT::v2i32 && ShrCnt == 16 && ShlCnt == 16) {
-      // sext_inreg(v2i32, v2i16)
-      // We essentially only care the Mask {0, 2, u, u}
-      int Mask[4] = {0, 2, 4, 6};
-      return GenForSextInreg(N, DCI, MVT::v4i16, MVT::v4i32, MVT::v2i32,
-                             Mask, Src); 
-    }
-    else if (VT == MVT::v2i32 && ShrCnt == 24 && ShlCnt == 24) {
-      // sext_inreg(v2i16, v2i8)
-      // We essentially only care the Mask {0, u, 4, u, u, u, u, u, u, u, u, u}
-      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v2i32,
-                             Mask, Src);
-    }
-    else if (VT == MVT::v4i16 && ShrCnt == 8 && ShlCnt == 8) {
-      // sext_inreg(v4i16, v4i8)
-      // We essentially only care the Mask {0, 2, 4, 6, u, u, u, u, u, u, u, u}
-      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v4i16,
-                             Mask, Src);
-    }
-  }
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
 
-  // Nothing to be done for scalar shifts.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!VT.isVector() || !TLI.isTypeLegal(VT))
-    return SDValue();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
 
-  assert(ST->hasNEON() && "unexpected vector shift");
-  int64_t Cnt;
+  return true;
+}
 
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
 
-  case ISD::SHL:
-    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
-      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
-                      DAG.getConstant(Cnt, MVT::i32));
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
-    }
-    break;
+  return true;
+}
 
-  case ISD::SRA:
-  case ISD::SRL:
-    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
-      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
-                      DAG.getConstant(Cnt, MVT::i32));
-      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
-    }
-    break;
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != 2 * i + WhichResult)
+      return false;
   }
 
-  return SDValue();
+  return true;
 }
 
-/// ARM-specific DAG combining for intrinsics.
-static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
 
-  switch (IntNo) {
-  default:
-    // Don't do anything for most intrinsics.
-    break;
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+      return false;
+    Idx += 1;
+  }
 
-  case Intrinsic::arm_neon_vqshifts:
-  case Intrinsic::arm_neon_vqshiftu:
-    EVT VT = N->getOperand(1).getValueType();
-    int64_t Cnt;
-    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
-      break;
-    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
-                             ? AArch64ISD::NEON_QSHLs
-                             : AArch64ISD::NEON_QSHLu;
-    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned)MIdx != Idx)
+        return false;
+      Idx += 2;
+    }
   }
 
-  return SDValue();
+  return true;
 }
 
-/// Target-specific DAG combine function for NEON load/store intrinsics
-/// to merge base address updates.
-static SDValue CombineBaseUpdate(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
 
-  SelectionDAG &DAG = DCI.DAG;
-  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
-                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
-  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
-  SDValue Addr = N->getOperand(AddrOpIdx);
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+                      bool &DstIsLeft, int &Anomaly) {
+  if (M.size() != static_cast<size_t>(NumInputElements))
+    return false;
 
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
-      continue;
+  int NumLHSMatch = 0, NumRHSMatch = 0;
+  int LastLHSMismatch = -1, LastRHSMismatch = -1;
 
-    // Check that the add is independent of the load/store.  Otherwise, folding
-    // it would create a cycle.
-    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+  for (int i = 0; i < NumInputElements; ++i) {
+    if (M[i] == -1) {
+      ++NumLHSMatch;
+      ++NumRHSMatch;
       continue;
-
-    // Find the new opcode for the updating load/store.
-    bool isLoad = true;
-    bool isLaneOp = false;
-    unsigned NewOpc = 0;
-    unsigned NumVecs = 0;
-    if (isIntrinsic) {
-      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-      switch (IntNo) {
-      default: llvm_unreachable("unexpected intrinsic for Neon base update");
-      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
-        NumVecs = 1; break;
-      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
-        NumVecs = 1; isLoad = false; break;
-      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
-        NumVecs = 2; isLoad = false; break;
-      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
-        NumVecs = 3; isLoad = false; break;
-      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
-        NumVecs = 4; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
-        NumVecs = 2; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
-        NumVecs = 3; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
-        NumVecs = 4; isLoad = false; break;
-      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
-        NumVecs = 2; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
-        NumVecs = 3; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
-        NumVecs = 4; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
-        NumVecs = 2; isLoad = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
-        NumVecs = 3; isLoad = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
-        NumVecs = 4; isLoad = false; isLaneOp = true; break;
-      }
-    } else {
-      isLaneOp = true;
-      switch (N->getOpcode()) {
-      default: llvm_unreachable("unexpected opcode for Neon base update");
-      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
-        NumVecs = 2; break;
-      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
-        NumVecs = 3; break;
-      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
-        NumVecs = 4; break;
-      }
     }
 
-    // Find the size of memory referenced by the load/store.
-    EVT VecTy;
-    if (isLoad)
-      VecTy = N->getValueType(0);
+    if (M[i] == i)
+      ++NumLHSMatch;
     else
-      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
-    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-    if (isLaneOp)
-      NumBytes /= VecTy.getVectorNumElements();
-
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
-      uint32_t IncVal = CInc->getZExtValue();
-      if (IncVal != NumBytes)
-        continue;
-      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
-    }
-
-    // Create the new updating load/store node.
-    EVT Tys[6];
-    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
-    unsigned n;
-    for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = VecTy;
-    Tys[n++] = MVT::i64;
-    Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(N->getOperand(0)); // incoming chain
-    Ops.push_back(N->getOperand(AddrOpIdx));
-    Ops.push_back(Inc);
-    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
-      Ops.push_back(N->getOperand(i));
-    }
-    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
-                                           Ops, MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
+      LastLHSMismatch = i;
 
-    // Update the uses.
-    std::vector<SDValue> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i) {
-      NewResults.push_back(SDValue(UpdN.getNode(), i));
-    }
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
-    DCI.CombineTo(N, NewResults);
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+    if (M[i] == i + NumInputElements)
+      ++NumRHSMatch;
+    else
+      LastRHSMismatch = i;
+  }
 
-    break;
+  if (NumLHSMatch == NumInputElements - 1) {
+    DstIsLeft = true;
+    Anomaly = LastLHSMismatch;
+    return true;
+  } else if (NumRHSMatch == NumInputElements - 1) {
+    DstIsLeft = false;
+    Anomaly = LastRHSMismatch;
+    return true;
   }
-  return SDValue();
+
+  return false;
 }
 
-/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
-/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
-/// If so, combine them to a vldN-dup operation and return true.
-static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+  if (VT.getSizeInBits() != 128)
+    return false;
 
-  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
-  SDNode *VLD = N->getOperand(0).getNode();
-  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
-    return SDValue();
-  unsigned NumVecs = 0;
-  unsigned NewOpc = 0;
-  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
-  if (IntNo == Intrinsic::arm_neon_vld2lane) {
-    NumVecs = 2;
-    NewOpc = AArch64ISD::NEON_LD2DUP;
-  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
-    NumVecs = 3;
-    NewOpc = AArch64ISD::NEON_LD3DUP;
-  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
-    NumVecs = 4;
-    NewOpc = AArch64ISD::NEON_LD4DUP;
-  } else {
-    return SDValue();
-  }
+  unsigned NumElts = VT.getVectorNumElements();
 
-  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
-  // numbers match the load.
-  unsigned VLDLaneNo =
-      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
-    // Ignore uses of the chain result.
-    if (UI.getUse().getResNo() == NumVecs)
-      continue;
-    SDNode *User = *UI;
-    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
-        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
-      return SDValue();
+  for (int I = 0, E = NumElts / 2; I != E; I++) {
+    if (Mask[I] != I)
+      return false;
   }
 
-  // Create the vldN-dup node.
-  EVT Tys[5];
-  unsigned n;
-  for (n = 0; n < NumVecs; ++n)
-    Tys[n] = VT;
-  Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumVecs + 1));
-  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
-  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
-  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops,
-                                           VLDMemInt->getMemoryVT(),
-                                           VLDMemInt->getMemOperand());
-
-  // Update the uses.
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
-    unsigned ResNo = UI.getUse().getResNo();
-    // Ignore uses of the chain result.
-    if (ResNo == NumVecs)
-      continue;
-    SDNode *User = *UI;
-    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  int Offset = NumElts / 2;
+  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+    if (Mask[I] != I + SplitLHS * Offset)
+      return false;
   }
 
-  // Now the vldN-lane intrinsic is dead except for its chain result.
-  // Update uses of the chain.
-  std::vector<SDValue> VLDDupResults;
-  for (unsigned n = 0; n < NumVecs; ++n)
-    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
-  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
-  DCI.CombineTo(VLD, VLDDupResults);
-
-  return SDValue(N, 0);
+  return true;
 }
 
-// vselect (v1i1 setcc) ->
-//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
-// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
-// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
-// such VSELECT.
-static SDValue PerformVSelectCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  EVT CCVT = N0.getValueType();
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
 
-  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
-      CCVT.getVectorElementType() != MVT::i1)
+  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
     return SDValue();
 
-  EVT ResVT = N->getValueType(0);
-  EVT CmpVT = N0.getOperand(0).getValueType();
-  // Only combine when the result type is of the same size as the compared
-  // operands.
-  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
+
+  if (!isConcatMask(Mask, VT, SplitV0))
     return SDValue();
 
-  SDValue IfTrue = N->getOperand(1);
-  SDValue IfFalse = N->getOperand(2);
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
-                   N0.getOperand(0), N0.getOperand(1),
-                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
-  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
-                     IfTrue, IfFalse);
+  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+  if (SplitV0) {
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  if (V1.getValueType().getSizeInBits() == 128) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
 }
 
-// sign_extend (extract_vector_elt (v1i1 setcc)) ->
-//     extract_vector_elt (v1iXX setcc)
-// (XX is the size of the compared operand type)
-static SDValue PerformSignExtendCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  SDValue Vec = N0.getOperand(0);
-
-  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      Vec.getOpcode() != ISD::SETCC)
-    return SDValue();
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1 * 9 + 2) * 9 + 3)
+      return LHS;
+    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+    return RHS;
+  }
 
-  EVT ResVT = N->getValueType(0);
-  EVT CmpVT = Vec.getOperand(0).getValueType();
-  // Only optimize when the result type is of the same size as the element
-  // type of the compared operand.
-  if (ResVT.getSizeInBits() != CmpVT.getVectorElementType().getSizeInBits())
-    return SDValue();
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
 
-  SDValue Lane = N0.getOperand(1);
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
-                   Vec.getOperand(0), Vec.getOperand(1),
-                   cast<CondCodeSDNode>(Vec.getOperand(2))->get());
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ResVT,
-                     SetCC, Lane);
+  switch (OpNum) {
+  default:
+    llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> REV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> REV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3: {
+    EVT EltTy = VT.getVectorElementType();
+    unsigned Opcode;
+    if (EltTy == MVT::i8)
+      Opcode = AArch64ISD::DUPLANE8;
+    else if (EltTy == MVT::i16)
+      Opcode = AArch64ISD::DUPLANE16;
+    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+      Opcode = AArch64ISD::DUPLANE32;
+    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+      Opcode = AArch64ISD::DUPLANE64;
+    else
+      llvm_unreachable("Invalid vector element type?");
+
+    if (VT.getSizeInBits() == 64)
+      OpLHS = WidenVector(OpLHS, DAG);
+    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
+    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+  }
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3: {
+    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+  case OP_VUZPL:
+    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VUZPR:
+    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPL:
+    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPR:
+    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNL:
+    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNR:
+    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  }
 }
 
-SDValue
-AArch64TargetLowering::PerformDAGCombine(SDNode *N,
-                                         DAGCombinerInfo &DCI) const {
-  switch (N->getOpcode()) {
-  default: break;
-  case ISD::AND: return PerformANDCombine(N, DCI);
-  case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
-  case ISD::SHL:
-  case ISD::SRA:
-  case ISD::SRL:
-    return PerformShiftCombine(N, DCI, getSubtarget());
-  case ISD::VSELECT: return PerformVSelectCombine(N, DCI.DAG);
-  case ISD::SIGN_EXTEND: return PerformSignExtendCombine(N, DCI.DAG);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return PerformIntrinsicCombine(N, DCI.DAG);
-  case AArch64ISD::NEON_VDUPLANE:
-    return CombineVLDDUP(N, DCI);
-  case AArch64ISD::NEON_LD2DUP:
-  case AArch64ISD::NEON_LD3DUP:
-  case AArch64ISD::NEON_LD4DUP:
-    return CombineBaseUpdate(N, DCI);
-  case ISD::INTRINSIC_VOID:
-  case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
-    case Intrinsic::arm_neon_vld1:
-    case Intrinsic::arm_neon_vld2:
-    case Intrinsic::arm_neon_vld3:
-    case Intrinsic::arm_neon_vld4:
-    case Intrinsic::arm_neon_vst1:
-    case Intrinsic::arm_neon_vst2:
-    case Intrinsic::arm_neon_vst3:
-    case Intrinsic::arm_neon_vst4:
-    case Intrinsic::arm_neon_vld2lane:
-    case Intrinsic::arm_neon_vld3lane:
-    case Intrinsic::arm_neon_vld4lane:
-    case Intrinsic::aarch64_neon_vld1x2:
-    case Intrinsic::aarch64_neon_vld1x3:
-    case Intrinsic::aarch64_neon_vld1x4:
-    case Intrinsic::aarch64_neon_vst1x2:
-    case Intrinsic::aarch64_neon_vst1x3:
-    case Intrinsic::aarch64_neon_vst1x4:
-    case Intrinsic::arm_neon_vst2lane:
-    case Intrinsic::arm_neon_vst3lane:
-    case Intrinsic::arm_neon_vst4lane:
-      return CombineBaseUpdate(N, DCI);
-    default:
-      break;
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+                           SelectionDAG &DAG) {
+  // Check to see if we can use the TBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
     }
   }
-  return SDValue();
-}
 
-bool
-AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueType().getSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
+  }
 
-  if (!VT.isSimple())
-    return false;
+  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
 
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f16:
-  case MVT::f32:
-  case MVT::f64:
-    return true;
-  case MVT::f128:
-    return false;
-  default:
-    break;
+  SDValue Shuffle;
+  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+    if (IndexLen == 8)
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                    makeArrayRef(TBLMask.data(), IndexLen)));
+  } else {
+    if (IndexLen == 8) {
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    } else {
+      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+      // cannot currently represent the register constraints on the input
+      // table registers.
+      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+      //                               &TBLMask[0], IndexLen));
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    }
   }
-
-  return false;
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
 }
 
-bool AArch64TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                          unsigned AddrSpace,
-                                                          bool *Fast) const {
-  const AArch64Subtarget *Subtarget = getSubtarget();
-  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
-  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  default:
-    return false;
-  // Scalar types
-  case MVT::i8:  case MVT::i16:
-  case MVT::i32: case MVT::i64:
-  case MVT::f32: case MVT::f64: {
-    // Unaligned access can use (for example) LRDB, LRDH, LDRW
-    if (AllowsUnaligned) {
-      if (Fast)
-        *Fast = true;
-      return true;
-    }
-    return false;
-  }
-  // 64-bit vector types
-  case MVT::v8i8:  case MVT::v4i16:
-  case MVT::v2i32: case MVT::v1i64:
-  case MVT::v2f32: case MVT::v1f64:
-  // 128-bit vector types
-  case MVT::v16i8: case MVT::v8i16:
-  case MVT::v4i32: case MVT::v2i64:
-  case MVT::v4f32: case MVT::v2f64: {
-    // For any little-endian targets with neon, we can support unaligned
-    // load/store of V registers using ld1/st1.
-    // A big-endian target may also explicitly support unaligned accesses
-    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
-      if (Fast)
-        *Fast = true;
-      return true;
-    }
-    return false;
-  }
-  }
+static unsigned getDUPLANEOp(EVT EltType) {
+  if (EltType == MVT::i8)
+    return AArch64ISD::DUPLANE8;
+  if (EltType == MVT::i16)
+    return AArch64ISD::DUPLANE16;
+  if (EltType == MVT::i32 || EltType == MVT::f32)
+    return AArch64ISD::DUPLANE32;
+  if (EltType == MVT::i64 || EltType == MVT::f64)
+    return AArch64ISD::DUPLANE64;
+
+  llvm_unreachable("Invalid vector element type?");
 }
 
-// Check whether a shuffle_vector could be presented as concat_vector.
-bool AArch64TargetLowering::isConcatVector(SDValue Op, SelectionDAG &DAG,
-                                           SDValue V0, SDValue V1,
-                                           const int *Mask,
-                                           SDValue &Res) const {
-  SDLoc DL(Op);
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  if (VT.getSizeInBits() != 128)
-    return false;
-  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
-      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
-    return false;
 
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isContactVector = true;
-  bool splitV0 = false;
-  if (V0.getValueType().getSizeInBits() == 128)
-    splitV0 = true;
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
-  for (int I = 0, E = NumElts / 2; I != E; I++) {
-    if (Mask[I] != I) {
-      isContactVector = false;
-      break;
-    }
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
+                                       V1.getValueType().getSimpleVT())) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1)
+      Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+                         V1.getOperand(0));
+    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+    // constant. If so, we can just reference the lane's definition directly.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(Lane)))
+      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+    // Otherwise, duplicate from the lane of the input vector.
+    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+    // to make a vector of the same size as this SHUFFLE. We can ignore the
+    // extract entirely, and canonicalise the concat using WidenVector.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      V1 = V1.getOperand(0);
+    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+      Lane -= Idx * VT.getVectorNumElements() / 2;
+      V1 = WidenVector(V1.getOperand(Idx), DAG);
+    } else if (VT.getSizeInBits() == 64)
+      V1 = WidenVector(V1, DAG);
+
+    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
   }
 
-  if (isContactVector) {
-    int offset = NumElts / 2;
-    for (int I = NumElts / 2, E = NumElts; I != E; I++) {
-      if (Mask[I] != I + splitV0 * offset) {
-        isContactVector = false;
-        break;
-      }
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+  bool ReverseEXT = false;
+  unsigned Imm;
+  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+    if (ReverseEXT)
+      std::swap(V1, V2);
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  } else if (V2->getOpcode() == ISD::UNDEF &&
+             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+
+  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
+  if (Concat.getNode())
+    return Concat;
+
+  bool DstIsLeft;
+  int Anomaly;
+  int NumInputElements = V1.getValueType().getVectorNumElements();
+  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+    SDValue DstVec = DstIsLeft ? V1 : V2;
+    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
+
+    SDValue SrcVec = V1;
+    int SrcLane = ShuffleMask[Anomaly];
+    if (SrcLane >= NumInputElements) {
+      SrcVec = V2;
+      SrcLane -= VT.getVectorNumElements();
     }
+    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
+
+    EVT ScalarVT = VT.getVectorElementType();
+    if (ScalarVT.getSizeInBits() < 32)
+      ScalarVT = MVT::i32;
+
+    return DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+        DstLaneV);
   }
 
-  if (isContactVector) {
-    EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                  NumElts / 2);
-    if (splitV0) {
-      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
-                       DAG.getConstant(0, MVT::i64));
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
     }
-    if (V1.getValueType().getSizeInBits() == 128) {
-      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
-                       DAG.getConstant(0, MVT::i64));
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+                               APInt &UndefBits) {
+  EVT VT = BVN->getValueType(0);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+    for (unsigned i = 0; i < NumSplats; ++i) {
+      CnstBits <<= SplatBitSize;
+      UndefBits <<= SplatBitSize;
+      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
     }
-    Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+
     return true;
   }
+
   return false;
 }
 
-// Check whether a Build Vector could be presented as Shuffle Vector.
-// This Shuffle Vector maybe not legalized, so the length of its operand and
-// the length of result may not equal.
-bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
-                                                 SDValue &V0, SDValue &V1,
-                                                 int *Mask) const {
-  SDLoc DL(Op);
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  SDValue LHS = Op.getOperand(0);
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned V0NumElts = 0;
 
-  // Check if all elements are extracted from less than 3 vectors.
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Elt = Op.getOperand(i);
-    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Elt.getOperand(0).getValueType().getVectorElementType() !=
-            VT.getVectorElementType())
-      return false;
+  if (!BVN)
+    return Op;
 
-    if (!V0.getNode()) {
-      V0 = Elt.getOperand(0);
-      V0NumElts = V0.getValueType().getVectorNumElements();
-    }
-    if (Elt.getOperand(0) == V0) {
-      Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
-      continue;
-    } else if (!V1.getNode()) {
-      V1 = Elt.getOperand(0);
-    }
-    if (Elt.getOperand(0) == V1) {
-      unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
-      Mask[i] = (Lane + V0NumElts);
-      continue;
-    } else {
-      return false;
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We only have BIC vector immediate instruction, which is and-not.
+    CnstBits = ~CnstBits;
+
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
     }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = ~UndefBits;
+    goto AttemptModImm;
   }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+  return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+                                     uint64_t &ConstVal) {
+  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+  if (!Bvec)
+    return false;
+  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+  if (!FirstElt)
+    return false;
+  EVT VT = Bvec->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 1; i < NumElts; ++i)
+    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+      return false;
+  ConstVal = FirstElt->getZExtValue();
   return true;
 }
 
-// LowerShiftRightParts - Lower SRL_PARTS and SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
 
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Tmp3 = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (!VT.isVector())
+    return SDValue();
 
-  SDValue A64cc;
-  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
-                                        DAG.getConstant(0, MVT::i64),
-                                        ISD::SETGE, A64cc,
-                                        DAG, dl);
+  SDLoc DL(N);
 
-  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           DAG.getConstant(0, Tmp3.getValueType()), Tmp3,
-                           A64cc);
-  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           TrueVal, FalseVal, A64cc);
+  // Is the first op an AND?
+  const SDValue And = N->getOperand(0);
+  if (And.getOpcode() != ISD::AND)
+    return SDValue();
 
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
+  // Is the second op an shl or lshr?
+  SDValue Shift = N->getOperand(1);
+  // This will have been turned into: AArch64ISD::VSHL vector, #shift
+  // or AArch64ISD::VLSHR vector, #shift
+  unsigned ShiftOpc = Shift.getOpcode();
+  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
+    return SDValue();
+  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+
+  // Is the shift amount constant?
+  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  if (!C2node)
+    return SDValue();
+
+  // Is the and mask vector all constant?
+  uint64_t C1;
+  if (!isAllConstantBuildVector(And.getOperand(1), C1))
+    return SDValue();
+
+  // Is C1 == ~C2, taking into account how much one can shift elements of a
+  // particular size?
+  uint64_t C2 = C2node->getZExtValue();
+  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+  if (C2 > ElemSizeInBits)
+    return SDValue();
+  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+  if ((C1 & ElemMask) != (~C2 & ElemMask))
+    return SDValue();
+
+  SDValue X = And.getOperand(0);
+  SDValue Y = Shift.getOperand(0);
+
+  unsigned Intrin =
+      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+  SDValue ResultSLI =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
+
+  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+  DEBUG(N->dump(&DAG));
+  DEBUG(dbgs() << "into: \n");
+  DEBUG(ResultSLI->dump(&DAG));
+
+  ++NumShiftInserts;
+  return ResultSLI;
 }
 
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+  if (EnableAArch64SlrGeneration) {
+    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+  SDValue LHS = Op.getOperand(1);
   SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
+  EVT VT = Op.getValueType();
 
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue Tmp4 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  // OR commutes, so try swapping the operands.
+  if (!BVN) {
+    LHS = Op.getOperand(0);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  }
+  if (!BVN)
+    return Op;
 
-  SDValue A64cc;
-  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
-                                        DAG.getConstant(0, MVT::i64),
-                                        ISD::SETGE, A64cc,
-                                        DAG, dl);
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
 
-  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           DAG.getConstant(0, Tmp4.getValueType()), Tmp4,
-                           A64cc);
-  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           Tmp3, FalseVal, A64cc);
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
 
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
-}
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
 
-// If this is a case we can't handle, return null and let the default
-// expansion code take care of it.
-SDValue
-AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
-                                         const AArch64Subtarget *ST) const {
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
 
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  SDLoc DL(Op);
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-
-  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
-
-  // Note we favor lowering MOVI over MVNI.
-  // This has implications on the definition of patterns in TableGen to select
-  // BIC immediate instructions but not ORR immediate instructions.
-  // If this lowering order is changed, TableGen patterns for BIC immediate and
-  // ORR immediate instructions have to be updated.
-  if (UseNeonMov &&
-      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize <= 64) {
-      // First attempt to use vector immediate-form MOVI
-      EVT NeonMovVT;
-      unsigned Imm = 0;
-      unsigned OpCmode = 0;
-
-      if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                            SplatBitSize, DAG, VT.is128BitVector(),
-                            Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
-        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
-        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
-
-        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
-          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
-                                        ImmVal, OpCmodeVal);
-          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      // Certain magic vector constants (used to express things like NOT
+      // and NEG) are passed through unmodified.  This allows codegen patterns
+      // for these operations to match.  Special-purpose patterns will lower
+      // these immediates to MOVIs if it proves necessary.
+      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+        return Op;
+
+      // The many faces of MOVI...
+      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+        if (VT.getSizeInBits() == 128) {
+          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+                                    DAG.getConstant(CnstVal, MVT::i32));
+          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
         }
+
+        // Support the V64 version via subregister insertion.
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
       }
 
-      // Then attempt to use vector immediate-form MVNI
-      uint64_t NegatedImm = (~SplatBits).getZExtValue();
-      if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
-                            DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
-                            Imm, OpCmode)) {
-        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
-        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
-        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
-          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
-                                        ImmVal, OpCmodeVal);
-          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
-        }
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
       }
 
-      // Attempt to use vector immediate-form FMOV
-      if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
-          (VT == MVT::v2f64 && SplatBitSize == 64)) {
-        APFloat RealVal(
-            SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
-            SplatBits);
-        uint32_t ImmVal;
-        if (A64Imms::isFPImm(RealVal, ImmVal)) {
-          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
-          return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
-        }
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The few faces of FMOV...
+      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+          VT.getSizeInBits() == 128) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The many faces of MVNI...
+      CnstVal = ~CnstVal;
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
       }
     }
-  }
 
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+FailedModImm:
+
+  // Scan through the operands to find some interesting properties we can
+  // exploit:
+  //   1) If only one value is used, we can use a DUP, or
+  //   2) if only the low element is not undef, we can just insert that, or
+  //   3) if only one constant value is used (w/ some non-constant lanes),
+  //      we can splat the constant value into the whole vector then fill
+  //      in the non-constant lanes.
+  //   4) FIXME: If different constant values are used, but we can intelligently
+  //             select the values we'll be overwriting for the non-constant
+  //             lanes such that we can directly materialize the vector
+  //             some other way (MOVI, e.g.), we can be sneaky.
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
-  bool hasDominantValue = false;
+  bool usesOnlyOneConstantValue = true;
   bool isConstant = true;
-
-  // Map of the number of times a particular SDValue appears in the
-  // element list.
-  DenseMap<SDValue, unsigned> ValueCounts;
+  unsigned NumConstantLanes = 0;
   SDValue Value;
+  SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() == ISD::UNDEF)
@@ -4738,143 +5431,90 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
-    ValueCounts.insert(std::make_pair(V, 0));
-    unsigned &Count = ValueCounts[V];
+    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+      ++NumConstantLanes;
+      if (!ConstantValue.getNode())
+        ConstantValue = V;
+      else if (ConstantValue != V)
+        usesOnlyOneConstantValue = false;
+    }
 
-    // Is this value dominant? (takes up more than half of the lanes)
-    if (++Count > (NumElts / 2)) {
-      hasDominantValue = true;
+    if (!Value.getNode())
       Value = V;
-    }
+    else if (V != Value)
+      usesOnlyOneValue = false;
   }
-  if (ValueCounts.size() != 1)
-    usesOnlyOneValue = false;
-  if (!Value.getNode() && ValueCounts.size() > 0)
-    Value = ValueCounts.begin()->first;
 
-  if (ValueCounts.size() == 0)
+  if (!Value.getNode())
     return DAG.getUNDEF(VT);
 
   if (isOnlyLowElement)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
 
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (hasDominantValue && EltSize <= 64) {
-    // Use VDUP for non-constant splats.
+  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue) {
     if (!isConstant) {
-      SDValue N;
-
-      // If we are DUPing a value that comes directly from a vector, we could
-      // just use DUPLANE. We can only do this if the lane being extracted
-      // is at a constant index, as the DUP from lane instructions only have
-      // constant-index forms.
-      //
-      // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
-      // remove TRUNCATE for DUPLANE by apdating the source vector to
-      // appropriate vector type and lane index.
-      //
-      // FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
-      // are not legal any more, no need to check the type size in bits should
-      // be large than 64.
-      SDValue V = Value;
-      if (Value->getOpcode() == ISD::TRUNCATE)
-        V = Value->getOperand(0);
-      if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(V->getOperand(1)) &&
-          V->getOperand(0).getValueType().getSizeInBits() >= 64) {
-
-        // If the element size of source vector is larger than DUPLANE
-        // element size, we can do transformation by,
-        // 1) bitcasting source register to smaller element vector
-        // 2) mutiplying the lane index by SrcEltSize/ResEltSize
-        // For example, we can lower
-        //     "v8i16 vdup_lane(v4i32, 1)"
-        // to be
-        //     "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
-        SDValue SrcVec = V->getOperand(0);
-        unsigned SrcEltSize =
-            SrcVec.getValueType().getVectorElementType().getSizeInBits();
-        unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
-        if (SrcEltSize > ResEltSize) {
-          assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
-          SDValue BitCast;
-          unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
-          unsigned ResSize = VT.getSizeInBits();
-
-          if (SrcSize > ResSize) {
-            assert((SrcSize % ResSize == 0) && "Invalid vector size");
-            EVT CastVT =
-                EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                 SrcSize / ResEltSize);
-            BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
-          } else {
-            assert((SrcSize == ResSize) && "Invalid vector size of source vec");
-            BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
-          }
+      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Value.getValueType() != VT)
+        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
 
-          unsigned LaneIdx = V->getConstantOperandVal(1);
-          SDValue Lane =
-              DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
-          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
-        } else {
-          assert((SrcEltSize == ResEltSize) &&
-                 "Invalid element size of source vec");
-          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
-                          V->getOperand(1));
-        }
-      } else
-        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
-
-      if (!usesOnlyOneValue) {
-        // The dominant value was splatted as 'N', but we now have to insert
-        // all differing elements.
-        for (unsigned I = 0; I < NumElts; ++I) {
-          if (Op.getOperand(I) == Value)
-            continue;
-          SmallVector<SDValue, 3> Ops;
-          Ops.push_back(N);
-          Ops.push_back(Op.getOperand(I));
-          Ops.push_back(DAG.getConstant(I, MVT::i64));
-          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Ops);
-        }
-      }
-      return N;
+      // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+      // DUPLANE works on 128-bit vectors, widen it if necessary.
+      SDValue Lane = Value.getOperand(1);
+      Value = Value.getOperand(0);
+      if (Value.getValueType().getSizeInBits() == 64)
+        Value = WidenVector(Value, DAG);
+
+      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+      return DAG.getNode(Opcode, dl, VT, Value, Lane);
+    }
+
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      MVT NewType =
+          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
-    if (usesOnlyOneValue && isConstant) {
-      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+  }
+
+  // If there was only one constant value used and for more than one lane,
+  // start by splatting that value, then replace the non-constant lanes. This
+  // is better than the default, which will perform a separate initialization
+  // for each lane.
+  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+    // Now insert the non-constant lanes.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+        // Note that type legalization likely mucked about with the VT of the
+        // source operand, so we may have to convert it here before inserting.
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+      }
     }
+    return Val;
   }
+
   // If all elements are constants and the case above didn't get hit, fall back
   // to the default expansion, which will generate a load from the constant
   // pool.
   if (isConstant)
     return SDValue();
 
-  // Try to lower this in lowering ShuffleVector way.
-  SDValue V0, V1;
-  int Mask[16];
-  if (isKnownShuffleVector(Op, DAG, V0, V1, Mask)) {
-    unsigned V0NumElts = V0.getValueType().getVectorNumElements();
-    if (!V1.getNode() && V0NumElts == NumElts * 2) {
-      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                       DAG.getConstant(NumElts, MVT::i64));
-      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                       DAG.getConstant(0, MVT::i64));
-      V0NumElts = V0.getValueType().getVectorNumElements();
-    }
-
-    if (V1.getNode() && NumElts == V0NumElts &&
-        V0NumElts == V1.getValueType().getVectorNumElements()) {
-      SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
-      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
-        return Shuffle;
-      else
-        return LowerVECTOR_SHUFFLE(Shuffle, DAG);
-    } else {
-      SDValue Res;
-      if (isConcatVector(Op, DAG, V0, V1, Mask, Res))
-        return Res;
-    }
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
   }
 
   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
@@ -4885,528 +5525,453 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   // on the stack followed by a load for everything else.
   if (!isConstant && !usesOnlyOneValue) {
     SDValue Vec = DAG.getUNDEF(VT);
-    for (unsigned i = 0 ; i < NumElts; ++i) {
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+    unsigned i = 0;
+    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // a) Avoid a RMW dependency on the full vector register, and
+    // b) Allow the register coalescer to fold away the copy if the
+    //    value is already in an S or D register.
+    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+      MachineSDNode *N =
+          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+                             DAG.getTargetConstant(SubIdx, MVT::i32));
+      Vec = SDValue(N, 0);
+      ++i;
+    }
+    for (; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       if (V.getOpcode() == ISD::UNDEF)
         continue;
       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
     }
     return Vec;
   }
+
+  // Just use the default expansion. We failed to find a better alternative.
   return SDValue();
 }
 
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
 
-  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
-  if (EltSz == 64)
-    return false;
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(2)))
+    return SDValue();
 
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
+  EVT VT = Op.getOperand(0).getValueType();
 
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
-    return false;
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
 
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
 
-  return true;
-}
+  // For V64 types, we perform insertion by expanding the value
+  // to a V128 type and perform the insertion on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
 
-// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
-// TRN instruction.
-static unsigned isPermuteMask(ArrayRef<int> M, EVT VT, bool isV2undef) {
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts < 4)
-    return 0;
+  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+                             Op.getOperand(1), Op.getOperand(2));
+  // Re-narrow the resultant vector.
+  return NarrowVector(Node, DAG);
+}
 
-  bool ismatch = true;
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
 
-  // Check UZP1
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i * 2;
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_UZP1;
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
 
-  // Check UZP2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i * 2 + 1;
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_UZP2;
+  EVT VT = Op.getOperand(0).getValueType();
 
-  // Check ZIP1
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i / 2 + NumElts * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_ZIP1;
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
 
-  // Check ZIP2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = (NumElts + i) / 2 + NumElts * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_ZIP2;
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
 
-  // Check TRN1
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i + (NumElts - 1) * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_TRN1;
+  // For V64 types, we perform extraction by expanding the value
+  // to a V128 type and perform the extraction on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
 
-  // Check TRN2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = 1 + i + (NumElts - 1) * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_TRN2;
+  EVT ExtrTy = WideTy.getVectorElementType();
+  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+    ExtrTy = MVT::i32;
 
-  return 0;
+  // For extractions, we just return the result directly.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+                     Op.getOperand(1));
 }
 
-SDValue
-AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
   SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
-
-  // Convert shuffles that are directly supported on NEON to target-specific
-  // DAG nodes, instead of keeping them as shuffles and matching them again
-  // during code selection.  This is more efficient and avoids the possibility
-  // of inconsistencies between legalization and selection.
-  ArrayRef<int> ShuffleMask = SVN->getMask();
-
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (EltSize > 64)
+  // Just in case...
+  if (!VT.isVector())
     return SDValue();
 
-  if (isREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
-  if (isREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
-  if (isREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
-
-  unsigned ISDNo;
-  if (V2.getOpcode() == ISD::UNDEF)
-    ISDNo = isPermuteMask(ShuffleMask, VT, true);
-  else
-    ISDNo = isPermuteMask(ShuffleMask, VT, false);
-
-  if (ISDNo) {
-    if (V2.getOpcode() == ISD::UNDEF)
-      return DAG.getNode(ISDNo, dl, VT, V1, V1);
-    else
-      return DAG.getNode(ISDNo, dl, VT, V1, V2);
-  }
-
-  SDValue Res;
-  if (isConcatVector(Op, DAG, V1, V2, &ShuffleMask[0], Res))
-    return Res;
-
-  // If the element of shuffle mask are all the same constant, we can
-  // transform it into either NEON_VDUP or NEON_VDUPLANE
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1) Lane = 0;
-
-    // Test if V1 is a SCALAR_TO_VECTOR.
-    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
-    }
-    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
-    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
-      bool IsScalarToVector = true;
-      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
-        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
-            i != (unsigned)Lane) {
-          IsScalarToVector = false;
-          break;
-        }
-      if (IsScalarToVector)
-        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
-                           V1.getOperand(Lane));
-    }
-
-    // Test if V1 is a EXTRACT_SUBVECTOR.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
-      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
-                         DAG.getConstant(Lane + ExtLane, MVT::i64));
-    }
-    // Test if V1 is a CONCAT_VECTORS.
-    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
-        V1.getOperand(1).getOpcode() == ISD::UNDEF) {
-      SDValue Op0 = V1.getOperand(0);
-      assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
-             "Invalid vector lane access");
-      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
-                         DAG.getConstant(Lane, MVT::i64));
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
+    return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  unsigned Size = Op.getValueType().getSizeInBits();
+  if (Val == 0) {
+    switch (Size) {
+    case 8:
+      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 16:
+      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 32:
+      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 64:
+      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    default:
+      llvm_unreachable("Unexpected vector type in extract_subvector!");
     }
-
-    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
-                       DAG.getConstant(Lane, MVT::i64));
   }
+  // If this is extracting the upper 64-bits of a 128-bit vector, we match
+  // that directly.
+  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+    return Op;
 
-  int Length = ShuffleMask.size();
-  int V1EltNum = V1.getValueType().getVectorNumElements();
+  return SDValue();
+}
 
-  // If the number of v1 elements is the same as the number of shuffle mask
-  // element and the shuffle masks are sequential values, we can transform
-  // it into NEON_VEXTRACT.
-  if (V1EltNum == Length) {
-    // Check if the shuffle mask is sequential.
-    int SkipUndef = 0;
-    while (ShuffleMask[SkipUndef] == -1) {
-      SkipUndef++;
-    }
-    int CurMask = ShuffleMask[SkipUndef];
-    if (CurMask >= SkipUndef) {
-      bool IsSequential = true;
-      for (int I = SkipUndef; I < Length; ++I) {
-        if (ShuffleMask[I] != -1 && ShuffleMask[I] != CurMask) {
-          IsSequential = false;
-          break;
-        }
-        CurMask++;
-      }
-      if (IsSequential) {
-        assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
-        unsigned VecSize = EltSize * V1EltNum;
-        unsigned Index = (EltSize / 8) * (ShuffleMask[SkipUndef] - SkipUndef);
-        if (VecSize == 64 || VecSize == 128)
-          return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
-                             DAG.getConstant(Index, MVT::i64));
-      }
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                               EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
     }
-  }
 
-  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
-  // by element from V2 to V1 .
-  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
-  // better choice to be inserted than V1 as less insert needed, so we count
-  // element to be inserted for both V1 and V2, and select less one as insert
-  // target.
-
-  // Collect elements need to be inserted and their index.
-  SmallVector<int, 8> NV1Elt;
-  SmallVector<int, 8> N1Index;
-  SmallVector<int, 8> NV2Elt;
-  SmallVector<int, 8> N2Index;
-  for (int I = 0; I != Length; ++I) {
-    if (ShuffleMask[I] != I) {
-      NV1Elt.push_back(ShuffleMask[I]);
-      N1Index.push_back(I);
-    }
-  }
-  for (int I = 0; I != Length; ++I) {
-    if (ShuffleMask[I] != (I + V1EltNum)) {
-      NV2Elt.push_back(ShuffleMask[I]);
-      N2Index.push_back(I);
-    }
-  }
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
 
-  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
-  // will be inserted.
-  SDValue InsV = V1;
-  SmallVector<int, 8> InsMasks = NV1Elt;
-  SmallVector<int, 8> InsIndex = N1Index;
-  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
-    if (NV1Elt.size() > NV2Elt.size()) {
-      InsV = V2;
-      InsMasks = NV2Elt;
-      InsIndex = N2Index;
-    }
-  } else {
-    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
+    if (Cost <= 4)
+      return true;
   }
 
-  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
-    SDValue ExtV = V1;
-    int Mask = InsMasks[I];
-    if (Mask >= V1EltNum) {
-      ExtV = V2;
-      Mask -= V1EltNum;
-    }
-    // Any value type smaller than i32 is illegal in AArch64, and this lower
-    // function is called after legalize pass, so we need to legalize
-    // the result here.
-    EVT EltVT;
-    if (VT.getVectorElementType().isFloatingPoint())
-      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
-    else
-      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
-
-    if (Mask >= 0) {
-      ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
-                         DAG.getConstant(Mask, MVT::i64));
-      InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
-                         DAG.getConstant(InsIndex[I], MVT::i64));
-    }
-  }
-  return InsV;
+  bool DummyBool;
+  int DummyInt;
+  unsigned DummyUnsigned;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+          isZIPMask(M, VT, DummyUnsigned) ||
+          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+          isConcatMask(M, VT, VT.getSizeInBits() == 128));
 }
 
-AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default: break;
-    case 'w': // An FP/SIMD vector register
-      return C_RegisterClass;
-    case 'I': // Constant that can be used with an ADD instruction
-    case 'J': // Constant that can be used with a SUB instruction
-    case 'K': // Constant that can be used with a 32-bit logical instruction
-    case 'L': // Constant that can be used with a 64-bit logical instruction
-    case 'M': // Constant that can be used as a 32-bit MOV immediate
-    case 'N': // Constant that can be used as a 64-bit MOV immediate
-    case 'Y': // Floating point constant zero
-    case 'Z': // Integer constant zero
-      return C_Other;
-    case 'Q': // A memory reference with base register and no offset
-      return C_Memory;
-    case 'S': // A symbolic address
-      return C_Other;
-    }
-  }
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
 
-  // FIXME: Ump, Utf, Usa, Ush
-  // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
-  //      whatever they may be
-  // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
-  // Usa: An absolute symbolic address
-  // Ush: The high part (bits 32:12) of a pc-relative symbolic address
-  assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
-         && Constraint != "Ush" && "Unimplemented constraints");
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
 
-  return TargetLowering::getConstraintType(Constraint);
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
 }
 
-TargetLowering::ConstraintWeight
-AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
-                                                const char *Constraint) const {
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  int64_t Cnt;
 
-  llvm_unreachable("Constraint weight unimplemented");
-}
+  if (!Op.getOperand(1).getValueType().isVector())
+    return Op;
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
 
-void
-AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                    std::string &Constraint,
-                                                    std::vector<SDValue> &Ops,
-                                                    SelectionDAG &DAG) const {
-  SDValue Result;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
 
-  // Only length 1 constraints are C_Other.
-  if (Constraint.size() != 1) return;
+  case ISD::SHL:
+    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+      return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
+                       Op.getOperand(0), Op.getOperand(1));
+  case ISD::SRA:
+  case ISD::SRL:
+    // Right shift immediate
+    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
+        Cnt < EltSize) {
+      unsigned Opc =
+          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
 
-  // Only C_Other constraints get lowered like this. That means constants for us
-  // so return early if there's no hope the constraint can be lowered.
+    // Right shift register.  Note, there is not a shift right register
+    // instruction, but the shift left register instruction takes a signed
+    // value, where negative numbers specify a right shift.
+    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+                                                : Intrinsic::aarch64_neon_ushl;
+    // negate the shift amount
+    SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+    SDValue NegShiftLeft =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
+    return NegShiftLeft;
+  }
 
-  switch(Constraint[0]) {
-  default: break;
-  case 'I': case 'J': case 'K': case 'L':
-  case 'M': case 'N': case 'Z': {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C)
-      return;
+  return SDValue();
+}
 
-    uint64_t CVal = C->getZExtValue();
-    uint32_t Bits;
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
+                                    SDLoc dl, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
 
-    switch (Constraint[0]) {
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+  bool IsZero = IsCnst && (CnstBits == 0);
+
+  if (SrcVT.getVectorElementType().isFloatingPoint()) {
+    switch (CC) {
     default:
-      // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
-      // is a peculiarly useless SUB constraint.
-      llvm_unreachable("Unimplemented C_Other constraint");
-    case 'I':
-      if (CVal <= 0xfff)
-        break;
-      return;
-    case 'K':
-      if (A64Imms::isLogicalImm(32, CVal, Bits))
-        break;
-      return;
-    case 'L':
-      if (A64Imms::isLogicalImm(64, CVal, Bits))
-        break;
-      return;
-    case 'Z':
-      if (CVal == 0)
-        break;
-      return;
+      return SDValue();
+    case AArch64CC::NE: {
+      SDValue Fcmeq;
+      if (IsZero)
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      else
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
     }
-
-    Result = DAG.getTargetConstant(CVal, Op.getValueType());
-    break;
-  }
-  case 'S': {
-    // An absolute symbolic address or label reference.
-    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
-      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
-                                          GA->getValueType(0));
-    } else if (const BlockAddressSDNode *BA
-                 = dyn_cast<BlockAddressSDNode>(Op)) {
-      Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
-                                         BA->getValueType(0));
-    } else if (const ExternalSymbolSDNode *ES
-                 = dyn_cast<ExternalSymbolSDNode>(Op)) {
-      Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
-                                           ES->getValueType(0));
-    } else
-      return;
-    break;
-  }
-  case 'Y':
-    if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-      if (CFP->isExactlyValue(0.0)) {
-        Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
-        break;
-      }
+    case AArch64CC::EQ:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+    case AArch64CC::GE:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+    case AArch64CC::GT:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+    case AArch64CC::LS:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+    case AArch64CC::LT:
+      if (!NoNans)
+        return SDValue();
+    // If we ignore NaNs then we can use to the MI implementation.
+    // Fallthrough.
+    case AArch64CC::MI:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
     }
-    return;
   }
 
-  if (Result.getNode()) {
-    Ops.push_back(Result);
-    return;
+  switch (CC) {
+  default:
+    return SDValue();
+  case AArch64CC::NE: {
+    SDValue Cmeq;
+    if (IsZero)
+      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    else
+      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+  }
+  case AArch64CC::EQ:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+  case AArch64CC::GE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+  case AArch64CC::GT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+  case AArch64CC::LE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+  case AArch64CC::LS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+  case AArch64CC::LO:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+  case AArch64CC::LT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+  case AArch64CC::HI:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+  case AArch64CC::HS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
   }
-
-  // It's an unknown constraint for us. Let generic code have a go.
-  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-std::pair<unsigned, const TargetRegisterClass*>
-AArch64TargetLowering::getRegForInlineAsmConstraint(
-                                                  const std::string &Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      if (VT.getSizeInBits() <= 32)
-        return std::make_pair(0U, &AArch64::GPR32RegClass);
-      else if (VT == MVT::i64)
-        return std::make_pair(0U, &AArch64::GPR64RegClass);
-      break;
-    case 'w':
-      if (VT == MVT::f16)
-        return std::make_pair(0U, &AArch64::FPR16RegClass);
-      else if (VT == MVT::f32)
-        return std::make_pair(0U, &AArch64::FPR32RegClass);
-      else if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &AArch64::FPR64RegClass);
-      else if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &AArch64::FPR128RegClass);
-      break;
-    }
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType().getVectorElementType().isInteger()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
+                                dl, DAG);
   }
 
-  // Use the default implementation in TargetLowering to convert the register
-  // constraint into a member of a register class.
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+         LHS.getValueType().getVectorElementType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  AArch64CC::CondCode CC1, CC2;
+  bool ShouldInvert;
+  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  SDValue Cmp =
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+  if (!Cmp.getNode())
+    return SDValue();
+
+  if (CC2 != AArch64CC::AL) {
+    SDValue Cmp2 =
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+    if (!Cmp2.getNode())
+      return SDValue();
+
+    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+  }
+
+  if (ShouldInvert)
+    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+  return Cmp;
 }
 
-/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
-/// The associated MachineMemOperands record the alignment specified
-/// in the intrinsic calls.
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                const CallInst &I,
                                                unsigned Intrinsic) const {
   switch (Intrinsic) {
-  case Intrinsic::arm_neon_vld1:
-  case Intrinsic::arm_neon_vld2:
-  case Intrinsic::arm_neon_vld3:
-  case Intrinsic::arm_neon_vld4:
-  case Intrinsic::aarch64_neon_vld1x2:
-  case Intrinsic::aarch64_neon_vld1x3:
-  case Intrinsic::aarch64_neon_vld1x4:
-  case Intrinsic::arm_neon_vld2lane:
-  case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane: {
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_ld1x2:
+  case Intrinsic::aarch64_neon_ld1x3:
+  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld2lane:
+  case Intrinsic::aarch64_neon_ld3lane:
+  case Intrinsic::aarch64_neon_ld4lane:
+  case Intrinsic::aarch64_neon_ld2r:
+  case Intrinsic::aarch64_neon_ld3r:
+  case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(0);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.align = 0;
     Info.vol = false; // volatile loads with NEON intrinsics not supported
     Info.readMem = true;
     Info.writeMem = false;
     return true;
   }
-  case Intrinsic::arm_neon_vst1:
-  case Intrinsic::arm_neon_vst2:
-  case Intrinsic::arm_neon_vst3:
-  case Intrinsic::arm_neon_vst4:
-  case Intrinsic::aarch64_neon_vst1x2:
-  case Intrinsic::aarch64_neon_vst1x3:
-  case Intrinsic::aarch64_neon_vst1x4:
-  case Intrinsic::arm_neon_vst2lane:
-  case Intrinsic::arm_neon_vst3lane:
-  case Intrinsic::arm_neon_vst4lane: {
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+  case Intrinsic::aarch64_neon_st1x2:
+  case Intrinsic::aarch64_neon_st1x3:
+  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st2lane:
+  case Intrinsic::aarch64_neon_st3lane:
+  case Intrinsic::aarch64_neon_st4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
@@ -5417,41 +5982,85 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(0);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.align = 0;
     Info.vol = false; // volatile stores with NEON intrinsics not supported
     Info.readMem = false;
     Info.writeMem = true;
     return true;
   }
-  default:
-    break;
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
   }
-
-  return false;
-}
-
-// Truncations from 64-bit GPR to 32-bit GPR is free.
-bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxp:
+  case Intrinsic::aarch64_ldxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxp:
+  case Intrinsic::aarch64_stxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
+  return NumBits1 > NumBits2;
 }
-
 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
+  return NumBits1 > NumBits2;
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -5461,19 +6070,14 @@ bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
+  return NumBits1 == 32 && NumBits2 == 64;
 }
-
 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
+  return NumBits1 == 32 && NumBits2 == 64;
 }
 
 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
@@ -5486,14 +6090,77 @@ bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
     return false;
 
   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
-  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
-          VT2.isInteger() && VT1.getSizeInBits() <= 32);
+  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
+          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
+          VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType.isSimple() ||
+      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType.getSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                               unsigned SrcAlign, bool IsMemset,
+                                               bool ZeroMemset,
+                                               bool MemcpyStrSrc,
+                                               MachineFunction &MF) const {
+  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+  // instruction to materialize the v2i64 zero and one store (with restrictive
+  // addressing mode). Just do two i64 store of zero-registers.
+  bool Fast;
+  const Function *F = MF.getFunction();
+  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat) &&
+      (memOpAlign(SrcAlign, DstAlign, 16) ||
+       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+    return MVT::f128;
+
+  return Size >= 8 ? MVT::i64 : MVT::i32;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
+    return true;
+  return false;
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+  if (Immed < 0)
+    Immed *= -1;
+  return isLegalAddImmediate(Immed);
 }
 
-// isLegalAddressingMode - Return true if the addressing mode represented
+/// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty) const {
+                                                  Type *Ty) const {
   // AArch64 has five basic addressing modes:
   //  reg
   //  reg + 9-bit signed offset
@@ -5534,6 +6201,9 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
       return true;
     return false;
   }
+
+  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
   if (!AM.Scale || AM.Scale == 1 ||
       (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
     return true;
@@ -5541,7 +6211,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
 }
 
 int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                              Type *Ty) const {
+                                                Type *Ty) const {
   // Scaling factors are not free at all.
   // Operands                     | Rt Latency
   // -------------------------------------------
@@ -5556,9 +6226,1711 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
   return -1;
 }
 
-/// getMaximalGlobalOffset - Returns the maximal possible offset which can
-/// be used for loads / stores from the global.
-unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
-  return 4095;
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints.
+  static const MCPhysReg ScratchRegs[] = {
+    AArch64::X16, AArch64::X17, AArch64::LR, 0
+  };
+  return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+  EVT VT = N->getValueType(0);
+    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+    // it with shift to let it be lowered to UBFX.
+  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+      isa<ConstantSDNode>(N->getOperand(1))) {
+    uint64_t TruncMask = N->getConstantOperandVal(1);
+    if (isMask_64(TruncMask) &&
+      N->getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+      return false;
+  }
+  return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                              Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return false;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+    return true;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift < 3) ? true : false;
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CSEL.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                  N0.getOperand(0));
+        // Generate SUBS & CSEL.
+        SDValue Cmp =
+            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                        N0.getOperand(0), DAG.getConstant(0, VT));
+        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+                           DAG.getConstant(AArch64CC::PL, MVT::i32),
+                           SDValue(Cmp.getNode(), 1));
+      }
+  return SDValue();
+}
+
+// performXorCombine - Attempts to handle integer ABS.
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  return performIntegerAbsCombine(N, DAG);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    APInt Value = C->getAPIntValue();
+    EVT VT = N->getValueType(0);
+    APInt VM1 = Value - 1;
+    if (VM1.isPowerOf2()) {
+      // Multiplying by one more than a power of two, replace with a shift
+      // and an add.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VM1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+    APInt VP1 = Value + 1;
+    if (VP1.isPowerOf2()) {
+      // Multiplying by one less than a power of two, replace with a shift
+      // and a subtract.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VP1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+  }
+  return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+  // Only optimize when the source and destination types have the same width.
+  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+    return SDValue();
+
+  // If the result of an integer load is only used by an integer-to-float
+  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  SDValue N0 = N->getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getPointerInfo(), LN0->isVolatile(),
+                               LN0->isNonTemporal(), LN0->isInvariant(),
+                               LN0->getAlignment());
+
+    // Make sure successors of the original load stay after it by updating them
+    // to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+    unsigned Opcode =
+        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+  }
+
+  return SDValue();
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
+                     DAG.getConstant(ShiftRHS, MVT::i64));
+}
+
+static SDValue tryCombineToBSL(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getVectorElementType().getSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+  if (!EnableAArch64ExtrGeneration)
+    return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDValue Res = tryCombineToEXTR(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  Res = tryCombineToBSL(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Remove extraneous bitcasts around an extract_subvector.
+  // For example,
+  //    (v4i16 (bitconvert
+  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+  //  becomes
+  //    (extract_subvector ((v8i16 ...), (i64 4)))
+
+  // Only interested in 64-bit vectors as the ultimate result.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+  if (VT.getSimpleVT().getSizeInBits() != 64)
+    return SDValue();
+  // Is the operand an extract_subvector starting at the beginning or halfway
+  // point of the vector? A low half may also come through as an
+  // EXTRACT_SUBREG, so look for that, too.
+  SDValue Op0 = N->getOperand(0);
+  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+      !(Op0->isMachineOpcode() &&
+        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+    return SDValue();
+  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+      return SDValue();
+  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+    if (idx != AArch64::dsub)
+      return SDValue();
+    // The dsub reference is equivalent to a lane zero subvector reference.
+    idx = 0;
+  }
+  // Look through the bitcast of the input to the extract.
+  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue Source = Op0->getOperand(0)->getOperand(0);
+  // If the source type has twice the number of elements as our destination
+  // type, we know this is an extract of the high or low half of the vector.
+  EVT SVT = Source->getValueType(0);
+  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+  // Create the simplified form to just extract the low or high half of the
+  // vector directly rather than bothering with the bitcasts.
+  SDLoc dl(N);
+  unsigned NumElements = VT.getVectorNumElements();
+  if (idx) {
+    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+  } else {
+    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
+    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+                                      Source, SubReg),
+                   0);
+  }
 }
 
+static SDValue performConcatVectorsCombine(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+  // canonicalise to that.
+  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+    assert(VT.getVectorElementType().getSizeInBits() == 64);
+    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
+                       WidenVector(N->getOperand(0), DAG),
+                       DAG.getConstant(0, MVT::i64));
+  }
+
+  // Canonicalise concat_vectors so that the right-hand vector has as few
+  // bit-casts as possible before its real operation. The primary matching
+  // destination for these operations will be the narrowing "2" instructions,
+  // which depend on the operation being performed on this right-hand vector.
+  // For example,
+  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
+  // becomes
+  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+  SDValue Op1 = N->getOperand(1);
+  if (Op1->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue RHS = Op1->getOperand(0);
+  MVT RHSTy = RHS.getValueType().getSimpleVT();
+  // If the RHS is not a vector, this is not the pattern we're looking for.
+  if (!RHSTy.isVector())
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+                                  RHSTy.getVectorNumElements() * 2);
+  return DAG.getNode(
+      ISD::BITCAST, dl, VT,
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+  // Transform a scalar conversion of a value from a lane extract into a
+  // lane extract of a vector conversion. E.g., from foo1 to foo2:
+  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+  //
+  // The second form interacts better with instruction selection and the
+  // register allocator to avoid cross-class register copies that aren't
+  // coalescable due to a lane reference.
+
+  // Check the operand and see if it originates from a lane extract.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    // Yep, no additional predication needed. Perform the transform.
+    SDValue IID = N->getOperand(0);
+    SDValue Shift = N->getOperand(2);
+    SDValue Vec = Op1.getOperand(0);
+    SDValue Lane = Op1.getOperand(1);
+    EVT ResTy = N->getValueType(0);
+    EVT VecResTy;
+    SDLoc DL(N);
+
+    // The vector width should be 128 bits by the time we get here, even
+    // if it started as 64 bits (the extract_vector handling will have
+    // done so).
+    assert(Vec.getValueType().getSizeInBits() == 128 &&
+           "unexpected vector size on extract_vector_elt!");
+    if (Vec.getValueType() == MVT::v4i32)
+      VecResTy = MVT::v4f32;
+    else if (Vec.getValueType() == MVT::v2i64)
+      VecResTy = MVT::v2f64;
+    else
+      llvm_unreachable("unexpected vector type!");
+
+    SDValue Convert =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+  }
+  return SDValue();
+}
+
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+//  (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  // We can handle most types of duplicate, but the lane ones have an extra
+  // operand saying *which* lane, so we need to know.
+  bool IsDUPLANE;
+  switch (N.getOpcode()) {
+  case AArch64ISD::DUP:
+    IsDUPLANE = false;
+    break;
+  case AArch64ISD::DUPLANE8:
+  case AArch64ISD::DUPLANE16:
+  case AArch64ISD::DUPLANE32:
+  case AArch64ISD::DUPLANE64:
+    IsDUPLANE = true;
+    break;
+  default:
+    return SDValue();
+  }
+
+  MVT NarrowTy = N.getSimpleValueType();
+  if (!NarrowTy.is64BitVector())
+    return SDValue();
+
+  MVT ElementTy = NarrowTy.getVectorElementType();
+  unsigned NumElems = NarrowTy.getVectorNumElements();
+  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+  SDValue NewDUP;
+  if (IsDUPLANE)
+    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
+                         N.getOperand(1));
+  else
+    NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
+
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
+                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    return true;
+
+  return N.getOpcode() == ISD::BITCAST &&
+         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+  const SDValue *Opnd0;
+  const SDValue *Opnd1;
+  ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+  const SDValue *Cmp;
+  AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+  GenericSetCCInfo Generic;
+  AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information.  If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+  SetCCInfo Info;
+  bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+  // If this is a setcc, this is straight forward.
+  if (Op.getOpcode() == ISD::SETCC) {
+    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SetCCInfo.IsAArch64 = false;
+    return true;
+  }
+  // Otherwise, check if this is a matching csel instruction.
+  // In other words:
+  // - csel 1, 0, cc
+  // - csel 0, 1, !cc
+  if (Op.getOpcode() != AArch64ISD::CSEL)
+    return false;
+  // Set the information about the operands.
+  // TODO: we want the operands of the Cmp not the csel
+  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+  SetCCInfo.IsAArch64 = true;
+  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // Check that the operands matches the constraints:
+  // (1) Both operands must be constants.
+  // (2) One must be 1 and the other must be 0.
+  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+  // Check (1).
+  if (!TValue || !FValue)
+    return false;
+
+  // Check (2).
+  if (!TValue->isOne()) {
+    // Update the comparison when we are interested in !cc.
+    std::swap(TValue, FValue);
+    SetCCInfo.Info.AArch64.CC =
+        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+  }
+  return TValue->isOne() && FValue->isNullValue();
+}
+
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+  if (isSetCC(Op, Info))
+    return true;
+  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+    isSetCC(Op->getOperand(0), Info));
+}
+
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+//   -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+  SDValue LHS = Op->getOperand(0);
+  SDValue RHS = Op->getOperand(1);
+  SetCCInfoAndKind InfoAndKind;
+
+  // If neither operand is a SET_CC, give up.
+  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+    std::swap(LHS, RHS);
+    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+      return SDValue();
+  }
+
+  // FIXME: This could be generatized to work for FP comparisons.
+  EVT CmpVT = InfoAndKind.IsAArch64
+                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
+  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+    return SDValue();
+
+  SDValue CCVal;
+  SDValue Cmp;
+  SDLoc dl(Op);
+  if (InfoAndKind.IsAArch64) {
+    CCVal = DAG.getConstant(
+        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
+    Cmp = *InfoAndKind.Info.AArch64.Cmp;
+  } else
+    Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+                      *InfoAndKind.Info.Generic.Opnd1,
+                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+                      CCVal, DAG, dl);
+
+  EVT VT = Op->getValueType(0);
+  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
+  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+//      (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector()) {
+    if (N->getOpcode() == ISD::ADD)
+      return performSetccAddFolding(N, DAG);
+    return SDValue();
+  }
+
+  // Make sure both branches are extended in the same way.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+      LHS.getOpcode() != RHS.getOpcode())
+    return SDValue();
+
+  unsigned ExtType = LHS.getOpcode();
+
+  // It's not worth doing if at least one of the inputs isn't already an
+  // extract, but we don't know which it'll be so we have to try both.
+  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+    if (!RHS.getNode())
+      return SDValue();
+
+    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+    if (!LHS.getNode())
+      return SDValue();
+
+    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+  }
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+//   (aarch64_neon_umull (extract_high (v2i64 vec)))
+//                     (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  assert(LHS.getValueType().is64BitVector() &&
+         RHS.getValueType().is64BitVector() &&
+         "unexpected shape for long operation");
+
+  // Either node could be a DUP, but it's not worth doing both of them (you'd
+  // just as well use the non-high version) so look for a corresponding extract
+  // operation on the other "wing".
+  if (isEssentiallyExtractSubvector(LHS)) {
+    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+    if (!RHS.getNode())
+      return SDValue();
+  } else if (isEssentiallyExtractSubvector(RHS)) {
+    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+    if (!LHS.getNode())
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+  unsigned ElemBits = ElemTy.getSizeInBits();
+
+  int64_t ShiftAmount;
+  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, ElemBits) ||
+        SplatBitSize != ElemBits)
+      return SDValue();
+
+    ShiftAmount = SplatValue.getSExtValue();
+  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    ShiftAmount = CVN->getSExtValue();
+  } else
+    return SDValue();
+
+  unsigned Opcode;
+  bool IsRightShift;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unknown shift intrinsic");
+  case Intrinsic::aarch64_neon_sqshl:
+    Opcode = AArch64ISD::SQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_uqshl:
+    Opcode = AArch64ISD::UQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_srshl:
+    Opcode = AArch64ISD::SRSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_urshl:
+    Opcode = AArch64ISD::URSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_sqshlu:
+    Opcode = AArch64ISD::SQSHLU_I;
+    IsRightShift = false;
+    break;
+  }
+
+  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(-ShiftAmount, MVT::i32));
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(ShiftAmount, MVT::i32));
+
+  return SDValue();
+}
+
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+  SDValue AndN = N->getOperand(2);
+  if (AndN.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+  if (!CMask || CMask->getZExtValue() != Mask)
+    return SDValue();
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_vcvtfxs2fp:
+  case Intrinsic::aarch64_neon_vcvtfxu2fp:
+    return tryCombineFixedPointConvert(N, DCI, DAG);
+    break;
+  case Intrinsic::aarch64_neon_fmax:
+    return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmin:
+    return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull:
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_sqdmull:
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+  case Intrinsic::aarch64_neon_sqshl:
+  case Intrinsic::aarch64_neon_uqshl:
+  case Intrinsic::aarch64_neon_sqshlu:
+  case Intrinsic::aarch64_neon_srshl:
+  case Intrinsic::aarch64_neon_urshl:
+    return tryCombineShiftImm(IID, N, DAG);
+  case Intrinsic::aarch64_crc32b:
+  case Intrinsic::aarch64_crc32cb:
+    return tryCombineCRC32(0xff, N, DAG);
+  case Intrinsic::aarch64_crc32h:
+  case Intrinsic::aarch64_crc32ch:
+    return tryCombineCRC32(0xffff, N, DAG);
+  }
+  return SDValue();
+}
+
+static SDValue performExtendCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+  // we can convert that DUP into another extract_high (of a bigger DUP), which
+  // helps the backend to decide that an sabdl2 would be useful, saving a real
+  // extract_high operation.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    SDNode *ABDNode = N->getOperand(0).getNode();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::aarch64_neon_sabd ||
+        IID == Intrinsic::aarch64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
+
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
+  }
+
+  // This is effectively a custom type legalization for AArch64.
+  //
+  // Type legalization will split an extend of a small, legal, type to a larger
+  // illegal type by first splitting the destination type, often creating
+  // illegal source types, which then get legalized in isel-confusing ways,
+  // leading to really terrible codegen. E.g.,
+  //   %result = v8i32 sext v8i8 %value
+  // becomes
+  //   %losrc = extract_subreg %value, ...
+  //   %hisrc = extract_subreg %value, ...
+  //   %lo = v4i32 sext v4i8 %losrc
+  //   %hi = v4i32 sext v4i8 %hisrc
+  // Things go rapidly downhill from there.
+  //
+  // For AArch64, the [sz]ext vector instructions can only go up one element
+  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+  // take two instructions.
+  //
+  // This implies that the most efficient way to do the extend from v8i8
+  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+  // the normal splitting to happen for the v8i16->v8i32.
+
+  // This is pre-legalization to catch some cases where the default
+  // type legalization will create ill-tempered code.
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // We're only interested in cleaning things up for non-legal vector types
+  // here. If both the source and destination are legal, things will just
+  // work naturally without any fiddling.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
+  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+    return SDValue();
+  // If the vector type isn't a simple VT, it's beyond the scope of what
+  // we're  worried about here. Let legalization do its thing and hope for
+  // the best.
+  if (!ResVT.isSimple())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  MVT SrcVT = Src->getValueType(0).getSimpleVT();
+  // If the source VT is a 64-bit vector, we can play games and get the
+  // better results we want.
+  if (SrcVT.getSizeInBits() != 64)
+    return SDValue();
+
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned ElementCount = SrcVT.getVectorNumElements();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  SDLoc DL(N);
+  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+  // Now split the rest of the operation into two halves, each with a 64
+  // bit source.
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  unsigned NumElements = ResVT.getVectorNumElements();
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ResVT.getVectorElementType(), NumElements / 2);
+
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+  // Now combine the parts back together so we still have a single result
+  // like the combiner expects.
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't replace floating point stores, they possibly won't be transformed to
+  // stp because of the store pair suppress pass.
+  if (VT.isFloatingPoint())
+    return SDValue();
+
+  // Check for insert vector elements.
+  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  unsigned NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+  SDValue SplatVal = StVal.getOperand(1);
+  unsigned RemainInsertElts = NumVecElts - 1;
+
+  // Check that this is a splat.
+  while (--RemainInsertElts) {
+    SDValue NextInsertElt = StVal.getOperand(0);
+    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+      return SDValue();
+    if (NextInsertElt.getOperand(1) != SplatVal)
+      return SDValue();
+    StVal = NextInsertElt;
+  }
+  unsigned OrigAlignment = St->getAlignment();
+  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
+  unsigned Alignment = std::min(OrigAlignment, EltOffset);
+
+  // Create scalar stores. This is at least as good as the code sequence for a
+  // split unaligned store wich is a dup.s, ext.b, and two stores.
+  // Most of the time the three stores should be replaced by store pair
+  // instructions (stp).
+  SDLoc DL(St);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
+                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+
+  unsigned Offset = EltOffset;
+  while (--NumVecElts) {
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                    DAG.getConstant(Offset, MVT::i64));
+    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), Alignment);
+    Offset += EltOffset;
+  }
+  return NewST1;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *S = cast<StoreSDNode>(N);
+  if (S->isVolatile())
+    return SDValue();
+
+  // Cyclone has bad performance on unaligned 16B stores when crossing line and
+  // page boundries. We want to split such stores.
+  if (!Subtarget->isCyclone())
+    return SDValue();
+
+  // Don't split at Oz.
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (IsMinSize)
+    return SDValue();
+
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+  // those up regresses performance on micro-benchmarks and olden/bh.
+  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+    return SDValue();
+
+  // Split unaligned 16B stores. They are terrible for performance.
+  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+  // extensions can use this to mark that it does not want splitting to happen
+  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+      S->getAlignment() <= 2)
+    return SDValue();
+
+  // If we get a splat of a scalar convert this vector store to a store of
+  // scalars. They will be merged into store pairs thereby removing two
+  // instructions.
+  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
+  if (ReplacedSplat != SDValue())
+    return ReplacedSplat;
+
+  SDLoc DL(S);
+  unsigned NumElts = VT.getVectorNumElements() / 2;
+  // Split VT into two.
+  EVT HalfVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(0));
+  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(NumElts));
+  SDValue BasePtr = S->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                  DAG.getConstant(8, MVT::i64));
+  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
+                      S->getAlignment());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     bool IsLaneOp) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  unsigned LoadIdx = IsLaneOp ? 1 : 0;
+  SDNode *LD = N->getOperand(LoadIdx).getNode();
+  // If it is not LOAD, can not do such combine.
+  if (LD->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+  EVT MemVT = LoadSDN->getMemoryVT();
+  // Check if memory operand is the same type as the vector element.
+  if (MemVT != VT.getVectorElementType())
+    return SDValue();
+
+  // Check if there are other uses. If so, do not combine as it will introduce
+  // an extra load.
+  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+       ++UI) {
+    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+      continue;
+    if (*UI != N)
+      return SDValue();
+  }
+
+  SDValue Addr = LD->getOperand(1);
+  SDValue Vector = N->getOperand(0);
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+       Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD
+        || UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load.  Otherwise, folding it
+    // would create a cycle.
+    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+      continue;
+    // Also check that add is not used in the vector operand.  This would also
+    // create a cycle.
+    if (User->isPredecessorOf(Vector.getNode()))
+      continue;
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(LD->getOperand(0));  // Chain
+    if (IsLaneOp) {
+      Ops.push_back(Vector);           // The vector to be inserted
+      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+    }
+    Ops.push_back(Addr);
+    Ops.push_back(Inc);
+
+    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+                                           MemVT,
+                                           LoadSDN->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    NewResults.push_back(SDValue(LD, 0));             // The result of load
+    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+    DCI.CombineTo(LD, NewResults);
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+
+    break;
+  }
+  return SDValue();
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  unsigned AddrOpIdx = N->getNumOperands() - 1;
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool IsStore = false;
+    bool IsLaneOp = false;
+    bool IsDupOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+    }
+
+    EVT VecTy;
+    if (IsStore)
+      VecTy = N->getOperand(2).getValueType();
+    else
+      VecTy = N->getValueType(0);
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+      if (IsLaneOp || IsDupOp)
+        NumBytes /= VecTy.getVectorNumElements();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // Incoming chain
+    // Load lane and store have vector list as input.
+    if (IsLaneOp || IsStore)
+      for (unsigned i = 2; i < AddrOpIdx; ++i)
+        Ops.push_back(N->getOperand(i));
+    Ops.push_back(Addr); // Base register
+    Ops.push_back(Inc);
+
+    // Return Types.
+    EVT Tys[6];
+    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;  // Type of write back register
+    Tys[n] = MVT::Other;  // Type of the chain
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue CCVal = N->getOperand(2);
+  SDValue Cmp = N->getOperand(3);
+
+  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return SDValue();
+
+  unsigned CmpOpc = Cmp.getOpcode();
+  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
+    return SDValue();
+
+  // Only attempt folding if there is only one use of the flag and no use of the
+  // value.
+  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+    return SDValue();
+
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected the value type to be the same for both operands!");
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return SDValue();
+
+  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+    std::swap(LHS, RHS);
+
+  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+    return SDValue();
+
+  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+      LHS.getOpcode() == ISD::SRL)
+    return SDValue();
+
+  // Fold the compare into the branch instruction.
+  SDValue BR;
+  if (CC == AArch64CC::EQ)
+    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+  else
+    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, BR, false);
+
+  return SDValue();
+}
+
+// vselect (v1i1 setcc) ->
+//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
+// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
+// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
+// such VSELECT.
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT CCVT = N0.getValueType();
+
+  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+      CCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ResVT = N->getValueType(0);
+  EVT CmpVT = N0.getOperand(0).getValueType();
+  // Only combine when the result type is of the same size as the compared
+  // operands.
+  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+    return SDValue();
+
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+                   N0.getOperand(0), N0.getOperand(1),
+                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
+                     IfTrue, IfFalse);
+}
+
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT ResVT = N->getValueType(0);
+
+  if (!N->getOperand(1).getValueType().isVector())
+    return SDValue();
+
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+    return SDValue();
+
+  SDLoc DL(N0);
+
+  EVT SrcVT = N0.getOperand(0).getValueType();
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+  // First perform a vector comparison, where lane 0 is the one we're interested
+  // in.
+  SDValue LHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+  // Now duplicate the comparison mask we want across all other lanes.
+  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+                     Mask);
+
+  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
+}
+
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+    return performAddSubLongCombine(N, DCI, DAG);
+  case ISD::XOR:
+    return performXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performIntToFpCombine(N, DAG);
+  case ISD::OR:
+    return performORCombine(N, DCI, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performExtendCombine(N, DCI, DAG);
+  case ISD::BITCAST:
+    return performBitcastCombine(N, DCI, DAG);
+  case ISD::CONCAT_VECTORS:
+    return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::SELECT:
+    return performSelectCombine(N, DAG);
+  case ISD::VSELECT:
+    return performVSelectCombine(N, DCI.DAG);
+  case ISD::STORE:
+    return performSTORECombine(N, DCI, DAG, Subtarget);
+  case AArch64ISD::BRCOND:
+    return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::DUP:
+    return performPostLD1Combine(N, DCI, false);
+  case ISD::INSERT_VECTOR_ELT:
+    return performPostLD1Combine(N, DCI, true);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::aarch64_neon_ld2:
+    case Intrinsic::aarch64_neon_ld3:
+    case Intrinsic::aarch64_neon_ld4:
+    case Intrinsic::aarch64_neon_ld1x2:
+    case Intrinsic::aarch64_neon_ld1x3:
+    case Intrinsic::aarch64_neon_ld1x4:
+    case Intrinsic::aarch64_neon_ld2lane:
+    case Intrinsic::aarch64_neon_ld3lane:
+    case Intrinsic::aarch64_neon_ld4lane:
+    case Intrinsic::aarch64_neon_ld2r:
+    case Intrinsic::aarch64_neon_ld3r:
+    case Intrinsic::aarch64_neon_ld4r:
+    case Intrinsic::aarch64_neon_st2:
+    case Intrinsic::aarch64_neon_st3:
+    case Intrinsic::aarch64_neon_st4:
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
+    case Intrinsic::aarch64_neon_st2lane:
+    case Intrinsic::aarch64_neon_st3lane:
+    case Intrinsic::aarch64_neon_st4lane:
+      return performNEONPostLDSTCombine(N, DCI, DAG);
+    default:
+      break;
+    }
+  }
+  return SDValue();
+}
+
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+                                               SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+        MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode *Node : Copy->uses()) {
+    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+
+  return true;
+}
+
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   bool &IsInc,
+                                                   SelectionDAG &DAG) const {
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Op->getOperand(0);
+  // All of the indexed addressing mode instructions take a signed
+  // 9 bit immediate offset.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int64_t RHSC = (int64_t)RHS->getZExtValue();
+    if (RHSC >= 256 || RHSC <= -256)
+      return false;
+    IsInc = (Op->getOpcode() == ISD::ADD);
+    Offset = Op->getOperand(1);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+    return false;
+  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+    return false;
+  // Post-indexing updates the base, so it's not a valid transform
+  // if that's not the same as the load's pointer.
+  if (Ptr != Base)
+    return false;
+  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+void AArch64TargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this");
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+    // Let normal code take care of it by not adding anything to Results.
+    return;
+  }
+}
+
+bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 128-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 128;
+
+  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+}
+
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                             AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i64, i64} and we have to recombine them into a
+  // single i128 here.
+  if (ValTy->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int =
+      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldxr, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+                                                   Value *Val, Value *Addr,
+                                                   AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i128 intrinsics take two
+  // parameters: "i64, i64". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+    Function *Stxr = Intrinsic::getDeclaration(M, Int);
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
+  }
+
+  Intrinsic::ID Int =
+      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+  Type *Tys[] = { Addr->getType() };
+  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Stxr, Builder.CreateZExtOrBitCast(
+                Val, Stxr->getFunctionType()->getParamType(0)),
+      Addr);
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 070db94808f0..139217d34402 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -12,274 +12,286 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_ISELLOWERING_H
-#define LLVM_TARGET_AARCH64_ISELLOWERING_H
+#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
+#define LLVM_TARGET_AArch64_ISELLOWERING_H
 
-#include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
+
 namespace AArch64ISD {
-  enum NodeType {
-    // Start the numbering from where ISD NodeType finishes.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    // This is a conditional branch which also notes the flag needed
-    // (eq/sgt/...). A64 puts this information on the branches rather than
-    // compares as LLVM does.
-    BR_CC,
-
-    // A node to be selected to an actual call operation: either BL or BLR in
-    // the absence of tail calls.
-    Call,
-
-    // Indicates a floating-point immediate which fits into the format required
-    // by the FMOV instructions. First (and only) operand is the 8-bit encoded
-    // value of that immediate.
-    FPMOV,
-
-    // Corresponds directly to an EXTR instruction. Operands are an LHS an RHS
-    // and an LSB.
-    EXTR,
-
-    // Wraps a load from the GOT, which should always be performed with a 64-bit
-    // load instruction. This prevents the DAG combiner folding a truncate to
-    // form a smaller memory access.
-    GOTLoad,
-
-    // Performs a bitfield insert. Arguments are: the value being inserted into;
-    // the value being inserted; least significant bit changed; width of the
-    // field.
-    BFI,
-
-    // Simply a convenient node inserted during ISelLowering to represent
-    // procedure return. Will almost certainly be selected to "RET".
-    Ret,
-
-    /// Extracts a field of contiguous bits from the source and sign extends
-    /// them into a single register. Arguments are: source; immr; imms. Note
-    /// these are pre-encoded since DAG matching can't cope with combining LSB
-    /// and Width into these values itself.
-    SBFX,
-
-    /// This is an A64-ification of the standard LLVM SELECT_CC operation. The
-    /// main difference is that it only has the values and an A64 condition,
-    /// which will be produced by a setcc instruction.
-    SELECT_CC,
-
-    /// This serves most of the functions of the LLVM SETCC instruction, for two
-    /// purposes. First, it prevents optimisations from fiddling with the
-    /// compare after we've moved the CondCode information onto the SELECT_CC or
-    /// BR_CC instructions. Second, it gives a legal instruction for the actual
-    /// comparison.
-    ///
-    /// It keeps a record of the condition flags asked for because certain
-    /// instructions are only valid for a subset of condition codes.
-    SETCC,
-
-    // Designates a node which is a tail call: both a call and a return
-    // instruction as far as selction is concerned. It should be selected to an
-    // unconditional branch. Has the usual plethora of call operands, but: 1st
-    // is callee, 2nd is stack adjustment required immediately before branch.
-    TC_RETURN,
-
-    // Designates a call used to support the TLS descriptor ABI. The call itself
-    // will be indirect ("BLR xN") but a relocation-specifier (".tlsdesccall
-    // var") must be attached somehow during code generation. It takes two
-    // operands: the callee and the symbol to be relocated against.
-    TLSDESCCALL,
-
-    // Leaf node which will be lowered to an appropriate MRS to obtain the
-    // thread pointer: TPIDR_EL0.
-    THREAD_POINTER,
-
-    /// Extracts a field of contiguous bits from the source and zero extends
-    /// them into a single register. Arguments are: source; immr; imms. Note
-    /// these are pre-encoded since DAG matching can't cope with combining LSB
-    /// and Width into these values itself.
-    UBFX,
-
-    // Wraps an address which the ISelLowering phase has decided should be
-    // created using the large memory model style: i.e. a sequence of four
-    // movz/movk instructions.
-    WrapperLarge,
-
-    // Wraps an address which the ISelLowering phase has decided should be
-    // created using the small memory model style: i.e. adrp/add or
-    // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
-    // get selected.
-    WrapperSmall,
-
-    // Vector move immediate
-    NEON_MOVIMM,
-
-    // Vector Move Inverted Immediate
-    NEON_MVNIMM,
-
-    // Vector FP move immediate
-    NEON_FMOVIMM,
-
-    // Vector permute
-    NEON_UZP1,
-    NEON_UZP2,
-    NEON_ZIP1,
-    NEON_ZIP2,
-    NEON_TRN1,
-    NEON_TRN2,
-
-    // Vector Element reverse
-    NEON_REV64,
-    NEON_REV32,
-    NEON_REV16,
-
-    // Vector compare
-    NEON_CMP,
-
-    // Vector compare zero
-    NEON_CMPZ,
-
-    // Vector compare bitwise test
-    NEON_TST,
-
-    // Vector saturating shift
-    NEON_QSHLs,
-    NEON_QSHLu,
-
-    // Vector dup
-    NEON_VDUP,
-
-    // Vector dup by lane
-    NEON_VDUPLANE,
-
-    // Vector extract
-    NEON_VEXTRACT,
-
-    // NEON duplicate lane loads
-    NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
-    NEON_LD3DUP,
-    NEON_LD4DUP,
-
-    // NEON loads with post-increment base updates:
-    NEON_LD1_UPD,
-    NEON_LD2_UPD,
-    NEON_LD3_UPD,
-    NEON_LD4_UPD,
-    NEON_LD1x2_UPD,
-    NEON_LD1x3_UPD,
-    NEON_LD1x4_UPD,
-
-    // NEON stores with post-increment base updates:
-    NEON_ST1_UPD,
-    NEON_ST2_UPD,
-    NEON_ST3_UPD,
-    NEON_ST4_UPD,
-    NEON_ST1x2_UPD,
-    NEON_ST1x3_UPD,
-    NEON_ST1x4_UPD,
-
-    // NEON duplicate lane loads with post-increment base updates:
-    NEON_LD2DUP_UPD,
-    NEON_LD3DUP_UPD,
-    NEON_LD4DUP_UPD,
-
-    // NEON lane loads with post-increment base updates:
-    NEON_LD2LN_UPD,
-    NEON_LD3LN_UPD,
-    NEON_LD4LN_UPD,
-
-    // NEON lane store with post-increment base updates:
-    NEON_ST2LN_UPD,
-    NEON_ST3LN_UPD,
-    NEON_ST4LN_UPD
-  };
-}
 
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Almost the same as a normal call node, except that a TLSDesc relocation is
+  // needed so the linker can relax it correctly if possible.
+  TLSDESC_CALL,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Floating point comparison
+  FCMP,
+
+  // Floating point max and min instructions.
+  FMAX,
+  FMIN,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // element must be identical.
+  BSL,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF,
+
+  // NEON Load/Store with post-increment base updates
+  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LD3post,
+  LD4post,
+  ST2post,
+  ST3post,
+  ST4post,
+  LD1x2post,
+  LD1x3post,
+  LD1x4post,
+  ST1x2post,
+  ST1x3post,
+  ST1x4post,
+  LD1DUPpost,
+  LD2DUPpost,
+  LD3DUPpost,
+  LD4DUPpost,
+  LD1LANEpost,
+  LD2LANEpost,
+  LD3LANEpost,
+  LD4LANEpost,
+  ST2LANEpost,
+  ST3LANEpost,
+  ST4LANEpost
+};
+
+} // end namespace AArch64ISD
 
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64TargetLowering : public TargetLowering {
-public:
-  explicit AArch64TargetLowering(AArch64TargetMachine &TM);
+  bool RequireStrictAlign;
 
-  const char *getTargetNodeName(unsigned Opcode) const override;
+public:
+  explicit AArch64TargetLowering(TargetMachine &TM);
 
-  CCAssignFn *CCAssignFnForNode(CallingConv::ID CC) const;
+  /// Selects the correct CCAssignFn for a the given CallingConvention
+  /// value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
-  SDValue LowerFormalArguments(SDValue Chain,
-                               CallingConv::ID CallConv, bool isVarArg,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc dl, SelectionDAG &DAG,
-                               SmallVectorImpl<SDValue> &InVals) const override;
+  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
 
-  SDValue LowerReturn(SDValue Chain,
-                      CallingConv::ID CallConv, bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc dl, SelectionDAG &DAG) const override;
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
 
-  unsigned getByValTypeAlignment(Type *Ty) const override;
+  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// unaligned memory accesses. of the specified type.
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                     bool *Fast = nullptr) const override {
+    if (RequireStrictAlign)
+      return false;
+    // FIXME: True for Cyclone, but not necessary others.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
 
-  SDValue LowerCall(CallLoweringInfo &CLI,
-                    SmallVectorImpl<SDValue> &InVals) const override;
+  /// LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                          CallingConv::ID CallConv, bool IsVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins,
-                          SDLoc dl, SelectionDAG &DAG,
-                          SmallVectorImpl<SDValue> &InVals) const;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
-  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  bool isConcatVector(SDValue Op, SelectionDAG &DAG, SDValue V0, SDValue V1,
-                      const int *Mask, SDValue &Res) const;
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  unsigned getFunctionAlignment(const Function *F) const;
 
-  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &V0,
-                            SDValue &V1, int *Mask) const;
+  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+  /// be used for loads / stores from the global.
+  unsigned getMaximalGlobalOffset() const override;
 
-  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
-                            const AArch64Subtarget *ST) const;
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
 
-  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
 
-  void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
-                           SDValue &Chain) const;
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
-  /// IsEligibleForTailCallOptimization - Check whether the call is eligible
-  /// for tail call optimization. Targets which want to do tail call
-  /// optimization should implement this function.
-  bool IsEligibleForTailCallOptimization(SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    bool IsVarArg,
-                                    bool IsCalleeStructRet,
-                                    bool IsCallerStructRet,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
-  /// Finds the incoming stack arguments which overlap the given fixed stack
-  /// object and incorporates their load into the current chain. This prevents
-  /// an upcoming store from clobbering the stack argument before it's used.
-  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
-                              MachineFrameInfo *MFI, int ClobberedFI) const;
+  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+  /// codegen'd directly, or if it should be stack expanded.
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
 
+  /// getSetCCResultType - Return the ISD::SETCC ValueType
   EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
-  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
 
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI,
+                              MachineBasicBlock *MBB) const override;
 
-  bool isLegalICmpImmediate(int64_t Val) const override;
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
 
-  /// \brief Return true if the addressing mode represented by AM is legal for
-  /// this target, for a load/store of the specified type.
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool hasPairedLoad(Type *LoadedType,
+                     unsigned &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+
+  bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalICmpImmediate(int64_t) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
   bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
   /// \brief Return the cost of the scaling factor used in the addressing
@@ -289,122 +301,164 @@ class AArch64TargetLowering : public TargetLowering {
   /// If the AM is not supported, it returns a negative value.
   int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
 
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
-  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
-  bool isZExtFree(EVT VT1, EVT VT2) const override;
-  bool isZExtFree(SDValue Val, EVT VT2) const override;
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
-  SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                         SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const;
+  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
 
-  MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
-                              MachineBasicBlock *MBB) const override;
+  /// \brief Returns true if it is beneficial to convert a load of a constant
+  /// to just the constant itself.
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
 
-  MachineBasicBlock *
-  emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *MBB,
-                   unsigned Size, unsigned Opcode) const;
+  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                        AtomicOrdering Ord) const override;
+  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                              Value *Addr, AtomicOrdering Ord) const override;
 
-  MachineBasicBlock *
-  emitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock *BB,
-                         unsigned Size, unsigned CmpOp,
-                         A64CC::CondCodes Cond) const;
-  MachineBasicBlock *
-  emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
-                    unsigned Size) const;
+  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
 
-  MachineBasicBlock *
-  EmitF128CSEL(MachineInstr *MI, MachineBasicBlock *MBB) const;
+private:
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
 
-  SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
-                          RTLIB::Libcall Call) const;
-  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
-  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
 
-  SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                    SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                           SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
-  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                          bool isThisReturn, SDValue ThisVal) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      bool isCalleeStructRet, bool isCallerStructRet,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
-  SDValue PerformDAGCombine(SDNode *N,DAGCombinerInfo &DCI) const override;
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
 
-  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
-  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
 
-  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
-  /// unaligned memory accesses of the specified type. Returns whether it
-  /// is "fast" by reference in the second argument.
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-                                     bool *Fast) const override;
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+                              SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
 
   ConstraintType
   getConstraintType(const std::string &Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
   ConstraintWeight
-  getSingleConstraintMatchWeight(AsmOperandInfo &Info,
-                                 const char *Constraint) const override;
-  void LowerAsmOperandForConstraint(SDValue Op,
-                                    std::string &Constraint,
-                                    std::vector<SDValue> &Ops,
-                                    SelectionDAG &DAG) const override;
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
 
-  std::pair<unsigned, const TargetRegisterClass*>
+  std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const std::string &Constraint,
                                MVT VT) const override;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
 
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                          unsigned Intrinsic) const override;
-
-  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
-  /// be used for loads / stores from the global.
-  unsigned getMaximalGlobalOffset() const override;
-
-protected:
-  std::pair<const TargetRegisterClass*, uint8_t>
-  findRepresentativeClass(MVT VT) const override;
-
-private:
-  const InstrItineraryData *Itins;
-
-  const AArch64Subtarget *getSubtarget() const {
-    return &getTargetMachine().getSubtarget<AArch64Subtarget>();
-  }
-};
-enum NeonModImmType {
-  Neon_Mov_Imm,
-  Neon_Mvn_Imm
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 };
 
-extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
-                                bool &usesOnlyOneValue, bool &hasDominantValue,
-                                bool &isConstant, bool &isUNDEF);
-} // namespace llvm
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
+
+} // end namespace llvm
 
-#endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
+#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 000000000000..3b9e3c630596
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,364 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected load ordering");
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                                     ro_Wextend8:$offset)),
+          (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend8:$offset)),
+          (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend16:$extend)),
+          (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend16:$extend)),
+          (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend32:$extend)),
+          (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend32:$extend)),
+          (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)),
+          (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+          (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend64:$extend)),
+          (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend64:$extend)),
+          (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset)),
+          (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected store ordering");
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+               GPR32:$val),
+          (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+               GPR32:$val),
+          (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+          (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+          (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend32:$extend),
+                                          GPR32:$val),
+          (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend32:$extend),
+                                          GPR32:$val),
+          (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+          (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR64:$val),
+          (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR64:$val),
+          (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+          (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+          (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+          (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+          (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 4cc3813203ce..446149b4fb0d 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1,4 +1,4 @@
-//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tablegen -*-=//
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,1482 +6,8579 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// This file describes AArch64 instruction formats, down to the level of the
-// instruction's overall class.
-//===----------------------------------------------------------------------===//
-
 
 //===----------------------------------------------------------------------===//
-// A64 Instruction Format Definitions.
-//===----------------------------------------------------------------------===//
+//  Describe AArch64 instructions format here
+//
 
-// A64 is currently the only instruction set supported by the AArch64
-// architecture.
-class A64Inst<dag outs, dag ins, string asmstr, list<dag> patterns,
-              InstrItinClass itin>
-    : Instruction {
-  // All A64 instructions are 32-bit. This field will be filled in
-  // gradually going down the hierarchy.
-  field bits<32> Inst;
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def NormalFrm   : Format<1>; // Do we need any others?
 
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+  field bits<32> Inst; // Instruction encoding.
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
   field bits<32> Unpredictable = 0;
   // SoftFail is the generic name for this field, but we alias it so
   // as to make it more obvious what it means in ARM-land.
   field bits<32> SoftFail = Unpredictable;
-
-  // LLVM-level model of the AArch64/A64 distinction.
-  let Namespace = "AArch64";
-  let DecoderNamespace = "A64";
-  let Size = 4;
-
-  // Set the templated fields
-  let OutOperandList = outs;
-  let InOperandList = ins;
-  let AsmString = asmstr;
-  let Pattern = patterns;
-  let Itinerary = itin;
+  let Namespace   = "AArch64";
+  Format F        = f;
+  bits<2> Form    = F.Value;
+  let Pattern     = [];
+  let Constraints = cstr;
 }
 
-class PseudoInst<dag outs, dag ins, list<dag> patterns> : Instruction {
-  let Namespace = "AArch64";
-
-  let OutOperandList = outs;
-  let InOperandList= ins;
-  let Pattern = patterns;
-  let isCodeGenOnly = 1;
-  let isPseudo = 1;
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+    : AArch64Inst<PseudoFrm, cstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+  let isCodeGenOnly  = 1;
 }
 
-// Represents a pseudo-instruction that represents a single A64 instruction for
-// whatever reason, the eventual result will be a 32-bit real instruction.
-class A64PseudoInst<dag outs, dag ins, list<dag> patterns>
-  : PseudoInst<outs, ins, patterns> {
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+  let Pattern = pattern;
   let Size = 4;
 }
 
-// As above, this will be a single A64 instruction, but we can actually give the
-// expansion in TableGen.
-class A64PseudoExpand<dag outs, dag ins, list<dag> patterns, dag Result>
-  : A64PseudoInst<outs, ins, patterns>,
-    PseudoInstExpansion<Result>;
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+        list<dag> pattern>
+    : EncodedI<cstr, pattern> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let AsmString      = !strconcat(asm, operands);
+}
 
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
 
-// First, some common cross-hierarchy register formats.
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
 
-class A64InstRd<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rd;
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
 
-  let Inst{4-0} = Rd;
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+  let Name = "Shifter";
 }
 
-class A64InstRt<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rt;
-
-  let Inst{4-0} = Rt;
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm32Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm64Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm64Shift";
 }
 
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "ArithmeticShifter" # width;
+  let PredicateMethod = "isArithmeticShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
 
-class A64InstRdn<dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-    : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  // Inherit rdt
-  bits<5> Rn;
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
 
-  let Inst{9-5} = Rn;
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalShifter" # width;
+  let PredicateMethod = "isLogicalShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
 }
 
-class A64InstRtn<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-    : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  // Inherit rdt
-  bits<5> Rn;
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
 
-  let Inst{9-5} = Rn;
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+  let SuperClasses = [LogicalVecShifterOperand];
+  let Name = "LogicalVecHalfWordShifter";
+  let RenderMethod = "addShifterOperands";
 }
 
-// Instructions taking Rt,Rt2,Rn
-class A64InstRtt2n<dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rt2;
-
-  let Inst{14-10} = Rt2;
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MoveVecShifter";
+  let RenderMethod = "addShifterOperands";
 }
 
-class A64InstRdnm<dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+  let Name = "Extend";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "Extend64";
+  let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "ExtendLSL64";
+  let RenderMethod = "addExtend64Operands";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
 
-  let Inst{20-16} = Rm;
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "tryParseFPImm";
+  let DiagnosticType = "InvalidFPImm";
 }
 
-class A64InstRtnm<dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+def CondCode : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "InvalidCondCode";
+}
 
-  let Inst{20-16} = Rm;
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+  let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32as64Operand;
 }
 
-//===----------------------------------------------------------------------===//
-//
-// Actual A64 Instruction Formats
-//
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
 
-// Format for Add-subtract (extended register) instructions.
-class A64I_addsubext<bit sf, bit op, bit S, bits<2> opt, bits<3> option,
-                     dag outs, dag ins, string asmstr, list<dag> patterns,
-                     InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    bits<3> Imm3;
-
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-24} = 0b01011;
-    let Inst{23-22} = opt;
-    let Inst{21} = 0b1;
-    // Rm inherited in 20-16
-    let Inst{15-13} = option;
-    let Inst{12-10} = Imm3;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-// Format for Add-subtract (immediate) instructions.
-class A64I_addsubimm<bit sf, bit op, bit S, bits<2> shift,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<12> Imm12;
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = S;
-  let Inst{28-24} = 0b10001;
-  let Inst{23-22} = shift;
-  let Inst{21-10} = Imm12;
-}
-
-// Format for Add-subtract (shifted register) instructions.
-class A64I_addsubshift<bit sf, bit op, bit S, bits<2> shift,
-                       dag outs, dag ins, string asmstr, list<dag> patterns,
-                       InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    bits<6> Imm6;
-
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-24} = 0b01011;
-    let Inst{23-22} = shift;
-    let Inst{21} = 0b0;
-    // Rm inherited in 20-16
-    let Inst{15-10} = Imm6;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-// Format for Add-subtract (with carry) instructions.
-class A64I_addsubcarry<bit sf, bit op, bit S, bits<6> opcode2,
-                       dag outs, dag ins, string asmstr, list<dag> patterns,
-                       InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-21} = 0b11010000;
-    // Rm inherited in 20-16
-    let Inst{15-10} = opcode2;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-
-// Format for Bitfield instructions
-class A64I_bitfield<bit sf, bits<2> opc, bit n,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<6> ImmR;
-  bits<6> ImmS;
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{22} = n;
-  let Inst{21-16} = ImmR;
-  let Inst{15-10} = ImmS;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let ParserMethod = "tryParseAdrpLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let PrintMethod = "printAdrpLabel";
+  let ParserMatchClass = AdrpOperand;
 }
 
-// Format for compare and branch (immediate) instructions.
-class A64I_cmpbr<bit sf, bit op,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  bits<19> Label;
-
-  let Inst{31} = sf;
-  let Inst{30-25} = 0b011010;
-  let Inst{24} = op;
-  let Inst{23-5} = Label;
-  // Inherit Rt in 4-0
+def AdrOperand : AsmOperandClass {
+  let Name = "AdrLabel";
+  let ParserMethod = "tryParseAdrLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrOperand;
 }
 
-// Format for conditional branch (immediate) instructions.
-class A64I_condbr<bit o1, bit o0,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<19> Label;
-  bits<4> Cond;
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+  let Name = "SImm9";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+  let ParserMatchClass = SImm9Operand;
+}
 
-  let Inst{31-25} = 0b0101010;
-  let Inst{24} = o1;
-  let Inst{23-5} = Label;
-  let Inst{4} = o0;
-  let Inst{3-0} = Cond;
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+  let Name = "SImm7s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
 }
 
-// Format for conditional compare (immediate) instructions.
-class A64I_condcmpimm<bit sf, bit op, bit o2, bit o3, bit s,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> UImm5;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010010;
-  let Inst{20-16} = UImm5;
-  let Inst{15-12} = Cond;
-  let Inst{11} = 0b1;
-  let Inst{10} = o2;
-  let Inst{9-5} = Rn;
-  let Inst{4} = o3;
-  let Inst{3-0} = NZCVImm;
+def simm7s4 : Operand<i32> {
+  let ParserMatchClass = SImm7s4Operand;
+  let PrintMethod = "printImmScale<4>";
 }
 
-// Format for conditional compare (register) instructions.
-class A64I_condcmpreg<bit sf, bit op, bit o2, bit o3, bit s,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+def simm7s8 : Operand<i32> {
+  let ParserMatchClass = SImm7s8Operand;
+  let PrintMethod = "printImmScale<8>";
+}
 
+def simm7s16 : Operand<i32> {
+  let ParserMatchClass = SImm7s16Operand;
+  let PrintMethod = "printImmScale<16>";
+}
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010010;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = Cond;
-  let Inst{11} = 0b0;
-  let Inst{10} = o2;
-  let Inst{9-5} = Rn;
-  let Inst{4} = o3;
-  let Inst{3-0} = NZCVImm;
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
 }
 
-// Format for conditional select instructions.
-class A64I_condsel<bit sf, bit op, bit s, bits<2> op2,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<4> Cond;
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010100;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = op2;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG3";
+  let RenderMethod = "addImmOperands";
 }
 
-// Format for data processing (1 source) instructions
-class A64I_dp_1src<bit sf, bit S, bits<5> opcode2, bits<6> opcode,
-                string asmstr, dag outs, dag ins,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b1;
-  let Inst{29} = S;
-  let Inst{28-21} = 0b11010110;
-  let Inst{20-16} = opcode2;
-  let Inst{15-10} = opcode;
-}
-
-// Format for data processing (2 source) instructions
-class A64I_dp_2src<bit sf, bits<6> opcode, bit S,
-                string asmstr, dag outs, dag ins,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = S;
-  let Inst{28-21} = 0b11010110;
-  let Inst{15-10} = opcode;
+def movz_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG3AsmOperand;
 }
 
-// Format for data-processing (3 source) instructions
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
 
-class A64I_dp3<bit sf, bits<6> opcode,
-               dag outs, dag ins, string asmstr,
-               list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30-29} = opcode{5-4};
-  let Inst{28-24} = 0b11011;
-  let Inst{23-21} = opcode{3-1};
-  // Inherits Rm in 20-16
-  let Inst{15} = opcode{0};
-  // {14-10} mostly Ra, but unspecified for SMULH/UMULH
-  // Inherits Rn in 9-5
-  // Inherits Rd in 4-0
-}
-
-// Format for exception generation instructions
-class A64I_exception<bits<3> opc, bits<3> op2, bits<2> ll,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<16> UImm16;
+def movz_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
 
-  let Inst{31-24} = 0b11010100;
-  let Inst{23-21} = opc;
-  let Inst{20-5} = UImm16;
-  let Inst{4-2} = op2;
-  let Inst{1-0} = ll;
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG1";
+  let RenderMethod = "addImmOperands";
 }
 
-// Format for extract (immediate) instructions
-class A64I_extract<bit sf, bits<3> op, bit n,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<6> LSB;
+def movz_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
 
-  let Inst{31} = sf;
-  let Inst{30-29} = op{2-1};
-  let Inst{28-23} = 0b100111;
-  let Inst{22} = n;
-  let Inst{21} = op{0};
-  // Inherits Rm in bits 20-16
-  let Inst{15-10} = LSB;
-  // Inherits Rn in 9-5
-  // Inherits Rd in 4-0
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG0";
+  let RenderMethod = "addImmOperands";
 }
 
-let Predicates = [HasFPARMv8] in {
+def movz_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
 
-// Format for floating-point compare instructions.
-class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> Rm;
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = op;
-  let Inst{13-10} = 0b1000;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = opcode2;
+def movk_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG3AsmOperand;
 }
 
-// Format for floating-point conditional compare instructions.
-class A64I_fpccmp<bit m, bit s, bits<2> type, bit op,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5} = Rn;
-  let Inst{4} = op;
-  let Inst{3-0} = NZCVImm;
+def movk_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG2AsmOperand;
 }
 
-// Format for floating-point conditional select instructions.
-class A64I_fpcondsel<bit m, bit s, bits<2> type,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<4> Cond;
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = 0b11;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def movk_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG1AsmOperand;
 }
 
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
 
-// Format for floating-point data-processing (1 source) instructions.
-class A64I_fpdp1<bit m, bit s, bits<2> type, bits<6> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format for floating-point data-processing (2 sources) instructions.
-class A64I_fpdp2<bit m, bit s, bits<2> type, bits<4> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def movk_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG0AsmOperand;
 }
 
-// Format for floating-point data-processing (3 sources) instructions.
-class A64I_fpdp3<bit m, bit s, bits<2> type, bit o1, bit o0,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<5> Ra;
+class fixedpoint_i32<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm32";
+  let ParserMatchClass = Imm1_32Operand;
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11111;
-  let Inst{23-22} = type;
-  let Inst{21} = o1;
-  // Inherit Rm in 20-16
-  let Inst{15} = o0;
-  let Inst{14-10} = Ra;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+class fixedpoint_i64<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm64";
+  let ParserMatchClass = Imm1_64Operand;
 }
 
-// Format for floating-point <-> fixed-point conversion instructions.
-class A64I_fpfixed<bit sf, bit s, bits<2> type, bits<2> mode, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<6> Scale;
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
 
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b0;
-  let Inst{20-19} = mode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = Scale;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+  let ParserMatchClass = Imm1_32Operand;
 }
 
-// Format for floating-point <-> integer conversion instructions.
-class A64I_fpint<bit sf, bit s, bits<2> type, bits<2> rmode, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
 }
 
 
-// Format for floating-point immediate instructions.
-class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<8> Imm8;
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+
+def LogicalImm32Operand : AsmOperandClass {
+  let Name = "LogicalImm32";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def LogicalImm64Operand : AsmOperandClass {
+  let Name = "LogicalImm64";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+  let PrintMethod = "printLogicalImm32";
+  let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+  let PrintMethod = "printLogicalImm64";
+  let ParserMatchClass = LogicalImm64Operand;
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-13} = Imm8;
-  let Inst{12-10} = 0b100;
-  let Inst{9-5} = imm5;
-  // Inherit Rd in 4-0
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let PrintMethod = "printHexImm";
 }
 
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 256;
+}]> {
+  let ParserMatchClass = Imm0_255Operand;
+  let PrintMethod = "printHexImm";
 }
 
-// Format for load-register (literal) instructions.
-class A64I_LDRlit<bits<2> opc, bit v,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  bits<19> Imm19;
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 128;
+}]> {
+  let ParserMatchClass = Imm0_127Operand;
+  let PrintMethod = "printHexImm";
+}
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5} = Imm19;
-  // Inherit Rt in 4-0
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 64;
+}]> {
+  let ParserMatchClass = Imm0_63Operand;
 }
 
-// Format for load-store exclusive instructions.
-class A64I_LDSTex_tn<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                 dag outs, dag ins, string asmstr,
-                 list <dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-30} = size;
-  let Inst{29-24} = 0b001000;
-  let Inst{23} = o2;
-  let Inst{22} = L;
-  let Inst{21} = o1;
-  let Inst{15} = o0;
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
 }
 
-class A64I_LDSTex_tt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rt2;
-   let Inst{14-10} = Rt2;
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
 }
 
-class A64I_LDSTex_stn<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rs;
-   let Inst{20-16} = Rs;
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
 }
 
-class A64I_LDSTex_stt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_stn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rt2;
-   let Inst{14-10} = Rt2;
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+//  {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "ArithmeticShifterOperand" # width);
 }
 
-// Format for load-store register (immediate post-indexed) instructions
-class A64I_LSpostind<bits<2> size, bit v, bits<2> opc,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b01;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
 }
 
-// Format for load-store register (immediate pre-indexed) instructions
-class A64I_LSpreind<bits<2> size, bit v, bits<2> opc,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
-
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b11;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+//  {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "LogicalShifterOperand" # width);
 }
 
-// Format for load-store register (unprivileged) instructions
-class A64I_LSunpriv<bits<2> size, bit v, bits<2> opc,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
 
-
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, shiftop);
 }
 
-// Format for load-store (unscaled immediate) instructions.
-class A64I_LSunalimm<bits<2> size, bit v, bits<2> opc,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+// A logical vector shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecShifterOperand;
 }
 
+// A logical vector half-word shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
 
-// Format for load-store (unsigned immediate) instructions.
-class A64I_LSunsigimm<bits<2> size, bit v, bits<2> opc,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<12> UImm12;
+// A vector move shifter operand:
+//  {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getMoveVecShifterOpValue";
+  let ParserMatchClass = MoveVecShifterOperand;
+}
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b01;
-  let Inst{23-22} = opc;
-  let Inst{21-10} = UImm12;
+def AddSubImmOperand : AsmOperandClass {
+  let Name = "AddSubImm";
+  let ParserMethod = "tryParseAddSubImm";
+  let DiagnosticType = "AddSubSecondSource";
+}
+// An ADD/SUB immediate shifter operand:
+//  second operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
 }
 
-// Format for load-store register (register offset) instructions.
-class A64I_LSregoff<bits<2> size, bit v, bits<2> opc, bit optionlo,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
 
-  // Complex operand selection needed for these instructions, so they
-  // need an "addr" field for encoding/decoding to be generated.
-  bits<3> Ext;
-  // OptionHi = Ext{2-1}
-  // S = Ext{0}
+class neg_addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = Ext{2-1};
-  let Inst{13} = optionlo;
-  let Inst{12} = Ext{0};
-  let Inst{11-10} = 0b10;
-  // Inherits Rn in 9-5
-  // Inherits Rt in 4-0
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
 
-  let AddedComplexity = 50;
+// An extend operand:
+//  {5-3} - extend type
+//  {2-0} - imm3
+def arith_extend : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand64;
 }
 
-// Format for Load-store register pair (offset) instructions
-class A64I_LSPoffset<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
-
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b010;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperandLSL64;
 }
 
-// Format for Load-store register pair (post-indexed) instructions
-class A64I_LSPpostind<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend);
+}
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b001;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend64);
 }
 
-// Format for Load-store register pair (pre-indexed) instructions
-class A64I_LSPpreind<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+// Floating-point immediate.
+def fpimm32 : Operand<f32>,
+              PatLeaf<(f32 fpimm), [{
+      return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+              PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b011;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+def fpimm8 : Operand<i32> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
 }
 
-// Format for Load-store non-temporal register pair (offset) instructions
-class A64I_LSPnontemp<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b000;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format for Logical (immediate) instructions
-class A64I_logicalimm<bit sf, bits<2> opc,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bit N;
-  bits<6> ImmR;
-  bits<6> ImmS;
-
-  // N, ImmR and ImmS have no separate existence in any assembly syntax (or for
-  // selection), so we'll combine them into a single field here.
-  bits<13> Imm;
-  // N = Imm{12};
-  // ImmR = Imm{11-6};
-  // ImmS = Imm{5-0};
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+  let Name = "VectorIndex" # Suffix;
+  let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) == 1;
+}]> {
+  let ParserMatchClass = VectorIndex1Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = VectorIndexBOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndexHOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndexSOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndexDOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100100;
-  let Inst{22} = Imm{12};
-  let Inst{21-16} = Imm{11-6};
-  let Inst{15-10} = Imm{5-0};
-  // Rn inherited in 9-5
-  // Rd inherited in 4-0
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+                    PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+                                               .bitcastToAPInt()
+                                               .getZExtValue());
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+                                                           .bitcastToAPInt()
+                                                           .getZExtValue());
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = SIMDImmType10Operand;
+  let PrintMethod = "printSIMDType10Operand";
 }
 
-// Format for Logical (shifted register) instructions
-class A64I_logicalshift<bit sf, bits<2> opc, bits<2> shift, bit N,
-                        dag outs, dag ins, string asmstr,
-                        list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<6> Imm6;
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-24} = 0b01010;
-  let Inst{23-22} = shift;
-  let Inst{21} = N;
-  // Rm inherited
-  let Inst{15-10} = Imm6;
-  // Rn inherited
-  // Rd inherited
-}
-
-// Format for Move wide (immediate)
-class A64I_movw<bit sf, bits<2> opc,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<16> UImm16;
-  bits<2> Shift; // Called "hw" officially
+//---
+// System management
+//---
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = Shift;
-  let Inst{20-5} = UImm16;
-  // Inherits Rd in 4-0
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21}    = L;
 }
 
-// Format for PC-relative addressing instructions, ADR and ADRP.
-class A64I_PCADR<bit op,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<21> Label;
-
-  let Inst{31} = op;
-  let Inst{30-29} = Label{1-0};
-  let Inst{28-24} = 0b10000;
-  let Inst{23-5} = Label{20-2};
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands>
+    : BaseSystemI<L, (outs), iops, asm, operands> {
+  let Inst{4-0} = 0b11111;
 }
 
-// Format for system instructions
-class A64I_system<bit l,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<2> Op0;
-  bits<3> Op1;
-  bits<4> CRn;
-  bits<4> CRm;
-  bits<3> Op2;
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : BaseSystemI<L, oops, iops, asm, operands>,
+      Sched<[WriteSys]> {
   bits<5> Rt;
+  let Inst{4-0} = Rt;
+}
 
-  let Inst{31-22} = 0b1101010100;
-  let Inst{21} = l;
-  let Inst{20-19} = Op0;
-  let Inst{18-16} = Op1;
-  let Inst{15-12} = CRn;
+// Hint instructions that take both a CRm and a 3-bit immediate.
+class HintI<string mnemonic>
+    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
+      Sched<[WriteHint]> {
+  bits <7> imm;
+  let Inst{20-12} = 0b000110010;
+  let Inst{11-5} = imm;
+}
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+  let Name = "Barrier";
+  let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+  let PrintMethod = "printBarrierOption";
+  let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+      Sched<[WriteBarrier]> {
+  bits<4> CRm;
+  let Inst{20-12} = 0b000110011;
   let Inst{11-8} = CRm;
-  let Inst{7-5} = Op2;
-  let Inst{4-0} = Rt;
+  let Inst{7-5} = opc;
+}
 
-  // These instructions can do horrible things.
-  let hasSideEffects = 1;
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+  let Name = "MRSSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MRS";
+}
+// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MRSSystemRegisterOperand;
+  let DecoderMethod = "DecodeMRSSystemRegister";
+  let PrintMethod = "printMRSSystemRegister";
 }
 
-// Format for unconditional branch (immediate) instructions
-class A64I_Bimm<bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<26> Label;
+def MSRSystemRegisterOperand : AsmOperandClass {
+  let Name = "MSRSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MSRSystemRegisterOperand;
+  let DecoderMethod = "DecodeMSRSystemRegister";
+  let PrintMethod = "printMSRSystemRegister";
+}
 
-  let Inst{31} = op;
-  let Inst{30-26} = 0b00101;
-  let Inst{25-0} = Label;
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+                       "mrs", "\t$Rt, $systemreg"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
 }
 
-// Format for Test & branch (immediate) instructions
-class A64I_TBimm<bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<6> Imm;
-  bits<14> Label;
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+                       "msr", "\t$systemreg, $Rt"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
 
-  let Inst{31} = Imm{5};
-  let Inst{30-25} = 0b011011;
-  let Inst{24} = op;
-  let Inst{23-19} = Imm{4-0};
-  let Inst{18-5} = Label;
-  // Inherit Rt in 4-0
+def SystemPStateFieldOperand : AsmOperandClass {
+  let Name = "SystemPStateField";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldOperand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateI
+  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
+                  "msr", "\t$pstate_field, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bits<4> imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-12} = 0b0100;
+  let Inst{11-8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+  let Name = "SysCR";
+  let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+  let PrintMethod = "printSysCROperand";
+  let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
 }
 
-// Format for Unconditional branch (register) instructions, including
-// RET.  Shares no fields with instructions further up the hierarchy
-// so top-level.
-class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<5> Rn;
 
+// Branch (register) instructions:
+//
+//  case opc of
+//    0001 blr
+//    0000 br
+//    0101 dret
+//    0100 eret
+//    0010 ret
+//    otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+                    string operands, list<dag> pattern>
+    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
   let Inst{31-25} = 0b1101011;
   let Inst{24-21} = opc;
-  let Inst{20-16} = op2;
-  let Inst{15-10} = op3;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = op4;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-10} = 0b000000;
+  let Inst{4-0}   = 0b00000;
 }
 
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+}
 
-//===----------------------------------------------------------------------===//
-//
-// Neon Instruction Format Definitions.
-//
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+  let Inst{9-5} = 0b11111;
+}
 
-let Predicates = [HasNEON] in {
+//---
+// Conditional branch instruction.
+//---
 
-class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
-  : InstAlias<Asm, Result, Emit> {
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+  let PrintMethod = "printCondCode";
+  let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+  // AL and NV are invalid in the aliases which use inv_ccode
+  let PrintMethod = "printInverseCondCode";
+  let ParserMatchClass = CondCode;
+  let MCOperandPredicate = [{
+    return MCOp.isImm() &&
+           MCOp.getImm() != AArch64CC::AL &&
+           MCOp.getImm() != AArch64CC::NV;
+  }];
 }
 
-// Format AdvSIMD bitwise extract
-class NeonI_BitExtract<bit q, bits<2> op2,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b101110;
-  let Inst{23-22} = op2;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  // imm4 in 14-11
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD perm
-class NeonI_Perm<bit q, bits<2> size, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+  let Name = "PCRelLabel19";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+  let EncoderMethod = "getCondBranchTargetOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
 }
 
-// Format AdvSIMD table lookup
-class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = op2;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-13} = len;
-  let Inst{12} = op;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 3 vector registers with same vector type
-class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+                     "b", ".$cond\t$target", "",
+                     [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+                   Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Uses = [NZCV];
+
+  bits<4> cond;
+  bits<19> target;
+  let Inst{31-24} = 0b01010100;
+  let Inst{23-5} = target;
+  let Inst{4} = 0;
+  let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+         asm, "\t$Rt, $target", "",
+         [(node regtype:$Rt, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<19> target;
+  let Inst{30-25} = 0b011010;
+  let Inst{24}    = op;
+  let Inst{23-5}  = target;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+  def W : BaseCmpBranch<GPR32, op, asm, node> {
+    let Inst{31} = 0;
+  }
+  def X : BaseCmpBranch<GPR64, op, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+  let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+  let EncoderMethod = "getTestBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+  let Name = "TBZImm0_31";
+  let PredicateMethod = "isImm0_31";
+  let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+  let Name = "Imm32_63";
+  let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+  let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+                     bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+       asm, "\t$Rt, $bit_off, $target", "",
+       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<6> bit_off;
+  bits<14> target;
+
+  let Inst{30-25} = 0b011011;
+  let Inst{24}    = op;
+  let Inst{23-19} = bit_off{4-0};
+  let Inst{18-5}  = target;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+    let Inst{31} = 1;
+  }
+
+  // Alias X-reg with 0-31 imm to W-Reg.
+  def : InstAlias<asm # "\t$Rd, $imm, $target",
+                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+            tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+  let Name = "BranchTarget26";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+  bits<26> addr;
+  let Inst{31}    = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0}  = addr;
+
+  let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+                         SDPatternOperator node>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+      [(set regtype:$Rd, (node regtype:$Rn))]>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+
+  let Inst{30-13} = 0b101101011000000000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+                          SDPatternOperator node = null_frag> {
+  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR32, asm, node> {
+  let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR64, asm, node> {
+  let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                          list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30}    = isSub;
+  let Inst{28-21} = 0b11010000;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                      SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+                              SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+         (implicit NZCV)]> {
+  let Defs = [NZCV];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+                       SDNode OpNode, SDNode OpNode_setflags> {
+  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+    let Inst{31} = 0;
+    let Inst{29} = 0;
+  }
+  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+    let Inst{29} = 0;
+  }
+
+  // Sets flags.
+  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 0;
+    let Inst{29} = 1;
+  }
+  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 1;
+    let Inst{29} = 1;
+  }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+                     SDPatternOperator OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b00;
+  let Inst{13-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+              SDPatternOperator OpNode>
+    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+  let Inst{10}    = isSigned;
+}
+
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+           Sched<[WriteID32, ReadID, ReadID]> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+           Sched<[WriteID64, ReadID, ReadID]> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+                SDPatternOperator OpNode = null_frag>
+  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+    Sched<[WriteIS, ReadI]> {
+  let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+  def Wr : BaseShift<shift_type, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+                       RegisterClass addtype, string asm,
+                       list<dag> pattern>
+  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{30-24} = 0b0011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+    let Inst{31} = 0;
+  }
+
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
+    let Inst{31} = 1;
+  }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+                   SDNode AccNode, SDNode ExtNode>
+  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+    Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+  let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+    Sched<[WriteIM64, ReadIM, ReadIM]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-24} = 0b10011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  let PostEncoderMethod = "fixMulHigh";
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+              SDPatternOperator OpNode, string asm>
+  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+    Sched<[WriteISReg, ReadI, ReadISReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = sf;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b010;
+  let Inst{12} = C;
+  let Inst{11-10} = sz;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+  let Predicates = [HasCRC];
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+        pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Xd;
+  bits<21> label;
+  let Inst{31}    = page;
+  let Inst{30-29} = label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5}  = label{20-2};
+  let Inst{4-0}   = Xd;
+
+  let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let EncoderMethod = "getMoveWideImmOpValue";
+  let PrintMethod = "printHexImm";
+}
+def movimm32_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm64ShifterOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                        string asm>
+  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "", []>,
+    Sched<[WriteImm]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                          string asm>
+  : I<(outs regtype:$Rd),
+      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
+                    string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+        asm, "\t$Rd, $Rn, $imm", "",
+        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+      Sched<[WriteI, ReadI]>  {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<14> imm;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+  let Inst{21-10} = imm{11-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+  let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+                          SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+                     arith_shifted_reg shifted_regtype, string asm,
+                     SDPatternOperator OpNode>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "",
+        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = 0;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, Operand src2Regtype,
+                     string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$R1),
+        (ins src1Regtype:$R2, src2Regtype:$R3),
+        asm, "\t$R1, $R2, $R3", "",
+        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = ext{5-3};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       Operand ext_op, string asm>
+    : I<(outs dstRegtype:$Rd),
+        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = ext{5};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, RegisterClass src2Regtype,
+                     int shiftExt>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+                      shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic,
+                  SDPatternOperator OpNode = null_frag> {
+  let hasSideEffects = 0 in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register - Only used for CodeGen
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1, hasSideEffects = 0 in {
+  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when either the destination or
+  // first source register is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
+  let isCompare = 1, Defs = [NZCV] in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1 in {
+  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  // Compare aliases
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+  // Compare shorthands
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when the first source register
+  // is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                      SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+                     list<dag> patterns>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+      Sched<[WriteExtr, ReadExtrHi]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> imm;
+
+  let Inst{30-23} = 0b00100111;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+                      [(set GPR32:$Rd,
+                        (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imm<5> must be zero.
+    let imm{5}   = 0;
+  }
+  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+                      [(set GPR64:$Rd,
+                        (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImm<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+                             imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+                     RegisterClass sregtype, Operand imm_type, string asm,
+                     list<dag> pattern>
+    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $imm", "", pattern>,
+      Sched<[WriteI, ReadI]> {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<13> imm;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22}    = imm{12};
+  let Inst{21-16} = imm{11-6};
+  let Inst{15-10} = imm{5-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+                      logical_shifted_reg shifted_regtype, string asm,
+                      list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = N;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+let AddedComplexity = 6 in
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+                                               logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+                                               logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+  let isCompare = 1, Defs = [NZCV] in {
+  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+  } // end Defs = [NZCV]
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+                      SDPatternOperator OpNode> {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+                                                 logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+                                                 logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+                       SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> imm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = imm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsImm<bit op, string asm> {
+  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsReg<bit op, string asm> {
+  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+                       PatFrag frag>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+               (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32);
+}]>;
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+
+  def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+  let Name = "UImm12Offset" # Scale;
+  let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+  let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+  let ParserMatchClass
+   = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+  let EncoderMethod
+   = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+  let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                      string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+
+  bits<5> Rn;
+  bits<12> offset;
+
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = offset;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+                           (ins GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+                           (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+  let Name = "Prefetch";
+  let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+  let PrintMethod = "printPrefetchOp";
+  let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+                      asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+  let EncoderMethod = "getLoadLiteralOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", []>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+  let Name = "Mem" # Reg # "Extend" # Width;
+  let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+  let RenderMethod = "addMemExtendOperands";
+  let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
+}
+
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+        : Operand<i32> {
+  let ParserMatchClass = ParserClass;
+  let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+  let DecoderMethod = "DecodeMemExtend";
+  let EncoderMethod = "getMemExtendOpValue";
+  let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
+}
+
+def ro_Wextend8   : ro_extend<MemWExtend8Operand,   "w", 8>;
+def ro_Wextend16  : ro_extend<MemWExtend16Operand,  "w", 16>;
+def ro_Wextend32  : ro_extend<MemWExtend32Operand,  "w", 32>;
+def ro_Wextend64  : ro_extend<MemWExtend64Operand,  "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8   : ro_extend<MemXExtend8Operand,   "x", 8>;
+def ro_Xextend16  : ro_extend<MemXExtend16Operand,  "x", 16>;
+def ro_Xextend32  : ro_extend<MemXExtend32Operand,  "x", 32>;
+def ro_Xextend64  : ro_extend<MemXExtend64Operand,  "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+                  Operand wextend, Operand xextend>  {
+  // CodeGen-level pattern covering the entire addressing mode.
+  ComplexPattern Wpat = windex;
+  ComplexPattern Xpat = xindex;
+
+  // Asm-level Operand covering the valid "uxtw #3" style syntax.
+  Operand Wext = wextend;
+  Operand Xext = xextend;
+}
+
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+                       ro_Xextend128>;
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+  : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                        ro_Xextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(set (Ty regtype:$Rt),
+                      (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                               ro_Wextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                               ro_Xextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                          ro_Wextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                          ro_Xextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+                     string asm, list<dag> pat>
+    : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+  def roW : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                asm, [(AArch64Prefetch imm:$Rt,
+                                     (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                    ro_Wextend64:$extend))]> {
+    let Inst{13} = 0b0;
+  }
+
+  def roX : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                asm,  [(AArch64Prefetch imm:$Rt,
+                                      (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend64:$extend))]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                           string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before LoadUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+                               (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                         string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before StoreUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pattern>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+                            list<dag> pat> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pat>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                                dag oops, dag iops, string asm>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                            RegisterClass regtype, string asm> {
+  let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+                                    (ins GPR64sp:$Rn, simm9:$offset), asm>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                             RegisterClass regtype, string asm> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+                                 (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                                 asm>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                     (outs GPR64sp:$wback, regtype:$Rt),
+                     (ins GPR64sp:$Rn, simm9:$offset), asm,
+                     "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+      Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+// (pre-index) load/stores.
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback, regtype:$Rt),
+                      (ins GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                       asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b010;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                          Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 1,
+                                  (outs regtype:$Rt, regtype:$Rt2),
+                                  (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+                                  (ins regtype:$Rt, regtype:$Rt2,
+                                       GPR64sp:$Rn, indextype:$offset),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b011;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, indextype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, indextype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b001;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                       Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
+                             (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, idxtype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+//  (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b000;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+                                   (outs regtype:$Rt, regtype:$Rt2),
+                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+                                   (ins regtype:$Rt, regtype:$Rt2,
+                                        GPR64sp:$Rn, indextype:$offset),
+                                   asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                             dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-30} = sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23}    = o2;
+  let Inst{22}    = L;
+  let Inst{21}    = o1;
+  let Inst{15}    = o0;
+
+  let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                               dag oops, dag iops, string asm, string operands>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                  RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                    RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                       RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs regtype:$Rt, regtype:$Rt2),
+                             (ins GPR64sp0:$Rn), asm,
+                             "\t$Rt, $Rt2, [$Rn]">,
+      Sched<[WriteLD, WriteLDHi]> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                   RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+                               (ins regtype:$Rt, GPR64sp0:$Rn),
+                               asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                     RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+                             (ins regtype:$Rt, GPR64sp0:$Rn),
+                             asm, "\t$Ws, $Rt, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                         RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs GPR32:$Ws),
+                             (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+                              asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-2}   = 0b000;
+  let Inst{1-0}   = ll;
+}
+
+let Predicates = [HasFPARMv8] in {
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 0;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+           SDPatternOperator OpN> {
+  // Unscaled single-precision to 32-bit
+  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled single-precision to 64-bit
+  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Unscaled double-precision to 32-bit
+  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled double-precision to 64-bit
+  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+                             SDPatternOperator OpN> {
+  // Scaled single-precision to 32-bit
+  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+                              fixedpoint_f32_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled single-precision to 64-bit
+  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+                              fixedpoint_f32_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled double-precision to 32-bit
+  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+                              fixedpoint_f64_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled double-precision to 64-bit
+  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+                              fixedpoint_f64_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b00001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      ValueType dvt, string asm, SDNode node>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+  // Unscaled
+  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  // Scaled
+  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f32_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f64_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f32_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f64_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+        // We use COPY_TO_REGCLASS for these bitconvert operations.
+        // copyPhysReg() expands the resultant COPY instructions after
+        // regalloc is done. This gives greater freedom for the allocator
+        // and related passes (coalescing, copy propagation, et. al.) to
+        // be more effective.
+        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterClass srcType, RegisterOperand dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterOperand srcType, RegisterClass dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+
+
+multiclass UnscaledConversion<string asm> {
+  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+                                             asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+                                               asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+                       RegisterClass srcType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{23-22} = type;
+  let Inst{21-17} = 0b10001;
+  let Inst{16-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPConversion<string asm> {
+  // Double-precision to Half-precision
+  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
+
+  // Double-precision to Single-precision
+  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
+
+  // Half-precision to Double-precision
+  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Half-precision to Single-precision
+  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Single-precision to Double-precision
+  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
+
+  // Single-precision to Half-precision
+  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+                              ValueType vt, string asm, SDPatternOperator node>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21-19} = 0b100;
+  let Inst{18-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+                               SDPatternOperator node = null_frag> {
+  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+                           string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+         asm, "\t$Rd, $Rn, $Rm", "", pat>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+                            SDPatternOperator node = null_frag> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                         [(set (f32 FPR32:$Rd),
+                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                         [(set (f64 FPR64:$Rd),
+                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+                             RegisterClass regtype, string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+      Sched<[WriteFMul]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{31-23} = 0b000111110;
+  let Inst{21}    = isNegated;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+                              SDPatternOperator node> {
+  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+            [(set FPR32:$Rd,
+                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+            [(set FPR64:$Rd,
+                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+                                 RegisterClass regtype, string asm,
+                                 list<dag> pat>
+    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b1000;
+
+  // Rm should be 0b00000 canonically, but we need to accept any value.
+  let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+                                string asm, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+                        SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV] in {
+  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+
+  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans,
+                              RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string asm> {
+  let Defs = [NZCV], Uses = [NZCV] in {
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV], Uses = [NZCV]
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+                          (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+  let Uses = [NZCV] in {
+  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+      [(set regtype:$Rd, fpimmtype:$imm)]>,
+    Sched<[WriteFImm]> {
+  bits<5> Rd;
+  bits<8> imm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-13} = imm;
+  let Inst{12-5}  = 0b10000000;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+    let Inst{22} = 1;
+  }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON] in {
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+                                      asm, ".2d",
+         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+      [(set (v8i8 V64:$dst),
+            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+      [(set (v16i8 V128:$dst),
+            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+      [(set (v4i16 V64:$dst),
+            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+      [(set (v8i16 V128:$dst),
+            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+      [(set (v2i32 V64:$dst),
+            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+      [(set (v4i32 V128:$dst),
+            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+    [(set (v16i8 V128:$Rd),
+          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only S and D sized floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+                                    string asm,
+                                    SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+     [(set (v2f32 V64:$dst),
+           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+     [(set (v4f32 V128:$dst),
+           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+     [(set (v2f64 V128:$dst),
+           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+                                  string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string dstkind,
+                        string srckind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b", ".8b",
+                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b", ".16b",
+                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, string amount>
+  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = size;
+  let Inst{21-10} = 0b100001001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+  let neverHasSideEffects = 1 in {
+  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+                                             "shll", ".8h",  ".8b", "8">;
+  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+                                             "shll2", ".8h", ".16b", "8">;
+  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+                                             "shll", ".4s",  ".4h", "16">;
+  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+                                             "shll2", ".4s", ".8h", "16">;
+  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+                                             "shll", ".2d",  ".2s", "32">;
+  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+                                             "shll2", ".2d", ".4s", "32">;
+  }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".4h", ".8b",
+               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".8h", ".16b",
+               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".2s", ".4h",
+               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".4s", ".8h",
+               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".1d", ".2s",
+               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".2d", ".4s",
+               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                          asm, ".4h", ".8b",
+      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+                                      (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                          asm, ".8h", ".16b",
+      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+                                      (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                          asm, ".2s", ".4h",
+      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+                                      (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                          asm, ".4s", ".8h",
+      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                      (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                          asm, ".1d", ".2s",
+      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+                                      (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                          asm, ".2d", ".4s",
+      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+                                      (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                    asm, ".8b", ".8b",
+    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                    asm, ".16b", ".16b",
+    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                    asm, ".4h", ".4h",
+    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                    asm, ".8h", ".8h",
+    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                    asm, ".2s", ".2s",
+    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                    asm, ".4s", ".4s",
+    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+                                    asm, ".2d", ".2d",
+    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                asm, ".2s", ".2s",
+    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                asm, ".4s", ".4s",
+    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+                                asm, ".2d", ".2d",
+    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+                          SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+                                      asm, ".8b", ".8h",
+        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+                                      asm#"2", ".16b", ".8h", []>;
+  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+                                      asm, ".4h", ".4s",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+                                      asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+                                      asm, ".2s", ".2d",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+                                      asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand regtype,
+                           string asm, string kind, string zero,
+                           ValueType dty, ValueType sty, SDNode OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+                            SDNode OpNode> {
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+                                     asm, ".8b", "0",
+                                     v8i8, v8i8, OpNode>;
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+                                     asm, ".16b", "0",
+                                     v16i8, v16i8, OpNode>;
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+                                     asm, ".4h", "0",
+                                     v4i16, v4i16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+                                     asm, ".8h", "0",
+                                     v8i16, v8i16, OpNode>;
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+                                     asm, ".2s", "0",
+                                     v2i32, v2i32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+                                     asm, ".4s", "0",
+                                     v4i32, v4i32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+                                     asm, ".2d", "0",
+                                     v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes.
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+                              string asm, SDNode OpNode> {
+
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+                                     asm, ".2s", "0.0",
+                                     v2i32, v2f32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+                                     asm, ".4s", "0.0",
+                                     v4i32, v4f32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+                                     asm, ".2d", "0.0",
+                                     v2i64, v2f64, OpNode>;
+
+  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+                                    asm, ".4s", ".4h", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".4s", ".8h", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+                                    asm, ".2d", ".2s", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+                                    asm, ".4h", ".4s", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                    asm, ".2s", ".2d", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+                                     Intrinsic OpNode> {
+  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                     asm, ".2s", ".2d",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4f32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+//        change the element count (in this case, placing the results in
+//        the high elements of the result register rather than the low
+//        elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                    Intrinsic IntOp> {
+  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".8b", ".8h", ".8h",
+     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".16b", ".8h", ".8h",
+     []>;
+  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".4h", ".4s", ".4s",
+     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".4s", ".4s",
+     []>;
+  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".2s", ".2d", ".2d",
+     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".2d", ".2d",
+     []>;
+
+
+  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+  // a version attached to an instruction.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8i16_v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+                                                    (v4i32 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v4i32_v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+                                                    (v2i64 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v2i64_v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+                                      Intrinsic IntOp> {
+  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                            V128, V64, V64,
+                                            asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".8h", ".16b", ".16b", []>;
+  let Predicates = [HasCrypto] in {
+    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+                                              V128, V64, V64,
+                                              asm, ".1q", ".1d", ".1d", []>;
+    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+                                              V128, V128, V128,
+                                              asm#"2", ".1q", ".2d", ".2d", []>;
+  }
+
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                (extract_high_v16i8 V128:$Rm)))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                  (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                 (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+                                          string asm,
+                                          SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                   (extract_high_v16i8 V128:$Rm))))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                    (extract_high_v8i16 V128:$Rm))))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                    (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+                                      (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+                                      string asm,
+                                      SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd),
+                  (extract_high_v16i8 V128:$Rn),
+                  (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+                                           SDPatternOperator Accum> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                                                (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+                                            (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+                                                (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+                                            (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".8h", ".8h", ".8b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".8h", ".16b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                                       (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".4s", ".4s", ".4h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".4s", ".8h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                                       (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".2d", ".2d", ".2s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".2d", ".4s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                                       (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+                             string asm, string kind>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+      [(set (vty regtype:$Rd),
+            (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> imm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size;
+  let Inst{29-21} = 0b101110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-11} = imm;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+    let imm{3} = 0;
+  }
+  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+                        string asm, string kind, SDNode OpNode, ValueType valty>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+                         SDNode OpNode> {
+  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
+      asm, ".8b", OpNode, v8i8>;
+  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
+      asm, ".16b", OpNode, v16i8>;
+  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
+      asm, ".4h", OpNode, v4i16>;
+  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
+      asm, ".8h", OpNode, v8i16>;
+  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
+      asm, ".2s", OpNode, v2i32>;
+  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
+      asm, ".4s", OpNode, v4i32>;
+  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
+      asm, ".2d", OpNode, v2i64>;
+
+  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "\t$Rd, $Rn, $Rm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 3 vector registers with different vector type
-class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  }
+
+  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+                                SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+              dag oops, dag iops, string asm, string cstr, list<dag> pat>
+  : I<oops, iops, asm,
+      "\t$Rd, $Rn, $Rm", cstr, pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11} = 0b0;
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD two registers and an element
-class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01111;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$Rd),
+                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$Rd),
+                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$dst),
+                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+                                      asm, "$Rd = $dst", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$dst),
+                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+                                      asm, "$Rd = $dst",
+            [(set (i64 FPR64:$dst),
+                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  // l in Inst{21}
-  // m in Inst{20}
-  // Inherit Rm in 19-16
-  let Inst{15-12} = opcode;
-  // h in Inst{11}
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 1 vector register with modified immediate
-class NeonI_1VModImm<bit q, bit op,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs,ins, asmstr, patterns, itin> {
-  bits<8> Imm;
-  bits<4> cmode;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "$Rd = $dst", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm, string zero>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "\t$Rd, $Rn, #" # zero, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+     [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-17} = 0b011111100110000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+
+  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+                          SDPatternOperator OpNode = null_frag> {
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+}
+
+multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+                                 Intrinsic OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, RegisterOperand vectype,
+                        string asm, string kind>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+                                      asm, ".2s">;
+  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+                          RegisterClass regtype, RegisterOperand vectype,
+                          string asm, string kind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+                              string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+                            Intrinsic intOp> {
+  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+                                   asm, ".4s",
+        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+                     string operands, string constraints, list<dag> pattern>
+  : I<outs, ins, asm, operands, constraints, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{15} = 0;
+  let Inst{10} = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+                      RegisterOperand vecreg, RegisterClass regtype>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+                   "{\t$Rd" # size # ", $Rn" #
+                   "|" # size # "\t$Rd, $Rn}", "",
+                   [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+  let Inst{20-16} = imm5;
+  let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+                         ValueType vectype, ValueType insreg,
+                         RegisterOperand vecreg, Operand idxtype,
+                         ValueType elttype, SDNode OpNode>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+                 [(set (vectype vecreg:$Rd),
+                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+  let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+                       VectorIndexD, i64, AArch64duplane64> {
+  bits<1> idx;
+  let Inst{20} = idx;
+  let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+                       VectorIndexS, i64, AArch64duplane32> {
+  bits<2> idx;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+                       VectorIndexH, i64, AArch64duplane16> {
+  bits<3> idx;
+  let Inst{20-18} = idx;
+  let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+                          RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+                       VectorIndexB, i64, AArch64duplane8> {
+  bits<4> idx;
+  let Inst{20-17} = idx;
+  let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+                  Operand idxtype, string asm, list<dag> pattern>
+  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+                   "{\t$Rd, $Rn" # size # "$idx" #
+                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+  let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+                   RegisterClass regtype, Operand idxtype>
+    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+                    "|" # size # "\t$dst, $src$idx}",
+                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+}
+
+multiclass UMov {
+  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+  def : SIMDMovAlias<"mov", ".s",
+                     !cast<Instruction>(NAME#"vi32"),
+                     GPR32, VectorIndexS>;
+  def : SIMDMovAlias<"mov", ".d",
+                     !cast<Instruction>(NAME#"vi64"),
+                     GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+                      RegisterClass regtype, Operand idxtype>
+  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" #
+                   "|" # size # "\t$Rd$idx, $Rn}",
+                   "$Rd = $dst",
+            [(set V128:$dst,
+              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+  let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+                         ValueType elttype, Operand idxtype>
+  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
+                   "$Rd = $dst",
+         [(set V128:$dst,
+               (vector_insert
+                 (vectype V128:$Rd),
+                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+                 idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+                          RegisterClass regtype, Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+                        "|" # size #"\t$dst$idx, $src}",
+                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+                             Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+                      # "|" # size #" $dst$idx, $src$idx2}",
+                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+    bits<4> idx;
+    bits<4> idx2;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+    let Inst{14-11} = idx2;
+  }
+  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+    bits<3> idx;
+    bits<3> idx2;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+    let Inst{14-12} = idx2;
+    let Inst{11} = 0;
+  }
+  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+    bits<2> idx;
+    bits<2> idx2;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+    let Inst{14-13} = idx2;
+    let Inst{12-11} = 0;
+  }
+  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+    bits<1> idx;
+    bits<1> idx2;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+    let Inst{14} = idx2;
+    let Inst{13-11} = 0;
+  }
+
+  // For all forms of the INS instruction, the "mov" mnemonic is the
+  // preferred alias. Why they didn't just call the instruction "mov" in
+  // the first place is a very good question indeed...
+  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+                         GPR32, VectorIndexB>;
+  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+                         GPR32, VectorIndexH>;
+  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+                         GPR32, VectorIndexS>;
+  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+                         GPR64, VectorIndexD>;
+
+  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+                         VectorIndexB>;
+  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+                         VectorIndexH>;
+  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+                         VectorIndexS>;
+  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+                         VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+                          RegisterOperand vectype, RegisterOperand listtype>
+    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+                        string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+       "{\t$dst, $src" # kind # "$idx" #
+       "|\t$dst, $src$idx}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> dst;
+  bits<5> src;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{15-10} = 0b000001;
+  let Inst{9-5}   = src;
+  let Inst{4-0}   = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+                    # "|\t$dst, $src$index}",
+                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+                                                          VectorIndexD:$idx)))),
+            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+  // 'DUP' mnemonic aliases.
+  def : SIMDScalarCPYAlias<"dup", ".b",
+                           !cast<Instruction>(NAME#"i8"),
+                           FPR8, V128, VectorIndexB>;
+  def : SIMDScalarCPYAlias<"dup", ".h",
+                           !cast<Instruction>(NAME#"i16"),
+                           FPR16, V128, VectorIndexH>;
+  def : SIMDScalarCPYAlias<"dup", ".s",
+                           !cast<Instruction>(NAME#"i32"),
+                           FPR32, V128, VectorIndexS>;
+  def : SIMDScalarCPYAlias<"dup", ".d",
+                           !cast<Instruction>(NAME#"i64"),
+                           FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+                          string asm, string op_string,
+                          string cstr, list<dag> pattern>
+  : I<oops, iops, asm, op_string, cstr, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<8> imm8;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = op;
   let Inst{28-19} = 0b0111100000;
+  let Inst{18-16} = imm8{7-5};
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = imm8{4-0};
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+                        !con((ins immtype:$imm8), opt_shift_iop), asm,
+                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "", pattern> {
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "$Rd = $dst", pattern> {
+  let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+                                      string asm> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+                                                 asm, ".4h", []>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+                                                 asm, ".8h", []>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+                                             asm, ".2s", []>;
+  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+                                             asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+                                      bits<2> w_cmode, string asm,
+                                      SDNode OpNode> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+                                                 asm, ".4h",
+             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+                                                 asm, ".8h",
+             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+                                             asm, ".2s",
+             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+                                             asm, ".4s",
+             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+                             RegisterOperand vectype, string asm,
+                             string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins move_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<1> shift;
+  let Inst{15-13} = cmode{3-1};
+  let Inst{12}    = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+                                   RegisterOperand vectype,
+                                   Operand imm_type, string asm,
+                                   string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+                              asm, kind, pattern> {
   let Inst{15-12} = cmode;
-  let Inst{11} = 0b0; // o2
-  let Inst{10} = 1;
-  // Inherit Rd in 4-0
-  let Inst{18-16} = Imm{7-5}; // imm a:b:c
-  let Inst{9-5} = Imm{4-0};   // imm d:e:f:g:h
 }
 
-// Format AdvSIMD 3 scalar registers with same type
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+                                   list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+                        "\t$Rd, $imm8", "", pattern> {
+  let Inst{15-12} = cmode;
+  let DecoderMethod = "DecodeModImmInstruction";
+}
 
-class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+      asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$dst),
+      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
 
-// Format AdvSIMD 2 vector registers miscellaneous
-class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2f32 V64:$Rd),
+        (OpNode (v2f32 V64:$Rn),
+         (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4f32 V128:$Rd),
+        (OpNode (v4f32 V128:$Rn),
+         (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d",
+    [(set (v2f64 V128:$Rd),
+        (OpNode (v2f64 V128:$Rn),
+         (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (f32 FPR32Op:$Rd),
+          (OpNode (f32 FPR32Op:$Rn),
+                  (f32 (vector_extract (v4f32 V128:$Rm),
+                                       VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d",
+    [(set (f64 FPR64Op:$Rd),
+          (OpNode (f64 FPR64Op:$Rn),
+                  (f64 (vector_extract (v2f64 V128:$Rm),
+                                       VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # v2i32_indexed)
+                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v4i32_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 V128:$Rm),
+                                           VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v2i64_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 FPR64Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+                                          V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s",  ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+      [(set (i32 FPR32Op:$Rd),
+            (OpNode FPR32Op:$Rn,
+                    (i32 (vector_extract (v4i32 V128:$Rm),
+                                         VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                             (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+  // intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                (i32 (vector_extract (v4i32
+                         (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx)))),
+                         (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v4i16_indexed)
+                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+                    V128_lo:$Rm, VectorIndexH:$idx),
+                ssub)>;
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                            (extract_high_v8i16 V128:$Rn),
+                            (extract_high_v8i16
+                                (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (Accum (v2i64 V128:$Rd),
+               (v2i64 (int_aarch64_neon_sqdmull
+                          (v2i32 V64:$Rn),
+                          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull
+                            (extract_high_v4i32 V128:$Rn),
+                            (extract_high_v4i32
+                                (AArch64duplane32 (v4i32 V128:$Rm),
+                                                VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (i64 FPR64Op:$dst),
+          (Accum (i64 FPR64Op:$Rd),
+                 (i64 (int_aarch64_neon_sqdmulls_scalar
+                            (i32 FPR32Op:$Rn),
+                            (i32 (vector_extract (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$Rd),
+     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+                                                   (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                           (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+                                            vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd),
+       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode = null_frag> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR16, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR32, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR64, vecshiftR32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftL8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftL16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftL32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
 }
 
-// Format AdvSIMD 2 vector 1 immediate shift
-class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<7> Imm;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand dst_reg, RegisterOperand src_reg,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
   let Inst{28-23} = 0b011110;
-  let Inst{22-16} = Imm;
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD duplicate and insert
-class NeonI_copy<bit q, bit op, bits<4> imm4,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Imm5;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{20-16} = Imm5;
-  let Inst{15} = 0b0;
-  let Inst{14-11} = imm4;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-// Format AdvSIMD insert from element to vector
-class NeonI_insert<bit q, bit op,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Imm5;
-  bits<4> Imm4;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{20-16} = Imm5;
-  let Inst{15} = 0b0;
-  let Inst{14-11} = Imm4;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD scalar pairwise
-class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand vectype1, RegisterOperand vectype2,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD 2 vector across lanes
-class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+                              Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+                                  Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
 }
 
-// Format AdvSIMD scalar two registers miscellaneous
-class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins,
-                            string asmstr, list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD vector load/store multiple N-element structure
-class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+                                     SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V128, vecshiftR16Narrow,
+                                  asm, ".8b", ".8h",
+      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR16Narrow,
+                                  asm#"2", ".16b", ".8h", []> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V128, vecshiftR32Narrow,
+                                  asm, ".4h", ".4s",
+      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR32Narrow,
+                                  asm#"2", ".8h", ".4s", []> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V128, vecshiftR64Narrow,
+                                  asm, ".2s", ".2d",
+      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR64Narrow,
+                                  asm#"2", ".4s", ".2d", []> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+  // themselves, so put them here instead.
+
+  // Patterns involving what's effectively an insert high and a normal
+  // intrinsic, represented by CONCAT_VECTORS.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+                                                   vecshiftR16Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v16i8_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR16Narrow:$imm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+                                                     vecshiftR32Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v8i16_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR32Narrow:$imm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+                                                     vecshiftR64Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v4i32_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                           (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
+             [(set (v16i8 V128:$dst),
+               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
+              [(set (v4i16 V64:$dst),
+                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                        (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
+            [(set (v8i16 V128:$dst),
+              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                      (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
+              [(set (v2i32 V64:$dst),
+                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                        (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
+            [(set (v4i32 V128:$dst),
+              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                      (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                      (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$dst),
+                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$dst),
+                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$dst),
+                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                                   (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$dst),
+                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+                    [(set (v2i32 V64:$dst),
+                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+                    [(set (v4i32 V128:$dst),
+                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+                    [(set (v2i64 V128:$dst),
+                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm#"2", ".8h", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm#"2", ".4s", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm#"2", ".2d", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+                   string asm, dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29-23} = 0b0011000;
-  let Inst{22} = l;
+  let Inst{22} = L;
   let Inst{21-16} = 0b000000;
   let Inst{15-12} = opcode;
   let Inst{11-10} = size;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load/store multiple N-element structure (post-index)
-class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
-                         dag outs, dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+                       string asm, dag oops, dag iops>
+  : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+  bits<5> Vt;
+  bits<5> Rn;
+  bits<5> Xm;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29-23} = 0b0011001;
-  let Inst{22} = l;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
+  let Inst{22} = L;
+  let Inst{21} = 0;
+  let Inst{20-16} = Xm;
   let Inst{15-12} = opcode;
   let Inst{11-10} = size;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
-                      dag ins, string asmstr, list<dag> patterns,
-                      InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-23} = 0b0011010;
-  let Inst{22} = 0b1;
-  let Inst{21} = r;
-  let Inst{20-16} = 0b00000;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+                           int Offset, int Size> {
+  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+  //      "ld1\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+  //      "ld1.8b\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1]"
+  //      "ld1\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+  //      "ld1\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+
+    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                                 GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // LD1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // ST1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
   let Inst{15-13} = opcode;
-  let Inst{12} = 0b0;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+                  Operand listtype>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+                       (outs listtype:$Vt), (ins GPR64sp:$Rn),
+                       []> {
+  let Inst{30} = Q;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = S;
   let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+                      string asm, Operand listtype, Operand GPR64pi>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+                       "$Rn = $wback",
+                       (outs GPR64sp:$wback, listtype:$Vt),
+                       (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+  bits<5> Xm;
+  let Inst{30} = Q;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+                          int Offset, int Size> {
+  // E.g. "ld1r { v0.8b }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1]"
+  //      "ld1r.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], x2"
+  //      "ld1r.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+  int Offset1, int Offset2, int Offset4, int Offset8> {
+  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count # "8b")>;
+  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count #"16b")>;
+  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"4h")>;
+  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"8h")>;
+  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"2s")>;
+  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"4s")>;
+  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"1d")>;
+  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"2d")>;
+
+  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "8b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "16b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "4h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "8h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "2s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "4s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "1d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "2d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+
+  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load/store Single N-element structure to/from one lane
-class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
-                         dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  bits<4> lane;
-  let Inst{31} = 0b0;
-  let Inst{29-23} = 0b0011010;
-  let Inst{22} = l;
-  let Inst{21} = r;
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
   let Inst{20-16} = 0b00000;
-  let Inst{15-14} = op2_1;
-  let Inst{13} = op0;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD post-index vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
-                           dag ins, string asmstr, list<dag> patterns,
-                           InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-23} = 0b0011011;
-  let Inst{22} = 0b1;
-  let Inst{21} = r;
-  // Inherit Rm in 20-16
-  let Inst{15-13} = opcode;
-  let Inst{12} = 0b0;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
   let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD post-index vector load/store Single N-element structure
-// to/from one lane
-class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
-                         dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  bits<4> lane;
-  let Inst{31} = 0b0;
-  let Inst{29-23} = 0b0011011;
-  let Inst{22} = l;
-  let Inst{21} = r;
-  // Inherit Rm in 20-16
-  let Inst{15-14} = op2_1;
-  let Inst{13} = op0;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD 3 scalar registers with different type
-
-class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-30} = 0b01;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+                           (outs listtype:$dst),
+                           (ins listtype:$Vt, VectorIndexB:$idx,
+                                GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexB:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                        GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+                                    (outs GPR64sp:$wback),
+                                    (ins listtype:$Vt, VectorIndexB:$idx,
+                                         GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
 }
 
-// Format AdvSIMD scalar shift by immediate
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+                                 string Count, int Offset, Operand idxtype> {
+  // E.g. "ld1 { v0.8b }[0], [x1], #1"
+  //      "ld1\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type  # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      idxtype:$idx, XZR), 1>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], #1"
+  //      "ld1.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                      idxtype:$idx, XZR), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1]"
+  //      "ld1.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+                      (!cast<Instruction>(NAME # Type)
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], x2"
+  //      "ld1.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+                      (!cast<Instruction>(NAME # Type # "_POST")
+                         GPR64sp:$Rn,
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx,
+                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
 
-class NeonI_ScalarShiftImm<bit u, bits<5> opcode,
-                           dag outs, dag ins, string asmstr,
-                           list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<4> Imm4;
-  bits<3> Imm3;
-  let Inst{31-30} = 0b01;
-  let Inst{29} = u;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-19} = Imm4;
-  let Inst{18-16} = Imm3;
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDLdSt1SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
 }
 
-// Format AdvSIMD crypto AES
-class NeonI_Crypto_AES<bits<2> size, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01001110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10100;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDLdSt2SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
 }
 
-// Format AdvSIMD crypto SHA
-class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01011110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10100;
-  let Inst{16-12} = opcode;
+multiclass SIMDLdSt3SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+              list<dag> pat>
+  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0100111000101000;
+  let Inst{15-12} = opc;
   let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD crypto 3V SHA
-class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
-                         dag outs, dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01011110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-12} = opcode;
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+            "$Rd = $dst",
+            [(set (v16i8 V128:$dst),
+                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+                     dag oops, dag iops, list<dag> pat>
+  : I<oops, iops, asm,
+      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
   let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD scalar x indexed element
-class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
-                               bits<4> opcode, dag outs, dag ins,
-                               string asmstr, list<dag> patterns,
-                               InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11111;
-  let Inst{23} = szhi;
-  let Inst{22} = szlo;
-  // l in Inst{21}
-  // m in Instr{20}
-  // Inherit Rm in 19-16
-  let Inst{15-12} = opcode;
-  // h in Inst{11}
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-// Format AdvSIMD scalar copy - insert from element to scalar
-class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
-  let Inst{28} = 0b1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
+                   [(set (v4i32 V128:$dst),
+                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+                 string cstr, dag oops, dag iops,
+                 list<dag> pat>
+  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0101111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+               (ins V128:$Rd, V128:$Rn),
+               [(set (v4i32 V128:$dst),
+                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index e2612abffa52..ce85b2ceba95 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -11,22 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
 #include "AArch64InstrInfo.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineDominators.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
-#include <algorithm>
 
 using namespace llvm;
 
@@ -34,234 +29,71 @@ using namespace llvm;
 #include "AArch64GenInstrInfo.inc"
 
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
-  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
-    Subtarget(STI) {}
+    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+      RI(this, &STI), Subtarget(STI) {}
 
-void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
-  unsigned Opc = 0;
-  unsigned ZeroReg = 0;
-  if (DestReg == AArch64::XSP || SrcReg == AArch64::XSP) {
-    // E.g. ADD xDst, xsp, #0 (, lsl #0)
-    BuildMI(MBB, I, DL, get(AArch64::ADDxxi_lsl0_s), DestReg)
-      .addReg(SrcReg)
-      .addImm(0);
-    return;
-  } else if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
-    // E.g. ADD wDST, wsp, #0 (, lsl #0)
-    BuildMI(MBB, I, DL, get(AArch64::ADDwwi_lsl0_s), DestReg)
-      .addReg(SrcReg)
-      .addImm(0);
-    return;
-  } else if (DestReg == AArch64::NZCV) {
-    assert(AArch64::GPR64RegClass.contains(SrcReg));
-    // E.g. MSR NZCV, xDST
-    BuildMI(MBB, I, DL, get(AArch64::MSRix))
-      .addImm(A64SysReg::NZCV)
-      .addReg(SrcReg);
-  } else if (SrcReg == AArch64::NZCV) {
-    assert(AArch64::GPR64RegClass.contains(DestReg));
-    // E.g. MRS xDST, NZCV
-    BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
-      .addImm(A64SysReg::NZCV);
-  } else if (AArch64::GPR64RegClass.contains(DestReg)) {
-    if(AArch64::GPR64RegClass.contains(SrcReg)){
-      Opc = AArch64::ORRxxx_lsl;
-      ZeroReg = AArch64::XZR;
-    } else{
-      assert(AArch64::FPR64RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::GPR32RegClass.contains(DestReg)) {
-    if(AArch64::GPR32RegClass.contains(SrcReg)){
-      Opc = AArch64::ORRwww_lsl;
-      ZeroReg = AArch64::WZR;
-    } else{
-      assert(AArch64::FPR32RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR32RegClass.contains(DestReg)) {
-    if(AArch64::FPR32RegClass.contains(SrcReg)){
-      BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-    else {
-      assert(AArch64::GPR32RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR64RegClass.contains(DestReg)) {
-    if(AArch64::FPR64RegClass.contains(SrcReg)){
-      BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-    else {
-      assert(AArch64::GPR64RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR128RegClass.contains(DestReg)) {
-    assert(AArch64::FPR128RegClass.contains(SrcReg));
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
 
-    // If NEON is enable, we use ORR to implement this copy.
-    // If NEON isn't available, emit STR and LDR to handle this.
-    if(getSubTarget().hasNEON()) {
-      BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg)
-        .addReg(SrcReg)
-        .addReg(SrcReg);
-      return;
-    } else {
-      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
-        .addReg(SrcReg)
-        .addReg(AArch64::XSP)
-        .addImm(0x1ff & -16);
+  if (MI->getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
 
-      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
-        .addReg(AArch64::XSP, RegState::Define)
-        .addReg(AArch64::XSP)
-        .addImm(16);
-      return;
-    }
-  } else if (AArch64::FPR8RegClass.contains(DestReg, SrcReg)) {
-    // The copy of two FPR8 registers is implemented by the copy of two FPR32
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_8,
-                                            &AArch64::FPR32RegClass);
-    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_8,
-                                            &AArch64::FPR32RegClass);
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
-      .addReg(Src);
-    return;
-  } else if (AArch64::FPR16RegClass.contains(DestReg, SrcReg)) {
-    // The copy of two FPR16 registers is implemented by the copy of two FPR32
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_16,
-                                            &AArch64::FPR32RegClass);
-    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_16,
-                                            &AArch64::FPR32RegClass);
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
-      .addReg(Src);
-    return;
-  } else {
-    CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg);
-    return;
+  const MCInstrDesc &Desc = MI->getDesc();
+  switch (Desc.getOpcode()) {
+  default:
+    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
+    return 4;
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    return 0;
   }
 
-  // E.g. ORR xDst, xzr, xSrc, lsl #0
-  BuildMI(MBB, I, DL, get(Opc), DestReg)
-    .addReg(ZeroReg)
-    .addReg(SrcReg)
-    .addImm(0);
-}
-
-void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        DebugLoc DL, unsigned DestReg,
-                                        unsigned SrcReg) const {
-  unsigned SubRegs;
-  bool IsQRegs;
-  if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 2;
-    IsQRegs = false;
-  } else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 3;
-    IsQRegs = false;
-  } else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 4;
-    IsQRegs = false;
-  } else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 2;
-    IsQRegs = true;
-  } else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 3;
-    IsQRegs = true;
-  } else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 4;
-    IsQRegs = true;
-  } else
-    llvm_unreachable("Unknown register class");
-
-  unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0;
-  int Spacing = 1;
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  // Copy register tuples backward when the first Dest reg overlaps
-  // with SrcReg.
-  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
-    BeginIdx = BeginIdx + (SubRegs - 1);
-    Spacing = -1;
-  }
-
-  unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B;
-  for (unsigned i = 0; i != SubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
-    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
-    assert(Dst && Src && "Bad sub-register");
-    BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-        .addReg(Src)
-        .addReg(Src);
-  }
-  return;
-}
-
-/// Does the Opcode represent a conditional branch that we can remove and re-add
-/// at the end of a basic block?
-static bool isCondBranch(unsigned Opc) {
-  return Opc == AArch64::Bcc || Opc == AArch64::CBZw || Opc == AArch64::CBZx ||
-         Opc == AArch64::CBNZw || Opc == AArch64::CBNZx ||
-         Opc == AArch64::TBZwii || Opc == AArch64::TBZxii ||
-         Opc == AArch64::TBNZwii || Opc == AArch64::TBNZxii;
-}
-
-/// Takes apart a given conditional branch MachineInstr (see isCondBranch),
-/// setting TBB to the destination basic block and populating the Cond vector
-/// with data necessary to recreate the conditional branch at a later
-/// date. First element will be the opcode, and subsequent ones define the
-/// conditions being branched on in an instruction-specific manner.
-static void classifyCondBranch(MachineInstr *I, MachineBasicBlock *&TBB,
-                               SmallVectorImpl<MachineOperand> &Cond) {
-  switch(I->getOpcode()) {
-  case AArch64::Bcc:
-  case AArch64::CBZw:
-  case AArch64::CBZx:
-  case AArch64::CBNZw:
-  case AArch64::CBNZx:
-    // These instructions just have one predicate operand in position 0 (either
-    // a condition code or a register being compared).
-    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
-    Cond.push_back(I->getOperand(0));
-    TBB = I->getOperand(1).getMBB();
-    return;
-  case AArch64::TBZwii:
-  case AArch64::TBZxii:
-  case AArch64::TBNZwii:
-  case AArch64::TBNZxii:
-    // These have two predicate operands: a register and a bit position.
-    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
-    Cond.push_back(I->getOperand(0));
-    Cond.push_back(I->getOperand(1));
-    TBB = I->getOperand(2).getMBB();
-    return;
+  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  switch (LastInst->getOpcode()) {
   default:
-    llvm_unreachable("Unknown conditional branch to classify");
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    Target = LastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    Cond.push_back(LastInst->getOperand(1));
   }
 }
 
-
-bool
-AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
+// Branch analysis.
+bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
@@ -281,15 +113,16 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst->getOpcode();
   if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (LastOpc == AArch64::Bimm) {
+    if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
     }
-    if (isCondBranch(LastOpc)) {
-      classifyCondBranch(LastInst, TBB, Cond);
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
       return false;
     }
-    return true;  // Can't handle indirect branch.
+    return true; // Can't handle indirect branch.
   }
 
   // Get the instruction before it if it is a terminator.
@@ -298,8 +131,8 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
   // If AllowModify is true and the block ends with two or more unconditional
   // branches, delete all but the first unconditional branch.
-  if (AllowModify && LastOpc == AArch64::Bimm) {
-    while (SecondLastOpc == AArch64::Bimm) {
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
       LastInst->eraseFromParent();
       LastInst = SecondLastInst;
       LastOpc = LastInst->getOpcode();
@@ -319,23 +152,15 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
-  if (LastOpc == AArch64::Bimm) {
-    if (SecondLastOpc == AArch64::Bcc) {
-      TBB =  SecondLastInst->getOperand(1).getMBB();
-      Cond.push_back(MachineOperand::CreateImm(AArch64::Bcc));
-      Cond.push_back(SecondLastInst->getOperand(0));
-      FBB = LastInst->getOperand(0).getMBB();
-      return false;
-    } else if (isCondBranch(SecondLastOpc)) {
-      classifyCondBranch(SecondLastInst, TBB, Cond);
-      FBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
   }
 
   // If the block ends with two unconditional branches, handle it.  The second
   // one is not executed, so remove it.
-  if (SecondLastOpc == AArch64::Bimm && LastOpc == AArch64::Bimm) {
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
     TBB = SecondLastInst->getOperand(0).getMBB();
     I = LastInst;
     if (AllowModify)
@@ -343,84 +168,72 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     return false;
   }
 
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
   // Otherwise, can't handle this.
   return true;
 }
 
 bool AArch64InstrInfo::ReverseBranchCondition(
-                                  SmallVectorImpl<MachineOperand> &Cond) const {
-  switch (Cond[0].getImm()) {
-  case AArch64::Bcc: {
-    A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(Cond[1].getImm());
-    CC = A64InvertCondCode(CC);
-    Cond[1].setImm(CC);
-    return false;
-  }
-  case AArch64::CBZw:
-    Cond[0].setImm(AArch64::CBNZw);
-    return false;
-  case AArch64::CBZx:
-    Cond[0].setImm(AArch64::CBNZx);
-    return false;
-  case AArch64::CBNZw:
-    Cond[0].setImm(AArch64::CBZw);
-    return false;
-  case AArch64::CBNZx:
-    Cond[0].setImm(AArch64::CBZx);
-    return false;
-  case AArch64::TBZwii:
-    Cond[0].setImm(AArch64::TBNZwii);
-    return false;
-  case AArch64::TBZxii:
-    Cond[0].setImm(AArch64::TBNZxii);
-    return false;
-  case AArch64::TBNZwii:
-    Cond[0].setImm(AArch64::TBZwii);
-    return false;
-  case AArch64::TBNZxii:
-    Cond[0].setImm(AArch64::TBZxii);
-    return false;
-  default:
-    llvm_unreachable("Unknown branch type");
-  }
-}
-
-
-unsigned
-AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                               MachineBasicBlock *FBB,
-                               const SmallVectorImpl<MachineOperand> &Cond,
-                               DebugLoc DL) const {
-  if (!FBB && Cond.empty()) {
-    BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(TBB);
-    return 1;
-  } else if (!FBB) {
-    MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
-    for (int i = 1, e = Cond.size(); i != e; ++i)
-      MIB.addOperand(Cond[i]);
-    MIB.addMBB(TBB);
-    return 1;
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+  } else {
+    // Folded compare-and-branch
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown conditional branch!");
+    case AArch64::CBZW:
+      Cond[1].setImm(AArch64::CBNZW);
+      break;
+    case AArch64::CBNZW:
+      Cond[1].setImm(AArch64::CBZW);
+      break;
+    case AArch64::CBZX:
+      Cond[1].setImm(AArch64::CBNZX);
+      break;
+    case AArch64::CBNZX:
+      Cond[1].setImm(AArch64::CBZX);
+      break;
+    case AArch64::TBZW:
+      Cond[1].setImm(AArch64::TBNZW);
+      break;
+    case AArch64::TBNZW:
+      Cond[1].setImm(AArch64::TBZW);
+      break;
+    case AArch64::TBZX:
+      Cond[1].setImm(AArch64::TBNZX);
+      break;
+    case AArch64::TBNZX:
+      Cond[1].setImm(AArch64::TBZX);
+      break;
+    }
   }
 
-  MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
-  for (int i = 1, e = Cond.size(); i != e; ++i)
-    MIB.addOperand(Cond[i]);
-  MIB.addMBB(TBB);
-
-  BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(FBB);
-  return 2;
+  return false;
 }
 
 unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin()) return 0;
+  if (I == MBB.begin())
+    return 0;
   --I;
   while (I->isDebugValue()) {
     if (I == MBB.begin())
       return 0;
     --I;
   }
-  if (I->getOpcode() != AArch64::Bimm && !isCondBranch(I->getOpcode()))
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
     return 0;
 
   // Remove the branch.
@@ -428,9 +241,10 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   I = MBB.end();
 
-  if (I == MBB.begin()) return 1;
+  if (I == MBB.begin())
+    return 1;
   --I;
-  if (!isCondBranch(I->getOpcode()))
+  if (!isCondBranchOpcode(I->getOpcode()))
     return 1;
 
   // Remove the branch.
@@ -438,542 +252,1838 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const {
-  MachineInstr &MI = *MBBI;
-  MachineBasicBlock &MBB = *MI.getParent();
+void AArch64InstrInfo::instantiateCondBranch(
+    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    const SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+  } else {
+    // Folded compare-and-branch
+    const MachineInstrBuilder MIB =
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+    if (Cond.size() > 3)
+      MIB.addImm(Cond[3].getImm());
+    MIB.addMBB(TBB);
+  }
+}
 
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AArch64::TLSDESC_BLRx: {
-    MachineInstr *NewMI =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(AArch64::TLSDESCCALL))
-        .addOperand(MI.getOperand(1));
-    MI.setDesc(get(AArch64::BLRx));
-
-    llvm::finalizeBundle(MBB, NewMI, *++MBBI);
-    return true;
-    }
+unsigned AArch64InstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+    else
+      instantiateCondBranch(MBB, DL, TBB, Cond);
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  instantiateCondBranch(MBB, DL, TBB, Cond);
+  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+  return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    if (!DefMI->isFullCopy())
+      return VReg;
+    VReg = DefMI->getOperand(1).getReg();
+  }
+  return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+                                unsigned *NewVReg = nullptr) {
+  VReg = removeCopies(MRI, VReg);
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return 0;
+
+  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+  unsigned Opc = 0;
+  unsigned SrcOpNum = 0;
+  switch (DefMI->getOpcode()) {
+  case AArch64::ADDSXri:
+  case AArch64::ADDSWri:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to ADDXri and ADDWri.
+  case AArch64::ADDXri:
+  case AArch64::ADDWri:
+    // add x, 1 -> csinc.
+    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+        DefMI->getOperand(3).getImm() != 0)
+      return 0;
+    SrcOpNum = 1;
+    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+    break;
+
+  case AArch64::ORNXrr:
+  case AArch64::ORNWrr: {
+    // not x -> csinv, represented as orn dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+    break;
+  }
+
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSWrr:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to SUBXrr and SUBWrr.
+  case AArch64::SUBXrr:
+  case AArch64::SUBWrr: {
+    // neg x -> csneg, represented as sub dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+    break;
+  }
   default:
+    return 0;
+  }
+  assert(Opc && SrcOpNum && "Missing parameters");
+
+  if (NewVReg)
+    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
+    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+    int &FalseCycles) const {
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
     return false;
+
+  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+  unsigned ExtraCondLat = Cond.size() != 1;
+
+  // GPRs are handled by csel.
+  // FIXME: Fold in x+1, -x, and ~x when applicable.
+  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+      AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+    // Single-cycle csel, csinc, csinv, and csneg.
+    CondCycles = 1 + ExtraCondLat;
+    TrueCycles = FalseCycles = 1;
+    if (canFoldIntoCSel(MRI, TrueReg))
+      TrueCycles = 0;
+    else if (canFoldIntoCSel(MRI, FalseReg))
+      FalseCycles = 0;
+    return true;
   }
 
+  // Scalar floating point is handled by fcsel.
+  // FIXME: Form fabs, fmin, and fmax when applicable.
+  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+      AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+    CondCycles = 5 + ExtraCondLat;
+    TrueCycles = FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
   return false;
 }
 
-void
-AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      unsigned SrcReg, bool isKill,
-                                      int FrameIdx,
-                                      const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-
-  MachineMemOperand *MMO
-    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                              MachineMemOperand::MOStore,
-                              MFI.getObjectSize(FrameIdx),
-                              Align);
-
-  unsigned StoreOp = 0;
-  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
-    switch(RC->getSize()) {
-    case 4: StoreOp = AArch64::LS32_STR; break;
-    case 8: StoreOp = AArch64::LS64_STR; break;
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I, DebugLoc DL,
+                                    unsigned DstReg,
+                                    const SmallVectorImpl<MachineOperand> &Cond,
+                                    unsigned TrueReg, unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Parse the condition code, see parseCondBranch() above.
+  AArch64CC::CondCode CC;
+  switch (Cond.size()) {
+  default:
+    llvm_unreachable("Unknown condition opcode in Cond");
+  case 1: // b.cc
+    CC = AArch64CC::CondCode(Cond[0].getImm());
+    break;
+  case 3: { // cbz/cbnz
+    // We must insert a compare against 0.
+    bool Is64Bit;
+    switch (Cond[1].getImm()) {
     default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
-    StoreOp = AArch64::LSFP8_STR;
-  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
-    StoreOp = AArch64::LSFP16_STR;
-  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
-             RC->hasType(MVT::f128)) {
-    switch (RC->getSize()) {
-    case 4: StoreOp = AArch64::LSFP32_STR; break;
-    case 8: StoreOp = AArch64::LSFP64_STR; break;
-    case 16: StoreOp = AArch64::LSFP128_STR; break;
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::CBZW:
+      Is64Bit = 0;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBZX:
+      Is64Bit = 1;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBNZW:
+      Is64Bit = 0;
+      CC = AArch64CC::NE;
+      break;
+    case AArch64::CBNZX:
+      Is64Bit = 1;
+      CC = AArch64CC::NE;
+      break;
+    }
+    unsigned SrcReg = Cond[2].getReg();
+    if (Is64Bit) {
+      // cmp reg, #0 is actually subs xzr, reg, #0.
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    } else {
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    }
+    break;
+  }
+  case 4: { // tbz/tbnz
+    // We must insert a tst instruction.
+    switch (Cond[1].getImm()) {
     default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else { // For a super register class has more than one sub registers
-    if (AArch64::DPairRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x2_8B;
-    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x3_8B;
-    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x4_8B;
-    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x2_16B;
-    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x3_16B;
-    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x4_16B;
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      CC = AArch64CC::NE;
+      break;
+    }
+    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+    if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+      BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
     else
-      llvm_unreachable("Unknown reg class");
+      BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+    break;
+  }
+  }
 
-    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
-    // Vector store has different operands from other store instructions.
-    NewMI.addFrameIndex(FrameIdx)
-         .addReg(SrcReg, getKillRegState(isKill))
-         .addMemOperand(MMO);
-    return;
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  bool TryFold = false;
+  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+    RC = &AArch64::GPR64RegClass;
+    Opc = AArch64::CSELXr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+    RC = &AArch64::GPR32RegClass;
+    Opc = AArch64::CSELWr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FCSELDrrr;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+    RC = &AArch64::FPR32RegClass;
+    Opc = AArch64::FCSELSrrr;
+  }
+  assert(RC && "Unsupported regclass");
+
+  // Try folding simple instructions into the csel.
+  if (TryFold) {
+    unsigned NewVReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    if (FoldedOpc) {
+      // The folded opcodes csinc, csinc and csneg apply the operation to
+      // FalseReg, so we need to invert the condition.
+      CC = AArch64CC::getInvertedCondCode(CC);
+      TrueReg = FalseReg;
+    } else
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+    // Fold the operation. Leave any dead instructions for DCE to clean up.
+    if (FoldedOpc) {
+      FalseReg = NewVReg;
+      Opc = FoldedOpc;
+      // The extends the live range of NewVReg.
+      MRI.clearKillFlags(NewVReg);
+    }
   }
 
-  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
-  NewMI.addReg(SrcReg, getKillRegState(isKill))
-    .addFrameIndex(FrameIdx)
-    .addImm(0)
-    .addMemOperand(MMO);
+  // Pull all virtual register into the appropriate class.
+  MRI.constrainRegClass(TrueReg, RC);
+  MRI.constrainRegClass(FalseReg, RC);
 
+  // Insert the csel.
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+      CC);
 }
 
-void
-AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MBBI,
-                                       unsigned DestReg, int FrameIdx,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-
-  MachineMemOperand *MMO
-    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                              MachineMemOperand::MOLoad,
-                              MFI.getObjectSize(FrameIdx),
-                              Align);
-
-  unsigned LoadOp = 0;
-  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
-    switch(RC->getSize()) {
-    case 4: LoadOp = AArch64::LS32_LDR; break;
-    case 8: LoadOp = AArch64::LS64_LDR; break;
-    default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
-    LoadOp = AArch64::LSFP8_LDR;
-  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
-    LoadOp = AArch64::LSFP16_LDR;
-  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
-             RC->hasType(MVT::f128)) {
-    switch (RC->getSize()) {
-    case 4: LoadOp = AArch64::LSFP32_LDR; break;
-    case 8: LoadOp = AArch64::LSFP64_LDR; break;
-    case 16: LoadOp = AArch64::LSFP128_LDR; break;
-    default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else { // For a super register class has more than one sub registers
-    if (AArch64::DPairRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x2_8B;
-    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x3_8B;
-    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x4_8B;
-    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x2_16B;
-    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x3_16B;
-    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x4_16B;
-    else
-      llvm_unreachable("Unknown reg class");
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                             unsigned &SrcReg, unsigned &DstReg,
+                                             unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::SBFMXri: // aka sxtw
+  case AArch64::UBFMXri: // aka uxtw
+    // Check for the 32 -> 64 bit extension case, these instructions can do
+    // much more.
+    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+      return false;
+    // This is a signed or unsigned 32 -> 64 bit extension.
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = AArch64::sub_32;
+    return true;
+  }
+}
 
-    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
-    // Vector load has different operands from other load instructions.
-    NewMI.addFrameIndex(FrameIdx)
-         .addMemOperand(MMO);
-    return;
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+    // Replace SUBSWrr with SUBWrr if NZCV is not used.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+    // ANDS does not use the same encoding scheme as the others xxxS
+    // instructions.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(),
+        MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+    return true;
   }
 
-  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
-  NewMI.addFrameIndex(FrameIdx)
-       .addImm(0)
-       .addMemOperand(MMO);
+  return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr *Instr) {
+  MachineBasicBlock *MBB = Instr->getParent();
+  assert(MBB && "Can't get MachineBasicBlock here");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Can't get MachineFunction here");
+  const TargetMachine *TM = &MF->getTarget();
+  const TargetInstrInfo *TII = TM->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+       ++OpIdx) {
+    MachineOperand &MO = Instr->getOperand(OpIdx);
+    const TargetRegisterClass *OpRegCstraints =
+        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+
+    // If there's no constraint, there's nothing to do.
+    if (!OpRegCstraints)
+      continue;
+    // If the operand is a frame index, there's nothing to do here.
+    // A frame index operand will resolve correctly during PEI.
+    if (MO.isFI())
+      continue;
+
+    assert(MO.isReg() &&
+           "Operand has register constraints without being a register!");
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!OpRegCstraints->contains(Reg))
+        return false;
+    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+               !MRI->constrainRegClass(Reg, OpRegCstraints))
+      return false;
+  }
+
+  return true;
 }
 
-unsigned AArch64InstrInfo::estimateRSStackLimit(MachineFunction &MF) const {
-  unsigned Limit = (1 << 16) - 1;
-  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!I->getOperand(i).isFI()) continue;
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool AArch64InstrInfo::optimizeCompareInstr(
+    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+
+  // Replace SUBSWrr with SUBWrr if NZCV is not used.
+  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (Cmp_NZCV != -1) {
+    unsigned NewOpc;
+    switch (CmpInstr->getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
+    case AArch64::ADDSWri:      NewOpc = AArch64::ADDWri; break;
+    case AArch64::ADDSWrs:      NewOpc = AArch64::ADDWrs; break;
+    case AArch64::ADDSWrx:      NewOpc = AArch64::ADDWrx; break;
+    case AArch64::ADDSXrr:      NewOpc = AArch64::ADDXrr; break;
+    case AArch64::ADDSXri:      NewOpc = AArch64::ADDXri; break;
+    case AArch64::ADDSXrs:      NewOpc = AArch64::ADDXrs; break;
+    case AArch64::ADDSXrx:      NewOpc = AArch64::ADDXrx; break;
+    case AArch64::SUBSWrr:      NewOpc = AArch64::SUBWrr; break;
+    case AArch64::SUBSWri:      NewOpc = AArch64::SUBWri; break;
+    case AArch64::SUBSWrs:      NewOpc = AArch64::SUBWrs; break;
+    case AArch64::SUBSWrx:      NewOpc = AArch64::SUBWrx; break;
+    case AArch64::SUBSXrr:      NewOpc = AArch64::SUBXrr; break;
+    case AArch64::SUBSXri:      NewOpc = AArch64::SUBXri; break;
+    case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
+    case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
+    }
 
-        // When using ADDxxi_lsl0_s to get the address of a stack object, 0xfff
-        // is the largest offset guaranteed to fit in the immediate offset.
-        if (I->getOpcode() == AArch64::ADDxxi_lsl0_s) {
-          Limit = std::min(Limit, 0xfffu);
-          break;
-        }
+    const MCInstrDesc &MCID = get(NewOpc);
+    CmpInstr->setDesc(MCID);
+    CmpInstr->RemoveOperand(Cmp_NZCV);
+    bool succeeded = UpdateOperandRegClass(CmpInstr);
+    (void)succeeded;
+    assert(succeeded && "Some operands reg class are incompatible!");
+    return true;
+  }
 
-        int AccessScale, MinOffset, MaxOffset;
-        getAddressConstraints(*I, AccessScale, MinOffset, MaxOffset);
-        Limit = std::min(Limit, static_cast<unsigned>(MaxOffset));
+  // Continue only if we have a "ri" where immediate is zero.
+  if (CmpValue != 0 || SrcReg2 != 0)
+    return false;
 
-        break; // At most one FI per instruction
+  // CmpInstr is a Compare instruction if destination register is not used.
+  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+    return false;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of the source register or done with the
+  // basic block, to check whether NZCV is used or modified in between.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, we can't optimize away the Compare.
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // Check that NZCV isn't set between the comparison instruction and the one we
+  // want to change.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+        Instr.readsRegister(AArch64::NZCV, TRI))
+      // This instruction modifies or uses NZCV after the one we want to
+      // change. We can't do this transformation.
+      return false;
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  unsigned NewOpc = MI->getOpcode();
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXri:
+    break;
+  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
+  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
+  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
+  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
+  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
+  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
+  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
+  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
+  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
+  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
+  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
+  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
+  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
+  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
+  }
+
+  // Scan forward for the use of NZCV.
+  // When checking against MI: if it's a conditional code requires
+  // checking of V bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if NZCV is redefined or killed.
+  // If we are done with the basic block, we need to check whether NZCV is
+  // live-out.
+  bool IsSafe = false;
+  for (MachineBasicBlock::iterator I = CmpInstr,
+                                   E = CmpInstr->getParent()->end();
+       !IsSafe && ++I != E;) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
+         ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
+        IsSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
+        continue;
+      if (MO.isDef()) {
+        IsSafe = true;
+        break;
+      }
+
+      // Decode the condition code.
+      unsigned Opc = Instr.getOpcode();
+      AArch64CC::CondCode CC;
+      switch (Opc) {
+      default:
+        return false;
+      case AArch64::Bcc:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
+        break;
+      case AArch64::CSINVWr:
+      case AArch64::CSINVXr:
+      case AArch64::CSINCWr:
+      case AArch64::CSINCXr:
+      case AArch64::CSELWr:
+      case AArch64::CSELXr:
+      case AArch64::CSNEGWr:
+      case AArch64::CSNEGXr:
+      case AArch64::FCSELSrrr:
+      case AArch64::FCSELDrrr:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
+        break;
+      }
+
+      // It is not safe to remove Compare instruction if Overflow(V) is used.
+      switch (CC) {
+      default:
+        // NZCV can be used multiple times, we should continue.
+        break;
+      case AArch64CC::VS:
+      case AArch64CC::VC:
+      case AArch64CC::GE:
+      case AArch64CC::LT:
+      case AArch64CC::GT:
+      case AArch64CC::LE:
+        return false;
       }
     }
   }
 
-  return Limit;
+  // If NZCV is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!IsSafe) {
+    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
+    for (auto *MBB : ParentBlock->successors())
+      if (MBB->isLiveIn(AArch64::NZCV))
+        return false;
+  }
+
+  // Update the instruction to set NZCV.
+  MI->setDesc(get(NewOpc));
+  CmpInstr->eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(MI);
+  (void)succeeded;
+  assert(succeeded && "Some operands reg class are incompatible!");
+  MI->addRegisterDefined(AArch64::NZCV, TRI);
+  return true;
 }
-void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
-                                             int &AccessScale, int &MinOffset,
-                                             int &MaxOffset) const {
-  switch (MI.getOpcode()) {
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Unknown load/store kind");
-  case TargetOpcode::DBG_VALUE:
-    AccessScale = 1;
-    MinOffset = INT_MIN;
-    MaxOffset = INT_MAX;
-    return;
-  case AArch64::LS8_LDR: case AArch64::LS8_STR:
-  case AArch64::LSFP8_LDR: case AArch64::LSFP8_STR:
-  case AArch64::LDRSBw:
-  case AArch64::LDRSBx:
-    AccessScale = 1;
-    MinOffset = 0;
-    MaxOffset = 0xfff;
-    return;
-  case AArch64::LS16_LDR: case AArch64::LS16_STR:
-  case AArch64::LSFP16_LDR: case AArch64::LSFP16_STR:
-  case AArch64::LDRSHw:
-  case AArch64::LDRSHx:
-    AccessScale = 2;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LS32_LDR:  case AArch64::LS32_STR:
-  case AArch64::LSFP32_LDR: case AArch64::LSFP32_STR:
-  case AArch64::LDRSWx:
-  case AArch64::LDPSWx:
-    AccessScale = 4;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LS64_LDR: case AArch64::LS64_STR:
-  case AArch64::LSFP64_LDR: case AArch64::LSFP64_STR:
-  case AArch64::PRFM:
-    AccessScale = 8;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LSFP128_LDR: case AArch64::LSFP128_STR:
-    AccessScale = 16;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LSPair32_LDR: case AArch64::LSPair32_STR:
-  case AArch64::LSFPPair32_LDR: case AArch64::LSFPPair32_STR:
-    AccessScale = 4;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LSPair64_LDR: case AArch64::LSPair64_STR:
-  case AArch64::LSFPPair64_LDR: case AArch64::LSFPPair64_STR:
-    AccessScale = 8;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LSFPPair128_LDR: case AArch64::LSFPPair128_STR:
-    AccessScale = 16;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LD1x2_8B: case AArch64::ST1x2_8B:
-    AccessScale = 16;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x3_8B: case AArch64::ST1x3_8B:
-    AccessScale = 24;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x4_8B: case AArch64::ST1x4_8B:
-  case AArch64::LD1x2_16B: case AArch64::ST1x2_16B:
-    AccessScale = 32;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x3_16B: case AArch64::ST1x3_16B:
-    AccessScale = 48;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x4_16B: case AArch64::ST1x4_16B:
-    AccessScale = 64;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
+    break;
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXrs:
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::CRC32Brr:
+  case AArch64::CRC32CBrr:
+  case AArch64::CRC32CHrr:
+  case AArch64::CRC32CWrr:
+  case AArch64::CRC32CXrr:
+  case AArch64::CRC32Hrr:
+  case AArch64::CRC32Wrr:
+  case AArch64::CRC32Xrr:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
   }
+  return false;
 }
 
-unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
-  const MCInstrDesc &MCID = MI.getDesc();
-  const MachineBasicBlock &MBB = *MI.getParent();
-  const MachineFunction &MF = *MBB.getParent();
-  const MCAsmInfo &MAI = *MF.getTarget().getMCAsmInfo();
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64:
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64:
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
 
-  if (MCID.getSize())
-    return MCID.getSize();
+  return false;
+}
 
-  if (MI.getOpcode() == AArch64::INLINEASM)
-    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVZWi:
+  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 3 &&
+             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+      return true;
+    }
+    break;
+  case AArch64::ANDWri: // and Rd, Rzr, #imm
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  case AArch64::ANDXri:
+    return MI->getOperand(1).getReg() == AArch64::XZR;
+  case TargetOpcode::COPY:
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  }
+  return false;
+}
 
-  switch (MI.getOpcode()) {
-  case TargetOpcode::BUNDLE:
-    return getInstBundleLength(MI);
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-  case TargetOpcode::CFI_INSTRUCTION:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::GC_LABEL:
-  case TargetOpcode::DBG_VALUE:
-  case AArch64::TLSDESCCALL:
-    return 0;
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Unknown instruction class");
+    break;
+  case TargetOpcode::COPY: {
+    // GPR32 copies will by lowered to ORRXrs
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::GPR32RegClass.contains(DstReg) ||
+            AArch64::GPR64RegClass.contains(DstReg));
   }
+  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+    if (MI->getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+      return true;
+    }
+  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+    if (MI->getOperand(2).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+      return true;
+    }
+  }
+  return false;
 }
 
-unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
-  unsigned Size = 0;
-  MachineBasicBlock::const_instr_iterator I = MI;
-  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
-  while (++I != E && I->isInsideBundle()) {
-    assert(!I->isBundle() && "No nested bundle!");
-    Size += getInstSizeInBytes(*I);
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // FPR64 copies will by lowered to ORR.16b
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::FPR64RegClass.contains(DstReg) ||
+            AArch64::FPR128RegClass.contains(DstReg));
   }
-  return Size;
+  case AArch64::ORRv16i8:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+             "invalid ORRv16i8 operands");
+      return true;
+    }
+  }
+  return false;
 }
 
-bool llvm::rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                unsigned FrameReg, int &Offset,
-                                const AArch64InstrInfo &TII) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
 
-  MFI.getObjectOffset(FrameRegIdx);
-  llvm_unreachable("Unimplemented rewriteFrameIndex");
+  return 0;
 }
 
-void llvm::emitRegUpdate(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI,
-                         DebugLoc dl, const TargetInstrInfo &TII,
-                         unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
-                         int64_t NumBytes, MachineInstr::MIFlag MIFlags) {
-  if (NumBytes == 0 && DstReg == SrcReg)
-    return;
-  else if (abs64(NumBytes) & ~0xffffff) {
-    // Generically, we have to materialize the offset into a temporary register
-    // and subtract it. There are a couple of ways this could be done, for now
-    // we'll use a movz/movk or movn/movk sequence.
-    uint64_t Bits = static_cast<uint64_t>(abs64(NumBytes));
-    BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVZxii), ScratchReg)
-      .addImm(0xffff & Bits).addImm(0)
-      .setMIFlags(MIFlags);
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(1)
-        .setMIFlags(MIFlags);
-    }
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(2)
-        .setMIFlags(MIFlags);
-    }
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(3)
-        .setMIFlags(MIFlags);
-    }
-
-    // ADD DST, SRC, xTMP (, lsl #0)
-    unsigned AddOp = NumBytes > 0 ? AArch64::ADDxxx_uxtx : AArch64::SUBxxx_uxtx;
-    BuildMI(MBB, MBBI, dl, TII.get(AddOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addReg(ScratchReg, RegState::Kill)
-      .addImm(0)
-      .setMIFlag(MIFlags);
-    return;
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
   }
+  return 0;
+}
 
-  // Now we know that the adjustment can be done in at most two add/sub
-  // (immediate) instructions, which is always more efficient than a
-  // literal-pool load, or even a hypothetical movz/movk/add sequence
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBroW:
+  case AArch64::LDRDroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHroW:
+  case AArch64::LDRQroW:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSroW:
+  case AArch64::LDRWroW:
+  case AArch64::LDRXroW:
+  case AArch64::STRBBroW:
+  case AArch64::STRBroW:
+  case AArch64::STRDroW:
+  case AArch64::STRHHroW:
+  case AArch64::STRHroW:
+  case AArch64::STRQroW:
+  case AArch64::STRSroW:
+  case AArch64::STRWroW:
+  case AArch64::STRXroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroX:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroX:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroX:
+  case AArch64::STRQroX:
+  case AArch64::STRSroX:
+  case AArch64::STRWroX:
+  case AArch64::STRXroX:
+
+    unsigned Val = MI->getOperand(3).getImm();
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+    return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+  }
+  return false;
+}
 
-  // Decide whether we're doing addition or subtraction
-  unsigned LowOp, HighOp;
-  if (NumBytes >= 0) {
-    LowOp = AArch64::ADDxxi_lsl0_s;
-    HighOp = AArch64::ADDxxi_lsl12_s;
-  } else {
-    LowOp = AArch64::SUBxxi_lsl0_s;
-    HighOp = AArch64::SUBxxi_lsl12_s;
-    NumBytes = abs64(NumBytes);
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  for (auto *MM : MI->memoperands()) {
+    if (MM->getFlags() &
+        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
+      return true;
+    }
   }
+  return false;
+}
+
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
+  if (MI->memoperands_empty())
+    return;
 
-  // If we're here, at the very least a move needs to be produced, which just
-  // happens to be materializable by an ADD.
-  if ((NumBytes & 0xfff) || NumBytes == 0) {
-    BuildMI(MBB, MBBI, dl, TII.get(LowOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addImm(NumBytes & 0xfff)
-      .setMIFlag(MIFlags);
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  (*MI->memoperands_begin())
+      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+}
 
-    // Next update should use the register we've just defined.
-    SrcReg = DstReg;
-  }
+bool
+AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                                       unsigned &Offset,
+                                       const TargetRegisterInfo *TRI) const {
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+  case AArch64::STRXui:
+  case AArch64::STRWui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::LDRXui:
+  case AArch64::LDRWui:
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+      return false;
+    BaseReg = LdSt->getOperand(1).getReg();
+    MachineFunction &MF = *LdSt->getParent()->getParent();
+    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
+    Offset = LdSt->getOperand(2).getImm() * Width;
+    return true;
+  };
+}
 
-  if (NumBytes & 0xfff000) {
-    BuildMI(MBB, MBBI, dl, TII.get(HighOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addImm(NumBytes >> 12)
-      .setMIFlag(MIFlags);
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                          MachineInstr *SecondLdSt,
+                                          unsigned NumLoads) const {
+  // Only cluster up to a single pair.
+  if (NumLoads > 1)
+    return false;
+  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+    return false;
+  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
+  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
+  // Allow 6 bits of positive range.
+  if (Ofs1 > 64)
+    return false;
+  // The caller should already have ordered First/SecondLdSt by offset.
+  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
+  return Ofs1 + 1 == Ofs2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
+                                              MachineInstr *Second) const {
+  // Cyclone can fuse CMN, CMP followed by Bcc.
+
+  // FIXME: B0 can also fuse:
+  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
+  if (Second->getOpcode() != AArch64::Bcc)
+    return false;
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::ANDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+  case AArch64::ANDSXri:
+    return true;
   }
 }
 
-void llvm::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                        DebugLoc dl, const TargetInstrInfo &TII,
-                        unsigned ScratchReg, int64_t NumBytes,
-                        MachineInstr::MIFlag MIFlags) {
-  emitRegUpdate(MBB, MI, dl, TII, AArch64::XSP, AArch64::XSP, AArch64::X16,
-                NumBytes, MIFlags);
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                                         int FrameIx,
+                                                         uint64_t Offset,
+                                                         const MDNode *MDPtr,
+                                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+                                .addFrameIndex(FrameIx)
+                                .addImm(0)
+                                .addImm(Offset)
+                                .addMetadata(MDPtr);
+  return &*MIB;
 }
 
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg, unsigned SubIdx,
+                                            unsigned State,
+                                            const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
 
-namespace {
-  struct LDTLSCleanup : public MachineFunctionPass {
-    static char ID;
-    LDTLSCleanup() : MachineFunctionPass(ID) {}
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
 
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      AArch64MachineFunctionInfo* MFI
-        = MF.getInfo<AArch64MachineFunctionInfo>();
-      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
-        // No point folding accesses if there isn't at least two.
-        return false;
-      }
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+                                        unsigned NumRegs) {
+  // We really want the positive remainder mod 32 here, that happens to be
+  // easily obtainable with a mask.
+  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
 
-      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
-      return VisitNode(DT->getRootNode(), 0);
-    }
-
-    // Visit the dominator subtree rooted at Node in pre-order.
-    // If TLSBaseAddrReg is non-null, then use that to replace any
-    // TLS_base_addr instructions. Otherwise, create the register
-    // when the first such instruction is seen, and then use it
-    // as we encounter more instructions.
-    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
-      MachineBasicBlock *BB = Node->getBlock();
-      bool Changed = false;
-
-      // Traverse the current block.
-      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-           ++I) {
-        switch (I->getOpcode()) {
-        case AArch64::TLSDESC_BLRx:
-          // Make sure it's a local dynamic access.
-          if (!I->getOperand(1).isSymbol() ||
-              strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
-            break;
-
-          if (TLSBaseAddrReg)
-            I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
-          else
-            I = SetRegister(I, &TLSBaseAddrReg);
-          Changed = true;
-          break;
-        default:
-          break;
-        }
-      }
+void AArch64InstrInfo::copyPhysRegTuple(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+    llvm::ArrayRef<unsigned> Indices) const {
+  assert(Subtarget.hasNEON() &&
+         "Unexpected register copy without NEON");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned NumRegs = Indices.size();
+
+  int SubReg = 0, End = NumRegs, Incr = 1;
+  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+    SubReg = NumRegs - 1;
+    End = -1;
+    Incr = -1;
+  }
+
+  for (; SubReg != End; SubReg += Incr) {
+    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
+    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+  }
+}
 
-      // Visit the children of this block in the dominator tree.
-      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
-           I != E; ++I) {
-        Changed |= VisitNode(*I, TLSBaseAddrReg);
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  if (AArch64::GPR32spRegClass.contains(DestReg) &&
+      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+      // If either operand is WSP, expand to ADD #0.
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+            .addReg(SrcRegX, RegState::Undef)
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc))
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+      }
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+            .addReg(AArch64::XZR)
+            .addReg(SrcRegX, RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        // Otherwise, expand to ORR WZR.
+        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+            .addReg(AArch64::WZR)
+            .addReg(SrcReg, getKillRegState(KillSrc));
       }
+    }
+    return;
+  }
 
-      return Changed;
+  if (AArch64::GPR64spRegClass.contains(DestReg) &&
+      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
     }
+    return;
+  }
 
-    // Replace the TLS_base_addr instruction I with a copy from
-    // TLSBaseAddrReg, returning the new instruction.
-    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
-                                         unsigned TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
-      const AArch64TargetMachine *TM =
-          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-      const AArch64InstrInfo *TII = TM->getInstrInfo();
+  // Copy a DDDD register quad by copying the individual sub-registers.
+  if (AArch64::DDDDRegClass.contains(DestReg) &&
+      AArch64::DDDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2, AArch64::dsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
 
-      // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
-      // code sequence assumes the address will be.
-      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   AArch64::X0)
-        .addReg(TLSBaseAddrReg);
+  // Copy a DDD register triple by copying the individual sub-registers.
+  if (AArch64::DDDRegClass.contains(DestReg) &&
+      AArch64::DDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
 
-      // Erase the TLS_base_addr instruction.
-      I->eraseFromParent();
+  // Copy a DD register pair by copying the individual sub-registers.
+  if (AArch64::DDRegClass.contains(DestReg) &&
+      AArch64::DDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQQ register quad by copying the individual sub-registers.
+  if (AArch64::QQQQRegClass.contains(DestReg) &&
+      AArch64::QQQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2, AArch64::qsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQ register triple by copying the individual sub-registers.
+  if (AArch64::QQQRegClass.contains(DestReg) &&
+      AArch64::QQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQ register pair by copying the individual sub-registers.
+  if (AArch64::QQRegClass.contains(DestReg) &&
+      AArch64::QQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  if (AArch64::FPR128RegClass.contains(DestReg) &&
+      AArch64::FPR128RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(AArch64::SP)
+        .addImm(-16);
+      BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(DestReg, RegState::Define)
+        .addReg(AArch64::SP)
+        .addImm(16);
+    }
+    return;
+  }
+
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR16RegClass.contains(DestReg) &&
+      AArch64::FPR16RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR8RegClass.contains(DestReg) &&
+      AArch64::FPR8RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copies between GPR64 and FPR64.
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::GPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  // Copies between GPR32 and FPR32.
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::GPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (DestReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MSR))
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
+    return;
+  }
+
+  if (SrcReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MRS))
+      .addReg(DestReg)
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
+    return;
+  }
+
+  llvm_unreachable("unimplemented reg-to-reg copy");
+}
 
-      return Copy;
+void AArch64InstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRWui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+      else
+        assert(SrcReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRXui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      else
+        assert(SrcReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev2d, Offset = false;
     }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
 
-    // Create a virtal register in *TLSBaseAddrReg, and populate it by
-    // inserting a copy instruction after I. Returns the new instruction.
-    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
-      const AArch64TargetMachine *TM =
-          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-      const AArch64InstrInfo *TII = TM->getInstrInfo();
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(SrcReg, getKillRegState(isKill))
+                                      .addFrameIndex(FI);
 
-      // Create a virtual register for the TLS base address.
-      MachineRegisterInfo &RegInfo = MF->getRegInfo();
-      *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
 
-      // Insert a copy from X0 to TLSBaseAddrReg for later.
-      MachineInstr *Next = I->getNextNode();
-      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   *TLSBaseAddrReg)
-        .addReg(AArch64::X0);
+void AArch64InstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
 
-      return Copy;
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRWui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+      else
+        assert(DestReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRXui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+      else
+        assert(DestReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev1d, Offset = false;
     }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(DestReg, getDefRegState(true))
+                                      .addFrameIndex(FI);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg, int Offset,
+                           const TargetInstrInfo *TII,
+                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+  if (DestReg == SrcReg && Offset == 0)
+    return;
 
-    const char *getPassName() const override {
-      return "Local Dynamic TLS Access Clean-up";
+  bool isSub = Offset < 0;
+  if (isSub)
+    Offset = -Offset;
+
+  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+  // scratch register.  If DestReg is a virtual register, use it as the
+  // scratch register; otherwise, create a new virtual register (to be
+  // replaced by the scavenger at the end of PEI).  That case can be optimized
+  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+  // register can be loaded with offset%8 and the add/sub can use an extending
+  // instruction with LSL#3.
+  // Currently the function handles any offsets but generates a poor sequence
+  // of code.
+  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+  unsigned Opc;
+  if (SetNZCV)
+    Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+  else
+    Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+  const unsigned MaxEncoding = 0xfff;
+  const unsigned ShiftSize = 12;
+  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  while (((unsigned)Offset) >= (1 << ShiftSize)) {
+    unsigned ThisVal;
+    if (((unsigned)Offset) > MaxEncodableValue) {
+      ThisVal = MaxEncodableValue;
+    } else {
+      ThisVal = Offset & MaxEncodableValue;
     }
+    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+           "Encoding cannot handle value that big");
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addImm(ThisVal >> ShiftSize)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+        .setMIFlag(Flag);
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<MachineDominatorTree>();
-      MachineFunctionPass::getAnalysisUsage(AU);
+    SrcReg = DestReg;
+    Offset -= ThisVal;
+    if (Offset == 0)
+      return;
+  }
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addImm(Offset)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlag(Flag);
+}
+
+MachineInstr *
+AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  // This is a bit of a hack. Consider this instruction:
+  //
+  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
+  //
+  // We explicitly chose GPR64all for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, we are going to constrain the %vreg0 register class here.
+  //
+  // <rdar://problem/11522048>
+  //
+  if (MI->isCopy()) {
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (SrcReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+      return nullptr;
     }
-  };
+    if (DstReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+  }
+
+  // Cannot fold.
+  return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                                    bool *OutUseUnscaledOp,
+                                    unsigned *OutUnscaledOp,
+                                    int *EmittableOffset) {
+  int Scale = 1;
+  bool IsSigned = false;
+  // The ImmIdx should be changed case by case if it is not 2.
+  unsigned ImmIdx = 2;
+  unsigned UnscaledOp = 0;
+  // Set output values in case of early exit.
+  if (EmittableOffset)
+    *EmittableOffset = 0;
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = false;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = 0;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
+  // Vector spills/fills can't take an immediate offset.
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Fourv1d:
+  case AArch64::ST1Twov2d:
+  case AArch64::ST1Threev2d:
+  case AArch64::ST1Fourv2d:
+  case AArch64::ST1Twov1d:
+  case AArch64::ST1Threev1d:
+  case AArch64::ST1Fourv1d:
+    return AArch64FrameOffsetCannotUpdate;
+  case AArch64::PRFMui:
+    Scale = 8;
+    UnscaledOp = AArch64::PRFUMi;
+    break;
+  case AArch64::LDRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURXi;
+    break;
+  case AArch64::LDRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURWi;
+    break;
+  case AArch64::LDRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBi;
+    break;
+  case AArch64::LDRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHi;
+    break;
+  case AArch64::LDRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSi;
+    break;
+  case AArch64::LDRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURDi;
+    break;
+  case AArch64::LDRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::LDURQi;
+    break;
+  case AArch64::LDRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBBi;
+    break;
+  case AArch64::LDRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHHi;
+    break;
+  case AArch64::LDRSBXui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBXi;
+    break;
+  case AArch64::LDRSBWui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBWi;
+    break;
+  case AArch64::LDRSHXui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHXi;
+    break;
+  case AArch64::LDRSHWui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHWi;
+    break;
+  case AArch64::LDRSWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSWi;
+    break;
+
+  case AArch64::STRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURXi;
+    break;
+  case AArch64::STRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURWi;
+    break;
+  case AArch64::STRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBi;
+    break;
+  case AArch64::STRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHi;
+    break;
+  case AArch64::STRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURSi;
+    break;
+  case AArch64::STRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURDi;
+    break;
+  case AArch64::STRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::STURQi;
+    break;
+  case AArch64::STRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBBi;
+    break;
+  case AArch64::STRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHHi;
+    break;
+
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+    IsSigned = true;
+    Scale = 8;
+    break;
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    IsSigned = true;
+    Scale = 16;
+    break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+    IsSigned = true;
+    Scale = 4;
+    break;
+
+  case AArch64::LDURXi:
+  case AArch64::LDURWi:
+  case AArch64::LDURBi:
+  case AArch64::LDURHi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::STURBi:
+  case AArch64::STURHi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+    Scale = 1;
+    break;
+  }
+
+  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+  bool useUnscaledOp = false;
+  // If the offset doesn't match the scale, we rewrite the instruction to
+  // use the unscaled instruction instead. Likewise, if we have a negative
+  // offset (and have an unscaled op to use).
+  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+    useUnscaledOp = true;
+
+  // Use an unscaled addressing mode if the instruction has a negative offset
+  // (or if the instruction is already using an unscaled addressing mode).
+  unsigned MaskBits;
+  if (IsSigned) {
+    // ldp/stp instructions.
+    MaskBits = 7;
+    Offset /= Scale;
+  } else if (UnscaledOp == 0 || useUnscaledOp) {
+    MaskBits = 9;
+    IsSigned = true;
+    Scale = 1;
+  } else {
+    MaskBits = 12;
+    IsSigned = false;
+    Offset /= Scale;
+  }
+
+  // Attempt to fold address computation.
+  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+  if (Offset >= MinOff && Offset <= MaxOff) {
+    if (EmittableOffset)
+      *EmittableOffset = Offset;
+    Offset = 0;
+  } else {
+    int NewOff = Offset < 0 ? MinOff : MaxOff;
+    if (EmittableOffset)
+      *EmittableOffset = NewOff;
+    Offset = (Offset - NewOff) * Scale;
+  }
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = useUnscaledOp;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = UnscaledOp;
+  return AArch64FrameOffsetCanUpdate |
+         (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
 }
 
-char LDTLSCleanup::ID = 0;
-FunctionPass*
-llvm::createAArch64CleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                    unsigned FrameReg, int &Offset,
+                                    const AArch64InstrInfo *TII) {
+  unsigned Opcode = MI.getOpcode();
+  unsigned ImmIdx = FrameRegIdx + 1;
+
+  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+    Offset += MI.getOperand(ImmIdx).getImm();
+    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+                    MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+    MI.eraseFromParent();
+    Offset = 0;
+    return true;
+  }
+
+  int NewOffset;
+  unsigned UnscaledOp;
+  bool UseUnscaledOp;
+  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+                                         &UnscaledOp, &NewOffset);
+  if (Status & AArch64FrameOffsetCanUpdate) {
+    if (Status & AArch64FrameOffsetIsLegal)
+      // Replace the FrameIndex with FrameReg.
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+    if (UseUnscaledOp)
+      MI.setDesc(TII->get(UnscaledOp));
+
+    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+    return Offset == 0;
+  }
+
+  return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(AArch64::HINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 10d5185ab630..f70b82b7c2da 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64INSTRINFO_H
-#define LLVM_TARGET_AARCH64INSTRINFO_H
+#ifndef LLVM_TARGET_AArch64INSTRINFO_H
+#define LLVM_TARGET_AArch64INSTRINFO_H
 
+#include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -23,90 +24,206 @@
 namespace llvm {
 
 class AArch64Subtarget;
+class AArch64TargetMachine;
 
 class AArch64InstrInfo : public AArch64GenInstrInfo {
+  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+  // They will be shifted into MOTargetHintStart when accessed.
+  enum TargetMemOperandFlags {
+    MOSuppressPair = 1
+  };
+
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
+
 public:
-  explicit AArch64InstrInfo(const AArch64Subtarget &TM);
+  explicit AArch64InstrInfo(const AArch64Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
-  ///
-  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  /// Returns true if there is a shiftable register and that the shift value
+  /// is non-zero.
+  bool hasShiftedReg(const MachineInstr *MI) const;
+
+  /// Returns true if there is an extendable register and that the extending
+  /// value is non-zero.
+  bool hasExtendedReg(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr *MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr *MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr *MI) const;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                            unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool enableClusterLoads() const override { return true; }
 
-  const AArch64Subtarget &getSubTarget() const { return Subtarget; }
+  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const override;
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
+  bool shouldScheduleAdjacent(MachineInstr *First,
+                              MachineInstr *Second) const override;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
-  void CopyPhysRegTuple(MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator I, DebugLoc DL,
-                        unsigned DestReg, unsigned SrcReg) const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
+
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIdx,
-                            const TargetRegisterClass *RC,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                        const SmallVectorImpl<unsigned> &Ops,
+                        int FrameIndex) const override;
+
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
                         const SmallVectorImpl<MachineOperand> &Cond,
                         DebugLoc DL) const override;
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
-
-  /// Look through the instructions in this function and work out the largest
-  /// the stack frame can be while maintaining the ability to address local
-  /// slots with no complexities.
-  unsigned estimateRSStackLimit(MachineFunction &MF) const;
-
-  /// getAddressConstraints - For loads and stores (and PRFMs) taking an
-  /// immediate offset, this function determines the constraints required for
-  /// the immediate. It must satisfy:
-  ///    + MinOffset <= imm <= MaxOffset
-  ///    + imm % OffsetScale == 0
-  void getAddressConstraints(const MachineInstr &MI, int &AccessScale,
-                             int &MinOffset, int &MaxOffset) const;
-
-
-  unsigned getInstSizeInBytes(const MachineInstr &MI) const;
-
-  unsigned getInstBundleLength(const MachineInstr &MI) const;
-
+  bool canInsertSelect(const MachineBasicBlock &,
+                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
+                       unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    DebugLoc DL, unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+                             MachineBasicBlock *TBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
 };
 
-bool rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                          unsigned FrameReg, int &Offset,
-                          const AArch64InstrInfo &TII);
-
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+                     const TargetInstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetNZCV = false);
+
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const AArch64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+  AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  AArch64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  AArch64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
 
-void emitRegUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                   DebugLoc dl, const TargetInstrInfo &TII,
-                   unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
-                   int64_t NumBytes,
-                   MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = nullptr,
+                            unsigned *OutUnscaledOp = nullptr,
+                            int *EmittableOffset = nullptr);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case AArch64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    return true;
+  default:
+    return false;
+  }
+}
 
-void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                  DebugLoc dl, const TargetInstrInfo &TII,
-                  unsigned ScratchReg, int64_t NumBytes,
-                  MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
 
-}
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 4d3c80152c30..b1e8fa64c2da 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1,4 +1,4 @@
-//===----- AArch64InstrInfo.td - AArch64 Instruction Info ----*- tablegen -*-=//
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the AArch64 scalar instructions in TableGen format.
+// AArch64 Instruction definitions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,5370 +19,5270 @@ def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<"FeatureCrypto","crypto">;
-
-// Use fused MAC if more precision in FP computation is allowed.
-def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
-                                 " FPOpFusion::Fast)">;
-include "AArch64InstrFormats.td"
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 
 //===----------------------------------------------------------------------===//
-//  AArch64 specific pattern fragments.
+// AArch64-specific DAG Nodes.
 //
-// An 'fmul' node with a single use.
-def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
-  return N->hasOneUse();
-}]>;
-
-
-//===----------------------------------------------------------------------===//
-// Target-specific ISD nodes and profiles
-//===----------------------------------------------------------------------===//
-
-def SDT_A64ret : SDTypeProfile<0, 0, []>;
-def A64ret : SDNode<"AArch64ISD::Ret", SDT_A64ret, [SDNPHasChain,
-                                                    SDNPOptInGlue,
-                                                    SDNPVariadic]>;
-
-// (ins NZCV, Condition, Dest)
-def SDT_A64br_cc : SDTypeProfile<0, 3, [SDTCisVT<0, i32>]>;
-def A64br_cc : SDNode<"AArch64ISD::BR_CC", SDT_A64br_cc, [SDNPHasChain]>;
-
-// (outs Result), (ins NZCV, IfTrue, IfFalse, Condition)
-def SDT_A64select_cc : SDTypeProfile<1, 4, [SDTCisVT<1, i32>,
-                                            SDTCisSameAs<0, 2>,
-                                            SDTCisSameAs<2, 3>]>;
-def A64select_cc : SDNode<"AArch64ISD::SELECT_CC", SDT_A64select_cc>;
-
-// (outs NZCV), (ins LHS, RHS, Condition)
-def SDT_A64setcc : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
-                                        SDTCisSameAs<1, 2>]>;
-def A64setcc : SDNode<"AArch64ISD::SETCC", SDT_A64setcc>;
-
-
-// (outs GPR64), (ins)
-def A64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
-
-// A64 compares don't care about the cond really (they set all flags) so a
-// simple binary operator is useful.
-def A64cmp : PatFrag<(ops node:$lhs, node:$rhs),
-                     (A64setcc node:$lhs, node:$rhs, cond)>;
-
-
-// When matching a notional (CMP op1, (sub 0, op2)), we'd like to use a CMN
-// instruction on the grounds that "op1 - (-op2) == op1 + op2". However, the C
-// and V flags can be set differently by this operation. It comes down to
-// whether "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are
-// then everything is fine. If not then the optimization is wrong. Thus general
-// comparisons are only valid if op2 != 0.
-
-// So, finally, the only LLVM-native comparisons that don't mention C and V are
-// SETEQ and SETNE. They're the only ones we can safely use CMN for in the
-// absence of information about op2.
-def equality_cond : PatLeaf<(cond), [{
-  return N->get() == ISD::SETEQ || N->get() == ISD::SETNE;
-}]>;
-
-def A64cmn : PatFrag<(ops node:$lhs, node:$rhs),
-                     (A64setcc node:$lhs, (sub 0, node:$rhs), equality_cond)>;
-
-// There are two layers of indirection here, driven by the following
-// considerations.
-//     + TableGen does not know CodeModel or Reloc so that decision should be
-//       made for a variable/address at ISelLowering.
-//     + The output of ISelLowering should be selectable (hence the Wrapper,
-//       rather than a bare target opcode)
-def SDTAArch64WrapperLarge : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                                  SDTCisSameAs<0, 2>,
-                                                  SDTCisSameAs<0, 3>,
-                                                  SDTCisSameAs<0, 4>,
-                                                  SDTCisPtrTy<0>]>;
-
-def A64WrapperLarge :SDNode<"AArch64ISD::WrapperLarge", SDTAArch64WrapperLarge>;
-
-def SDTAArch64WrapperSmall : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
-                                                  SDTCisSameAs<1, 2>,
-                                                  SDTCisVT<3, i32>,
-                                                  SDTCisPtrTy<0>]>;
-
-def A64WrapperSmall :SDNode<"AArch64ISD::WrapperSmall", SDTAArch64WrapperSmall>;
-
-
-def SDTAArch64GOTLoad : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
-def A64GOTLoad : SDNode<"AArch64ISD::GOTLoad", SDTAArch64GOTLoad,
-                        [SDNPHasChain]>;
 
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+                                              [SDTCisSameAs<0, 2>,
+                                               SDTCisSameAs<0, 3>,
+                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond  : SDTypeProfile<0, 3,
+                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                        SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel  : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
+def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
+                                   [SDTCisFP<0>,
+                                    SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>,
+                                           SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+                                                 SDTCisPtrTy<1>]>;
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
+                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                               SDTCisVT<1, i32> ]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call          : SDNode<"AArch64ISD::CALL",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+                                [SDNPHasChain]>;
+def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64cbnz           : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+
+
+def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv         : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg         : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc         : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag       : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc       : SDNode<"AArch64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
+def AArch64sbc       : SDNode<"AArch64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag  : SDNode<"AArch64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64sub_flag  : SDNode<"AArch64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64fmax      : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
+def AArch64fmin      : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
+
+def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+                        (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
+                                 SDT_AArch64TLSDescCall,
+                                 [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                  SDNPVariadic]>;
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+                                 SDT_AArch64WrapperLarge>;
 
-// (A64BFI LHS, RHS, LSB, Width)
-def SDTA64BFI : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                     SDTCisSameAs<1, 2>,
-                                     SDTCisVT<3, i64>,
-                                     SDTCisVT<4, i64>]>;
 
-def A64Bfi : SDNode<"AArch64ISD::BFI", SDTA64BFI>;
+//===----------------------------------------------------------------------===//
 
-// (A64EXTR HiReg, LoReg, LSB)
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
-                                      SDTCisVT<3, i64>]>;
-def A64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+//===----------------------------------------------------------------------===//
 
-// (A64[SU]BFX Field, ImmR, ImmS).
+// AArch64 Instruction Predicate Definitions.
 //
-// Note that ImmR and ImmS are already encoded for the actual instructions. The
-// more natural LSB and Width mix together to form ImmR and ImmS, something
-// which TableGen can't handle.
-def SDTA64BFX : SDTypeProfile<1, 3, [SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
-def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
+def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
+def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize   : Predicate<"ForCodeSize">;
+def NotForCodeSize   : Predicate<"!ForCodeSize">;
 
-def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
+include "AArch64InstrFormats.td"
 
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Call sequence pseudo-instructions
+// Miscellaneous instructions.
 //===----------------------------------------------------------------------===//
 
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              [(AArch64callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, they can be
+// removed, along with the AArch64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+                     [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+              Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+                                            tglobaladdr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+                                             tjumptable:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+                                             tconstpool:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+                                             tblockaddress:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+                                            texternalsym:$low))]>,
+      Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+          (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+          (LOADgot texternalsym:$addr)>;
+
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+          (LOADgot tconstpool:$addr)>;
 
-def SDT_AArch64Call : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def AArch64Call : SDNode<"AArch64ISD::Call", SDT_AArch64Call,
-                     [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
-
-def AArch64tcret : SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64Call,
-                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
-// The TLSDESCCALL node is a variant call which goes to an indirectly calculated
-// destination but needs a relocation against a fixed symbol. As such it has two
-// certain operands: the callee and the relocated variable.
-//
-// The TLS ABI only allows it to be selected to a BLR instructin (with
-// appropriate relocation).
-def SDTTLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
-
-def A64tlsdesc_blr : SDNode<"AArch64ISD::TLSDESCCALL", SDTTLSDescCall,
-                            [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                             SDNPVariadic]>;
+//===----------------------------------------------------------------------===//
+// System instructions.
+//===----------------------------------------------------------------------===//
 
+def HINT  : HintI<"hint">;
+def : InstAlias<"nop",  (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe",  (HINT 0b010)>;
+def : InstAlias<"wfi",  (HINT 0b011)>;
+def : InstAlias<"sev",  (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
 
-def SDT_AArch64CallSeqStart : SDCallSeqStart<[ SDTCisPtrTy<0> ]>;
-def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AArch64CallSeqStart,
-                                  [SDNPHasChain, SDNPOutGlue]>;
+  // As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
 
-def SDT_AArch64CallSeqEnd   : SDCallSeqEnd<[ SDTCisPtrTy<0>, SDTCisPtrTy<1> ]>;
-def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_AArch64CallSeqEnd,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
 
+def MRS    : MRSI;
+def MSR    : MSRI;
+def MSRpstate: MSRpstateI;
 
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
 
-// These pseudo-instructions have special semantics by virtue of being passed to
-// the InstrInfo constructor. CALLSEQ_START/CALLSEQ_END are produced by
-// LowerCall to (in our case) tell the back-end about stack adjustments for
-// arguments passed on the stack. Here we select those markers to
-// pseudo-instructions which explicitly set the stack, and finally in the
-// RegisterInfo we convert them to a true stack adjustment.
-let Defs = [XSP], Uses = [XSP] in {
-  def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i64imm:$amt),
-                                    [(AArch64callseq_start timm:$amt)]>;
+// Generic system instructions
+def SYSxt  : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
 
-  def ADJCALLSTACKUP : PseudoInst<(outs), (ins i64imm:$amt1, i64imm:$amt2),
-                                 [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
-}
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
 
 //===----------------------------------------------------------------------===//
-// Atomic operation pseudo-instructions
+// Move immediate instructions.
 //===----------------------------------------------------------------------===//
 
-// These get selected from C++ code as a pretty much direct translation from the
-// generic DAG nodes. The one exception is the AtomicOrdering is added as an
-// operand so that the eventual lowering can make use of it and choose
-// acquire/release operations when required.
-
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
-multiclass AtomicSizes {
-  def _I8 : PseudoInst<(outs GPR32:$dst),
-                       (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I16 : PseudoInst<(outs GPR32:$dst),
-                        (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I32 : PseudoInst<(outs GPR32:$dst),
-                        (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I64 : PseudoInst<(outs GPR64:$dst),
-                        (ins GPR64xsp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
-}
-}
-
-defm ATOMIC_LOAD_ADD  : AtomicSizes;
-defm ATOMIC_LOAD_SUB  : AtomicSizes;
-defm ATOMIC_LOAD_AND  : AtomicSizes;
-defm ATOMIC_LOAD_OR   : AtomicSizes;
-defm ATOMIC_LOAD_XOR  : AtomicSizes;
-defm ATOMIC_LOAD_NAND : AtomicSizes;
-defm ATOMIC_SWAP      : AtomicSizes;
-let Defs = [NZCV] in {
-  // These operations need a CMP to calculate the correct value
-  defm ATOMIC_LOAD_MIN  : AtomicSizes;
-  defm ATOMIC_LOAD_MAX  : AtomicSizes;
-  defm ATOMIC_LOAD_UMIN : AtomicSizes;
-  defm ATOMIC_LOAD_UMAX : AtomicSizes;
-}
-
-class AtomicCmpSwap<RegisterClass GPRData>
-  : PseudoInst<(outs GPRData:$dst),
-               (ins GPR64xsp:$ptr, GPRData:$old, GPRData:$new,
-                    i32imm:$ordering), []> {
-  let usesCustomInserter = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let Defs = [NZCV];
-}
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
 
-def ATOMIC_CMP_SWAP_I8  : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
 
-//===----------------------------------------------------------------------===//
-// Add-subtract (extended register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP
-
-// The RHS of these operations is conceptually a sign/zero-extended
-// register, optionally shifted left by 1-4. The extension can be a
-// NOP (e.g. "sxtx" sign-extending a 64-bit register to 64-bits) but
-// must be specified with one exception:
-
-// If one of the registers is sp/wsp then LSL is an alias for UXTW in
-// 32-bit instructions and UXTX in 64-bit versions, the shift amount
-// is not optional in that case (but can explicitly be 0), and the
-// entire suffix can be skipped (e.g. "add sp, x3, x2").
-
-multiclass extend_operands<string PREFIX, string Diag> {
-     def _asmoperand : AsmOperandClass {
-         let Name = PREFIX;
-         let RenderMethod = "addRegExtendOperands";
-         let PredicateMethod = "isRegExtend<A64SE::" # PREFIX # ">";
-         let DiagnosticType = "AddSubRegExtend" # Diag;
-     }
-
-     def _operand : Operand<i64>,
-                    ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 4; }]> {
-         let PrintMethod = "printRegExtendOperand<A64SE::" # PREFIX # ">";
-         let DecoderMethod = "DecodeRegExtendOperand";
-         let ParserMatchClass = !cast<AsmOperandClass>(PREFIX # "_asmoperand");
-     }
-}
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
 
-defm UXTB : extend_operands<"UXTB", "Small">;
-defm UXTH : extend_operands<"UXTH", "Small">;
-defm UXTW : extend_operands<"UXTW", "Small">;
-defm UXTX : extend_operands<"UXTX", "Large">;
-defm SXTB : extend_operands<"SXTB", "Small">;
-defm SXTH : extend_operands<"SXTH", "Small">;
-defm SXTW : extend_operands<"SXTW", "Small">;
-defm SXTX : extend_operands<"SXTX", "Large">;
-
-def LSL_extasmoperand : AsmOperandClass {
-    let Name = "RegExtendLSL";
-    let RenderMethod = "addRegExtendOperands";
-    let DiagnosticType = "AddSubRegExtendLarge";
-}
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
-def LSL_extoperand : Operand<i64> {
-    let ParserMatchClass = LSL_extasmoperand;
-}
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
 
-// The patterns for various sign-extensions are a little ugly and
-// non-uniform because everything has already been promoted to the
-// legal i64 and i32 types. We'll wrap the various variants up in a
-// class for use later.
-class extend_types {
-    dag uxtb; dag uxth; dag uxtw; dag uxtx;
-    dag sxtb; dag sxth; dag sxtw; dag sxtx;
-    ValueType ty;
-    RegisterClass GPR;
-}
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
-def extends_to_i64 : extend_types {
-    let uxtb = (and (anyext i32:$Rm), 255);
-    let uxth = (and (anyext i32:$Rm), 65535);
-    let uxtw = (zext i32:$Rm);
-    let uxtx = (i64 $Rm);
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
-    let sxtb = (sext_inreg (anyext i32:$Rm), i8);
-    let sxth = (sext_inreg (anyext i32:$Rm), i16);
-    let sxtw = (sext i32:$Rm);
-    let sxtx = (i64 $Rm);
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
 
-    let ty = i64;
-    let GPR = GPR64xsp;
-}
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+                          int width, int shift> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "_lsl" # shift # "MovAlias";
+    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+                               # shift # ">";
+    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
+  }
 
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
+  }
 
-def extends_to_i32 : extend_types {
-    let uxtb = (and i32:$Rm, 255);
-    let uxth = (and i32:$Rm, 65535);
-    let uxtw = (i32 i32:$Rm);
-    let uxtx = (i32 i32:$Rm);
+  def : InstAlias<"mov $Rd, $imm",
+                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
+}
+
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
+
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+    isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+             [(set GPR32:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+def MOVi64imm
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+             [(set GPR64:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
 
-    let sxtb = (sext_inreg i32:$Rm, i8);
-    let sxth = (sext_inreg i32:$Rm, i16);
-    let sxtw = (i32 i32:$Rm);
-    let sxtx = (i32 i32:$Rm);
+def trunc_imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
+}]>;
 
-    let ty = i32;
-    let GPR = GPR32wsp;
-}
+def : Pat<(i64 i64imm_32bit:$src),
+          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+                             tglobaladdr:$g1, tglobaladdr:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+                                  tglobaladdr:$g2, 32),
+                          tglobaladdr:$g1, 16),
+                  tglobaladdr:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+                             tblockaddress:$g1, tblockaddress:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+                                  tblockaddress:$g2, 32),
+                          tblockaddress:$g1, 16),
+                  tblockaddress:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+                             tconstpool:$g1, tconstpool:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+                                  tconstpool:$g2, 32),
+                          tconstpool:$g1, 16),
+                  tconstpool:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+                             tjumptable:$g1, tjumptable:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+                                  tjumptable:$g2, 32),
+                          tjumptable:$g1, 16),
+                  tjumptable:$g0, 0)>;
 
-// Now, six of the extensions supported are easy and uniform: if the source size
-// is 32-bits or less, then Rm is always a 32-bit register. We'll instantiate
-// those instructions in one block.
-
-// The uxtx/sxtx could potentially be merged in, but three facts dissuaded me:
-//     + It would break the naming scheme: either ADDxx_uxtx or ADDww_uxtx would
-//       be impossible.
-//     + Patterns are very different as well.
-//     + Passing different registers would be ugly (more fields in extend_types
-//       would probably be the best option).
-multiclass addsub_exts<bit sf, bit op, bit S, string asmop,
-                       SDPatternOperator opfrag,
-                       dag outs, extend_types exts> {
-    def w_uxtb : A64I_addsubext<sf, op, S, 0b00, 0b000,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTB_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxtb, UXTB_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_uxth : A64I_addsubext<sf, op, S, 0b00, 0b001,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTH_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxth, UXTH_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_uxtw : A64I_addsubext<sf, op, S, 0b00, 0b010,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTW_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxtw, UXTW_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def w_sxtb : A64I_addsubext<sf, op, S, 0b00, 0b100,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTB_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxtb, SXTB_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_sxth : A64I_addsubext<sf, op, S, 0b00, 0b101,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTH_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxth, SXTH_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_sxtw : A64I_addsubext<sf, op, S, 0b00, 0b110,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTW_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxtw, SXTW_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
 
-// These two could be merge in with the above, but their patterns aren't really
-// necessary and the naming-scheme would necessarily break:
-multiclass addsub_xxtx<bit op, bit S, string asmop, SDPatternOperator opfrag,
-                       dag outs> {
-    def x_uxtx : A64I_addsubext<0b1, op, S, 0b00, 0b011,
-                   outs,
-                   (ins GPR64xsp:$Rn, GPR64:$Rm, UXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [(opfrag i64:$Rn, (shl i64:$Rm, UXTX_operand:$Imm3))],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def x_sxtx : A64I_addsubext<0b1, op, S, 0b00, 0b111,
-                   outs,
-                   (ins GPR64xsp:$Rn, GPR64:$Rm, SXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No Pattern: same as uxtx */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
 
-multiclass addsub_wxtx<bit op, bit S, string asmop, dag outs> {
-    def w_uxtx : A64I_addsubext<0b0, op, S, 0b00, 0b011,
-                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No pattern: probably same as uxtw */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def w_sxtx : A64I_addsubext<0b0, op, S, 0b00, 0b111,
-                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No Pattern: probably same as uxtw */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", add>;
+defm SUB : AddSub<1, "sub">;
+
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+let isCodeGenOnly = 1 in {
+defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
+defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
+}
+
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
+
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
 
-class SetRD<RegisterClass RC, SDPatternOperator op>
- : PatFrag<(ops node:$lhs, node:$rhs), (set RC:$Rd, (op node:$lhs, node:$rhs))>;
-class SetNZCV<SDPatternOperator op>
-  : PatFrag<(ops node:$lhs, node:$rhs), (set NZCV, (op node:$lhs, node:$rhs))>;
-
-defm ADDxx :addsub_exts<0b1, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
-                        (outs GPR64xsp:$Rd), extends_to_i64>,
-            addsub_xxtx<     0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
-                        (outs GPR64xsp:$Rd)>;
-defm ADDww :addsub_exts<0b0, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR32wsp, add>,
-                        (outs GPR32wsp:$Rd), extends_to_i32>,
-            addsub_wxtx<     0b0, 0b0, "add\t$Rd, ",
-                        (outs GPR32wsp:$Rd)>;
-defm SUBxx :addsub_exts<0b1, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
-                        (outs GPR64xsp:$Rd), extends_to_i64>,
-            addsub_xxtx<     0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
-                        (outs GPR64xsp:$Rd)>;
-defm SUBww :addsub_exts<0b0, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR32wsp, sub>,
-                        (outs GPR32wsp:$Rd), extends_to_i32>,
-            addsub_wxtx<     0b1, 0b0, "sub\t$Rd, ",
-                        (outs GPR32wsp:$Rd)>;
-
-let Defs = [NZCV] in {
-defm ADDSxx :addsub_exts<0b1, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
-                         (outs GPR64:$Rd), extends_to_i64>,
-             addsub_xxtx<     0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
-                         (outs GPR64:$Rd)>;
-defm ADDSww :addsub_exts<0b0, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR32, addc>,
-                         (outs GPR32:$Rd), extends_to_i32>,
-             addsub_wxtx<     0b0, 0b1, "adds\t$Rd, ",
-                         (outs GPR32:$Rd)>;
-defm SUBSxx :addsub_exts<0b1, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
-                         (outs GPR64:$Rd), extends_to_i64>,
-             addsub_xxtx<     0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
-                         (outs GPR64:$Rd)>;
-defm SUBSww :addsub_exts<0b0, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR32, subc>,
-                         (outs GPR32:$Rd), extends_to_i32>,
-             addsub_wxtx<     0b1, 0b1, "subs\t$Rd, ",
-                         (outs GPR32:$Rd)>;
-
-
-let SchedRW = [WriteCMP, ReadCMP, ReadCMP], Rd = 0b11111, isCompare = 1 in {
-defm CMNx : addsub_exts<0b1, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
-                        (outs), extends_to_i64>,
-            addsub_xxtx<     0b0, 0b1, "cmn\t", SetNZCV<A64cmn>, (outs)>;
-defm CMNw : addsub_exts<0b0, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
-                        (outs), extends_to_i32>,
-            addsub_wxtx<     0b0, 0b1, "cmn\t", (outs)>;
-defm CMPx : addsub_exts<0b1, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
-                        (outs), extends_to_i64>,
-            addsub_xxtx<     0b1, 0b1, "cmp\t", SetNZCV<A64cmp>, (outs)>;
-defm CMPw : addsub_exts<0b0, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
-                        (outs), extends_to_i32>,
-            addsub_wxtx<     0b1, 0b1, "cmp\t", (outs)>;
-}
-}
 
-// Now patterns for the operation without a shift being needed. No patterns are
-// created for uxtx/sxtx since they're non-uniform and it's expected that
-// add/sub (shifted register) will handle those cases anyway.
-multiclass addsubext_noshift_patterns<string prefix, SDPatternOperator nodeop,
-                                      extend_types exts> {
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxtb),
-              (!cast<Instruction>(prefix # "w_uxtb") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxth),
-              (!cast<Instruction>(prefix # "w_uxth") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxtw),
-              (!cast<Instruction>(prefix # "w_uxtw") $Rn, $Rm, 0)>;
-
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxtb),
-              (!cast<Instruction>(prefix # "w_sxtb") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxth),
-              (!cast<Instruction>(prefix # "w_sxth") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxtw),
-              (!cast<Instruction>(prefix # "w_sxtw") $Rn, $Rm, 0)>;
-}
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
 
-defm : addsubext_noshift_patterns<"ADDxx", add, extends_to_i64>;
-defm : addsubext_noshift_patterns<"ADDww", add, extends_to_i32>;
-defm : addsubext_noshift_patterns<"SUBxx", sub, extends_to_i64>;
-defm : addsubext_noshift_patterns<"SUBww", sub, extends_to_i32>;
-
-defm : addsubext_noshift_patterns<"CMNx", A64cmn, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMNw", A64cmn, extends_to_i32>;
-defm : addsubext_noshift_patterns<"CMPx", A64cmp, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMPw", A64cmp, extends_to_i32>;
-
-// An extend of "lsl #imm" is valid if and only if one of Rn and Rd is
-// sp/wsp. It is synonymous with uxtx/uxtw depending on the size of the
-// operation. Also permitted in this case is complete omission of the argument,
-// which implies "lsl #0".
-multiclass lsl_aliases<string asmop, Instruction inst, RegisterClass GPR_Rd,
-                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
-    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                    (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
-
-    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm, $LSL"),
-                (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL),
-                0>;
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>;
+defm AND  : LogicalImm<0b00, "and", and>;
+defm EOR  : LogicalImm<0b10, "eor", xor>;
+defm ORR  : LogicalImm<0b01, "orr", or>;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+                                          logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+                                          logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+                        BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND  : LogicalReg<0b00, 0, "and", and>;
+defm BIC  : LogicalReg<0b00, 1, "bic",
+                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON  : LogicalReg<0b10, 1, "eon",
+                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN  : LogicalReg<0b01, 1, "orn",
+                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR  : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
 
-}
 
-defm : lsl_aliases<"add",  ADDxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"add",  ADDxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"add",  ADDwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"add",  ADDwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
-defm : lsl_aliases<"sub",  SUBxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"sub",  SUBxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"sub",  SUBwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"sub",  SUBwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
-
-// Rd cannot be sp for flag-setting variants so only half of the aliases are
-// needed.
-defm : lsl_aliases<"adds", ADDSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"adds", ADDSwww_uxtw, GPR32, Rwsp, GPR32>;
-defm : lsl_aliases<"subs", SUBSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"subs", SUBSwww_uxtw, GPR32, Rwsp, GPR32>;
-
-// CMP unfortunately has to be different because the instruction doesn't have a
-// dest register.
-multiclass cmp_lsl_aliases<string asmop, Instruction inst,
-                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
-    def : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
-                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
-
-    def : InstAlias<!strconcat(asmop, " $Rn, $Rm, $LSL"),
-                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
-}
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
 
-defm : cmp_lsl_aliases<"cmp", CMPxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmp", CMPww_uxtw, Rwsp, GPR32>;
-defm : cmp_lsl_aliases<"cmn", CMNxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmn", CMNww_uxtw, Rwsp, GPR32>;
+defm CLS    : OneOperandData<0b101, "cls">;
+defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT   : OneOperandData<0b000, "rbit">;
+
+def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
+def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+
+def  REV16Wr : OneWRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+          (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+          (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+                (i32 1))),
+          (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+                (i64 1))),
+          (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr   : OneWRegData<0b010, "rev", bswap>;
+def REVXr   : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (immediate) instructions
+// Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, MOV
-
-// These instructions accept a 12-bit unsigned immediate, optionally shifted
-// left by 12 bits. Official assembly format specifies a 12 bit immediate with
-// one of "", "LSL #0", "LSL #12" supplementary operands.
-
-// There are surprisingly few ways to make this work with TableGen, so this
-// implementation has separate instructions for the "LSL #0" and "LSL #12"
-// variants.
-
-// If the MCInst retained a single combined immediate (which could be 0x123000,
-// for example) then both components (imm & shift) would have to be delegated to
-// a single assembly operand. This would entail a separate operand parser
-// (because the LSL would have to live in the same AArch64Operand as the
-// immediate to be accessible); assembly parsing is rather complex and
-// error-prone C++ code.
-//
-// By splitting the immediate, we can delegate handling this optional operand to
-// an InstAlias. Supporting functions to generate the correct MCInst are still
-// required, but these are essentially trivial and parsing can remain generic.
-//
-// Rejected plans with rationale:
-// ------------------------------
-//
-// In an ideal world you'de have two first class immediate operands (in
-// InOperandList, specifying imm12 and shift). Unfortunately this is not
-// selectable by any means I could discover.
-//
-// An Instruction with two MCOperands hidden behind a single entry in
-// InOperandList (expanded by ComplexPatterns and MIOperandInfo) was functional,
-// but required more C++ code to handle encoding/decoding. Parsing (the intended
-// main beneficiary) ended up equally complex because of the optional nature of
-// "LSL #0".
-//
-// Attempting to circumvent the need for a custom OperandParser above by giving
-// InstAliases without the "lsl #0" failed. add/sub could be accommodated but
-// the cmp/cmn aliases didn't use the MIOperandInfo to determine how operands
-// should be parsed: there was no way to accommodate an "lsl #12".
-
-let ParserMethod = "ParseImmWithLSLOperand",
-    RenderMethod = "addImmWithLSLOperands" in {
-  // Derived PredicateMethod fields are different for each
-  def addsubimm_lsl0_asmoperand : AsmOperandClass {
-    let Name = "AddSubImmLSL0";
-    // If an error is reported against this operand, instruction could also be a
-    // register variant.
-    let DiagnosticType = "AddSubSecondSource";
-  }
+let neverHasSideEffects = 1 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
 
-  def addsubimm_lsl12_asmoperand : AsmOperandClass {
-    let Name = "AddSubImmLSL12";
-    let DiagnosticType = "AddSubSecondSource";
-  }
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
 }
 
-def shr_12_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue() >> 12, MVT::i32);
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
-def shr_12_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((-N->getSExtValue()) >> 12, MVT::i32);
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
-def neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(-N->getSExtValue(), MVT::i32);
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-multiclass addsub_imm_operands<ValueType ty> {
- let PrintMethod = "printAddSubImmLSL0Operand",
-      EncoderMethod = "getAddSubImmOpValue",
-      ParserMatchClass = addsubimm_lsl0_asmoperand in {
-    def _posimm_lsl0 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff) == 0; }]>;
-    def _negimm_lsl0 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff) == 0; }],
-                neg_XFORM>;
-  }
-
-  let PrintMethod = "printAddSubImmLSL12Operand",
-      EncoderMethod = "getAddSubImmOpValue",
-      ParserMatchClass = addsubimm_lsl12_asmoperand in {
-    def _posimm_lsl12 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff000) == 0; }],
-                shr_12_XFORM>;
-
-    def _negimm_lsl12 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff000) == 0; }],
-                shr_12_neg_XFORM>;
-  }
-}
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-// The add operands don't need any transformation
-defm addsubimm_operand_i32 : addsub_imm_operands<i32>;
-defm addsubimm_operand_i64 : addsub_imm_operands<i64>;
-
-multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
-                               string asmop, string cmpasmop,
-                               Operand imm_operand, Operand cmp_imm_operand,
-                               RegisterClass GPR, RegisterClass GPRsp,
-                               AArch64Reg ZR, ValueType Ty> {
-    // All registers for non-S variants allow SP
-  def _s : A64I_addsubimm<sf, op, 0b0, shift,
-                         (outs GPRsp:$Rd),
-                         (ins GPRsp:$Rn, imm_operand:$Imm12),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm12"),
-                         [(set Ty:$Rd, (add Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary>,
-           Sched<[WriteALU, ReadALU]>;
-
-
-  // S variants can read SP but would write to ZR
-  def _S : A64I_addsubimm<sf, op, 0b1, shift,
-                         (outs GPR:$Rd),
-                         (ins GPRsp:$Rn, imm_operand:$Imm12),
-                         !strconcat(asmop, "s\t$Rd, $Rn, $Imm12"),
-                         [(set Ty:$Rd, (addc Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary>,
-           Sched<[WriteALU, ReadALU]> {
-    let Defs = [NZCV];
-  }
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-  // Note that the pattern here for ADDS is subtle. Canonically CMP
-  // a, b becomes SUBS a, b. If b < 0 then this is equivalent to
-  // ADDS a, (-b). This is not true in general.
-  def _cmp : A64I_addsubimm<sf, op, 0b1, shift,
-                            (outs), (ins GPRsp:$Rn, imm_operand:$Imm12),
-                            !strconcat(cmpasmop, " $Rn, $Imm12"),
-                            [(set NZCV,
-                                  (A64cmp Ty:$Rn, cmp_imm_operand:$Imm12))],
-                            NoItinerary>,
-           Sched<[WriteCMP, ReadCMP]> {
-    let Rd = 0b11111;
-    let Defs = [NZCV];
-    let isCompare = 1;
-  }
-}
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-multiclass addsubimm_shifts<string prefix, bit sf, bit op,
-           string asmop, string cmpasmop, string operand, string cmpoperand,
-           RegisterClass GPR, RegisterClass GPRsp, AArch64Reg ZR,
-           ValueType Ty> {
-  defm _lsl0 : addsubimm_varieties<prefix # "_lsl0", sf, op, 0b00,
-                                   asmop, cmpasmop,
-                                   !cast<Operand>(operand # "_lsl0"),
-                                   !cast<Operand>(cmpoperand # "_lsl0"),
-                                   GPR, GPRsp, ZR, Ty>;
-
-  defm _lsl12 : addsubimm_varieties<prefix # "_lsl12", sf, op, 0b01,
-                                    asmop, cmpasmop,
-                                    !cast<Operand>(operand # "_lsl12"),
-                                    !cast<Operand>(cmpoperand # "_lsl12"),
-                                    GPR, GPRsp, ZR, Ty>;
-}
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 31 ? 31 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-defm ADDwwi : addsubimm_shifts<"ADDwi", 0b0, 0b0, "add", "cmn",
-                              "addsubimm_operand_i32_posimm",
-                              "addsubimm_operand_i32_negimm",
-                              GPR32, GPR32wsp, WZR, i32>;
-defm ADDxxi : addsubimm_shifts<"ADDxi", 0b1, 0b0, "add", "cmn",
-                              "addsubimm_operand_i64_posimm",
-                              "addsubimm_operand_i64_negimm",
-                              GPR64, GPR64xsp, XZR, i64>;
-defm SUBwwi : addsubimm_shifts<"SUBwi", 0b0, 0b1, "sub", "cmp",
-                              "addsubimm_operand_i32_negimm",
-                              "addsubimm_operand_i32_posimm",
-                              GPR32, GPR32wsp, WZR, i32>;
-defm SUBxxi : addsubimm_shifts<"SUBxi", 0b1, 0b1, "sub", "cmp",
-                              "addsubimm_operand_i64_negimm",
-                              "addsubimm_operand_i64_posimm",
-                              GPR64, GPR64xsp, XZR, i64>;
-
-multiclass MOVsp<RegisterClass GPRsp, RegisterClass SP, Instruction addop> {
-  def _fromsp : InstAlias<"mov $Rd, $Rn",
-                          (addop GPRsp:$Rd, SP:$Rn, 0),
-                          0b1>;
-
-  def _tosp : InstAlias<"mov $Rd, $Rn",
-                        (addop SP:$Rd, GPRsp:$Rn, 0),
-                        0b1>;
-}
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+                              (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_b imm0_63:$imm)))>;
 
-// Recall Rxsp is a RegisterClass containing *just* xsp.
-defm MOVxx : MOVsp<GPR64xsp, Rxsp, ADDxxi_lsl0_s>;
-defm MOVww : MOVsp<GPR32wsp, Rwsp, ADDwwi_lsl0_s>;
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (shifted register) instructions
+// Conditionally set flags instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, NEG, NEGS
-
-//===-------------------------------
-// 1. The "shifted register" operands. Shared with logical insts.
-//===-------------------------------
-
-multiclass shift_operands<string prefix, string form> {
-  def _asmoperand_i32 : AsmOperandClass {
-    let Name = "Shift" # form # "i32";
-    let RenderMethod = "addShiftOperands";
-    let PredicateMethod = "isShift<A64SE::" # form # ", false>";
-    let DiagnosticType = "AddSubRegShift32";
-  }
-
-  // Note that the operand type is intentionally i64 because the DAGCombiner
-  // puts these into a canonical form.
-  def _i32 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-    let ParserMatchClass
-          = !cast<AsmOperandClass>(prefix # "_asmoperand_i32");
-    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
-    let DecoderMethod = "Decode32BitShiftOperand";
-  }
-
-  def _asmoperand_i64 : AsmOperandClass {
-      let Name = "Shift" # form # "i64";
-      let RenderMethod = "addShiftOperands";
-      let PredicateMethod = "isShift<A64SE::" # form # ", true>";
-      let DiagnosticType = "AddSubRegShift64";
-  }
+defm CCMN : CondSetFlagsImm<0, "ccmn">;
+defm CCMP : CondSetFlagsImm<1, "ccmp">;
 
-  def _i64 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-    let ParserMatchClass
-          = !cast<AsmOperandClass>(prefix # "_asmoperand_i64");
-    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
-  }
-}
+defm CCMN : CondSetFlagsReg<0, "ccmn">;
+defm CCMP : CondSetFlagsReg<1, "ccmp">;
 
-defm lsl_operand : shift_operands<"lsl_operand", "LSL">;
-defm lsr_operand : shift_operands<"lsr_operand", "LSR">;
-defm asr_operand : shift_operands<"asr_operand", "ASR">;
-
-// Not used for add/sub, but defined here for completeness. The "logical
-// (shifted register)" instructions *do* have an ROR variant.
-defm ror_operand : shift_operands<"ror_operand", "ROR">;
-
-//===-------------------------------
-// 2. The basic 3.5-operand ADD/SUB/ADDS/SUBS instructions.
-//===-------------------------------
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
-                         string asmop, SDPatternOperator opfrag, ValueType ty,
-                         RegisterClass GPR, list<Register> defs> {
-  let isCommutable = commutable, Defs = defs in {
-  def _lsl : A64I_addsubshift<sf, op, s, 0b00,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set GPR:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-
-  def _lsr : A64I_addsubshift<sf, op, s, 0b01,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-
-  def _asr : A64I_addsubshift<sf, op, s, 0b10,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-  }
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL  : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
 
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
-                                                      GPR:$Rm, 0)>;
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+def ADR  : ADRI<0, "adr", adrlabel, []>;
+} // neverHasSideEffects = 1
 
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
+def ADRP : ADRI<1, "adrp", adrplabel,
+                [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
 
-multiclass addsub_sizes<string prefix, bit op, bit s, bit commutable,
-                         string asmop, SDPatternOperator opfrag,
-                         list<Register> defs> {
-  defm xxx : addsub_shifts<prefix # "xxx", 0b1, op, s,
-                           commutable, asmop, opfrag, i64, GPR64, defs>;
-  defm www : addsub_shifts<prefix # "www", 0b0, op, s,
-                           commutable, asmop, opfrag, i32, GPR32, defs>;
-}
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
 
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
 
-defm ADD : addsub_sizes<"ADD", 0b0, 0b0, 0b1, "add", add, []>;
-defm SUB : addsub_sizes<"SUB", 0b1, 0b0, 0b0, "sub", sub, []>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET  : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
 
-defm ADDS : addsub_sizes<"ADDS", 0b0, 0b1, 0b1, "adds", addc, [NZCV]>;
-defm SUBS : addsub_sizes<"SUBS", 0b1, 0b1, 0b0, "subs", subc, [NZCV]>;
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
 
-//===-------------------------------
-// 1. The NEG/NEGS aliases
-//===-------------------------------
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
 
-multiclass neg_alias<Instruction INST, RegisterClass GPR, Register ZR,
-                     ValueType ty, Operand shift_operand, SDNode shiftop> {
-   def : InstAlias<"neg $Rd, $Rm, $Imm6",
-                   (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
 
-   def : Pat<(sub 0, (shiftop ty:$Rm, shift_operand:$Imm6)),
-             (INST ZR, $Rm, shift_operand:$Imm6)>;
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
 }
 
-defm : neg_alias<SUBwww_lsl, GPR32, WZR, i32, lsl_operand_i32, shl>;
-defm : neg_alias<SUBwww_lsr, GPR32, WZR, i32, lsr_operand_i32, srl>;
-defm : neg_alias<SUBwww_asr, GPR32, WZR, i32, asr_operand_i32, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-def : Pat<(sub 0, i32:$Rm), (SUBwww_lsl WZR, $Rm, 0)>;
-
-defm : neg_alias<SUBxxx_lsl, GPR64, XZR, i64, lsl_operand_i64, shl>;
-defm : neg_alias<SUBxxx_lsr, GPR64, XZR, i64, lsr_operand_i64, srl>;
-defm : neg_alias<SUBxxx_asr, GPR64, XZR, i64, asr_operand_i64, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def : Pat<(sub 0, i64:$Rm), (SUBxxx_lsl XZR, $Rm, 0)>;
-
-// NEGS doesn't get any patterns yet: defining multiple outputs means C++ has to
-// be involved.
-class negs_alias<Instruction INST, RegisterClass GPR,
-                 Register ZR, Operand shift_operand, SDNode shiftop>
-  : InstAlias<"negs $Rd, $Rm, $Imm6",
-              (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
-
-def : negs_alias<SUBSwww_lsl, GPR32, WZR, lsl_operand_i32, shl>;
-def : negs_alias<SUBSwww_lsr, GPR32, WZR, lsr_operand_i32, srl>;
-def : negs_alias<SUBSwww_asr, GPR32, WZR, asr_operand_i32, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-
-def : negs_alias<SUBSxxx_lsl, GPR64, XZR, lsl_operand_i64, shl>;
-def : negs_alias<SUBSxxx_lsr, GPR64, XZR, lsr_operand_i64, srl>;
-def : negs_alias<SUBSxxx_asr, GPR64, XZR, asr_operand_i64, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-
-//===-------------------------------
-// 1. The CMP/CMN aliases
-//===-------------------------------
-
-multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
-                      string asmop, SDPatternOperator opfrag, ValueType ty,
-                      RegisterClass GPR> {
-  let isCommutable = commutable, Rd = 0b11111, Defs = [NZCV] in {
-  def _lsl : A64I_addsubshift<sf, op, 0b1, 0b00,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-  def _lsr : A64I_addsubshift<sf, op, 0b1, 0b01,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-  def _asr : A64I_addsubshift<sf, op, 0b1, 0b10,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-  }
-
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+  let AsmString = ".tlsdesccall $sym";
 }
 
-defm CMPww : cmp_shifts<"CMPww", 0b0, 0b1, 0b0, "cmp", A64cmp, i32, GPR32>;
-defm CMPxx : cmp_shifts<"CMPxx", 0b1, 0b1, 0b0, "cmp", A64cmp, i64, GPR64>;
-
-defm CMNww : cmp_shifts<"CMNww", 0b0, 0b0, 0b1, "cmn", A64cmn, i32, GPR32>;
-defm CMNxx : cmp_shifts<"CMNxx", 0b1, 0b0, 0b1, "cmn", A64cmn, i64, GPR64>;
+// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
+// gets expanded to two MCInsts during lowering.
+let isCall = 1, Defs = [LR] in
+def TLSDESC_BLR
+    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
+             [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
 
+def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
+          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
 //===----------------------------------------------------------------------===//
-// Add-subtract (with carry) instructions
+// Conditional branch (immediate) instruction.
 //===----------------------------------------------------------------------===//
-// Contains: ADC, ADCS, SBC, SBCS + aliases NGC, NGCS
-
-multiclass A64I_addsubcarrySizes<bit op, bit s, string asmop> {
-  let Uses = [NZCV] in {
-    def www : A64I_addsubcarry<0b0, op, s, 0b000000,
-                               (outs GPR32:$Rd), (ins GPR32:$Rn, GPR32:$Rm),
-                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def xxx : A64I_addsubcarry<0b1, op, s, 0b000000,
-                               (outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
-                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-}
-
-let isCommutable = 1 in {
-  defm ADC : A64I_addsubcarrySizes<0b0, 0b0, "adc">;
-}
-
-defm SBC : A64I_addsubcarrySizes<0b1, 0b0, "sbc">;
-
-let Defs = [NZCV] in {
-  let isCommutable = 1 in {
-    defm ADCS : A64I_addsubcarrySizes<0b0, 0b1, "adcs">;
-  }
-
-  defm SBCS : A64I_addsubcarrySizes<0b1, 0b1, "sbcs">;
-}
+def Bcc : BranchCond;
 
-def : InstAlias<"ngc $Rd, $Rm", (SBCwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngc $Rd, $Rm", (SBCxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ  : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
 
-// Note that adde and sube can form a chain longer than two (e.g. for 256-bit
-// addition). So the flag-setting instructions are appropriate.
-def : Pat<(adde i32:$Rn, i32:$Rm), (ADCSwww $Rn, $Rm)>;
-def : Pat<(adde i64:$Rn, i64:$Rm), (ADCSxxx $Rn, $Rm)>;
-def : Pat<(sube i32:$Rn, i32:$Rm), (SBCSwww $Rn, $Rm)>;
-def : Pat<(sube i64:$Rn, i64:$Rm), (SBCSxxx $Rn, $Rm)>;
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm TBZ  : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
 
 //===----------------------------------------------------------------------===//
-// Bitfield
+// Unconditional branch (immediate) instructions.
 //===----------------------------------------------------------------------===//
-// Contains: SBFM, BFM, UBFM, [SU]XT[BHW], ASR, LSR, LSL, SBFI[ZX], BFI, BFXIL,
-//     UBFIZ, UBFX
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B  : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
 
-// Because of the rather complicated nearly-overlapping aliases, the decoding of
-// this range of instructions is handled manually. The architectural
-// instructions are BFM, SBFM and UBFM but a disassembler should never produce
-// these.
-//
-// In the end, the best option was to use BFM instructions for decoding under
-// almost all circumstances, but to create aliasing *Instructions* for each of
-// the canonical forms and specify a completely custom decoder which would
-// substitute the correct MCInst as needed.
-//
-// This also simplifies instruction selection, parsing etc because the MCInsts
-// have a shape that's closer to their use in code.
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
 
-//===-------------------------------
-// 1. The architectural BFM instructions
-//===-------------------------------
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
 
-def uimm5_asmoperand : AsmOperandClass {
-  let Name = "UImm5";
-  let PredicateMethod = "isUImm<5>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm5";
-}
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
 
-def uimm6_asmoperand : AsmOperandClass {
-  let Name = "UImm6";
-  let PredicateMethod = "isUImm<6>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm6";
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
+defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
+defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+                              ValueType ScalTy, ValueType VecTy,
+                              Instruction LOADW, Instruction LOADX,
+                              SubRegIndex sub> {
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+                           sub)>;
+
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+                           sub)>;
 }
 
-def bitfield32_imm : Operand<i64>,
-                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 32; }]> {
-  let ParserMatchClass = uimm5_asmoperand;
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
 
-  let DecoderMethod = "DecodeBitfield32ImmOperand";
-}
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
 
+defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
 
-def bitfield64_imm : Operand<i64>,
-                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
-  let ParserMatchClass = uimm6_asmoperand;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
 
-  // Default decoder works in 64-bit case: the 6-bit field can take any value.
-}
+defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
 
-multiclass A64I_bitfieldSizes<bits<2> opc, string asmop> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                    (ins GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    let DecoderMethod = "DecodeBitfieldInstruction";
-  }
+defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
 
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                    (ins GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    let DecoderMethod = "DecodeBitfieldInstruction";
-  }
-}
 
-defm SBFM : A64I_bitfieldSizes<0b00, "sbfm">;
-defm UBFM : A64I_bitfieldSizes<0b10, "ubfm">;
-
-// BFM instructions modify the destination register rather than defining it
-// completely.
-def BFMwwii :
-  A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-        (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-  Sched<[WriteALU, ReadALU, ReadALU]> {
-  let DecoderMethod = "DecodeBitfieldInstruction";
-  let Constraints = "$src = $Rd";
-}
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                           ro_Wextend64:$extend))))),
+           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 
-def BFMxxii :
-  A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-        (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-  Sched<[WriteALU, ReadALU, ReadALU]> {
-  let DecoderMethod = "DecodeBitfieldInstruction";
-  let Constraints = "$src = $Rd";
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                           ro_Xextend64:$extend))))),
+           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 }
 
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+                        Instruction LOADW, Instruction LOADX> {
 
-//===-------------------------------
-// 2. Extend aliases to 64-bit dest
-//===-------------------------------
-
-// Unfortunately the extensions that end up as 64-bits cannot be handled by an
-// instruction alias: their syntax is (for example) "SXTB x0, w0", which needs
-// to be mapped to "SBFM x0, x0, #0, 7" (changing the class of Rn). InstAlias is
-// not capable of such a map as far as I'm aware
-
-// Note that these instructions are strictly more specific than the
-// BFM ones (in ImmR) so they can handle their own decoding.
-class A64I_bf_ext<bit sf, bits<2> opc, RegisterClass GPRDest, ValueType dty,
-                    string asmop, bits<6> imms, dag pattern>
-  : A64I_bitfield<sf, opc, sf,
-                  (outs GPRDest:$Rd), (ins GPR32:$Rn),
-                  !strconcat(asmop, "\t$Rd, $Rn"),
-                  [(set dty:$Rd, pattern)], NoItinerary>,
-    Sched<[WriteALU, ReadALU]> {
-  let ImmR = 0b000000;
-  let ImmS = imms;
-}
+  def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-// Signed extensions
-def SXTBxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtb", 7,
-                         (sext_inreg (anyext i32:$Rn), i8)>;
-def SXTBww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxtb", 7,
-                         (sext_inreg i32:$Rn, i8)>;
-def SXTHxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxth", 15,
-                         (sext_inreg (anyext i32:$Rn), i16)>;
-def SXTHww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxth", 15,
-                         (sext_inreg i32:$Rn, i16)>;
-def SXTWxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtw", 31, (sext i32:$Rn)>;
-
-// Unsigned extensions
-def UXTBww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxtb", 7,
-                         (and i32:$Rn, 255)>;
-def UXTHww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxth", 15,
-                         (and i32:$Rn, 65535)>;
-
-// The 64-bit unsigned variants are not strictly architectural but recommended
-// for consistency.
-let isAsmParserOnly = 1 in {
-  def UXTBxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxtb", 7,
-                           (and (anyext i32:$Rn), 255)>;
-  def UXTHxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxth", 15,
-                           (and (anyext i32:$Rn), 65535)>;
+  def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-// Extra patterns for when the source register is actually 64-bits
-// too. There's no architectural difference here, it's just LLVM
-// shinanigans. There's no need for equivalent zero-extension patterns
-// because they'll already be caught by logical (immediate) matching.
-def : Pat<(sext_inreg i64:$Rn, i8),
-          (SXTBxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i16),
-          (SXTHxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i32),
-          (SXTWxw (EXTRACT_SUBREG $Rn, sub_32))>;
-
-
-//===-------------------------------
-// 3. Aliases for ASR and LSR (the simple shifts)
-//===-------------------------------
-
-// These also handle their own decoding because ImmS being set makes
-// them take precedence over BFM.
-multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
-  def wwi : A64I_bitfield<0b0, opc, 0b0,
-                    (outs GPR32:$Rd), (ins GPR32:$Rn, bitfield32_imm:$ImmR),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
-                    [(set i32:$Rd, (opnode i32:$Rn, bitfield32_imm:$ImmR))],
-                    NoItinerary>,
-            Sched<[WriteALU, ReadALU]> {
-    let ImmS = 31;
-  }
-
-  def xxi : A64I_bitfield<0b1, opc, 0b1,
-                    (outs GPR64:$Rd), (ins GPR64:$Rn, bitfield64_imm:$ImmR),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
-                    [(set i64:$Rd, (opnode i64:$Rn, bitfield64_imm:$ImmR))],
-                    NoItinerary>,
-            Sched<[WriteALU, ReadALU]> {
-    let ImmS = 63;
-  }
-
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                           sub_32)>;
+
+  def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                           sub_32)>;
 }
 
-defm ASR : A64I_shift<0b00, "asr", sra>;
-defm LSR : A64I_shift<0b10, "lsr", srl>;
-
-//===-------------------------------
-// 4. Aliases for LSL
-//===-------------------------------
+let AddedComplexity = 10 in {
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
 
-// Unfortunately LSL and subsequent aliases are much more complicated. We need
-// to be able to say certain output instruction fields depend in a complex
-// manner on combinations of input assembly fields).
-//
-// MIOperandInfo *might* have been able to do it, but at the cost of
-// significantly more C++ code.
-
-// N.b. contrary to usual practice these operands store the shift rather than
-// the machine bits in an MCInst. The complexity overhead of consistency
-// outweighed the benefits in this case (custom asmparser, printer and selection
-// vs custom encoder).
-def bitfield32_lsl_imm : Operand<i64>,
-                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-  let ParserMatchClass = uimm5_asmoperand;
-  let EncoderMethod = "getBitfield32LSLOpValue";
-}
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
 
-def bitfield64_lsl_imm : Operand<i64>,
-                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-  let ParserMatchClass = uimm6_asmoperand;
-  let EncoderMethod = "getBitfield64LSLOpValue";
-}
+  // extload -> zextload
+  defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
 
-class A64I_bitfield_lsl<bit sf, RegisterClass GPR, ValueType ty,
-                        Operand operand>
-  : A64I_bitfield<sf, 0b10, sf, (outs GPR:$Rd), (ins GPR:$Rn, operand:$FullImm),
-                  "lsl\t$Rd, $Rn, $FullImm",
-                  [(set ty:$Rd, (shl ty:$Rn, operand:$FullImm))],
-                  NoItinerary>,
-    Sched<[WriteALU, ReadALU]> {
-  bits<12> FullImm;
-  let ImmR = FullImm{5-0};
-  let ImmS = FullImm{11-6};
-
-  // No disassembler allowed because it would overlap with BFM which does the
-  // actual work.
-  let isAsmParserOnly = 1;
+  // extloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
 }
 
-def LSLwwi : A64I_bitfield_lsl<0b0, GPR32, i32, bitfield32_lsl_imm>;
-def LSLxxi : A64I_bitfield_lsl<0b1, GPR64, i64, bitfield64_lsl_imm>;
-
-//===-------------------------------
-// 5. Aliases for bitfield extract instructions
-//===-------------------------------
 
-def bfx32_width_asmoperand : AsmOperandClass {
-  let Name = "BFX32Width";
-  let PredicateMethod = "isBitfieldWidth<32>";
-  let RenderMethod = "addBFXWidthOperands";
-  let DiagnosticType = "Width32";
-}
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-def bfx32_width : Operand<i64>, ImmLeaf<i64, [{ return true; }]> {
-  let PrintMethod = "printBFXWidthOperand";
-  let ParserMatchClass = bfx32_width_asmoperand;
-}
+  def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 
-def bfx64_width_asmoperand : AsmOperandClass {
-  let Name = "BFX64Width";
-  let PredicateMethod = "isBitfieldWidth<64>";
-  let RenderMethod = "addBFXWidthOperands";
-  let DiagnosticType = "Width64";
 }
 
-def bfx64_width : Operand<i64> {
-  let PrintMethod = "printBFXWidthOperand";
-  let ParserMatchClass = bfx64_width_asmoperand;
+let AddedComplexity = 10 in {
+  // extload -> zextload
+  defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+                   [(set GPR64:$Rt,
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+                   [(set GPR32:$Rt,
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+                   [(set FPR8:$Rt,
+                         (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+                   [(set (f16 FPR16:$Rt),
+                         (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+                   [(set (f32 FPR32:$Rt),
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+                   [(set (f64 FPR64:$Rt),
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+                 [(set (f128 FPR128:$Rt),
+                       (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+          (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+                    [(set GPR32:$Rt,
+                          (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                     uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+                    [(set GPR32:$Rt,
+                          (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                   uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+                     [(set GPR32:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+                     [(set GPR64:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+                     [(set GPR32:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+                     [(set GPR64:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+                     [(set GPR64:$Rt,
+                           (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+      (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+                        [(AArch64Prefetch imm:$Rt,
+                                        (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+//                   [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+                    [(set GPR64:$Rt,
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+                    [(set GPR32:$Rt,
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+                    [(set FPR8:$Rt,
+                          (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+                    [(set FPR16:$Rt,
+                          (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+                    [(set (f32 FPR32:$Rt),
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+                    [(set (f64 FPR64:$Rt),
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+                    [(set (f128 FPR128:$Rt),
+                          (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+    : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+             [(set GPR32:$Rt,
+                    (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+    : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+             [(set GPR32:$Rt,
+                    (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+//  anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+  let Name = "SImm9OffsetFB" # Width;
+  let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+  let RenderMethod = "addImmOperands";
 }
 
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+    : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+               [(set GPR32:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+    : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+              [(set GPR64:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+    : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+                [(set GPR32:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+    : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+                [(set GPR64:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+    : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+              [(set GPR64:$Rt,
+                    (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+                (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+                (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+                (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+                  [(AArch64Prefetch imm:$Rt,
+                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
 
-multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                       (ins GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
-                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                       [(set i32:$Rd, (op i32:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                       (ins GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
-                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                       [(set i64:$Rd, (op i64:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-}
+//===----------------------------------------------------------------------===//
+// Store instructions.
+//===----------------------------------------------------------------------===//
 
-defm SBFX :  A64I_bitfield_extract<0b00, "sbfx", A64Sbfx>;
-defm UBFX :  A64I_bitfield_extract<0b10, "ubfx", A64Ubfx>;
-
-// Again, variants based on BFM modify Rd so need it as an input too.
-def BFXILwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-                          (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
-                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-                Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
+defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
+defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
+defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
+defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+                                 Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-def BFXILxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-                          (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
-                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-                Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
+let AddedComplexity = 10 in {
+  // truncstore i64
+  defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
+  defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+  defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
 }
 
-// SBFX instructions can do a 1-instruction sign-extension of boolean values.
-def : Pat<(sext_inreg i64:$Rn, i1), (SBFXxxii $Rn, 0, 0)>;
-def : Pat<(sext_inreg i32:$Rn, i1), (SBFXwwii $Rn, 0, 0)>;
-def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
-          (SBFXxxii (SUBREG_TO_REG (i64 0), $Rn, sub_32), 0, 0)>;
-
-// UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
-// use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
-                                         sub_32)>;
-
-//===-------------------------------
-// 6. Aliases for bitfield insert instructions
-//===-------------------------------
-
-def bfi32_lsb_asmoperand : AsmOperandClass {
-  let Name = "BFI32LSB";
-  let PredicateMethod = "isUImm<5>";
-  let RenderMethod = "addBFILSBOperands<32>";
-  let DiagnosticType = "UImm5";
-}
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+                         Instruction STRW, Instruction STRX> {
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-def bfi32_lsb : Operand<i64>,
-                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-  let PrintMethod = "printBFILSBOperand<32>";
-  let ParserMatchClass = bfi32_lsb_asmoperand;
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-def bfi64_lsb_asmoperand : AsmOperandClass {
-  let Name = "BFI64LSB";
-  let PredicateMethod = "isUImm<6>";
-  let RenderMethod = "addBFILSBOperands<64>";
-  let DiagnosticType = "UImm6";
-}
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+                   [(store GPR64:$Rt,
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+                    [(store GPR32:$Rt,
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+                    [(store FPR8:$Rt,
+                            (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+                    [(store (f16 FPR16:$Rt),
+                            (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+                    [(store (f32 FPR32:$Rt),
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+                    [(store (f64 FPR64:$Rt),
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+                     [(truncstorei16 GPR32:$Rt,
+                                     (am_indexed16 GPR64sp:$Rn,
+                                                   uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
+                     [(truncstorei8 GPR32:$Rt,
+                                    (am_indexed8 GPR64sp:$Rn,
+                                                 uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128  FPR128:$Rt),
+                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+                         (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+                         (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+                         [(store GPR64:$Rt,
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+                         [(store GPR32:$Rt,
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+                         [(store FPR8:$Rt,
+                                 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+                         [(store (f16 FPR16:$Rt),
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+                         [(store (f32 FPR32:$Rt),
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+                         [(store (f64 FPR64:$Rt),
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+                         [(store (f128 FPR128:$Rt),
+                                 (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+                         [(truncstorei16 GPR32:$Rt,
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+                         [(truncstorei8 GPR32:$Rt,
+                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+                (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+                (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+           simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
-def bfi64_lsb : Operand<i64>,
-                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-  let PrintMethod = "printBFILSBOperand<64>";
-  let ParserMatchClass = bfi64_lsb_asmoperand;
-}
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
 
-// Width verification is performed during conversion so width operand can be
-// shared between 32/64-bit cases. Still needed for the print method though
-// because ImmR encodes "width - 1".
-def bfi32_width_asmoperand : AsmOperandClass {
-  let Name = "BFI32Width";
-  let PredicateMethod = "isBitfieldWidth<32>";
-  let RenderMethod = "addBFIWidthOperands";
-  let DiagnosticType = "Width32";
-}
+def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
 
-def bfi32_width : Operand<i64>,
-                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 32; }]> {
-  let PrintMethod = "printBFIWidthOperand";
-  let ParserMatchClass = bfi32_width_asmoperand;
-}
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
 
-def bfi64_width_asmoperand : AsmOperandClass {
-  let Name = "BFI64Width";
-  let PredicateMethod = "isBitfieldWidth<64>";
-  let RenderMethod = "addBFIWidthOperands";
-  let DiagnosticType = "Width64";
-}
+def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
 
-def bfi64_width : Operand<i64>,
-                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 64; }]> {
-  let PrintMethod = "printBFIWidthOperand";
-  let ParserMatchClass = bfi64_width_asmoperand;
-}
+def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
 
-multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                           (ins GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
-                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
 
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                           (ins GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
-                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-}
+def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
 
-defm SBFIZ :  A64I_bitfield_insert<0b00, "sbfiz">;
-defm UBFIZ :  A64I_bitfield_insert<0b10, "ubfiz">;
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
 
+def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
 
-def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-                (ins GPR32:$src, GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
 
-def BFIxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-                (ins GPR64:$src, GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
+def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
 
 //===----------------------------------------------------------------------===//
-// Compare and branch (immediate)
+// Scaled floating point to integer conversion instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CBZ, CBNZ
-
-class label_asmoperand<int width, int scale> : AsmOperandClass {
-  let Name = "Label" # width # "_" # scale;
-  let PredicateMethod = "isLabel<" # width # "," # scale # ">";
-  let RenderMethod = "addLabelOperands<" # width # ", " # scale # ">";
-  let DiagnosticType = "Label";
-}
-
-def label_wid19_scal4_asmoperand : label_asmoperand<19, 4>;
-
-// All conditional immediate branches are the same really: 19 signed bits scaled
-// by the instruction-size (4).
-def bcc_target : Operand<OtherVT> {
-  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
-  let ParserMatchClass = label_wid19_scal4_asmoperand;
-  let PrintMethod = "printLabelOperand<19, 4>";
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_condbr>";
-  let OperandType = "OPERAND_PCREL";
-}
 
-multiclass cmpbr_sizes<bit op, string asmop, ImmLeaf SETOP> {
-  let isBranch = 1, isTerminator = 1 in {
-  def x : A64I_cmpbr<0b1, op,
-                     (outs),
-                     (ins GPR64:$Rt, bcc_target:$Label),
-                     !strconcat(asmop,"\t$Rt, $Label"),
-                     [(A64br_cc (A64cmp i64:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>,
-          Sched<[WriteBr, ReadBr]>;
-
-  def w : A64I_cmpbr<0b0, op,
-                     (outs),
-                     (ins GPR32:$Rt, bcc_target:$Label),
-                     !strconcat(asmop,"\t$Rt, $Label"),
-                     [(A64br_cc (A64cmp i32:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>,
-          Sched<[WriteBr, ReadBr]>;
-  }
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
 }
 
-defm CBZ  : cmpbr_sizes<0b0, "cbz",  ImmLeaf<i32, [{
-  return Imm == A64CC::EQ;
-}]> >;
-defm CBNZ : cmpbr_sizes<0b1, "cbnz", ImmLeaf<i32, [{
-  return Imm == A64CC::NE;
-}]> >;
-
 //===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instructions
+// Scaled integer to floating point conversion instructions.
 //===----------------------------------------------------------------------===//
-// Contains: B.cc
-
-def cond_code_asmoperand : AsmOperandClass {
-  let Name = "CondCode";
-  let DiagnosticType = "CondCode";
-}
 
-def cond_code : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm <= 15;
-}]> {
-  let PrintMethod = "printCondCodeOperand";
-  let ParserMatchClass = cond_code_asmoperand;
-}
-
-def Bcc : A64I_condbr<0b0, 0b0, (outs),
-                (ins cond_code:$Cond, bcc_target:$Label),
-                "b.$Cond $Label", [(A64br_cc NZCV, (i32 imm:$Cond), bb:$Label)],
-                NoItinerary>,
-          Sched<[WriteBr]> {
-  let Uses = [NZCV];
-  let isBranch = 1;
-  let isTerminator = 1;
-}
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
 
 //===----------------------------------------------------------------------===//
-// Conditional compare (immediate) instructions
+// Unscaled integer to floating point conversion instruction.
 //===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
-
-def uimm4_asmoperand : AsmOperandClass {
-  let Name = "UImm4";
-  let PredicateMethod = "isUImm<4>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm4";
-}
 
-def uimm4 : Operand<i32> {
-  let ParserMatchClass = uimm4_asmoperand;
-}
+defm FMOV : UnscaledConversion<"fmov">;
 
-def uimm5 : Operand<i32> {
-  let ParserMatchClass = uimm5_asmoperand;
-}
+def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
+def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
 
-// The only difference between this operand and the one for instructions like
-// B.cc is that it's parsed manually. The other get parsed implicitly as part of
-// the mnemonic handling.
-def cond_code_op_asmoperand : AsmOperandClass {
-  let Name = "CondCodeOp";
-  let RenderMethod = "addCondCodeOperands";
-  let PredicateMethod = "isCondCode";
-  let ParserMethod = "ParseCondCodeOperand";
-  let DiagnosticType = "CondCode";
-}
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
 
-def cond_code_op : Operand<i32> {
-  let PrintMethod = "printCondCodeOperand";
-  let ParserMatchClass = cond_code_op_asmoperand;
-}
+defm FCVT : FPConversion<"fcvt">;
 
-class A64I_condcmpimmImpl<bit sf, bit op, RegisterClass GPR, string asmop>
-  : A64I_condcmpimm<sf, op, 0b0, 0b0, 0b1, (outs),
-                (ins GPR:$Rn, uimm5:$UImm5, uimm4:$NZCVImm, cond_code_op:$Cond),
-                !strconcat(asmop, "\t$Rn, $UImm5, $NZCVImm, $Cond"),
-                [], NoItinerary>,
-    Sched<[WriteCMP, ReadCMP]> {
-  let Defs = [NZCV];
-}
+def : Pat<(f32_to_f16 FPR32:$Rn),
+          (i32 (COPY_TO_REGCLASS
+                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
+                   GPR32))>;
 
-def CCMNwi : A64I_condcmpimmImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxi : A64I_condcmpimmImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPwi : A64I_condcmpimmImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxi : A64I_condcmpimmImpl<0b1, 0b1, GPR64, "ccmp">;
+def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
+                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
 
 //===----------------------------------------------------------------------===//
-// Conditional compare (register) instructions
+// Floating point single operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
-
-class A64I_condcmpregImpl<bit sf, bit op, RegisterClass GPR, string asmop>
-  : A64I_condcmpreg<sf, op, 0b0, 0b0, 0b1,
-                    (outs),
-                    (ins GPR:$Rn, GPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
-                    !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                    [], NoItinerary>,
-    Sched<[WriteCMP, ReadCMP, ReadCMP]> {
-  let Defs = [NZCV];
-}
 
-def CCMNww : A64I_condcmpregImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxx : A64I_condcmpregImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPww : A64I_condcmpregImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxx : A64I_condcmpregImpl<0b1, 0b1, GPR64, "ccmp">;
+defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
 
-//===----------------------------------------------------------------------===//
-// Conditional select instructions
-//===----------------------------------------------------------------------===//
-// Contains: CSEL, CSINC, CSINV, CSNEG + aliases CSET, CSETM, CINC, CINV, CNEG
-
-// Condition code which is encoded as the inversion (semantically rather than
-// bitwise) in the instruction.
-def inv_cond_code_op_asmoperand : AsmOperandClass {
-  let Name = "InvCondCodeOp";
-  let RenderMethod = "addInvCondCodeOperands";
-  let PredicateMethod = "isCondCode";
-  let ParserMethod = "ParseCondCodeOperand";
-  let DiagnosticType = "CondCode";
-}
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+          (FRINTNDr FPR64:$Rn)>;
 
-def inv_cond_code_op : Operand<i32> {
-  let ParserMatchClass = inv_cond_code_op_asmoperand;
-  let PrintMethod = "printInverseCondCodeOperand";
+// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
+// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
+// <rdar://problem/13715968>
+// TODO: We should really model the FPSR flags correctly. This is really ugly.
+let hasSideEffects = 1 in {
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
 }
 
-// Having a separate operand for the selectable use-case is debatable, but gives
-// consistency with cond_code.
-def inv_cond_XFORM : SDNodeXForm<imm, [{
-  A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(N->getZExtValue());
-  return CurDAG->getTargetConstant(A64InvertCondCode(CC), MVT::i32);
-}]>;
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
 
-def inv_cond_code
-  : ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 15; }], inv_cond_XFORM>;
-
-
-multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
-                             SDPatternOperator select> {
-  let Uses = [NZCV] in {
-    def wwwc : A64I_condsel<0b0, op, 0b0, op2,
-                            (outs GPR32:$Rd),
-                            (ins GPR32:$Rn, GPR32:$Rm, cond_code_op:$Cond),
-                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
-                            [(set i32:$Rd, (select i32:$Rn, i32:$Rm))],
-                            NoItinerary>,
-               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-
-    def xxxc : A64I_condsel<0b1, op, 0b0, op2,
-                            (outs GPR64:$Rd),
-                            (ins GPR64:$Rn, GPR64:$Rm, cond_code_op:$Cond),
-                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
-                            [(set i64:$Rd, (select i64:$Rn, i64:$Rm))],
-                            NoItinerary>,
-               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-  }
+let SchedRW = [WriteFDiv] in {
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
 }
 
-def simple_select
-  : PatFrag<(ops node:$lhs, node:$rhs),
-            (A64select_cc NZCV, node:$lhs, node:$rhs, (i32 imm:$Cond))>;
-
-class complex_select<SDPatternOperator opnode>
-  : PatFrag<(ops node:$lhs, node:$rhs),
-        (A64select_cc NZCV, node:$lhs, (opnode node:$rhs), (i32 imm:$Cond))>;
-
-
-defm CSEL : A64I_condselSizes<0b0, 0b00, "csel", simple_select>;
-defm CSINC : A64I_condselSizes<0b0, 0b01, "csinc",
-                               complex_select<PatFrag<(ops node:$val),
-                                                      (add node:$val, 1)>>>;
-defm CSINV : A64I_condselSizes<0b1, 0b00, "csinv", complex_select<not>>;
-defm CSNEG : A64I_condselSizes<0b1, 0b01, "csneg", complex_select<ineg>>;
-
-// Now the instruction aliases, which fit nicely into LLVM's model:
-
-def : InstAlias<"cset $Rd, $Cond",
-                (CSINCwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cset $Rd, $Cond",
-                (CSINCxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
-                (CSINVwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
-                (CSINVxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
-           (CSINCwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
-           (CSINCxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
-           (CSINVwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
-           (CSINVxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
-           (CSNEGwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
-           (CSNEGxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-
-// Finally some helper patterns.
-
-// For CSET (a.k.a. zero-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
-          (CSINCwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
-          (CSINCwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
-          (CSINCxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
-          (CSINCxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// For CSETM (a.k.a. sign-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
-          (CSINVwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
-          (CSINVwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
-          (CSINVxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
-          (CSINVxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// CINC, CINV and CNEG get dealt with automatically, which leaves the issue of
-// commutativity. The instructions are to complex for isCommutable to be used,
-// so we have to create the patterns manually:
-
-// No commutable pattern for CSEL since the commuted version is isomorphic.
-
-// CSINC
-def :Pat<(A64select_cc NZCV, (add i32:$Rm, 1), i32:$Rn, inv_cond_code:$Cond),
-         (CSINCwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (add i64:$Rm, 1), i64:$Rn, inv_cond_code:$Cond),
-         (CSINCxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSINV
-def :Pat<(A64select_cc NZCV, (not i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
-         (CSINVwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (not i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
-         (CSINVxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSNEG
-def :Pat<(A64select_cc NZCV, (ineg i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
-         (CSNEGwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (ineg i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
-         (CSNEGxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
 //===----------------------------------------------------------------------===//
-// Data Processing (1 source) instructions
+// Floating point two operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: RBIT, REV16, REV, REV32, CLZ, CLS.
-
-// We define an unary operator which always fails. We will use this to
-// define unary operators that cannot be matched.
-
-class A64I_dp_1src_impl<bit sf, bits<6> opcode, string asmop,
-                   list<dag> patterns, RegisterClass GPRrc,
-                   InstrItinClass itin>:
-      A64I_dp_1src<sf,
-                   0,
-                   0b00000,
-                   opcode,
-                   !strconcat(asmop, "\t$Rd, $Rn"),
-                   (outs GPRrc:$Rd),
-                   (ins GPRrc:$Rn),
-                   patterns,
-                   itin>,
-      Sched<[WriteALU, ReadALU]>;
-
-multiclass A64I_dp_1src <bits<6> opcode, string asmop> {
-  let hasSideEffects = 0 in {
-    def ww : A64I_dp_1src_impl<0b0, opcode, asmop, [], GPR32, NoItinerary>;
-    def xx : A64I_dp_1src_impl<0b1, opcode, asmop, [], GPR64, NoItinerary>;
-  }
-}
 
-defm RBIT  : A64I_dp_1src<0b000000, "rbit">;
-defm CLS   : A64I_dp_1src<0b000101, "cls">;
-defm CLZ   : A64I_dp_1src<0b000100, "clz">;
-
-def : Pat<(ctlz i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz i64:$Rn), (CLZxx $Rn)>;
-def : Pat<(ctlz_zero_undef i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz_zero_undef i64:$Rn), (CLZxx $Rn)>;
-
-def : Pat<(cttz i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz i64:$Rn), (CLZxx (RBITxx $Rn))>;
-def : Pat<(cttz_zero_undef i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz_zero_undef i64:$Rn), (CLZxx (RBITxx $Rn))>;
-
-
-def REVww : A64I_dp_1src_impl<0b0, 0b000010, "rev",
-                              [(set i32:$Rd, (bswap i32:$Rn))],
-                              GPR32, NoItinerary>;
-def REVxx : A64I_dp_1src_impl<0b1, 0b000011, "rev",
-                              [(set i64:$Rd, (bswap i64:$Rn))],
-                              GPR64, NoItinerary>;
-def REV32xx : A64I_dp_1src_impl<0b1, 0b000010, "rev32",
-                          [(set i64:$Rd, (bswap (rotr i64:$Rn, (i64 32))))],
-                          GPR64, NoItinerary>;
-def REV16ww : A64I_dp_1src_impl<0b0, 0b000001, "rev16",
-                          [(set i32:$Rd, (bswap (rotr i32:$Rn, (i64 16))))],
-                          GPR32,
-                          NoItinerary>;
-def REV16xx : A64I_dp_1src_impl<0b1, 0b000001, "rev16", [], GPR64, NoItinerary>;
+defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+let SchedRW = [WriteFMul] in {
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
 
 //===----------------------------------------------------------------------===//
-// Data Processing (2 sources) instructions
+// Floating point three operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CRC32C?[BHWX], UDIV, SDIV, LSLV, LSRV, ASRV, RORV + aliases LSL,
-//           LSR, ASR, ROR
-
-
-class dp_2src_impl<bit sf, bits<6> opcode, string asmop, list<dag> patterns,
-                   RegisterClass GPRsp,
-                   InstrItinClass itin>:
-      A64I_dp_2src<sf,
-                   opcode,
-                   0,
-                   !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                   (outs GPRsp:$Rd),
-                   (ins GPRsp:$Rn, GPRsp:$Rm),
-                   patterns,
-                   itin>,
-	  Sched<[WriteALU, ReadALU, ReadALU]>;
-
-multiclass dp_2src_crc<bit c, string asmop> {
-  def B_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 0},
-                           !strconcat(asmop, "b"), [], GPR32, NoItinerary>;
-  def H_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 1},
-                           !strconcat(asmop, "h"), [], GPR32, NoItinerary>;
-  def W_www : dp_2src_impl<0b0, {0, 1, 0, c, 1, 0},
-                           !strconcat(asmop, "w"), [], GPR32, NoItinerary>;
-  def X_wwx : A64I_dp_2src<0b1, {0, 1, 0, c, 1, 1}, 0b0,
-                           !strconcat(asmop, "x\t$Rd, $Rn, $Rm"),
-                           (outs GPR32:$Rd), (ins GPR32:$Rn, GPR64:$Rm), [],
-                           NoItinerary>,
-	          Sched<[WriteALU, ReadALU, ReadALU]>;
-}
 
-multiclass dp_2src_zext <bits<6> opcode, string asmop, SDPatternOperator op> {
-   def www : dp_2src_impl<0b0,
-                         opcode,
-                         asmop,
-                         [(set i32:$Rd,
-                               (op i32:$Rn, (i64 (zext i32:$Rm))))],
-                         GPR32,
-                         NoItinerary>;
-   def xxx : dp_2src_impl<0b1,
-                         opcode,
-                         asmop,
-                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
-                         GPR64,
-                         NoItinerary>;
-}
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
+     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
 
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
 
-multiclass dp_2src <bits<6> opcode, string asmop, SDPatternOperator op> {
-    def www : dp_2src_impl<0b0,
-                         opcode,
-                         asmop,
-                         [(set i32:$Rd, (op i32:$Rn, i32:$Rm))],
-                         GPR32,
-                         NoItinerary>;
-   def xxx : dp_2src_impl<0b1,
-                         opcode,
-                         asmop,
-                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
-                         GPR64,
-                         NoItinerary>;
-}
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-// Here we define the data processing 2 source instructions.
-defm CRC32  : dp_2src_crc<0b0, "crc32">;
-defm CRC32C : dp_2src_crc<0b1, "crc32c">;
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
-let SchedRW = [WriteDiv, ReadDiv, ReadDiv] in {
-  defm UDIV : dp_2src<0b000010, "udiv", udiv>;
-  defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
-}
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-let SchedRW = [WriteALUs, ReadALU, ReadALU] in {
-  defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
-  defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
-  defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
-  defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
-}
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-// Extra patterns for an incoming 64-bit value for a 32-bit
-// operation. Since the LLVM operations are undefined (as in C) if the
-// RHS is out of range, it's perfectly permissible to discard the high
-// bits of the GPR64.
-def : Pat<(shl i32:$Rn, i64:$Rm),
-          (LSLVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(srl i32:$Rn, i64:$Rm),
-          (LSRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(sra i32:$Rn, i64:$Rm),
-          (ASRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(rotr i32:$Rn, i64:$Rm),
-          (RORVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-
-// Here we define the aliases for the data processing 2 source instructions.
-def LSL_mnemonic : MnemonicAlias<"lslv", "lsl">;
-def LSR_mnemonic : MnemonicAlias<"lsrv", "lsr">;
-def ASR_menmonic : MnemonicAlias<"asrv", "asr">;
-def ROR_menmonic : MnemonicAlias<"rorv", "ror">;
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
 //===----------------------------------------------------------------------===//
-// Data Processing (3 sources) instructions
+// Floating point comparison instructions.
 //===----------------------------------------------------------------------===//
-// Contains: MADD, MSUB, SMADDL, SMSUBL, SMULH, UMADDL, UMSUBL, UMULH
-//    + aliases MUL, MNEG, SMULL, SMNEGL, UMULL, UMNEGL
-
-class A64I_dp3_4operand<bit sf, bits<6> opcode, RegisterClass AccReg,
-                        ValueType AccTy, RegisterClass SrcReg,
-                        string asmop, dag pattern>
-  : A64I_dp3<sf, opcode,
-             (outs AccReg:$Rd), (ins SrcReg:$Rn, SrcReg:$Rm, AccReg:$Ra),
-             !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Ra"),
-             [(set AccTy:$Rd, pattern)], NoItinerary>,
-    Sched<[WriteMAC, ReadMAC, ReadMAC, ReadMAC]> {
-  bits<5> Ra;
-  let Inst{14-10} = Ra;
-
-  RegisterClass AccGPR = AccReg;
-  RegisterClass SrcGPR = SrcReg;
-}
-
-def MADDwwww : A64I_dp3_4operand<0b0, 0b000000, GPR32, i32, GPR32, "madd",
-                                 (add i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MADDxxxx : A64I_dp3_4operand<0b1, 0b000000, GPR64, i64, GPR64, "madd",
-                                 (add i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def MSUBwwww : A64I_dp3_4operand<0b0, 0b000001, GPR32, i32, GPR32, "msub",
-                                 (sub i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MSUBxxxx : A64I_dp3_4operand<0b1, 0b000001, GPR64, i64, GPR64, "msub",
-                                 (sub i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def SMADDLxwwx : A64I_dp3_4operand<0b1, 0b000010, GPR64, i64, GPR32, "smaddl",
-                     (add i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-def SMSUBLxwwx : A64I_dp3_4operand<0b1, 0b000011, GPR64, i64, GPR32, "smsubl",
-                     (sub i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-
-def UMADDLxwwx : A64I_dp3_4operand<0b1, 0b001010, GPR64, i64, GPR32, "umaddl",
-                     (add i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-def UMSUBLxwwx : A64I_dp3_4operand<0b1, 0b001011, GPR64, i64, GPR32, "umsubl",
-                     (sub i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-
-let isCommutable = 1, PostEncoderMethod = "fixMulHigh" in {
-  def UMULHxxx : A64I_dp3<0b1, 0b001100, (outs GPR64:$Rd),
-                          (ins GPR64:$Rn, GPR64:$Rm),
-                          "umulh\t$Rd, $Rn, $Rm",
-                          [(set i64:$Rd, (mulhu i64:$Rn, i64:$Rm))],
-                          NoItinerary>,
-                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-
-  def SMULHxxx : A64I_dp3<0b1, 0b000100, (outs GPR64:$Rd),
-                          (ins GPR64:$Rn, GPR64:$Rm),
-                          "smulh\t$Rd, $Rn, $Rm",
-                          [(set i64:$Rd, (mulhs i64:$Rn, i64:$Rm))],
-                          NoItinerary>,
-                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-}
 
-multiclass A64I_dp3_3operand<string asmop, A64I_dp3_4operand INST,
-                             Register ZR, dag pattern> {
-  def : InstAlias<asmop # " $Rd, $Rn, $Rm",
-                  (INST INST.AccGPR:$Rd, INST.SrcGPR:$Rn, INST.SrcGPR:$Rm, ZR)>;
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
 
-  def : Pat<pattern, (INST $Rn, $Rm, ZR)>;
-}
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
 
-defm : A64I_dp3_3operand<"mul", MADDwwww, WZR, (mul i32:$Rn, i32:$Rm)>;
-defm : A64I_dp3_3operand<"mul", MADDxxxx, XZR, (mul i64:$Rn, i64:$Rm)>;
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP  : FPCondComparison<0, "fccmp">;
 
-defm : A64I_dp3_3operand<"mneg", MSUBwwww, WZR,
-                         (sub 0, (mul i32:$Rn, i32:$Rm))>;
-defm : A64I_dp3_3operand<"mneg", MSUBxxxx, XZR,
-                         (sub 0, (mul i64:$Rn, i64:$Rm))>;
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
 
-defm : A64I_dp3_3operand<"smull", SMADDLxwwx, XZR,
-                         (mul (i64 (sext i32:$Rn)), (sext i32:$Rm))>;
-defm : A64I_dp3_3operand<"smnegl", SMSUBLxwwx, XZR,
-                       (sub 0, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
+defm FCSEL : FPCondSelect<"fcsel">;
 
-defm : A64I_dp3_3operand<"umull", UMADDLxwwx, XZR,
-                         (mul (i64 (zext i32:$Rn)), (zext i32:$Rm))>;
-defm : A64I_dp3_3operand<"umnegl", UMSUBLxwwx, XZR,
-                       (sub 0, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+                      [(set (f128 FPR128:$Rd),
+                            (AArch64csel FPR128:$Rn, FPR128:$Rm,
+                                       (i32 imm:$cond), NZCV))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
+}
 
 
 //===----------------------------------------------------------------------===//
-// Exception generation
+// Floating point immediate move.
 //===----------------------------------------------------------------------===//
-// Contains: SVC, HVC, SMC, BRK, HLT, DCPS1, DCPS2, DCPS3
-
-def uimm16_asmoperand : AsmOperandClass {
-  let Name = "UImm16";
-  let PredicateMethod = "isUImm<16>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm16";
-}
 
-def uimm16 : Operand<i32> {
-  let ParserMatchClass = uimm16_asmoperand;
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
 }
 
-class A64I_exceptImpl<bits<3> opc, bits<2> ll, string asmop>
-  : A64I_exception<opc, 0b000, ll, (outs), (ins uimm16:$UImm16),
-                   !strconcat(asmop, "\t$UImm16"), [], NoItinerary>,
-    Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-}
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
 
-def SVCi : A64I_exceptImpl<0b000, 0b01, "svc">;
-def HVCi : A64I_exceptImpl<0b000, 0b10, "hvc">;
-def SMCi : A64I_exceptImpl<0b000, 0b11, "smc">;
-def BRKi : A64I_exceptImpl<0b001, 0b00, "brk">;
-def HLTi : A64I_exceptImpl<0b010, 0b00, "hlt">;
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                              (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                    (i64 2))))),
+          (FCVTLv4i32 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+          (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+                          (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+                                        int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
+                                       int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
+                                       int_aarch64_neon_fcvtzu>;
+}
+defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+                (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+                (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+                    int_aarch64_neon_uaddlp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+  def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+            (SHLLv8i8 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+            (SHLLv16i8 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+            (SHLLv4i16 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+            (SHLLv8i16 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+            (SHLLv2i32 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+            (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
 
-def DCPS1i : A64I_exceptImpl<0b101, 0b01, "dcps1">;
-def DCPS2i : A64I_exceptImpl<0b101, 0b10, "dcps2">;
-def DCPS3i : A64I_exceptImpl<0b101, 0b11, "dcps3">;
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
 
-// The immediate is optional for the DCPS instructions, defaulting to 0.
-def : InstAlias<"dcps1", (DCPS1i 0)>;
-def : InstAlias<"dcps2", (DCPS2i 0)>;
-def : InstAlias<"dcps3", (DCPS3i 0)>;
+defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmls.8b\t$dst, $src1, $src2}",
+                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmls.16b\t$dst, $src1, $src2}",
+                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmls.4h\t$dst, $src1, $src2}",
+                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmls.8h\t$dst, $src1, $src2}",
+                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmls.2s\t$dst, $src1, $src2}",
+                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmls.4s\t$dst, $src1, $src2}",
+                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmls.2d\t$dst, $src1, $src2}",
+                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlo.8b\t$dst, $src1, $src2}",
+                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlo.16b\t$dst, $src1, $src2}",
+                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlo.4h\t$dst, $src1, $src2}",
+                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlo.8h\t$dst, $src1, $src2}",
+                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlo.2s\t$dst, $src1, $src2}",
+                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlo.4s\t$dst, $src1, $src2}",
+                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlo.2d\t$dst, $src1, $src2}",
+                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmle.8b\t$dst, $src1, $src2}",
+                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmle.16b\t$dst, $src1, $src2}",
+                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmle.4h\t$dst, $src1, $src2}",
+                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmle.8h\t$dst, $src1, $src2}",
+                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmle.2s\t$dst, $src1, $src2}",
+                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmle.4s\t$dst, $src1, $src2}",
+                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmle.2d\t$dst, $src1, $src2}",
+                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlt.8b\t$dst, $src1, $src2}",
+                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlt.16b\t$dst, $src1, $src2}",
+                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlt.4h\t$dst, $src1, $src2}",
+                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlt.8h\t$dst, $src1, $src2}",
+                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlt.2s\t$dst, $src1, $src2}",
+                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlt.4s\t$dst, $src1, $src2}",
+                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlt.2d\t$dst, $src1, $src2}",
+                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmle.2s\t$dst, $src1, $src2}",
+                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmle.4s\t$dst, $src1, $src2}",
+                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmle.2d\t$dst, $src1, $src2}",
+                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmlt.2s\t$dst, $src1, $src2}",
+                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmlt.4s\t$dst, $src1, $src2}",
+                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmlt.2d\t$dst, $src1, $src2}",
+                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|facle.2s\t$dst, $src1, $src2}",
+                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|facle.4s\t$dst, $src1, $src2}",
+                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|facle.2d\t$dst, $src1, $src2}",
+                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|faclt.2s\t$dst, $src1, $src2}",
+                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|faclt.4s\t$dst, $src1, $src2}",
+                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|faclt.2d\t$dst, $src1, $src2}",
+                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
-// Extract (immediate)
+// Advanced SIMD three scalar instructions.
 //===----------------------------------------------------------------------===//
-// Contains: EXTR + alias ROR
-
-def EXTRwwwi : A64I_extract<0b0, 0b000, 0b0,
-                            (outs GPR32:$Rd),
-                            (ins GPR32:$Rn, GPR32:$Rm, bitfield32_imm:$LSB),
-                            "extr\t$Rd, $Rn, $Rm, $LSB",
-                            [(set i32:$Rd,
-                                  (A64Extr i32:$Rn, i32:$Rm, imm:$LSB))],
-                            NoItinerary>,
-               Sched<[WriteALU, ReadALU, ReadALU]>;
-def EXTRxxxi : A64I_extract<0b1, 0b000, 0b1,
-                            (outs GPR64:$Rd),
-                            (ins GPR64:$Rn, GPR64:$Rm, bitfield64_imm:$LSB),
-                            "extr\t$Rd, $Rn, $Rm, $LSB",
-                            [(set i64:$Rd,
-                                  (A64Extr i64:$Rn, i64:$Rm, imm:$LSB))],
-                            NoItinerary>,
-               Sched<[WriteALU, ReadALU, ReadALU]>;
-
-def : InstAlias<"ror $Rd, $Rs, $LSB",
-               (EXTRwwwi GPR32:$Rd, GPR32:$Rs, GPR32:$Rs, bitfield32_imm:$LSB)>;
-def : InstAlias<"ror $Rd, $Rs, $LSB",
-               (EXTRxxxi GPR64:$Rd, GPR64:$Rs, GPR64:$Rs, bitfield64_imm:$LSB)>;
-
-def : Pat<(rotr i32:$Rn, bitfield32_imm:$LSB),
-          (EXTRwwwi $Rn, $Rn, bitfield32_imm:$LSB)>;
-def : Pat<(rotr i64:$Rn, bitfield64_imm:$LSB),
-          (EXTRxxxi $Rn, $Rn, bitfield64_imm:$LSB)>;
+
+defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+                                     int_aarch64_neon_facge>;
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+                                     int_aarch64_neon_facgt>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
+defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
-// Floating-point compare instructions
+// Advanced SIMD three scalar instructions (mixed operands).
 //===----------------------------------------------------------------------===//
-// Contains: FCMP, FCMPE
+defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+                                       int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
 
-def fpzero_asmoperand : AsmOperandClass {
-  let Name = "FPZero";
-  let ParserMethod = "ParseFPImmOperand";
-  let DiagnosticType = "FPZero";
-}
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
 
-def fpz32 : Operand<f32>,
-            ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
+def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
+defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
+                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+                                     int_aarch64_neon_suqadd>;
+defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+                                    int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+          (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+          (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+          (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+          (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+          (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+          (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+          (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+          (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+          (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+          (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+                             SDPatternOperator loadop, Instruction UCVTF,
+                             ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+                             SubRegIndex sub> {
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                                 sub))>;
+
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                                 sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+                         UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+               (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                     (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+                         UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+                         UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                    (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+                         UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+                         UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
 
-def fpz64 : Operand<f64>,
-            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three different-sized vector instructions.
+//===----------------------------------------------------------------------===//
 
-def fpz64movi : Operand<i64>,
-            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
-  def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
-                          (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
-                          NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Defs = [NZCV];
-  }
-
-  def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
-                        (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Defs = [NZCV];
-  }
-}
-
-defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm),
-                               (set NZCV, (A64cmp f32:$Rn, f32:$Rm))>;
-defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm),
-                               (set NZCV, (A64cmp f64:$Rn, f64:$Rm))>;
-
-// What would be Rm should be written as 0; note that even though it's called
-// "$Rm" here to fit in with the InstrFormats, it's actually an immediate.
-defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Rm),
-                               (set NZCV, (A64cmp f32:$Rn, fpz32:$Rm))>;
-
-defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Rm),
-                               (set NZCV, (A64cmp f64:$Rn, fpz64:$Rm))>;
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point conditional compare instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCCMP, FCCMPE
-
-class A64I_fpccmpImpl<bits<2> type, bit op, RegisterClass FPR, string asmop>
-  : A64I_fpccmp<0b0, 0b0, type, op,
-                (outs),
-                (ins FPR:$Rn, FPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
-                !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Defs = [NZCV];
-}
-
-def FCCMPss : A64I_fpccmpImpl<0b00, 0b0, FPR32, "fccmp">;
-def FCCMPEss : A64I_fpccmpImpl<0b00, 0b1, FPR32, "fccmpe">;
-def FCCMPdd : A64I_fpccmpImpl<0b01, 0b0, FPR64, "fccmp">;
-def FCCMPEdd : A64I_fpccmpImpl<0b01, 0b1, FPR64, "fccmpe">;
-
-//===----------------------------------------------------------------------===//
-// Floating-point conditional select instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCSEL
-
-let Uses = [NZCV] in {
-  def FCSELsssc : A64I_fpcondsel<0b0, 0b0, 0b00, (outs FPR32:$Rd),
-                                 (ins FPR32:$Rn, FPR32:$Rm, cond_code_op:$Cond),
-                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
-                                 [(set f32:$Rd,
-                                       (simple_select f32:$Rn, f32:$Rm))],
-                                 NoItinerary>,
-                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-
-  def FCSELdddc : A64I_fpcondsel<0b0, 0b0, 0b01, (outs FPR64:$Rd),
-                                 (ins FPR64:$Rn, FPR64:$Rm, cond_code_op:$Cond),
-                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
-                                 [(set f64:$Rd,
-                                       (simple_select f64:$Rn, f64:$Rm))],
-                                 NoItinerary>,
-                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (1 source)
-//===----------------------------------------------------------------------===//
-// Contains: FMOV, FABS, FNEG, FSQRT, FCVT, FRINT[NPMZAXI].
-
-def FPNoUnop : PatFrag<(ops node:$val), (fneg node:$val),
-                       [{ (void)N; return false; }]>;
-
-// First we do the fairly trivial bunch with uniform "OP s, s" and "OP d, d"
-// syntax. Default to no pattern because most are odd enough not to have one.
-multiclass A64I_fpdp1sizes<bits<6> opcode, string asmstr,
-                           SDPatternOperator opnode = FPNoUnop> {
-  def ss : A64I_fpdp1<0b0, 0b0, 0b00, opcode, (outs FPR32:$Rd), (ins FPR32:$Rn),
-                     !strconcat(asmstr, "\t$Rd, $Rn"),
-                     [(set f32:$Rd, (opnode f32:$Rn))],
-                     NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def dd : A64I_fpdp1<0b0, 0b0, 0b01, opcode, (outs FPR64:$Rd), (ins FPR64:$Rn),
-                     !strconcat(asmstr, "\t$Rd, $Rn"),
-                     [(set f64:$Rd, (opnode f64:$Rn))],
-                     NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMOV   : A64I_fpdp1sizes<0b000000, "fmov">;
-defm FABS   : A64I_fpdp1sizes<0b000001, "fabs", fabs>;
-defm FNEG   : A64I_fpdp1sizes<0b000010, "fneg", fneg>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
-  defm FSQRT  : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
-}
-
-defm FRINTN : A64I_fpdp1sizes<0b001000, "frintn">;
-defm FRINTP : A64I_fpdp1sizes<0b001001, "frintp", fceil>;
-defm FRINTM : A64I_fpdp1sizes<0b001010, "frintm", ffloor>;
-defm FRINTZ : A64I_fpdp1sizes<0b001011, "frintz", ftrunc>;
-defm FRINTA : A64I_fpdp1sizes<0b001100, "frinta">;
-defm FRINTX : A64I_fpdp1sizes<0b001110, "frintx", frint>;
-defm FRINTI : A64I_fpdp1sizes<0b001111, "frinti", fnearbyint>;
-
-// The FCVT instrucitons have different source and destination register-types,
-// but the fields are uniform everywhere a D-register (say) crops up. Package
-// this information in a Record.
-class FCVTRegType<RegisterClass rc, bits<2> fld, ValueType vt> {
-    RegisterClass Class = rc;
-    ValueType VT = vt;
-    bit t1 = fld{1};
-    bit t0 = fld{0};
-}
-
-def FCVT16 : FCVTRegType<FPR16, 0b11, f16>;
-def FCVT32 : FCVTRegType<FPR32, 0b00, f32>;
-def FCVT64 : FCVTRegType<FPR64, 0b01, f64>;
-
-class A64I_fpdp1_fcvt<FCVTRegType DestReg, FCVTRegType SrcReg, SDNode opnode>
-  : A64I_fpdp1<0b0, 0b0, {SrcReg.t1, SrcReg.t0},
-               {0,0,0,1, DestReg.t1, DestReg.t0},
-               (outs DestReg.Class:$Rd), (ins SrcReg.Class:$Rn),
-               "fcvt\t$Rd, $Rn",
-               [(set DestReg.VT:$Rd, (opnode SrcReg.VT:$Rn))], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTds : A64I_fpdp1_fcvt<FCVT64, FCVT32, fextend>;
-def FCVThs : A64I_fpdp1_fcvt<FCVT16, FCVT32, fround>;
-def FCVTsd : A64I_fpdp1_fcvt<FCVT32, FCVT64, fround>;
-def FCVThd : A64I_fpdp1_fcvt<FCVT16, FCVT64, fround>;
-def FCVTsh : A64I_fpdp1_fcvt<FCVT32, FCVT16, fextend>;
-def FCVTdh : A64I_fpdp1_fcvt<FCVT64, FCVT16, fextend>;
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (2 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMUL, FDIV, FADD, FSUB, FMAX, FMIN, FMAXNM, FMINNM, FNMUL
-
-def FPNoBinop : PatFrag<(ops node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs),
-                      [{ (void)N; return false; }]>;
-
-multiclass A64I_fpdp2sizes<bits<4> opcode, string asmstr,
-                           SDPatternOperator opnode> {
-  def sss : A64I_fpdp2<0b0, 0b0, 0b00, opcode,
-                      (outs FPR32:$Rd),
-                      (ins FPR32:$Rn, FPR32:$Rm),
-                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
-                      [(set f32:$Rd, (opnode f32:$Rn, f32:$Rm))],
-                      NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-  def ddd : A64I_fpdp2<0b0, 0b0, 0b01, opcode,
-                      (outs FPR64:$Rd),
-                      (ins FPR64:$Rn, FPR64:$Rm),
-                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
-                      [(set f64:$Rd, (opnode f64:$Rn, f64:$Rm))],
-                      NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
-
-let isCommutable = 1 in {
-  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    defm FMUL   : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
-  }
-  defm FADD   : A64I_fpdp2sizes<0b0010, "fadd", fadd>;
-
-  // No patterns for these.
-  defm FMAX   : A64I_fpdp2sizes<0b0100, "fmax", FPNoBinop>;
-  defm FMIN   : A64I_fpdp2sizes<0b0101, "fmin", FPNoBinop>;
-  defm FMAXNM : A64I_fpdp2sizes<0b0110, "fmaxnm", FPNoBinop>;
-  defm FMINNM : A64I_fpdp2sizes<0b0111, "fminnm", FPNoBinop>;
-
-  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    defm FNMUL  : A64I_fpdp2sizes<0b1000, "fnmul",
-                                  PatFrag<(ops node:$lhs, node:$rhs),
-                                          (fneg (fmul node:$lhs, node:$rhs))> >;
-  }
-}
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
-  defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
-}
-defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (3 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMADD, FMSUB, FNMADD, FNMSUB
-
-def fmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                    (fma (fneg node:$Rn),  node:$Rm, node:$Ra)>;
-def fnmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                     (fma node:$Rn,  node:$Rm, (fneg node:$Ra))>;
-def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                     (fma (fneg node:$Rn),  node:$Rm, (fneg node:$Ra))>;
-
-class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
-                     bits<2> type, bit o1, bit o0, SDPatternOperator fmakind>
-  : A64I_fpdp3<0b0, 0b0, type, o1, o0, (outs FPR:$Rd),
-               (ins FPR:$Rn, FPR:$Rm, FPR:$Ra),
-               !strconcat(asmop,"\t$Rd, $Rn, $Rm, $Ra"),
-               [(set VT:$Rd, (fmakind VT:$Rn, VT:$Rm, VT:$Ra))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]>;
-
-def FMADDssss  : A64I_fpdp3Impl<"fmadd",  FPR32, f32, 0b00, 0b0, 0b0, fma>;
-def FMSUBssss  : A64I_fpdp3Impl<"fmsub",  FPR32, f32, 0b00, 0b0, 0b1, fmsub>;
-def FNMADDssss : A64I_fpdp3Impl<"fnmadd", FPR32, f32, 0b00, 0b1, 0b0, fnmadd>;
-def FNMSUBssss : A64I_fpdp3Impl<"fnmsub", FPR32, f32, 0b00, 0b1, 0b1, fnmsub>;
-
-def FMADDdddd  : A64I_fpdp3Impl<"fmadd",  FPR64, f64, 0b01, 0b0, 0b0, fma>;
-def FMSUBdddd  : A64I_fpdp3Impl<"fmsub",  FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
-def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
-def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
-
-// Extra patterns for when we're allowed to optimise separate multiplication and
-// addition.
-let Predicates = [HasFPARMv8, UseFusedMAC] in {
-def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
-          (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
-          (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point <-> fixed-point conversion instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-// #1-#32 allowed, encoded as "64 - <specified imm>
-def fixedpos_asmoperand_i32 : AsmOperandClass {
-  let Name = "CVTFixedPos32";
-  let RenderMethod = "addCVTFixedPosOperands";
-  let PredicateMethod = "isCVTFixedPos<32>";
-  let DiagnosticType = "CVTFixedPos32";
-}
-
-// Also encoded as "64 - <specified imm>" but #1-#64 allowed.
-def fixedpos_asmoperand_i64 : AsmOperandClass {
-  let Name = "CVTFixedPos64";
-  let RenderMethod = "addCVTFixedPosOperands";
-  let PredicateMethod = "isCVTFixedPos<64>";
-  let DiagnosticType = "CVTFixedPos64";
-}
-
-// We need the cartesian product of f32/f64 i32/i64 operands for
-// conversions:
-//   + Selection needs to use operands of correct floating type
-//   + Assembly parsing and decoding depend on integer width
-class cvtfix_i32_op<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm]> {
-  let ParserMatchClass = fixedpos_asmoperand_i32;
-  let DecoderMethod = "DecodeCVT32FixedPosOperand";
-  let PrintMethod = "printCVTFixedPosOperand";
-}
-
-class cvtfix_i64_op<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm]> {
-  let ParserMatchClass = fixedpos_asmoperand_i64;
-  let PrintMethod = "printCVTFixedPosOperand";
-}
-
-// Because of the proliferation of weird operands, it's not really
-// worth going for a multiclass here. Oh well.
-
-class A64I_fptofix<bit sf, bits<2> type, bits<3> opcode,
-                   RegisterClass GPR, RegisterClass FPR,
-                   ValueType DstTy, ValueType SrcTy,
-                   Operand scale_op, string asmop, SDNode cvtop>
-  : A64I_fpfixed<sf, 0b0, type, 0b11, opcode,
-                 (outs GPR:$Rd), (ins FPR:$Rn, scale_op:$Scale),
-                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
-                 [(set DstTy:$Rd, (cvtop (fmul SrcTy:$Rn, scale_op:$Scale)))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTZSwsi : A64I_fptofix<0b0, 0b00, 0b000, GPR32, FPR32, i32, f32,
-                             cvtfix_i32_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZSxsi : A64I_fptofix<0b1, 0b00, 0b000, GPR64, FPR32, i64, f32,
-                             cvtfix_i64_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZUwsi : A64I_fptofix<0b0, 0b00, 0b001, GPR32, FPR32, i32, f32,
-                             cvtfix_i32_op<f32>, "fcvtzu", fp_to_uint>;
-def FCVTZUxsi : A64I_fptofix<0b1, 0b00, 0b001, GPR64, FPR32, i64, f32,
-                             cvtfix_i64_op<f32>, "fcvtzu", fp_to_uint>;
-
-def FCVTZSwdi : A64I_fptofix<0b0, 0b01, 0b000, GPR32, FPR64, i32, f64,
-                             cvtfix_i32_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZSxdi : A64I_fptofix<0b1, 0b01, 0b000, GPR64, FPR64, i64, f64,
-                             cvtfix_i64_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZUwdi : A64I_fptofix<0b0, 0b01, 0b001, GPR32, FPR64, i32, f64,
-                             cvtfix_i32_op<f64>, "fcvtzu", fp_to_uint>;
-def FCVTZUxdi : A64I_fptofix<0b1, 0b01, 0b001, GPR64, FPR64, i64, f64,
-                             cvtfix_i64_op<f64>, "fcvtzu", fp_to_uint>;
-
-
-class A64I_fixtofp<bit sf, bits<2> type, bits<3> opcode,
-                   RegisterClass FPR, RegisterClass GPR,
-                   ValueType DstTy, ValueType SrcTy,
-                   Operand scale_op, string asmop, SDNode cvtop>
-  : A64I_fpfixed<sf, 0b0, type, 0b00, opcode,
-                 (outs FPR:$Rd), (ins GPR:$Rn, scale_op:$Scale),
-                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
-                 [(set DstTy:$Rd, (fdiv (cvtop SrcTy:$Rn), scale_op:$Scale))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def SCVTFswi : A64I_fixtofp<0b0, 0b00, 0b010, FPR32, GPR32, f32, i32,
-                            cvtfix_i32_op<f32>, "scvtf", sint_to_fp>;
-def SCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b010, FPR32, GPR64, f32, i64,
-                            cvtfix_i64_op<f32>, "scvtf", sint_to_fp>;
-def UCVTFswi : A64I_fixtofp<0b0, 0b00, 0b011, FPR32, GPR32, f32, i32,
-                            cvtfix_i32_op<f32>, "ucvtf", uint_to_fp>;
-def UCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b011, FPR32, GPR64, f32, i64,
-                            cvtfix_i64_op<f32>, "ucvtf", uint_to_fp>;
-def SCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b010, FPR64, GPR32, f64, i32,
-                            cvtfix_i32_op<f64>, "scvtf", sint_to_fp>;
-def SCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b010, FPR64, GPR64, f64, i64,
-                            cvtfix_i64_op<f64>, "scvtf", sint_to_fp>;
-def UCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b011, FPR64, GPR32, f64, i32,
-                            cvtfix_i32_op<f64>, "ucvtf", uint_to_fp>;
-def UCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b011, FPR64, GPR64, f64, i64,
-                            cvtfix_i64_op<f64>, "ucvtf", uint_to_fp>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point <-> integer conversion instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-class A64I_fpintI<bit sf, bits<2> type, bits<2> rmode, bits<3> opcode,
-                   RegisterClass DestPR, RegisterClass SrcPR, string asmop>
-  : A64I_fpint<sf, 0b0, type, rmode, opcode, (outs DestPR:$Rd), (ins SrcPR:$Rn),
-               !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass A64I_fptointRM<bits<2> rmode, bit o2, string asmop> {
-  def Sws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 0},
-                        GPR32, FPR32, asmop # "s">;
-  def Sxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 0},
-                        GPR64, FPR32, asmop # "s">;
-  def Uws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 1},
-                        GPR32, FPR32, asmop # "u">;
-  def Uxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 1},
-                        GPR64, FPR32, asmop # "u">;
-
-  def Swd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 0},
-                        GPR32, FPR64, asmop # "s">;
-  def Sxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 0},
-                        GPR64, FPR64, asmop # "s">;
-  def Uwd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 1},
-                        GPR32, FPR64, asmop # "u">;
-  def Uxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 1},
-                        GPR64, FPR64, asmop # "u">;
-}
-
-defm FCVTN : A64I_fptointRM<0b00, 0b0, "fcvtn">;
-defm FCVTP : A64I_fptointRM<0b01, 0b0, "fcvtp">;
-defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
-defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
-defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
-def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
-def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
-def : Pat<(i64 (fp_to_uint f32:$Rn)), (FCVTZUxs $Rn)>;
-def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
-def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
-def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
-def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
-}
-
-multiclass A64I_inttofp<bit o0, string asmop> {
-  def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
-  def CVTFsx : A64I_fpintI<0b1, 0b00, 0b00, {0, 1, o0}, FPR32, GPR64, asmop>;
-  def CVTFdw : A64I_fpintI<0b0, 0b01, 0b00, {0, 1, o0}, FPR64, GPR32, asmop>;
-  def CVTFdx : A64I_fpintI<0b1, 0b01, 0b00, {0, 1, o0}, FPR64, GPR64, asmop>;
-}
-
-defm S : A64I_inttofp<0b0, "scvtf">;
-defm U : A64I_inttofp<0b1, "ucvtf">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
-def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
-def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
-def : Pat<(f64 (sint_to_fp i64:$Rn)), (SCVTFdx $Rn)>;
-def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
-def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
-def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
-def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
-}
-
-def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
-def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
-def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
-def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
-def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
-def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
-def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
-}
-
-def lane1_asmoperand : AsmOperandClass {
-  let Name = "Lane1";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "Lane1";
-}
-
-def lane1 : Operand<i32> {
-  let ParserMatchClass = lane1_asmoperand;
-  let PrintMethod = "printBareImmOperand";
-}
-
-let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
-  def FMOVxv : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b110,
-                          (outs GPR64:$Rd), (ins VPR128:$Rn, lane1:$Lane),
-                          "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-  def FMOVvx : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b111,
-                          (outs VPR128:$Rd), (ins GPR64:$Rn, lane1:$Lane),
-                          "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-let Predicates = [HasFPARMv8] in {
-def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
-                (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
-
-def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
-                (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point immediate instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMOV
-
-def fpimm_asmoperand : AsmOperandClass {
-  let Name = "FMOVImm";
-  let ParserMethod = "ParseFPImmOperand";
-  let DiagnosticType = "FPImm";
-}
-
-// The MCOperand for these instructions are the encoded 8-bit values.
-def SDXF_fpimm : SDNodeXForm<fpimm, [{
-  uint32_t Imm8;
-  A64Imms::isFPImm(N->getValueAPF(), Imm8);
-  return CurDAG->getTargetConstant(Imm8, MVT::i32);
+defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+                                             int_aarch64_neon_sabd>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+                                          int_aarch64_neon_sabd>;
+defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
+            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
+                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+                                               int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+                                               int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+                                     int_aarch64_neon_sqdmull>;
+defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+                                              int_aarch64_neon_uabd>;
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_aarch64_neon_uabd>;
+defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
+                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
+                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
+          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup<    0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+          (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+          (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+          (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+          (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+          (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+          (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+          (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+          (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v2f32 (DUPv2i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v4f32 (DUPv4i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+          (v2f64 (DUPv2i64lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+            (i64 0)))>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
 }]>;
-
-class fmov_operand<ValueType FT>
-  : Operand<i32>,
-    PatLeaf<(FT fpimm), [{ return A64Imms::isFPImm(N->getValueAPF()); }],
-            SDXF_fpimm> {
-  let PrintMethod = "printFPImmOperand";
-  let ParserMatchClass = fpimm_asmoperand;
-}
-
-def fmov32_operand : fmov_operand<f32>;
-def fmov64_operand : fmov_operand<f64>;
-
-class A64I_fpimm_impl<bits<2> type, RegisterClass Reg, ValueType VT,
-                      Operand fmov_operand>
-  : A64I_fpimm<0b0, 0b0, type, 0b00000,
-               (outs Reg:$Rd),
-               (ins fmov_operand:$Imm8),
-               "fmov\t$Rd, $Imm8",
-               [(set VT:$Rd, fmov_operand:$Imm8)],
-               NoItinerary>,
-    Sched<[WriteFPALU]>;
-
-def FMOVsi : A64I_fpimm_impl<0b00, FPR32, f32, fmov32_operand>;
-def FMOVdi : A64I_fpimm_impl<0b01, FPR64, f64, fmov64_operand>;
-
-//===----------------------------------------------------------------------===//
-// Load-register (literal) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDR, LDRSW, PRFM
-
-def ldrlit_label_asmoperand : AsmOperandClass {
-  let Name = "LoadLitLabel";
-  let RenderMethod = "addLabelOperands<19, 4>";
-  let DiagnosticType = "Label";
-}
-
-def ldrlit_label : Operand<i64> {
-  let EncoderMethod = "getLoadLitLabelOpValue";
-
-  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<19, 4>";
-  let ParserMatchClass = ldrlit_label_asmoperand;
-  let OperandType = "OPERAND_PCREL";
-}
-
-// Various instructions take an immediate value (which can always be used),
-// where some numbers have a symbolic name to make things easier. These operands
-// and the associated functions abstract away the differences.
-multiclass namedimm<string prefix, string mapper> {
-  def _asmoperand : AsmOperandClass {
-    let Name = "NamedImm" # prefix;
-    let PredicateMethod = "isUImm";
-    let RenderMethod = "addImmOperands";
-    let ParserMethod = "ParseNamedImmOperand<" # mapper # ">";
-    let DiagnosticType = "NamedImm_" # prefix;
-  }
-
-  def _op : Operand<i32> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-    let PrintMethod = "printNamedImmOperand<" # mapper # ">";
-    let DecoderMethod = "DecodeNamedImmOperand<" # mapper # ">";
-  }
-}
-
-defm prefetch : namedimm<"prefetch", "A64PRFM::PRFMMapper">;
-
-class A64I_LDRlitSimple<bits<2> opc, bit v, RegisterClass OutReg,
-                      list<dag> patterns = []>
-   : A64I_LDRlit<opc, v, (outs OutReg:$Rt), (ins ldrlit_label:$Imm19),
-                 "ldr\t$Rt, $Imm19", patterns, NoItinerary>,
-     Sched<[WriteLd]>;
-
-let mayLoad = 1 in {
-  def LDRw_lit : A64I_LDRlitSimple<0b00, 0b0, GPR32>;
-  def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
-}
-
-let Predicates = [HasFPARMv8] in {
-def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
-def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
-}
-
-let mayLoad = 1 in {
-  let Predicates = [HasFPARMv8] in {
-  def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
-  }
-
-  def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
-                               (outs GPR64:$Rt),
-                               (ins ldrlit_label:$Imm19),
-                               "ldrsw\t$Rt, $Imm19",
-                               [], NoItinerary>,
-                   Sched<[WriteLd]>;
-
-  def PRFM_lit : A64I_LDRlit<0b11, 0b0,
-                             (outs), (ins prefetch_op:$Rt, ldrlit_label:$Imm19),
-                             "prfm\t$Rt, $Imm19",
-                             [], NoItinerary>,
-                 Sched<[WriteLd, ReadLd]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Load-store exclusive instructions
-//===----------------------------------------------------------------------===//
-// Contains: STXRB, STXRH, STXR, LDXRB, LDXRH, LDXR. STXP, LDXP, STLXRB,
-//           STLXRH, STLXR, LDAXRB, LDAXRH, LDAXR, STLXP, LDAXP, STLRB,
-//           STLRH, STLR, LDARB, LDARH, LDAR
-
-// Since these instructions have the undefined register bits set to 1 in
-// their canonical form, we need a post encoder method to set those bits
-// to 1 when encoding these instructions. We do this using the
-// fixLoadStoreExclusive function. This function has template parameters:
-//
-// fixLoadStoreExclusive<int hasRs, int hasRt2>
-//
-// hasRs indicates that the instruction uses the Rs field, so we won't set
-// it to 1 (and the same for Rt2). We don't need template parameters for
-// the other register fiels since Rt and Rn are always used.
-
-// This operand parses a GPR64xsp register, followed by an optional immediate
-// #0.
-def GPR64xsp0_asmoperand : AsmOperandClass {
-  let Name = "GPR64xsp0";
-  let PredicateMethod = "isWrappedReg";
-  let RenderMethod = "addRegOperands";
-  let ParserMethod = "ParseLSXAddressOperand";
-  // Diagnostics are provided by ParserMethod
-}
-
-def GPR64xsp0 : RegisterOperand<GPR64xsp> {
-  let ParserMatchClass = GPR64xsp0_asmoperand;
-}
-
-//===----------------------------------
-// Store-exclusive (releasing & normal)
-//===----------------------------------
-
-class A64I_SRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-       A64I_LDSTex_stn <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rs, $Rt, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
-  let Constraints = "@earlyclobber $Rs";
-}
-
-multiclass A64I_SRex<string asmstr, bits<3> opcode, string prefix> {
-  def _byte:  A64I_SRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _hword:  A64I_SRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                               (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                               [],NoItinerary>,
-               Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _word:  A64I_SRexs_impl<0b10, opcode, asmstr,
-                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SRexs_impl<0b11, opcode, asmstr,
-                              (outs GPR32:$Rs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-}
-
-defm STXR  : A64I_SRex<"stxr",  0b000, "STXR">;
-defm STLXR : A64I_SRex<"stlxr", 0b001, "STLXR">;
-
-//===----------------------------------
-// Loads
-//===----------------------------------
-
-class A64I_LRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-        A64I_LDSTex_tn <size,
-                        opcode{2}, 1, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, [$Rn]"),
-                        pat, itin> {
-  let mayLoad = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-multiclass A64I_LRex<string asmstr, bits<3> opcode> {
-  def _byte:  A64I_LRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-
-  def _hword:  A64I_LRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-               Sched<[WriteLd]>;
-
-  def _word:  A64I_LRexs_impl<0b10, opcode, asmstr,
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-
-  def _dword: A64I_LRexs_impl<0b11, opcode, asmstr,
-                            (outs GPR64:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-}
-
-defm LDXR  : A64I_LRex<"ldxr",  0b000>;
-defm LDAXR : A64I_LRex<"ldaxr", 0b001>;
-defm LDAR  : A64I_LRex<"ldar",  0b101>;
-
-class acquiring_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+def VecIndex_x4 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
 }]>;
-
-def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
-def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
-def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
-def atomic_load_acquire_64 : acquiring_load<atomic_load_64>;
-
-def : Pat<(atomic_load_acquire_8  i64:$Rn), (LDAR_byte  $Rn)>;
-def : Pat<(atomic_load_acquire_16 i64:$Rn), (LDAR_hword $Rn)>;
-def : Pat<(atomic_load_acquire_32 i64:$Rn), (LDAR_word  $Rn)>;
-def : Pat<(atomic_load_acquire_64 i64:$Rn), (LDAR_dword $Rn)>;
-
-//===----------------------------------
-// Store-release (no exclusivity)
-//===----------------------------------
-
-class A64I_SLexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-        A64I_LDSTex_tn <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-class releasing_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Release || Ordering == SequentiallyConsistent;
+def VecIndex_x8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
 }]>;
 
-def atomic_store_release_8  : releasing_store<atomic_store_8>;
-def atomic_store_release_16 : releasing_store<atomic_store_16>;
-def atomic_store_release_32 : releasing_store<atomic_store_32>;
-def atomic_store_release_64 : releasing_store<atomic_store_64>;
-
-multiclass A64I_SLex<string asmstr, bits<3> opcode, string prefix> {
-  def _byte:  A64I_SLexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                            (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                            [(atomic_store_release_8 i64:$Rn, i32:$Rt)],
-                            NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _hword:  A64I_SLexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_16 i64:$Rn, i32:$Rt)],
-                           NoItinerary>,
-               Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _word:  A64I_SLexs_impl<0b10, opcode, asmstr,
-                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_32 i64:$Rn, i32:$Rt)],
-                           NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SLexs_impl<0b11, opcode, asmstr,
-                           (outs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_64 i64:$Rn, i64:$Rt)],
-                           NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-}
-
-defm STLR  : A64I_SLex<"stlr", 0b101, "STLR">;
-
-//===----------------------------------
-// Store-exclusive pair (releasing & normal)
-//===----------------------------------
-
-class A64I_SPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-     A64I_LDSTex_stt2n <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rs, $Rt, $Rt2, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-}
-
-
-multiclass A64I_SPex<string asmstr, bits<3> opcode> {
-  def _word:  A64I_SPexs_impl<0b10, opcode, asmstr, (outs),
-                            (ins GPR32:$Rs, GPR32:$Rt, GPR32:$Rt2,
-                                 GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SPexs_impl<0b11, opcode, asmstr, (outs),
-                            (ins GPR32:$Rs, GPR64:$Rt, GPR64:$Rt2,
-                                            GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-}
-
-defm STXP  : A64I_SPex<"stxp", 0b010>;
-defm STLXP : A64I_SPex<"stlxp", 0b011>;
-
-//===----------------------------------
-// Load-exclusive pair (acquiring & normal)
-//===----------------------------------
-
-class A64I_LPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-      A64I_LDSTex_tt2n <size,
-                        opcode{2}, 1, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, $Rt2, [$Rn]"),
-                        pat, itin>{
-  let mayLoad = 1;
-  let DecoderMethod = "DecodeLoadPairExclusiveInstruction";
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
-}
-
-multiclass A64I_LPex<string asmstr, bits<3> opcode> {
-  def _word:  A64I_LPexs_impl<0b10, opcode, asmstr,
-                            (outs GPR32:$Rt, GPR32:$Rt2),
-                            (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd, WriteLd, ReadLd]>;
-
-  def _dword: A64I_LPexs_impl<0b11, opcode, asmstr,
-                            (outs GPR64:$Rt, GPR64:$Rt2),
-                            (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd, WriteLd, ReadLd]>;
-}
-
-defm LDXP  : A64I_LPex<"ldxp", 0b010>;
-defm LDAXP : A64I_LPex<"ldaxp", 0b011>;
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+                            ValueType Src128VT, ValueType ScalVT,
+                            Instruction DUP, SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+                                                     imm:$idx)))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+                                                     imm:$idx)))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+                               SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+                                                         imm:$idx))))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
+                                                         imm:$idx))))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+               (i32 0xff)),
+          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+               (i32 0xffff)),
+          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                  (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi32lane
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+              (i64 0)),
+            dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (INSvi32lane
+            V128:$Rn, VectorIndexS:$imm,
+            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+            (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+          (INSvi64lane
+            V128:$Rn, VectorIndexD:$imm,
+            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+            (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+                   VectorIndexB:$idx2)),
+          (v16i8 (INSvi8lane
+                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+          )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+                   VectorIndexH:$idx2)),
+          (v8i16 (INSvi16lane
+                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+          )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+                   VectorIndexS:$idx2)),
+          (v4i32 (INSvi32lane
+                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+          )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+                   VectorIndexD:$idx2)),
+          (v2i64 (INSvi64lane
+                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+          )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+                                ValueType VTScal, Instruction INS> {
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd,
+                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                 imm:$Immd, V128:$Rn, imm:$Immn),
+                            dsub)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG
+                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+                dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, possibly fed by an INS if the lane number is
+// anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+          (f64 (EXTRACT_SUBREG
+            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexD:$idx),
+            dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+          (f32 (EXTRACT_SUBREG
+            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexS:$idx),
+            ssub))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+
+}
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+           ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+                                                Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+            ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_aarch64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_aarch64_neon_uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>;
+def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>;
+def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>;
+def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>;
+def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".2d",
+                       [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+                                              "fmov", ".2s",
+                       [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".4s",
+                       [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+          (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+                                                simdimmtype10,
+                                                "movi", ".2d",
+                   [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+
+// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
+// Complexity is added to break a tie with a plain MOVI.
+let AddedComplexity = 1 in {
+def : Pat<(f32   fpimm0),
+          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
+      Requires<[HasZCZ]>;
+def : Pat<(f64   fpimm0),
+          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
+      Requires<[HasZCZ]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+                                                 "movi", ".8b",
+                       [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+                                                 "movi", ".16b",
+                       [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let neverHasSideEffects = 1 in {
+  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (v2f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (v4f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+  // (DUPLANE from 64-bit would be trivial).
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+                                           VectorIndexD:$idx))),
+            (FMLSv2i64_indexed
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 (fneg V64:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv2i32_indexed V64:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv4i32_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+          (FMULv2i64_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+            (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+                int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+                                           int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+                                           int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+                int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                          (vector_extract (v4i32 V128:$Vm),
+                                                           VectorIndexS:$idx)),
+          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
+defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+                                     int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+                                     int_aarch64_neon_sqrshrun>;
+defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+                                     int_aarch64_neon_sqshrn>;
+defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+                                     int_aarch64_neon_sqshrun>;
+defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
+defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+                                     int_aarch64_neon_uqrshrn>;
+defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+                                     int_aarch64_neon_uqshrn>;
+defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+                                   int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+                                         int_aarch64_neon_rshrn>;
+defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+                          BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftL64:$imm))),
+          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+                                         int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+                                         int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+                                         int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+                                         int_aarch64_neon_sqshrun>;
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftR64:$imm))),
+          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+                 TriOpFrag<(add node:$LHS,
+                                (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+                BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+                TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+                        int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+                                         int_aarch64_neon_uqrshrn>;
+defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+                                         int_aarch64_neon_uqshrn>;
+defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+                TriOpFrag<(add node:$LHS,
+                               (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+                BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+                TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+                                 (trunc (AArch64vlshr (v8i16 V128:$Rn),
+                                                    vecshiftR16Narrow:$imm)))),
+          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+                                 (trunc (AArch64vlshr (v4i32 V128:$Rn),
+                                                    vecshiftR32Narrow:$imm)))),
+          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+                                 (trunc (AArch64vlshr (v2i64 V128:$Rn),
+                                                    vecshiftR64Narrow:$imm)))),
+          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                              (f64
+                                (EXTRACT_SUBREG
+                                  (SSHLLv8i8_shift
+                                    (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        bsub),
+                                    0),
+                                  dsub)),
+                               0),
+                             ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+                          (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+                          (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  hsub),
+                                0),
+                            ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                 (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        hsub),
+                                     0),
+                                   dsub)),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+                           (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+                           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+                           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+  : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8,  LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+  : Pat<(store ty:$Vt, GPR64sp:$Rn),
+        (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8,  ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, GPR64sp:$Rn),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 15 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 15 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+                        2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+                         1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+                         2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
+def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
+def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
+def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
+def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
+def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
 
-//===----------------------------------------------------------------------===//
-// Load-store register (unscaled immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDURB, LDURH, LDRUSB, LDRUSH, LDRUSW, STUR, STURB, STURH and PRFUM
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use an unsigned bitfield move
+// instruction (UBFM) on the enclosing super-reg.
+def : Pat<(i64 (zext GPR32:$src)),
+ (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
+                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
+                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
+                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
 //
-// and
+// Consider a simple memory load followed by a bitconvert then a store.
+//   v0 = load v2i32
+//   v1 = BITCAST v2i32 v0 to v4i16
+//        store v4i16 v2
 //
-//===----------------------------------------------------------------------===//
-// Load-store register (register offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (unsigned immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
 //
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = BITCAST v2i32 v1 to v4i16
+//   v3 = REV v4i16 v2               (implicit)
+//        store v4i16 v3
 //
-// and
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = REV v2i32
+//   v3 = BITCAST v2i32 v2 to v4i16
+//   v4 = REV v4i16
+//   v5 = REV v4i16 v4               (implicit)
+//        store v4i16 v5
 //
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
-
-// Note that patterns are much later on in a completely separate section (they
-// need ADRPxi to be defined).
-
-//===-------------------------------
-// 1. Various operands needed
-//===-------------------------------
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-// The addressing mode for these instructions consists of an unsigned 12-bit
-// immediate which is scaled by the size of the memory access.
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
 //
-// We represent this in the MC layer by two operands:
-//     1. A base register.
-//     2. A 12-bit immediate: not multiplied by access size, so "LDR x0,[x0,#8]"
-//        would have '1' in this field.
-// This means that separate functions are needed for converting representations
-// which *are* aware of the intended access size.
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-
-multiclass offsets_uimm12<int MemSize, string prefix> {
-  def uimm12_asmoperand : AsmOperandClass {
-    let Name = "OffsetUImm12_" # MemSize;
-    let PredicateMethod = "isOffsetUImm12<" # MemSize # ">";
-    let RenderMethod = "addOffsetUImm12Operands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreUImm12_" # MemSize;
-  }
-
-  // Pattern is really no more than an ImmLeaf, but predicated on MemSize which
-  // complicates things beyond TableGen's ken.
-  def uimm12 : Operand<i64>,
-               ComplexPattern<i64, 1, "SelectOffsetUImm12<" # MemSize # ">"> {
-    let ParserMatchClass
-      = !cast<AsmOperandClass>(prefix # uimm12_asmoperand);
-
-    let PrintMethod = "printOffsetUImm12Operand<" # MemSize # ">";
-    let EncoderMethod = "getOffsetUImm12OpValue<" # MemSize # ">";
-  }
-}
-
-defm byte_  : offsets_uimm12<1, "byte_">;
-defm hword_ : offsets_uimm12<2, "hword_">;
-defm word_  : offsets_uimm12<4, "word_">;
-defm dword_ : offsets_uimm12<8, "dword_">;
-defm qword_ : offsets_uimm12<16, "qword_">;
-
-//===-------------------------------
-// 1.1 Signed 9-bit immediate operands
-//===-------------------------------
-
-// The MCInst is expected to store the bit-wise encoding of the value,
-// which amounts to lopping off the extended sign bits.
-def SDXF_simm9 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 0x1ff, MVT::i32);
-}]>;
-
-def simm9_asmoperand : AsmOperandClass {
-  let Name = "SImm9";
-  let PredicateMethod = "isSImm<9>";
-  let RenderMethod = "addSImmOperands<9>";
-  let DiagnosticType = "LoadStoreSImm9";
-}
-
-def simm9 : Operand<i64>,
-            ImmLeaf<i64, [{ return Imm >= -0x100 && Imm <= 0xff; }],
-            SDXF_simm9> {
-  let PrintMethod = "printOffsetSImm9Operand";
-  let ParserMatchClass = simm9_asmoperand;
-}
-
-
-//===-------------------------------
-// 1.3 Register offset extensions
-//===-------------------------------
-
-// The assembly-syntax for these addressing-modes is:
-//    [<Xn|SP>, <R><m> {, <extend> {<amount>}}]
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
 //
-// The essential semantics are:
-//     + <amount> is a shift: #<log(transfer size)> or #0
-//     + <R> can be W or X.
-//     + If <R> is W, <extend> can be UXTW or SXTW
-//     + If <R> is X, <extend> can be LSL or SXTX
+// Most bitconverts require some sort of conversion. The only exceptions are:
+//   a) Identity conversions -  vNfX <-> vNiX
+//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
 //
-// The trickiest of those constraints is that Rm can be either GPR32 or GPR64,
-// which will need separate instructions for LLVM type-consistency. We'll also
-// need separate operands, of course.
-multiclass regexts<int MemSize, int RmSize, RegisterClass GPR,
-                   string Rm, string prefix> {
-  def regext_asmoperand : AsmOperandClass {
-    let Name = "AddrRegExtend_" # MemSize # "_" #  Rm;
-    let PredicateMethod = "isAddrRegExtend<" # MemSize # "," # RmSize # ">";
-    let RenderMethod = "addAddrRegExtendOperands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreExtend" # RmSize # "_" # MemSize;
-  }
-
-  def regext : Operand<i64> {
-    let PrintMethod
-      = "printAddrRegExtendOperand<" # MemSize # ", " # RmSize # ">";
-
-    let DecoderMethod = "DecodeAddrRegExtendOperand";
-    let ParserMatchClass
-      = !cast<AsmOperandClass>(prefix # regext_asmoperand);
-  }
-}
-
-multiclass regexts_wx<int MemSize, string prefix> {
-  // Rm is an X-register if LSL or SXTX are specified as the shift.
-  defm Xm_ : regexts<MemSize, 64, GPR64, "Xm", prefix # "Xm_">;
-
-  // Rm is a W-register if UXTW or SXTW are specified as the shift.
-  defm Wm_ : regexts<MemSize, 32, GPR32, "Wm", prefix # "Wm_">;
-}
-
-defm byte_  : regexts_wx<1, "byte_">;
-defm hword_ : regexts_wx<2, "hword_">;
-defm word_  : regexts_wx<4, "word_">;
-defm dword_ : regexts_wx<8, "dword_">;
-defm qword_ : regexts_wx<16, "qword_">;
-
-
-//===------------------------------
-// 2. The instructions themselves.
-//===------------------------------
-
-// We have the following instructions to implement:
-// |                 | B     | H     | W     | X      |
-// |-----------------+-------+-------+-------+--------|
-// | unsigned str    | STRB  | STRH  | STR   | STR    |
-// | unsigned ldr    | LDRB  | LDRH  | LDR   | LDR    |
-// | signed ldr to W | LDRSB | LDRSH | -     | -      |
-// | signed ldr to X | LDRSB | LDRSH | LDRSW | (PRFM) |
-
-// This will instantiate the LDR/STR instructions you'd expect to use for an
-// unsigned datatype (first two rows above) or floating-point register, which is
-// reasonably uniform across all access sizes.
-
-
-//===------------------------------
-// 2.1 Regular instructions
-//===------------------------------
-
-// This class covers the basic unsigned or irrelevantly-signed loads and stores,
-// to general-purpose and floating-point registers.
-
-class AddrParams<string prefix> {
-  Operand uimm12 = !cast<Operand>(prefix # "_uimm12");
-
-  Operand regextWm = !cast<Operand>(prefix # "_Wm_regext");
-  Operand regextXm = !cast<Operand>(prefix # "_Xm_regext");
-}
-
-def byte_addrparams : AddrParams<"byte">;
-def hword_addrparams : AddrParams<"hword">;
-def word_addrparams : AddrParams<"word">;
-def dword_addrparams : AddrParams<"dword">;
-def qword_addrparams : AddrParams<"qword">;
-
-multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
-                                bit high_opc, string asmsuffix,
-                                RegisterClass GPR, AddrParams params> {
-  // Unsigned immediate
-  def _STR : A64I_LSunsigimm<size, v, {high_opc, 0b0},
-                     (outs), (ins GPR:$Rt, GPR64xsp:$Rn, params.uimm12:$UImm12),
-                     "str" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                     [], NoItinerary>,
-             Sched<[WriteSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-  }
-  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn]",
-                (!cast<Instruction>(prefix # "_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _LDR : A64I_LSunsigimm<size, v, {high_opc, 0b1},
-                      (outs GPR:$Rt), (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                      "ldr" #  asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                      [], NoItinerary>,
-             Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn]",
-                (!cast<Instruction>(prefix # "_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Register offset (four of these: load/store and Wm/Xm).
-  let mayLoad = 1 in {
-    def _Wm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b0,
-                            (outs GPR:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def _Xm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b1,
-                            (outs GPR:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-  }
-  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "_Xm_RegOffset_LDR") GPR:$Rt, GPR64xsp:$Rn,
-                                                          GPR64:$Rm, 2)>;
-
-  let mayStore = 1 in {
-    def _Wm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b0,
-                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR32:$Rm,
-                                               params.regextWm:$Ext),
-                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>,
-                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-
-    def _Xm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b1,
-                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR64:$Rm,
-                                               params.regextXm:$Ext),
-                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>,
-                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-  }
-  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn, $Rm]",
-      (!cast<Instruction>(prefix # "_Xm_RegOffset_STR") GPR:$Rt, GPR64xsp:$Rn,
-                                                        GPR64:$Rm, 2)>;
-
-  // Unaligned immediate
-  def _STUR : A64I_LSunalimm<size, v, {high_opc, 0b0},
-                             (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                             "stur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-  }
-  def : InstAlias<"stur" # asmsuffix # " $Rt, [$Rn]",
-               (!cast<Instruction>(prefix # "_STUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _LDUR : A64I_LSunalimm<size, v, {high_opc, 0b1},
-                             (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldur" # asmsuffix # " $Rt, [$Rn]",
-               (!cast<Instruction>(prefix # "_LDUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Post-indexed
-  def _PostInd_STR : A64I_LSpostind<size, v, {high_opc, 0b0},
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                               "str" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                               [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt]> {
-    let Constraints = "$Rn = $Rn_wb";
-    let mayStore = 1;
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  def _PostInd_LDR : A64I_LSpostind<size, v, {high_opc, 0b1},
-                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
-                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                    "ldr" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                                    [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  // Pre-indexed
-  def _PreInd_STR : A64I_LSpreind<size, v, {high_opc, 0b0},
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                               "str" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                               [], NoItinerary>,
-                    Sched<[WriteSt, ReadSt, ReadSt]> {
-    let Constraints = "$Rn = $Rn_wb";
-    let mayStore = 1;
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  def _PreInd_LDR : A64I_LSpreind<size, v, {high_opc, 0b1},
-                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
-                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                    "ldr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                                    [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-}
-
-// STRB/LDRB: First define the instructions
-defm LS8
-  : A64I_LDRSTR_unsigned<"LS8", 0b00, 0b0, 0b0, "b", GPR32, byte_addrparams>;
-
-// STRH/LDRH
-defm LS16
-  : A64I_LDRSTR_unsigned<"LS16", 0b01, 0b0, 0b0, "h", GPR32, hword_addrparams>;
-
-
-// STR/LDR to/from a W register
-defm LS32
-  : A64I_LDRSTR_unsigned<"LS32", 0b10, 0b0, 0b0, "", GPR32, word_addrparams>;
-
-// STR/LDR to/from an X register
-defm LS64
-  : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
-
-let Predicates = [HasFPARMv8] in {
-// STR/LDR to/from a B register
-defm LSFP8
-  : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
-
-// STR/LDR to/from an H register
-defm LSFP16
-  : A64I_LDRSTR_unsigned<"LSFP16", 0b01, 0b1, 0b0, "", FPR16, hword_addrparams>;
-
-// STR/LDR to/from an S register
-defm LSFP32
-  : A64I_LDRSTR_unsigned<"LSFP32", 0b10, 0b1, 0b0, "", FPR32, word_addrparams>;
-// STR/LDR to/from a D register
-defm LSFP64
-  : A64I_LDRSTR_unsigned<"LSFP64", 0b11, 0b1, 0b0, "", FPR64, dword_addrparams>;
-// STR/LDR to/from a Q register
-defm LSFP128
-  : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
-                         qword_addrparams>;
-}
-
-//===------------------------------
-// 2.3 Signed loads
-//===------------------------------
-
-// Byte and half-word signed loads can both go into either an X or a W register,
-// so it's worth factoring out. Signed word loads don't fit because there is no
-// W version.
-multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
-                           string prefix> {
-  // Unsigned offset
-  def w : A64I_LSunsigimm<size, 0b0, 0b11,
-                          (outs GPR32:$Rt),
-                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary>,
-          Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
-                  (!cast<Instruction>(prefix # w) GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def x : A64I_LSunsigimm<size, 0b0, 0b10,
-                          (outs GPR64:$Rt),
-                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary>,
-          Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
-                  (!cast<Instruction>(prefix # x) GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Register offset
-  let mayLoad = 1 in {
-    def w_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b0,
-                            (outs GPR32:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def w_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b1,
-                            (outs GPR32:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def x_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b0,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def x_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b1,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "w_Xm_RegOffset") GPR32:$Rt, GPR64xsp:$Rn,
-                                                       GPR64:$Rm, 2)>;
-
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "x_Xm_RegOffset") GPR64:$Rt, GPR64xsp:$Rn,
-                                                       GPR64:$Rm, 2)>;
-
-
-  let mayLoad = 1 in {
-    // Unaligned offset
-    def w_U : A64I_LSunalimm<size, 0b0, 0b11,
-                             (outs GPR32:$Rt),
-                             (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]>;
-
-    def x_U : A64I_LSunalimm<size, 0b0, 0b10,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]>;
-
-
-    // Post-indexed
-    def w_PostInd : A64I_LSpostind<size, 0b0, 0b11,
-                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                 [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    def x_PostInd : A64I_LSpostind<size, 0b0, 0b10,
-                                   (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                   (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                   "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                   [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    // Pre-indexed
-    def w_PreInd : A64I_LSpreind<size, 0b0, 0b11,
-                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                   Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    def x_PreInd : A64I_LSpreind<size, 0b0, 0b10,
-                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                   Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-  } // let mayLoad = 1
-}
-
-// LDRSB
-defm LDRSB : A64I_LDR_signed<0b00, "b", byte_addrparams, "LDRSB">;
-// LDRSH
-defm LDRSH : A64I_LDR_signed<0b01, "h", hword_addrparams, "LDRSH">;
-
-// LDRSW: load a 32-bit register, sign-extending to 64-bits.
-def LDRSWx
-    : A64I_LSunsigimm<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt),
-                    (ins GPR64xsp:$Rn, word_uimm12:$UImm12),
-                    "ldrsw\t$Rt, [$Rn, $UImm12]",
-                    [], NoItinerary>,
-      Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn]", (LDRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
-  def LDRSWx_Wm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b0,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, GPR32:$Rm, word_Wm_regext:$Ext),
-                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-
-  def LDRSWx_Xm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b1,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, GPR64:$Rm, word_Xm_regext:$Ext),
-                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn, $Rm]",
-                (LDRSWx_Xm_RegOffset GPR64:$Rt, GPR64xsp:$Rn, GPR64:$Rm, 2)>;
-
-
-def LDURSWx
-    : A64I_LSunalimm<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt),
-                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                    "ldursw\t$Rt, [$Rn, $SImm9]",
-                    [], NoItinerary>,
-      Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldursw $Rt, [$Rn]", (LDURSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-def LDRSWx_PostInd
-    : A64I_LSpostind<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                    "ldrsw\t$Rt, [$Rn], $SImm9",
-                    [], NoItinerary>,
-      Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
-                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrsw\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-//===------------------------------
-// 2.4 Prefetch operations
-//===------------------------------
-
-def PRFM : A64I_LSunsigimm<0b11, 0b0, 0b10, (outs),
-                 (ins prefetch_op:$Rt, GPR64xsp:$Rn, dword_uimm12:$UImm12),
-                 "prfm\t$Rt, [$Rn, $UImm12]",
-                 [], NoItinerary>,
-           Sched<[WritePreLd, ReadPreLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"prfm $Rt, [$Rn]",
-                (PRFM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
-  def PRFM_Wm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b0, (outs),
-                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
-                                             GPR32:$Rm, dword_Wm_regext:$Ext),
-                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>,
-                          Sched<[WritePreLd, ReadPreLd]>;
-  def PRFM_Xm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b1, (outs),
-                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
-                                             GPR64:$Rm, dword_Xm_regext:$Ext),
-                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>,
-                          Sched<[WritePreLd, ReadPreLd]>;
-}
-
-def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
-                (PRFM_Xm_RegOffset prefetch_op:$Rt, GPR64xsp:$Rn,
-                                   GPR64:$Rm, 2)>;
-
-
-def PRFUM : A64I_LSunalimm<0b11, 0b0, 0b10, (outs),
-                         (ins prefetch_op:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                         "prfum\t$Rt, [$Rn, $SImm9]",
-                         [], NoItinerary>,
-            Sched<[WritePreLd, ReadPreLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"prfum $Rt, [$Rn]",
-                (PRFUM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load-store register (unprivileged) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDTRB, LDTRH, LDTRSB, LDTRSH, LDTRSW, STTR, STTRB and STTRH
-
-// These instructions very much mirror the "unscaled immediate" loads, but since
-// there are no floating-point variants we need to split them out into their own
-// section to avoid instantiation of "ldtr d0, [sp]" etc.
-
-multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
-                         string prefix> {
-  def _UnPriv_STR : A64I_LSunpriv<size, 0b0, 0b00,
-                              (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                              "sttr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                              [], NoItinerary>,
-                    Sched<[WriteLd, ReadLd]> {
-    let mayStore = 1;
-  }
-
-  def : InstAlias<"sttr" # asmsuffix # " $Rt, [$Rn]",
-         (!cast<Instruction>(prefix # "_UnPriv_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _UnPriv_LDR : A64I_LSunpriv<size, 0b0, 0b01,
-                               (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
-                               "ldtr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                               [], NoItinerary>,
-                    Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-
-  def : InstAlias<"ldtr" # asmsuffix # " $Rt, [$Rn]",
-         (!cast<Instruction>(prefix # "_UnPriv_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
-
-// STTRB/LDTRB: First define the instructions
-defm LS8 : A64I_LDTRSTTR<0b00, "b", GPR32, "LS8">;
-
-// STTRH/LDTRH
-defm LS16 : A64I_LDTRSTTR<0b01, "h", GPR32, "LS16">;
-
-// STTR/LDTR to/from a W register
-defm LS32 : A64I_LDTRSTTR<0b10, "", GPR32, "LS32">;
-
-// STTR/LDTR to/from an X register
-defm LS64 : A64I_LDTRSTTR<0b11, "", GPR64, "LS64">;
-
-// Now a class for the signed instructions that can go to either 32 or 64
-// bits...
-multiclass A64I_LDTR_signed<bits<2> size, string asmopcode, string prefix> {
-  let mayLoad = 1 in {
-    def w : A64I_LSunpriv<size, 0b0, 0b11,
-                          (outs GPR32:$Rt),
-                          (ins GPR64xsp:$Rn, simm9:$SImm9),
-                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>,
-            Sched<[WriteLd, ReadLd]>;
-
-    def x : A64I_LSunpriv<size, 0b0, 0b10,
-                          (outs GPR64:$Rt),
-                          (ins GPR64xsp:$Rn, simm9:$SImm9),
-                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>,
-            Sched<[WriteLd, ReadLd]>;
-  }
-
-  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
-                 (!cast<Instruction>(prefix # "w") GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
-                 (!cast<Instruction>(prefix # "x") GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
-
-// LDTRSB
-defm LDTRSB : A64I_LDTR_signed<0b00, "b", "LDTRSB">;
-// LDTRSH
-defm LDTRSH : A64I_LDTR_signed<0b01, "h", "LDTRSH">;
-
-// And finally LDTRSW which only goes to 64 bits.
-def LDTRSWx : A64I_LSunpriv<0b10, 0b0, 0b10,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, simm9:$SImm9),
-                            "ldtrsw\t$Rt, [$Rn, $SImm9]",
-                            [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldtrsw $Rt, [$Rn]", (LDTRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load-store register pair (offset) instructions
-//===----------------------------------------------------------------------===//
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store non-temporal register pair (offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STNP, LDNP
-
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-multiclass offsets_simm7<string MemSize, string prefix> {
-  // The bare signed 7-bit immediate is used in post-indexed instructions, but
-  // because of the scaling performed a generic "simm7" operand isn't
-  // appropriate here either.
-  def simm7_asmoperand : AsmOperandClass {
-    let Name = "SImm7_Scaled" # MemSize;
-    let PredicateMethod = "isSImm7Scaled<" # MemSize # ">";
-    let RenderMethod = "addSImm7ScaledOperands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreSImm7_" # MemSize;
-  }
-
-  def simm7 : Operand<i64> {
-    let PrintMethod = "printSImm7ScaledOperand<" # MemSize # ">";
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "simm7_asmoperand");
-  }
-}
-
-defm word_  : offsets_simm7<"4", "word_">;
-defm dword_ : offsets_simm7<"8", "dword_">;
-defm qword_ : offsets_simm7<"16", "qword_">;
-
-multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
-                          Operand simm7, string prefix> {
-  def _STR : A64I_LSPoffset<opc, v, 0b0, (outs),
-                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, ReadLd]> {
-    let mayStore = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"stp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_STR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _LDR : A64I_LSPoffset<opc, v, 0b1,
-                            (outs SomeReg:$Rt, SomeReg:$Rt2),
-                            (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"ldp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_LDR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _PostInd_STR : A64I_LSPpostind<opc, v, 0b0,
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins SomeReg:$Rt, SomeReg:$Rt2,
-                                    GPR64xsp:$Rn,
-                                    simm7:$SImm7),
-                               "stp\t$Rt, $Rt2, [$Rn], $SImm7",
-                               [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let Constraints = "$Rn = $Rn_wb";
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PostInd_LDR : A64I_LSPpostind<opc, v, 0b1,
-                        (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
-                        (ins GPR64xsp:$Rn, simm7:$SImm7),
-                        "ldp\t$Rt, $Rt2, [$Rn], $SImm7",
-                        [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PreInd_STR : A64I_LSPpreind<opc, v, 0b0, (outs GPR64xsp:$Rn_wb),
-                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                       "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                       [], NoItinerary>,
-                    Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PreInd_LDR : A64I_LSPpreind<opc, v, 0b1,
-                              (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
-                              (ins GPR64xsp:$Rn, simm7:$SImm7),
-                              "ldp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                              [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _NonTemp_STR : A64I_LSPnontemp<opc, v, 0b0, (outs),
-                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                       "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"stnp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_NonTemp_STR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _NonTemp_LDR : A64I_LSPnontemp<opc, v, 0b1,
-                            (outs SomeReg:$Rt, SomeReg:$Rt2),
-                            (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"ldnp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_NonTemp_LDR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-}
-
-
-defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
-defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
-
-let Predicates = [HasFPARMv8] in {
-defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
-defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
-defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
-                                  "LSFPPair128">;
-}
-
-
-def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
-                           (outs GPR64:$Rt, GPR64:$Rt2),
-                           (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                           "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-def : InstAlias<"ldpsw $Rt, $Rt2, [$Rn]",
-                (LDPSWx GPR64:$Rt, GPR64:$Rt2, GPR64xsp:$Rn, 0)>;
-
-def LDPSWx_PostInd : A64I_LSPpostind<0b01, 0b0, 0b1,
-                                  (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
-                                  (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                                  "ldpsw\t$Rt, $Rt2, [$Rn], $SImm7",
-                                  [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-
-def LDPSWx_PreInd : A64I_LSPpreind<0b01, 0b0, 0b1,
-                                   (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
-                                   (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                                   "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                                   [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-
-//===----------------------------------------------------------------------===//
-// Logical (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: AND, ORR, EOR, ANDS, + aliases TST, MOV
-
-multiclass logical_imm_operands<string prefix, string note,
-                                int size, ValueType VT> {
-  def _asmoperand : AsmOperandClass {
-    let Name = "LogicalImm" # note # size;
-    let PredicateMethod = "isLogicalImm" # note # "<" # size # ">";
-    let RenderMethod = "addLogicalImmOperands<" # size # ">";
-    let DiagnosticType = "LogicalSecondSource";
-  }
-
-  def _operand
-        : Operand<VT>, ComplexPattern<VT, 1, "SelectLogicalImm", [imm]> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-    let PrintMethod = "printLogicalImmOperand<" # size # ">";
-    let DecoderMethod = "DecodeLogicalImmOperand<" # size # ">";
-  }
-}
-
-defm logical_imm32 : logical_imm_operands<"logical_imm32", "", 32, i32>;
-defm logical_imm64 : logical_imm_operands<"logical_imm64", "", 64, i64>;
-
-// The mov versions only differ in assembly parsing, where they
-// exclude values representable with either MOVZ or MOVN.
-defm logical_imm32_mov
-  : logical_imm_operands<"logical_imm32_mov", "MOV", 32, i32>;
-defm logical_imm64_mov
-  : logical_imm_operands<"logical_imm64_mov", "MOV", 64, i64>;
-
-
-multiclass A64I_logimmSizes<bits<2> opc, string asmop, SDNode opnode> {
-  def wwi : A64I_logicalimm<0b0, opc, (outs GPR32wsp:$Rd),
-                         (ins GPR32:$Rn, logical_imm32_operand:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [(set i32:$Rd,
-                               (opnode i32:$Rn, logical_imm32_operand:$Imm))],
-                         NoItinerary>,
-            Sched<[WriteALU, ReadALU]>;
-
-  def xxi : A64I_logicalimm<0b1, opc, (outs GPR64xsp:$Rd),
-                         (ins GPR64:$Rn, logical_imm64_operand:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [(set i64:$Rd,
-                               (opnode i64:$Rn, logical_imm64_operand:$Imm))],
-                         NoItinerary>,
-            Sched<[WriteALU, ReadALU]>;
-}
-
-defm AND : A64I_logimmSizes<0b00, "and", and>;
-defm ORR : A64I_logimmSizes<0b01, "orr", or>;
-defm EOR : A64I_logimmSizes<0b10, "eor", xor>;
-
-let Defs = [NZCV] in {
-  def ANDSwwi : A64I_logicalimm<0b0, 0b11, (outs GPR32:$Rd),
-                                (ins GPR32:$Rn, logical_imm32_operand:$Imm),
-                                "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>,
-                Sched<[WriteALU, ReadALU]>;
-
-  def ANDSxxi : A64I_logicalimm<0b1, 0b11, (outs GPR64:$Rd),
-                                (ins GPR64:$Rn, logical_imm64_operand:$Imm),
-                                "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>,
-                Sched<[WriteALU, ReadALU]>;
-}
-
-def : InstAlias<"tst $Rn, $Imm",
-                (ANDSwwi WZR, GPR32:$Rn, logical_imm32_operand:$Imm)>;
-def : InstAlias<"tst $Rn, $Imm",
-                (ANDSxxi XZR, GPR64:$Rn, logical_imm64_operand:$Imm)>;
-// FIXME: these sometimes are canonical.
-def : InstAlias<"mov $Rd, $Imm",
-                (ORRwwi GPR32wsp:$Rd, WZR, logical_imm32_mov_operand:$Imm), 0>;
-def : InstAlias<"mov $Rd, $Imm",
-                (ORRxxi GPR64xsp:$Rd, XZR, logical_imm64_mov_operand:$Imm), 0>;
-
-//===----------------------------------------------------------------------===//
-// Logical (shifted register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: AND, BIC, ORR, ORN, EOR, EON, ANDS, BICS + aliases TST, MVN, MOV
-
-// Operand for optimizing (icmp (and LHS, RHS), 0, SomeCode). In theory "ANDS"
-// behaves differently for unsigned comparisons, so we defensively only allow
-// signed or n/a as the operand. In practice "unsigned greater than 0" is "not
-// equal to 0" and LLVM gives us this.
-def signed_cond : PatLeaf<(cond), [{
-  return !isUnsignedIntSetCC(N->get());
-}]>;
-
-
-// These instructions share their "shift" operands with add/sub (shifted
-// register instructions). They are defined there.
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
-                          bit N, bit commutable,
-                          string asmop, SDPatternOperator opfrag, ValueType ty,
-                          RegisterClass GPR, list<Register> defs> {
-  let isCommutable = commutable, Defs = defs in {
-  def _lsl : A64I_logicalshift<sf, opc, 0b00, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _lsr : A64I_logicalshift<sf, opc, 0b01, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, opc, 0b10, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, opc, 0b11, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (rotr ty:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
-                                                      GPR:$Rm, 0)>;
-
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-multiclass logical_sizes<string prefix, bits<2> opc, bit N, bit commutable,
-                         string asmop, SDPatternOperator opfrag,
-                         list<Register> defs> {
-  defm xxx : logical_shifts<prefix # "xxx", 0b1, opc, N,
-                            commutable, asmop, opfrag, i64, GPR64, defs>;
-  defm www : logical_shifts<prefix # "www", 0b0, opc, N,
-                            commutable, asmop, opfrag, i32, GPR32, defs>;
-}
-
-
-defm AND : logical_sizes<"AND", 0b00, 0b0, 0b1, "and", and, []>;
-defm ORR : logical_sizes<"ORR", 0b01, 0b0, 0b1, "orr", or, []>;
-defm EOR : logical_sizes<"EOR", 0b10, 0b0, 0b1, "eor", xor, []>;
-defm ANDS : logical_sizes<"ANDS", 0b11, 0b0, 0b1, "ands",
-             PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs),
-                     [{ (void)N; return false; }]>,
-             [NZCV]>;
-
-defm BIC : logical_sizes<"BIC", 0b00, 0b1, 0b0, "bic",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (and node:$lhs, (not node:$rhs))>, []>;
-defm ORN : logical_sizes<"ORN", 0b01, 0b1, 0b0, "orn",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (or node:$lhs, (not node:$rhs))>, []>;
-defm EON : logical_sizes<"EON", 0b10, 0b1, 0b0, "eon",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (xor node:$lhs, (not node:$rhs))>, []>;
-defm BICS : logical_sizes<"BICS", 0b11, 0b1, 0b0, "bics",
-                          PatFrag<(ops node:$lhs, node:$rhs),
-                                  (and node:$lhs, (not node:$rhs)),
-                                  [{ (void)N; return false; }]>,
-                          [NZCV]>;
-
-multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
-  let isCommutable = 1, Rd = 0b11111, Defs = [NZCV] in {
-  def _lsl : A64I_logicalshift<sf, 0b11, 0b00, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (shl ty:$Rm,
-                           !cast<Operand>("lsl_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
-  def _lsr : A64I_logicalshift<sf, 0b11, 0b01, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (srl ty:$Rm,
-                           !cast<Operand>("lsr_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, 0b11, 0b10, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (sra ty:$Rm,
-                           !cast<Operand>("asr_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, 0b11, 0b11, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (rotr ty:$Rm,
-                           !cast<Operand>("ror_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift : InstAlias<"tst $Rn, $Rm",
-                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(A64setcc (and ty:$Rn, ty:$Rm), 0, signed_cond),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-defm TSTxx : tst_shifts<"TSTxx", 0b1, i64, GPR64>;
-defm TSTww : tst_shifts<"TSTww", 0b0, i32, GPR32>;
-
-
-multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
-  let isCommutable = 0, Rn = 0b11111 in {
-  def _lsl : A64I_logicalshift<sf, 0b01, 0b00, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (shl ty:$Rm,
-                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
-  def _lsr : A64I_logicalshift<sf, 0b01, 0b01, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (srl ty:$Rm,
-                         !cast<Operand>("lsr_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, 0b01, 0b10, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (sra ty:$Rm,
-                         !cast<Operand>("asr_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, 0b01, 0b11, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (rotr ty:$Rm,
-                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift : InstAlias<"mvn $Rn, $Rm",
-                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(not ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rm, 0)>;
-}
-
-defm MVNxx : mvn_shifts<"MVNxx", 0b1, i64, GPR64>;
-defm MVNww : mvn_shifts<"MVNww", 0b0, i32, GPR32>;
-
-def MOVxx :InstAlias<"mov $Rd, $Rm", (ORRxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def MOVww :InstAlias<"mov $Rd, $Rm", (ORRwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Move wide (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: MOVN, MOVZ, MOVK + MOV aliases
-
-// A wide variety of different relocations are needed for variants of these
-// instructions, so it turns out that we need a different operand for all of
-// them.
-multiclass movw_operands<string prefix, string instname, int width> {
-  def _imm_asmoperand : AsmOperandClass {
-    let Name = instname # width # "Shifted" # shift;
-    let PredicateMethod = "is" # instname # width # "Imm";
-    let RenderMethod = "addMoveWideImmOperands";
-    let ParserMethod = "ParseImmWithLSLOperand";
-    let DiagnosticType = "MOVWUImm16";
-  }
-
-  def _imm : Operand<i64> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_imm_asmoperand");
-    let PrintMethod = "printMoveWideImmOperand";
-    let EncoderMethod = "getMoveWideImmOpValue";
-    let DecoderMethod = "DecodeMoveWideImmOperand<" # width # ">";
-
-    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
-  }
-}
-
-defm movn32 : movw_operands<"movn32", "MOVN", 32>;
-defm movn64 : movw_operands<"movn64", "MOVN", 64>;
-defm movz32 : movw_operands<"movz32", "MOVZ", 32>;
-defm movz64 : movw_operands<"movz64", "MOVZ", 64>;
-defm movk32 : movw_operands<"movk32", "MOVK", 32>;
-defm movk64 : movw_operands<"movk64", "MOVK", 64>;
-
-multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
-                          dag ins64bit> {
-
-  def wii : A64I_movw<0b0, opc, (outs GPR32:$Rd), ins32bit,
-                      !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary>,
-            Sched<[WriteALU]> {
-    bits<18> FullImm;
-    let UImm16 = FullImm{15-0};
-    let Shift = FullImm{17-16};
-  }
-
-  def xii : A64I_movw<0b1, opc, (outs GPR64:$Rd), ins64bit,
-                      !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary>,
-            Sched<[WriteALU]> {
-    bits<18> FullImm;
-    let UImm16 = FullImm{15-0};
-    let Shift = FullImm{17-16};
-  }
-}
-
-let isMoveImm = 1, isReMaterializable = 1,
-    isAsCheapAsAMove = 1, hasSideEffects = 0 in {
-  defm MOVN : A64I_movwSizes<0b00, "movn",
-                             (ins movn32_imm:$FullImm),
-                             (ins movn64_imm:$FullImm)>;
-
-  // Some relocations are able to convert between a MOVZ and a MOVN. If these
-  // are applied the instruction must be emitted with the corresponding bits as
-  // 0, which means a MOVZ needs to override that bit from the default.
-  let PostEncoderMethod = "fixMOVZ" in
-  defm MOVZ : A64I_movwSizes<0b10, "movz",
-                             (ins movz32_imm:$FullImm),
-                             (ins movz64_imm:$FullImm)>;
-}
-
-let Constraints = "$src = $Rd",
-    SchedRW = [WriteALU, ReadALU] in
-defm MOVK : A64I_movwSizes<0b11, "movk",
-                           (ins GPR32:$src, movk32_imm:$FullImm),
-                           (ins GPR64:$src, movk64_imm:$FullImm)>;
-
-
-// And now the "MOV" aliases. These also need their own operands because what
-// they accept is completely different to what the base instructions accept.
-multiclass movalias_operand<string prefix, string basename,
-                            string immpredicate, int width> {
-  def _asmoperand : AsmOperandClass {
-    let Name = basename # width # "MovAlias";
-    let PredicateMethod
-          = "isMoveWideMovAlias<" # width # ", A64Imms::" # immpredicate # ">";
-    let RenderMethod
-      = "addMoveWideMovAliasOperands<" # width # ", "
-                                       # "A64Imms::" # immpredicate # ">";
-  }
-
-  def _movimm : Operand<i64> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-
-    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
-  }
-}
-
-defm movz32 : movalias_operand<"movz32", "MOVZ", "isMOVZImm", 32>;
-defm movz64 : movalias_operand<"movz64", "MOVZ", "isMOVZImm", 64>;
-defm movn32 : movalias_operand<"movn32", "MOVN", "isOnlyMOVNImm", 32>;
-defm movn64 : movalias_operand<"movn64", "MOVN", "isOnlyMOVNImm", 64>;
-
-// FIXME: these are officially canonical aliases, but TableGen is too limited to
-// print them at the moment. I believe in this case an "AliasPredicate" method
-// will need to be implemented. to allow it, as well as the more generally
-// useful handling of non-register, non-constant operands.
-class movalias<Instruction INST, RegisterClass GPR, Operand operand>
-  : InstAlias<"mov $Rd, $FullImm", (INST GPR:$Rd, operand:$FullImm), 0>;
-
-def : movalias<MOVZwii, GPR32, movz32_movimm>;
-def : movalias<MOVZxii, GPR64, movz64_movimm>;
-def : movalias<MOVNwii, GPR32, movn32_movimm>;
-def : movalias<MOVNxii, GPR64, movn64_movimm>;
-
-def movw_addressref_g0 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<0>">;
-def movw_addressref_g1 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<1>">;
-def movw_addressref_g2 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<2>">;
-def movw_addressref_g3 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<3>">;
-
-def : Pat<(A64WrapperLarge movw_addressref_g3:$G3, movw_addressref_g2:$G2,
-                           movw_addressref_g1:$G1, movw_addressref_g0:$G0),
-          (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref_g3:$G3),
-                                     movw_addressref_g2:$G2),
-                            movw_addressref_g1:$G1),
-                   movw_addressref_g0:$G0)>;
-
-//===----------------------------------------------------------------------===//
-// PC-relative addressing instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADR, ADRP
-
-def adr_label : Operand<i64> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_adr_prel>";
-
-  // This label is a 21-bit offset from PC, unscaled
-  let PrintMethod = "printLabelOperand<21, 1>";
-  let ParserMatchClass = label_asmoperand<21, 1>;
-  let OperandType = "OPERAND_PCREL";
-}
-
-def adrp_label_asmoperand : AsmOperandClass {
-  let Name = "AdrpLabel";
-  let RenderMethod = "addLabelOperands<21, 4096>";
-  let DiagnosticType = "Label";
-}
-
-def adrp_label : Operand<i64> {
-  let EncoderMethod = "getAdrpLabelOpValue";
-
-  // This label is a 21-bit offset from PC, scaled by the page-size: 4096.
-  let PrintMethod = "printLabelOperand<21, 4096>";
-  let ParserMatchClass = adrp_label_asmoperand;
-  let OperandType = "OPERAND_PCREL";
-}
-
-let hasSideEffects = 0 in {
-  def ADRxi : A64I_PCADR<0b0, (outs GPR64:$Rd), (ins adr_label:$Label),
-                         "adr\t$Rd, $Label", [], NoItinerary>,
-              Sched<[WriteALUs]>;
-
-  def ADRPxi : A64I_PCADR<0b1, (outs GPR64:$Rd), (ins adrp_label:$Label),
-                          "adrp\t$Rd, $Label", [], NoItinerary>,
-               Sched<[WriteALUs]>;
-}
-
-//===----------------------------------------------------------------------===//
-// System instructions
-//===----------------------------------------------------------------------===//
-// Contains: HINT, CLREX, DSB, DMB, ISB, MSR, SYS, SYSL, MRS
-//    + aliases IC, DC, AT, TLBI, NOP, YIELD, WFE, WFI, SEV, SEVL
-
-// Op1 and Op2 fields are sometimes simple 3-bit unsigned immediate values.
-def uimm3_asmoperand : AsmOperandClass {
-  let Name = "UImm3";
-  let PredicateMethod = "isUImm<3>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm3";
-}
-
-def uimm3 : Operand<i32> {
-  let ParserMatchClass = uimm3_asmoperand;
-}
-
-// The HINT alias can accept a simple unsigned 7-bit immediate.
-def uimm7_asmoperand : AsmOperandClass {
-  let Name = "UImm7";
-  let PredicateMethod = "isUImm<7>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm7";
-}
-
-def uimm7 : Operand<i32> {
-  let ParserMatchClass = uimm7_asmoperand;
-}
-
-// Multiclass namedimm is defined with the prefetch operands. Most of these fit
-// into the NamedImmMapper scheme well: they either accept a named operand or
-// any immediate under a particular value (which may be 0, implying no immediate
-// is allowed).
-defm dbarrier : namedimm<"dbarrier", "A64DB::DBarrierMapper">;
-defm isb : namedimm<"isb", "A64ISB::ISBMapper">;
-defm ic : namedimm<"ic", "A64IC::ICMapper">;
-defm dc : namedimm<"dc", "A64DC::DCMapper">;
-defm at : namedimm<"at", "A64AT::ATMapper">;
-defm tlbi : namedimm<"tlbi", "A64TLBI::TLBIMapper">;
-
-// However, MRS and MSR are more complicated for a few reasons:
-//   * There are ~1000 generic names S3_<op1>_<CRn>_<CRm>_<Op2> which have an
-//     implementation-defined effect
-//   * Most registers are shared, but some are read-only or write-only.
-//   * There is a variant of MSR which accepts the same register name (SPSel),
-//     but which would have a different encoding.
-
-// In principle these could be resolved in with more complicated subclasses of
-// NamedImmMapper, however that imposes an overhead on other "named
-// immediates". Both in concrete terms with virtual tables and in unnecessary
-// abstraction.
-
-// The solution adopted here is to take the MRS/MSR Mappers out of the usual
-// hierarchy (they're not derived from NamedImmMapper) and to add logic for
-// their special situation.
-def mrs_asmoperand : AsmOperandClass {
-  let Name = "MRS";
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MRS";
-}
-
-def mrs_op : Operand<i32> {
-  let ParserMatchClass = mrs_asmoperand;
-  let PrintMethod = "printMRSOperand";
-  let DecoderMethod = "DecodeMRSOperand";
-}
-
-def msr_asmoperand : AsmOperandClass {
-  let Name = "MSRWithReg";
-
-  // Note that SPSel is valid for both this and the pstate operands, but with
-  // different immediate encodings. This is why these operands provide a string
-  // AArch64Operand rather than an immediate. The overlap is small enough that
-  // it could be resolved with hackery now, but who can say in future?
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MSR";
-}
-
-def msr_op : Operand<i32> {
-  let ParserMatchClass = msr_asmoperand;
-  let PrintMethod = "printMSROperand";
-  let DecoderMethod = "DecodeMSROperand";
-}
-
-def pstate_asmoperand : AsmOperandClass {
-  let Name = "MSRPState";
-  // See comment above about parser.
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MSR";
-}
-
-def pstate_op : Operand<i32> {
-  let ParserMatchClass = pstate_asmoperand;
-  let PrintMethod = "printNamedImmOperand<A64PState::PStateMapper>";
-  let DecoderMethod = "DecodeNamedImmOperand<A64PState::PStateMapper>";
-}
-
-// When <CRn> is specified, an assembler should accept something like "C4", not
-// the usual "#4" immediate.
-def CRx_asmoperand : AsmOperandClass {
-  let Name = "CRx";
-  let PredicateMethod = "isUImm<4>";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "ParseCRxOperand";
-  // Diagnostics are handled in all cases by ParseCRxOperand.
-}
-
-def CRx : Operand<i32> {
-  let ParserMatchClass = CRx_asmoperand;
-  let PrintMethod = "printCRxOperand";
-}
-
-
-// Finally, we can start defining the instructions.
-
-// HINT is straightforward, with a few aliases.
-def HINTi : A64I_system<0b0, (outs), (ins uimm7:$UImm7), "hint\t$UImm7",
-                        [], NoItinerary> {
-  bits<7> UImm7;
-  let CRm = UImm7{6-3};
-  let Op2 = UImm7{2-0};
-
-  let Op0 = 0b00;
-  let Op1 = 0b011;
-  let CRn = 0b0010;
-  let Rt = 0b11111;
-}
-
-def : InstAlias<"nop", (HINTi 0)>;
-def : InstAlias<"yield", (HINTi 1)>;
-def : InstAlias<"wfe", (HINTi 2)>;
-def : InstAlias<"wfi", (HINTi 3)>;
-def : InstAlias<"sev", (HINTi 4)>;
-def : InstAlias<"sevl", (HINTi 5)>;
-
-// Quite a few instructions then follow a similar pattern of fixing common
-// fields in the bitpattern, we'll define a helper-class for them.
-class simple_sys<bits<2> op0, bits<3> op1, bits<4> crn, bits<3> op2,
-                 Operand operand, string asmop>
-  : A64I_system<0b0, (outs), (ins operand:$CRm), !strconcat(asmop, "\t$CRm"),
-                [], NoItinerary> {
-  let Op0 = op0;
-  let Op1 = op1;
-  let CRn = crn;
-  let Op2 = op2;
-  let Rt = 0b11111;
-}
-
-
-def CLREXi : simple_sys<0b00, 0b011, 0b0011, 0b010, uimm4, "clrex">;
-def DSBi : simple_sys<0b00, 0b011, 0b0011, 0b100, dbarrier_op, "dsb">;
-def DMBi : simple_sys<0b00, 0b011, 0b0011, 0b101, dbarrier_op, "dmb">;
-def ISBi : simple_sys<0b00, 0b011, 0b0011, 0b110, isb_op, "isb">;
-
-def : InstAlias<"clrex", (CLREXi 0b1111)>;
-def : InstAlias<"isb", (ISBi 0b1111)>;
-
-// (DMBi 0xb) is a "DMB ISH" instruciton, appropriate for Linux SMP
-// configurations at least.
-def : Pat<(atomic_fence imm, imm), (DMBi 0xb)>;
-
-// Any SYS bitpattern can be represented with a complex and opaque "SYS"
-// instruction.
-def SYSiccix : A64I_system<0b0, (outs),
-                           (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm,
-                                uimm3:$Op2, GPR64:$Rt),
-                           "sys\t$Op1, $CRn, $CRm, $Op2, $Rt",
-                           [], NoItinerary> {
-  let Op0 = 0b01;
-}
-
-// You can skip the Xt argument whether it makes sense or not for the generic
-// SYS instruction.
-def : InstAlias<"sys $Op1, $CRn, $CRm, $Op2",
-                (SYSiccix uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2, XZR)>;
-
-
-// But many have aliases, which obviously don't fit into
-class SYSalias<dag ins, string asmstring>
-  : A64I_system<0b0, (outs), ins, asmstring, [], NoItinerary> {
-  let isAsmParserOnly = 1;
-
-  bits<14> SysOp;
-  let Op0 = 0b01;
-  let Op1 = SysOp{13-11};
-  let CRn = SysOp{10-7};
-  let CRm = SysOp{6-3};
-  let Op2 = SysOp{2-0};
-}
-
-def ICix : SYSalias<(ins ic_op:$SysOp, GPR64:$Rt), "ic\t$SysOp, $Rt">;
-
-def ICi : SYSalias<(ins ic_op:$SysOp), "ic\t$SysOp"> {
-  let Rt = 0b11111;
-}
-
-def DCix : SYSalias<(ins dc_op:$SysOp, GPR64:$Rt), "dc\t$SysOp, $Rt">;
-def ATix : SYSalias<(ins at_op:$SysOp, GPR64:$Rt), "at\t$SysOp, $Rt">;
-
-def TLBIix : SYSalias<(ins tlbi_op:$SysOp, GPR64:$Rt), "tlbi\t$SysOp, $Rt">;
-
-def TLBIi : SYSalias<(ins tlbi_op:$SysOp), "tlbi\t$SysOp"> {
-  let Rt = 0b11111;
-}
-
-
-def SYSLxicci : A64I_system<0b1, (outs GPR64:$Rt),
-                            (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2),
-                            "sysl\t$Rt, $Op1, $CRn, $CRm, $Op2",
-                            [], NoItinerary> {
-  let Op0 = 0b01;
-}
-
-// The instructions themselves are rather simple for MSR and MRS.
-def MSRix : A64I_system<0b0, (outs), (ins msr_op:$SysReg, GPR64:$Rt),
-                        "msr\t$SysReg, $Rt", [], NoItinerary> {
-  bits<16> SysReg;
-  let Op0 = SysReg{15-14};
-  let Op1 = SysReg{13-11};
-  let CRn = SysReg{10-7};
-  let CRm = SysReg{6-3};
-  let Op2 = SysReg{2-0};
-}
-
-def MRSxi : A64I_system<0b1, (outs GPR64:$Rt), (ins mrs_op:$SysReg),
-                        "mrs\t$Rt, $SysReg", [], NoItinerary> {
-  bits<16> SysReg;
-  let Op0 = SysReg{15-14};
-  let Op1 = SysReg{13-11};
-  let CRn = SysReg{10-7};
-  let CRm = SysReg{6-3};
-  let Op2 = SysReg{2-0};
-}
-
-def MSRii : A64I_system<0b0, (outs), (ins pstate_op:$PState, uimm4:$CRm),
-                        "msr\t$PState, $CRm", [], NoItinerary> {
-  bits<6> PState;
-
-  let Op0 = 0b00;
-  let Op1 = PState{5-3};
-  let CRn = 0b0100;
-  let Op2 = PState{2-0};
-  let Rt = 0b11111;
-}
-
-//===----------------------------------------------------------------------===//
-// Test & branch (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: TBZ, TBNZ
-
-// The bit to test is a simple unsigned 6-bit immediate in the X-register
-// versions.
-def uimm6 : Operand<i64> {
-  let ParserMatchClass = uimm6_asmoperand;
-}
-
-def label_wid14_scal4_asmoperand : label_asmoperand<14, 4>;
-
-def tbimm_target : Operand<OtherVT> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_tstbr>";
-
-  // This label is a 14-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<14, 4>";
-  let ParserMatchClass = label_wid14_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-def A64eq : ImmLeaf<i32, [{ return Imm == A64CC::EQ; }]>;
-def A64ne : ImmLeaf<i32, [{ return Imm == A64CC::NE; }]>;
-
-// These instructions correspond to patterns involving "and" with a power of
-// two, which we need to be able to select.
-def tstb64_pat : ComplexPattern<i64, 1, "SelectTSTBOperand<64>">;
-def tstb32_pat : ComplexPattern<i32, 1, "SelectTSTBOperand<32>">;
-
-let isBranch = 1, isTerminator = 1 in {
-  def TBZxii : A64I_TBimm<0b0, (outs),
-                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
-                        "tbz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
-                                   A64eq, bb:$Label)],
-                        NoItinerary>,
-               Sched<[WriteBr]>;
-
-  def TBNZxii : A64I_TBimm<0b1, (outs),
-                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
-                        "tbnz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
-                                   A64ne, bb:$Label)],
-                        NoItinerary>,
-                Sched<[WriteBr]>;
-
-
-  // Note, these instructions overlap with the above 64-bit patterns. This is
-  // intentional, "tbz x3, #1, somewhere" and "tbz w3, #1, somewhere" would both
-  // do the same thing and are both permitted assembly. They also both have
-  // sensible DAG patterns.
-  def TBZwii : A64I_TBimm<0b0, (outs),
-                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
-                        "tbz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
-                                   A64eq, bb:$Label)],
-                        NoItinerary>,
-               Sched<[WriteBr]> {
-    let Imm{5} = 0b0;
-  }
-
-  def TBNZwii : A64I_TBimm<0b1, (outs),
-                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
-                        "tbnz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
-                                   A64ne, bb:$Label)],
-                        NoItinerary>,
-                Sched<[WriteBr]> {
-    let Imm{5} = 0b0;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: B, BL
-
-def label_wid26_scal4_asmoperand : label_asmoperand<26, 4>;
-
-def bimm_target : Operand<OtherVT> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_uncondbr>";
-
-  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<26, 4>";
-  let ParserMatchClass = label_wid26_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-def blimm_target : Operand<i64> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_call>";
-
-  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<26, 4>";
-  let ParserMatchClass = label_wid26_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-class A64I_BimmImpl<bit op, string asmop, list<dag> patterns, Operand lbl_type>
-  : A64I_Bimm<op, (outs), (ins lbl_type:$Label),
-              !strconcat(asmop, "\t$Label"), patterns,
-              NoItinerary>,
-    Sched<[WriteBr]>;
-
-let isBranch = 1 in {
-  def Bimm : A64I_BimmImpl<0b0, "b", [(br bb:$Label)], bimm_target> {
-    let isTerminator = 1;
-    let isBarrier = 1;
-  }
-
-  let SchedRW = [WriteBrL] in {
-    def BLimm : A64I_BimmImpl<0b1, "bl",
-                              [(AArch64Call tglobaladdr:$Label)], blimm_target> {
-      let isCall = 1;
-      let Defs = [X30];
-    }
-  }
-}
-
-def : Pat<(AArch64Call texternalsym:$Label), (BLimm texternalsym:$Label)>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: BR, BLR, RET, ERET, DRP.
-
-// Most of the notional opcode fields in the A64I_Breg format are fixed in A64
-// at the moment.
-class A64I_BregImpl<bits<4> opc,
-                    dag outs, dag ins, string asmstr, list<dag> patterns,
-                    InstrItinClass itin = NoItinerary>
-  : A64I_Breg<opc, 0b11111, 0b000000, 0b00000,
-              outs, ins, asmstr, patterns, itin>,
-    Sched<[WriteBr]> {
-  let isBranch         = 1;
-  let isIndirectBranch = 1;
-}
-
-// Note that these are not marked isCall or isReturn because as far as LLVM is
-// concerned they're not. "ret" is just another jump unless it has been selected
-// by LLVM as the function's return.
-
-let isBranch = 1 in {
-  def BRx : A64I_BregImpl<0b0000,(outs), (ins GPR64:$Rn),
-                          "br\t$Rn", [(brind i64:$Rn)]> {
-    let isBarrier = 1;
-    let isTerminator = 1;
-  }
-
-  let SchedRW = [WriteBrL] in {
-    def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
-                             "blr\t$Rn", [(AArch64Call i64:$Rn)]> {
-      let isBarrier = 0;
-      let isCall = 1;
-      let Defs = [X30];
-    }
-  }
-
-  def RETx : A64I_BregImpl<0b0010, (outs), (ins GPR64:$Rn),
-                           "ret\t$Rn", []> {
-    let isBarrier = 1;
-    let isTerminator = 1;
-    let isReturn = 1;
-  }
-
-  // Create a separate pseudo-instruction for codegen to use so that we don't
-  // flag x30 as used in every function. It'll be restored before the RET by the
-  // epilogue if it's legitimately used.
-  def RET : A64PseudoExpand<(outs), (ins), [(A64ret)], (RETx (ops X30))> {
-    let isTerminator = 1;
-    let isBarrier = 1;
-    let isReturn = 1;
-  }
-
-  def ERET : A64I_BregImpl<0b0100, (outs), (ins), "eret", []> {
-    let Rn = 0b11111;
-    let isBarrier = 1;
-    let isTerminator = 1;
-    let isReturn = 1;
-  }
-
-  def DRPS : A64I_BregImpl<0b0101, (outs), (ins), "drps", []> {
-    let Rn = 0b11111;
-    let isBarrier = 1;
-  }
-}
-
-def RETAlias : InstAlias<"ret", (RETx X30)>;
-
-
-//===----------------------------------------------------------------------===//
-// Address generation patterns
-//===----------------------------------------------------------------------===//
-
-// Primary method of address generation for the small/absolute memory model is
-// an ADRP/ADR pair:
-//     ADRP x0, some_variable
-//     ADD x0, x0, #:lo12:some_variable
-//
-// The load/store elision of the ADD is accomplished when selecting
-// addressing-modes. This just mops up the cases where that doesn't work and we
-// really need an address in some register.
-
-// This wrapper applies a LO12 modifier to the address. Otherwise we could just
-// use the same address.
-
-class ADRP_ADD<SDNode Wrapper, SDNode addrop>
- : Pat<(Wrapper addrop:$Hi, addrop:$Lo12, (i32 imm)),
-       (ADDxxi_lsl0_s (ADRPxi addrop:$Hi), addrop:$Lo12)>;
-
-def : ADRP_ADD<A64WrapperSmall, tblockaddress>;
-def : ADRP_ADD<A64WrapperSmall, texternalsym>;
-def : ADRP_ADD<A64WrapperSmall, tglobaladdr>;
-def : ADRP_ADD<A64WrapperSmall, tglobaltlsaddr>;
-def : ADRP_ADD<A64WrapperSmall, tjumptable>;
-def : ADRP_ADD<A64WrapperSmall, tconstpool>;
-
-//===----------------------------------------------------------------------===//
-// GOT access patterns
-//===----------------------------------------------------------------------===//
-
-class GOTLoadSmall<SDNode addrfrag>
-  : Pat<(A64GOTLoad (A64WrapperSmall addrfrag:$Hi, addrfrag:$Lo12, 8)),
-        (LS64_LDR (ADRPxi addrfrag:$Hi), addrfrag:$Lo12)>;
-
-def : GOTLoadSmall<texternalsym>;
-def : GOTLoadSmall<tglobaladdr>;
-def : GOTLoadSmall<tglobaltlsaddr>;
-
-//===----------------------------------------------------------------------===//
-// Tail call handling
-//===----------------------------------------------------------------------===//
-
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [XSP] in {
-  def TC_RETURNdi
-    : PseudoInst<(outs), (ins i64imm:$dst, i32imm:$FPDiff),
-                 [(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff))]>;
-
-  def TC_RETURNxi
-    : PseudoInst<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff),
-                 [(AArch64tcret i64:$dst, (i32 timm:$FPDiff))]>;
-}
-
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    Uses = [XSP] in {
-  def TAIL_Bimm : A64PseudoExpand<(outs), (ins bimm_target:$Label), [],
-                                  (Bimm bimm_target:$Label)>;
-
-  def TAIL_BRx : A64PseudoExpand<(outs), (ins tcGPR64:$Rd), [],
-                                 (BRx GPR64:$Rd)>;
-}
-
 
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
+                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
+                             (f64 (REV64v8i8 FPR64:$src))>;
+}
+def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
+                             (v2f64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
+                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
+                             (v2i64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
+                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                              (REV64v4i32 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
+                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
+                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                              (REV64v16i8 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+    // so we match on v4f32 here, not v2f32. This will also catch adding
+    // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
-          (TC_RETURNdi texternalsym:$dst, imm:$FPDiff)>;
-
-//===----------------------------------------------------------------------===//
-// Thread local storage
-//===----------------------------------------------------------------------===//
-
-// This is a pseudo-instruction representing the ".tlsdesccall" directive in
-// assembly. Its effect is to insert an R_AARCH64_TLSDESC_CALL relocation at the
-// current location. It should always be immediately followed by a BLR
-// instruction, and is intended solely for relaxation by the linker.
-
-def : Pat<(A64threadpointer), (MRSxi 0xde82)>;
-
-def TLSDESCCALL : PseudoInst<(outs), (ins i64imm:$Lbl), []> {
-  let hasSideEffects = 1;
-}
-
-def TLSDESC_BLRx : PseudoInst<(outs), (ins GPR64:$Rn, i64imm:$Var),
-                            [(A64tlsdesc_blr i64:$Rn, tglobaltlsaddr:$Var)]> {
-  let isCall = 1;
-  let Defs = [X30];
-}
-
-def : Pat<(A64tlsdesc_blr i64:$Rn, texternalsym:$Var),
-          (TLSDESC_BLRx $Rn, texternalsym:$Var)>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield patterns
-//===----------------------------------------------------------------------===//
-
-def bfi32_lsb_to_immr : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((32 - N->getZExtValue()) % 32, MVT::i64);
-}]>;
-
-def bfi64_lsb_to_immr : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((64 - N->getZExtValue()) % 64, MVT::i64);
-}]>;
-
-def bfi_width_to_imms : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() - 1, MVT::i64);
-}]>;
-
-
-// The simpler patterns deal with cases where no AND mask is actually needed
-// (either all bits are used or the low 32 bits are used).
-let AddedComplexity = 10 in {
-
-def : Pat<(A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
-           (BFIxxii $src, $Rn,
-                    (bfi64_lsb_to_immr (i64 imm:$ImmR)),
-                    (bfi_width_to_imms (i64 imm:$ImmS)))>;
-
-def : Pat<(A64Bfi i32:$src, i32:$Rn, imm:$ImmR, imm:$ImmS),
-          (BFIwwii $src, $Rn,
-                   (bfi32_lsb_to_immr (i64 imm:$ImmR)),
-                   (bfi_width_to_imms (i64 imm:$ImmS)))>;
-
-
-def : Pat<(and (A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
-               (i64 4294967295)),
-          (SUBREG_TO_REG (i64 0),
-                         (BFIwwii (EXTRACT_SUBREG $src, sub_32),
-                                  (EXTRACT_SUBREG $Rn, sub_32),
-                                  (bfi32_lsb_to_immr (i64 imm:$ImmR)),
-                                  (bfi_width_to_imms (i64 imm:$ImmS))),
-                         sub_32)>;
-
-}
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous patterns
-//===----------------------------------------------------------------------===//
-
-// Truncation from 64 to 32-bits just involves renaming your register.
-def : Pat<(i32 (trunc i64:$val)), (EXTRACT_SUBREG $val, sub_32)>;
-
-// Similarly, extension where we don't care about the high bits is
-// just a rename.
-def : Pat<(i64 (anyext i32:$val)),
-          (INSERT_SUBREG (IMPLICIT_DEF), $val, sub_32)>;
-
-// SELECT instructions providing f128 types need to be handled by a
-// pseudo-instruction since the eventual code will need to introduce basic
-// blocks and control flow.
-def F128CSEL : PseudoInst<(outs FPR128:$Rd),
-                         (ins FPR128:$Rn, FPR128:$Rm, cond_code_op:$Cond),
-                         [(set f128:$Rd, (simple_select f128:$Rn, f128:$Rm))]> {
-  let Uses = [NZCV];
-  let usesCustomInserter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Load/store patterns
-//===----------------------------------------------------------------------===//
-
-// There are lots of patterns here, because we need to allow at least three
-// parameters to vary independently.
-//   1. Instruction: "ldrb w9, [sp]", "ldrh w9, [sp]", ...
-//   2. LLVM source: zextloadi8, anyextloadi8, ...
-//   3. Address-generation: A64Wrapper, (add BASE, OFFSET), ...
-//
-// The biggest problem turns out to be the address-generation variable. At the
-// point of instantiation we need to produce two DAGs, one for the pattern and
-// one for the instruction. Doing this at the lowest level of classes doesn't
-// work.
-//
-// Consider the simple uimm12 addressing mode, and the desire to match both (add
-// GPR64xsp:$Rn, uimm12:$Offset) and GPR64xsp:$Rn, particularly on the
-// instruction side. We'd need to insert either "GPR64xsp" and "uimm12" or
-// "GPR64xsp" and "0" into an unknown dag. !subst is not capable of this
-// operation, and PatFrags are for selection not output.
-//
-// As a result, the address-generation patterns are the final
-// instantiations. However, we do still need to vary the operand for the address
-// further down (At the point we're deciding A64WrapperSmall, we don't know
-// the memory width of the operation).
-
-//===------------------------------
-// 1. Basic infrastructural defs
-//===------------------------------
-
-// First, some simple classes for !foreach and !subst to use:
-class Decls {
-  dag pattern;
-}
-
-def decls : Decls;
-def ALIGN;
-def INST;
-def OFFSET;
-def SHIFT;
-
-// You can't use !subst on an actual immediate, but you *can* use it on an
-// operand record that happens to match a single immediate. So we do.
-def imm_eq0 : ImmLeaf<i64, [{ return Imm == 0; }]>;
-def imm_eq1 : ImmLeaf<i64, [{ return Imm == 1; }]>;
-def imm_eq2 : ImmLeaf<i64, [{ return Imm == 2; }]>;
-def imm_eq3 : ImmLeaf<i64, [{ return Imm == 3; }]>;
-def imm_eq4 : ImmLeaf<i64, [{ return Imm == 4; }]>;
-
-// If the low bits of a pointer are known to be 0 then an "or" is just as good
-// as addition for computing an offset. This fragment forwards that check for
-// TableGen's use.
-def add_like_or : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),
-[{
-  return CurDAG->isBaseWithConstantOffset(SDValue(N, 0));
-}]>;
-
-// Load/store (unsigned immediate) operations with relocations against global
-// symbols (for lo12) are only valid if those symbols have correct alignment
-// (since the immediate offset is divided by the access scale, it can't have a
-// remainder).
-//
-// The guaranteed alignment is provided as part of the WrapperSmall
-// operation, and checked against one of these.
-def any_align   : ImmLeaf<i32, [{ (void)Imm; return true; }]>;
-def min_align2  : ImmLeaf<i32, [{ return Imm >= 2; }]>;
-def min_align4  : ImmLeaf<i32, [{ return Imm >= 4; }]>;
-def min_align8  : ImmLeaf<i32, [{ return Imm >= 8; }]>;
-def min_align16 : ImmLeaf<i32, [{ return Imm >= 16; }]>;
-
-// "Normal" load/store instructions can be used on atomic operations, provided
-// the ordering parameter is at most "monotonic". Anything above that needs
-// special handling with acquire/release instructions.
-class simple_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_load_simple_i8  : simple_load<atomic_load_8>;
-def atomic_load_simple_i16 : simple_load<atomic_load_16>;
-def atomic_load_simple_i32 : simple_load<atomic_load_32>;
-def atomic_load_simple_i64 : simple_load<atomic_load_64>;
-
-class simple_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_store_simple_i8  : simple_store<atomic_store_8>;
-def atomic_store_simple_i16 : simple_store<atomic_store_16>;
-def atomic_store_simple_i32 : simple_store<atomic_store_32>;
-def atomic_store_simple_i64 : simple_store<atomic_store_64>;
-
-//===------------------------------
-// 2. UImm12 and SImm9
-//===------------------------------
-
-// These instructions have two operands providing the address so they can be
-// treated similarly for most purposes.
-
-//===------------------------------
-// 2.1 Base patterns covering extend/truncate semantics
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ls_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
-                          dag Offset, dag address, ValueType transty,
-                          ValueType sty> {
-  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
-            (LOAD Base, Offset)>;
-
-  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
-            (STORE $Rt, Base, Offset)>;
-}
-
-// Instructions accessing a memory chunk smaller than a register (or, in a
-// pinch, the same size) have a characteristic set of patterns they want to
-// match: extending loads and truncating stores. This class deals with the
-// sign-neutral version of those patterns.
-//
-// It will be instantiated across multiple addressing-modes.
-multiclass ls_small_pats<Instruction LOAD, Instruction STORE,
-                         dag Base, dag Offset,
-                         dag address, ValueType sty>
-  : ls_atomic_pats<LOAD, STORE, Base, Offset, address, i32, sty> {
-  def : Pat<(!cast<SDNode>(zextload # sty) address), (LOAD Base, Offset)>;
-
-  def : Pat<(!cast<SDNode>(extload # sty) address), (LOAD Base, Offset)>;
-
-  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
-  // register was actually set.
-  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
-  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
-  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
-            (STORE $Rt, Base, Offset)>;
-
-  // For truncating store from 64-bits, we have to manually tell LLVM to
-  // ignore the high bits of the x register.
-  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
-            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-}
-
-// Next come patterns for sign-extending loads.
-multiclass load_signed_pats<string T, string U, dag Base, dag Offset,
-                            dag address, ValueType sty> {
-  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "w" # U) Base, Offset)>;
-
-  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "x" # U) Base, Offset)>;
-
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ls_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
-                           dag Offset, dag address, ValueType sty> {
-  def : Pat<(sty (load address)), (LOAD Base, Offset)>;
-  def : Pat<(store sty:$Rt, address), (STORE $Rt, Base, Offset)>;
-}
-
-// Integer operations also get atomic instructions to select for.
-multiclass ls_int_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
-                           dag Offset, dag address, ValueType sty>
-  : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
-    ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
-
-//===------------------------------
-// 2.2. Addressing-mode instantiations
-//===------------------------------
-
-multiclass uimm12_pats<dag address, dag Base, dag Offset> {
-  defm : ls_small_pats<LS8_LDR, LS8_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, byte_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, byte_uimm12,
-                                !subst(ALIGN, any_align, decls.pattern))),
-                       i8>;
-  defm : ls_small_pats<LS16_LDR, LS16_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, hword_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, hword_uimm12,
-                                !subst(ALIGN, min_align2, decls.pattern))),
-                       i16>;
-  defm : ls_small_pats<LS32_LDR, LS32_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, word_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, word_uimm12,
-                                !subst(ALIGN, min_align4, decls.pattern))),
-                       i32>;
-
-  defm : ls_int_neutral_pats<LS32_LDR, LS32_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, word_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          i32>;
-
-  defm : ls_int_neutral_pats<LS64_LDR, LS64_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          i64>;
-
-  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          f16>;
-
-  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, word_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          f32>;
-
-  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          f64>;
-
-  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, qword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, qword_uimm12,
-                                   !subst(ALIGN, min_align16, decls.pattern))),
-                          f128>;
-
-  defm : load_signed_pats<"B", "", Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, byte_uimm12,
-                                   !subst(ALIGN, any_align, decls.pattern))),
-                          i8>;
-
-  defm : load_signed_pats<"H", "", Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          i16>;
-
-  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
-                                  !subst(OFFSET, word_uimm12,
-                                  !subst(ALIGN, min_align4, decls.pattern)))),
-            (LDRSWx Base, !foreach(decls.pattern, Offset,
-                                  !subst(OFFSET, word_uimm12, decls.pattern)))>;
-}
-
-// Straightforward patterns of last resort: a pointer with or without an
-// appropriate offset.
-defm : uimm12_pats<(i64 i64:$Rn), (i64 i64:$Rn), (i64 0)>;
-defm : uimm12_pats<(add i64:$Rn, OFFSET:$UImm12),
-                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// The offset could be hidden behind an "or", of course:
-defm : uimm12_pats<(add_like_or i64:$Rn, OFFSET:$UImm12),
-                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// Global addresses under the small-absolute model should use these
-// instructions. There are ELF relocations specifically for it.
-defm : uimm12_pats<(A64WrapperSmall tglobaladdr:$Hi, tglobaladdr:$Lo12, ALIGN),
-                   (ADRPxi tglobaladdr:$Hi), (i64 tglobaladdr:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tglobaltlsaddr:$Hi, tglobaltlsaddr:$Lo12,
-                                    ALIGN),
-                   (ADRPxi tglobaltlsaddr:$Hi), (i64 tglobaltlsaddr:$Lo12)>;
-
-// External symbols that make it this far should also get standard relocations.
-defm : uimm12_pats<(A64WrapperSmall texternalsym:$Hi, texternalsym:$Lo12,
-                                    ALIGN),
-                   (ADRPxi texternalsym:$Hi), (i64 texternalsym:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
-                   (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-// We also want to use uimm12 instructions for local variables at the moment.
-def tframeindex_XFORM : SDNodeXForm<frameindex, [{
-  int FI = cast<FrameIndexSDNode>(N)->getIndex();
-  return CurDAG->getTargetFrameIndex(FI, MVT::i64);
-}]>;
-
-defm : uimm12_pats<(i64 frameindex:$Rn),
-                   (tframeindex_XFORM tframeindex:$Rn), (i64 0)>;
-
-// These can be much simpler than uimm12 because we don't to change the operand
-// type (e.g. LDURB and LDURH take the same operands).
-multiclass simm9_pats<dag address, dag Base, dag Offset> {
-  defm : ls_small_pats<LS8_LDUR, LS8_STUR, Base, Offset, address, i8>;
-  defm : ls_small_pats<LS16_LDUR, LS16_STUR, Base, Offset, address, i16>;
-
-  defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
-  defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
-
-  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
-  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
-  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
-  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
-                         f128>;
-
-  def : Pat<(i64 (zextloadi32 address)),
-            (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
-
-  def : Pat<(truncstorei32 i64:$Rt, address),
-            (LS32_STUR (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-
-  defm : load_signed_pats<"B", "_U", Base, Offset, address, i8>;
-  defm : load_signed_pats<"H", "_U", Base, Offset, address, i16>;
-  def : Pat<(sextloadi32 address), (LDURSWx Base, Offset)>;
-}
-
-defm : simm9_pats<(add i64:$Rn, simm9:$SImm9),
-                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-defm : simm9_pats<(add_like_or i64:$Rn, simm9:$SImm9),
-                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-
-//===------------------------------
-// 3. Register offset patterns
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ro_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
-                          dag Offset, dag Extend, dag address,
-                          ValueType transty, ValueType sty> {
-  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
-            (STORE $Rt, Base, Offset, Extend)>;
-}
-
-// The register offset instructions take three operands giving the instruction,
-// and have an annoying split between instructions where Rm is 32-bit and
-// 64-bit. So we need a special hierarchy to describe them. Other than that the
-// same operations should be supported as for simm9 and uimm12 addressing.
-
-multiclass ro_small_pats<Instruction LOAD, Instruction STORE,
-                         dag Base, dag Offset, dag Extend,
-                         dag address, ValueType sty>
-  : ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, i32, sty> {
-  def : Pat<(!cast<SDNode>(zextload # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  def : Pat<(!cast<SDNode>(extload # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
-  // register was actually set.
-  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
-  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
-  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
-            (STORE $Rt, Base, Offset, Extend)>;
-
-  // For truncating store from 64-bits, we have to manually tell LLVM to
-  // ignore the high bits of the x register.
-  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
-            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset, Extend)>;
-
-}
-
-// Next come patterns for sign-extending loads.
-multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
-                          dag address, ValueType sty> {
-  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "w_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-
-  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "x_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ro_neutral_pats<Instruction LOAD, Instruction STORE,
-                           dag Base, dag Offset, dag Extend, dag address,
-                           ValueType sty> {
-  def : Pat<(sty (load address)), (LOAD Base, Offset, Extend)>;
-  def : Pat<(store sty:$Rt, address),
-            (STORE $Rt, Base, Offset, Extend)>;
-}
-
-multiclass ro_int_neutral_pats<Instruction LOAD, Instruction STORE,
-                               dag Base, dag Offset, dag Extend, dag address,
-                               ValueType sty>
-  : ro_neutral_pats<LOAD, STORE, Base, Offset, Extend, address, sty>,
-    ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, sty, sty>;
-
-multiclass regoff_pats<string Rm, dag address, dag Base, dag Offset,
-                       dag Extend> {
-  defm : ro_small_pats<!cast<Instruction>("LS8_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS8_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq0, decls.pattern)),
-                       i8>;
-  defm : ro_small_pats<!cast<Instruction>("LS16_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS16_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq1, decls.pattern)),
-                       i16>;
-  defm : ro_small_pats<!cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq2, decls.pattern)),
-                       i32>;
-
-  defm : ro_int_neutral_pats<
-                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
-                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
-                            Base, Offset, Extend,
-                            !foreach(decls.pattern, address,
-                                     !subst(SHIFT, imm_eq2, decls.pattern)),
-                            i32>;
-
-  defm : ro_int_neutral_pats<
-                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_LDR"),
-                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_STR"),
-                            Base, Offset, Extend,
-                            !foreach(decls.pattern, address,
-                                     !subst(SHIFT, imm_eq3, decls.pattern)),
-                            i64>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP16_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP16_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq1, decls.pattern)),
-                         f16>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP32_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP32_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq2, decls.pattern)),
-                         f32>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP64_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP64_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq3, decls.pattern)),
-                         f64>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP128_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP128_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq4, decls.pattern)),
-                         f128>;
-
-  defm : ro_signed_pats<"B", Rm, Base, Offset, Extend,
-                        !foreach(decls.pattern, address,
-                                 !subst(SHIFT, imm_eq0, decls.pattern)),
-                        i8>;
-
-  defm : ro_signed_pats<"H", Rm, Base, Offset, Extend,
-                        !foreach(decls.pattern, address,
-                                 !subst(SHIFT, imm_eq1, decls.pattern)),
-                        i16>;
-
-  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq2, decls.pattern))),
-            (!cast<Instruction>("LDRSWx_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-}
-
-
-// Finally we're in a position to tell LLVM exactly what addresses are reachable
-// using register-offset instructions. Essentially a base plus a possibly
-// extended, possibly shifted (by access size) offset.
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (sext i32:$Rm)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 6)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (sext i32:$Rm), SHIFT)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 7)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (zext i32:$Rm)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (zext i32:$Rm), SHIFT)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 3)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm),
-                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
-                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON) Support
-//
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 
-include "AArch64InstrNEON.td"
+include "AArch64InstrAtomics.td"
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
deleted file mode 100644
index 01a59a1a6a8b..000000000000
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ /dev/null
@@ -1,9474 +0,0 @@
-//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the AArch64 NEON instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// NEON-specific DAG Nodes.
-//===----------------------------------------------------------------------===//
-
-// (outs Result), (ins Imm, OpCmode)
-def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-
-def Neon_movi     : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>;
-
-def Neon_mvni     : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>;
-
-// (outs Result), (ins Imm)
-def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1,
-                        [SDTCisVec<0>, SDTCisVT<1, i32>]>>;
-
-// (outs Result), (ins LHS, RHS, CondCode)
-def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3,
-                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
-
-// (outs Result), (ins LHS, 0/0.0 constant, CondCode)
-def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
-                 [SDTCisVec<0>,  SDTCisVec<1>]>>;
-
-// (outs Result), (ins LHS, RHS)
-def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
-                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
-
-def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                                     SDTCisVT<2, i32>]>;
-def Neon_sqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
-def Neon_uqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
-
-def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                               SDTCisSameAs<0, 2>]>;
-def Neon_uzp1    : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
-def Neon_uzp2    : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
-def Neon_zip1    : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
-def Neon_zip2    : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
-def Neon_trn1    : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
-def Neon_trn2    : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
-
-def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
-def Neon_rev64    : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
-def Neon_rev32    : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
-def Neon_rev16    : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>;
-def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
-                       [SDTCisVec<0>]>>;
-def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
-                           [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;
-def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
-                           [SDTCisVec<0>,  SDTCisSameAs<0, 1>,
-                           SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
-
-//===----------------------------------------------------------------------===//
-// Addressing-mode instantiations
-//===----------------------------------------------------------------------===//
-
-multiclass ls_64_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
-                      !foreach(decls.pattern, Offset,
-                               !subst(OFFSET, dword_uimm12, decls.pattern)),
-                      !foreach(decls.pattern, address,
-                               !subst(OFFSET, dword_uimm12,
-                               !subst(ALIGN, min_align8, decls.pattern))),
-                      Ty>;
-}
-
-multiclass ls_128_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, qword_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, qword_uimm12,
-                                !subst(ALIGN, min_align16, decls.pattern))),
-                      Ty>;
-}
-
-multiclass uimm12_neon_pats<dag address, dag Base, dag Offset> {
-  defm : ls_64_pats<address, Base, Offset, v8i8>;
-  defm : ls_64_pats<address, Base, Offset, v4i16>;
-  defm : ls_64_pats<address, Base, Offset, v2i32>;
-  defm : ls_64_pats<address, Base, Offset, v1i64>;
-  defm : ls_64_pats<address, Base, Offset, v2f32>;
-  defm : ls_64_pats<address, Base, Offset, v1f64>;
-
-  defm : ls_128_pats<address, Base, Offset, v16i8>;
-  defm : ls_128_pats<address, Base, Offset, v8i16>;
-  defm : ls_128_pats<address, Base, Offset, v4i32>;
-  defm : ls_128_pats<address, Base, Offset, v2i64>;
-  defm : ls_128_pats<address, Base, Offset, v4f32>;
-  defm : ls_128_pats<address, Base, Offset, v2f64>;
-}
-
-defm : uimm12_neon_pats<(A64WrapperSmall
-                          tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
-                        (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-//===----------------------------------------------------------------------===//
-// Multiclasses
-//===----------------------------------------------------------------------===//
-
-multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
-                                string asmop, SDPatternOperator opnode8B,
-                                SDPatternOperator opnode16B,
-                                bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8B :  NeonI_3VSame<0b0, u, size, opcode,
-               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
-               [(set (v8i8 VPR64:$Rd),
-                  (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _16B : NeonI_3VSame<0b1, u, size, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
-               [(set (v16i8 VPR128:$Rd),
-                  (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-
-}
-
-multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
-                                  string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h",
-              [(set (v4i16 VPR64:$Rd),
-                 (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _8H : NeonI_3VSame<0b1, u, 0b01, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h",
-              [(set (v8i16 VPR128:$Rd),
-                 (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _2S : NeonI_3VSame<0b0, u, 0b10, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
-              [(set (v2i32 VPR64:$Rd),
-                 (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _4S : NeonI_3VSame<0b1, u, 0b10, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-              [(set (v4i32 VPR128:$Rd),
-                 (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
-                                  string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0>
-   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable> {
-  let isCommutable = Commutable in {
-    def _8B :  NeonI_3VSame<0b0, u, 0b00, opcode,
-               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
-               [(set (v8i8 VPR64:$Rd),
-                  (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _16B : NeonI_3VSame<0b1, u, 0b00, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
-               [(set (v16i8 VPR128:$Rd),
-                  (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
-                                   string asmop, SDPatternOperator opnode,
-                                   bit Commutable = 0>
-   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable> {
-  let isCommutable = Commutable in {
-    def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
-              [(set (v2i64 VPR128:$Rd),
-                 (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types,
-// but Result types can be integer or floating point types.
-multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
-                                 string asmop, SDPatternOperator opnode,
-                                 ValueType ResTy2S, ValueType ResTy4S,
-                                 ValueType ResTy2D, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
-              [(set (ResTy2S VPR64:$Rd),
-                 (ResTy2S (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-              [(set (ResTy4S VPR128:$Rd),
-                 (ResTy4S (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
-              [(set (ResTy2D VPR128:$Rd),
-                 (ResTy2D (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Instruction Definitions
-//===----------------------------------------------------------------------===//
-
-// Vector Arithmetic Instructions
-
-// Vector Add (Integer and Floating-Point)
-
-defm ADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>;
-defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Patterns to match add of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (add FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (add FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (add FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Sub (Integer and Floating-Point)
-
-defm SUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>;
-defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub,
-                                     v2f32, v4f32, v2f64, 0>;
-
-// Patterns to match sub of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (sub FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (sub FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (sub FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Multiply (Integer and Floating-Point)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>;
-defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul,
-                                     v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match mul of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (mul FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG 
-              (MULvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (mul FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG 
-              (MULvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (mul FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (MULvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Multiply (Polynomial)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
-                                    int_arm_neon_vmulp, int_arm_neon_vmulp, 1>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Integer)
-
-// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
-// two operands constraints.
-class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
-  RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size,
-  bits<5> opcode, SDPatternOperator opnode>
-  : NeonI_3VSame<q, u, size, opcode,
-    (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
-    asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
-    [(set (OpTy VPRC:$Rd),
-       (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))],
-    NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (sub node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def MLAvvv_8B:  NeonI_3VSame_Constraint_impl<"mla", ".8b",  VPR64,  v8i8,
-                                             0b0, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8,
-                                             0b1, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_4H:  NeonI_3VSame_Constraint_impl<"mla", ".4h",  VPR64,  v4i16,
-                                             0b0, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_8H:  NeonI_3VSame_Constraint_impl<"mla", ".8h",  VPR128, v8i16,
-                                             0b1, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_2S:  NeonI_3VSame_Constraint_impl<"mla", ".2s",  VPR64,  v2i32,
-                                             0b0, 0b0, 0b10, 0b10010, Neon_mla>;
-def MLAvvv_4S:  NeonI_3VSame_Constraint_impl<"mla", ".4s",  VPR128, v4i32,
-                                             0b1, 0b0, 0b10, 0b10010, Neon_mla>;
-
-def MLSvvv_8B:  NeonI_3VSame_Constraint_impl<"mls", ".8b",  VPR64,  v8i8,
-                                             0b0, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8,
-                                             0b1, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_4H:  NeonI_3VSame_Constraint_impl<"mls", ".4h",  VPR64,  v4i16,
-                                             0b0, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_8H:  NeonI_3VSame_Constraint_impl<"mls", ".8h",  VPR128, v8i16,
-                                             0b1, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_2S:  NeonI_3VSame_Constraint_impl<"mls", ".2s",  VPR64,  v2i32,
-                                             0b0, 0b1, 0b10, 0b10010, Neon_mls>;
-def MLSvvv_4S:  NeonI_3VSame_Constraint_impl<"mls", ".4s",  VPR128, v4i32,
-                                             0b1, 0b1, 0b10, 0b10010, Neon_mls>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Floating Point)
-
-def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fadd node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fsub node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-let Predicates = [HasNEON, UseFusedMAC],
-    SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s",  VPR64,  v2f32,
-                                             0b0, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s",  VPR128, v4f32,
-                                             0b1, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d",  VPR128, v2f64,
-                                             0b1, 0b0, 0b01, 0b11001, Neon_fmla>;
-
-def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s",  VPR64,  v2f32,
-                                              0b0, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s",  VPR128, v4f32,
-                                             0b1, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d",  VPR128, v2f64,
-                                             0b1, 0b0, 0b11, 0b11001, Neon_fmls>;
-}
-
-// We're also allowed to match the fma instruction regardless of compile
-// options.
-def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)),
-          (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
-          (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
-          (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)),
-          (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
-          (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
-          (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-// Vector Divide (Floating-Point)
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
-defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv,
-                                     v2f32, v4f32, v2f64, 0>;
-}
-
-// Vector Bitwise Operations
-
-// Vector Bitwise AND
-
-defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>;
-
-// Vector Bitwise Exclusive OR
-
-defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>;
-
-// Vector Bitwise OR
-
-defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
-
-// ORR disassembled as MOV if Vn==Vm
-
-// Vector Move - register
-// Alias for ORR if Vn=Vm.
-def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
-                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn)>;
-def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
-                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn)>;
-
-// The MOVI instruction takes two immediate operands.  The first is the
-// immediate encoding, while the second is the cmode.  A cmode of 14, or
-// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC.
-def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>;
-def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>;
-
-def Neon_not8B  : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>;
-def Neon_not16B : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>;
-
-def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
-                         (or node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm),
-                          (or node:$Rn, (Neon_not16B node:$Rm))>;
-
-def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm),
-                         (and node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm),
-                          (and node:$Rn, (Neon_not16B node:$Rm))>;
-
-
-// Vector Bitwise OR NOT - register
-
-defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn",
-                                   Neon_orn8B, Neon_orn16B, 0>;
-
-// Vector Bitwise Bit Clear (AND NOT) - register
-
-defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic",
-                                   Neon_bic8B, Neon_bic16B, 0>;
-
-multiclass Neon_bitwise2V_patterns<SDPatternOperator opnode8B,
-                                   SDPatternOperator opnode16B,
-                                   Instruction INST8B,
-                                   Instruction INST16B> {
-  def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN
-defm : Neon_bitwise2V_patterns<and, and, ANDvvv_8B, ANDvvv_16B>;
-defm : Neon_bitwise2V_patterns<or,  or,  ORRvvv_8B, ORRvvv_16B>;
-defm : Neon_bitwise2V_patterns<xor, xor, EORvvv_8B, EORvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_bic8B, Neon_bic16B, BICvvv_8B, BICvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>;
-
-//   Vector Bitwise Select
-def BSLvvv_8B  : NeonI_3VSame_Constraint_impl<"bsl", ".8b",  VPR64, v8i8,
-                                              0b0, 0b1, 0b01, 0b00011, vselect>;
-
-def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8,
-                                              0b1, 0b1, 0b01, 0b00011, vselect>;
-
-multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
-                                   Instruction INST8B,
-                                   Instruction INST16B> {
-  // Disassociate type from instruction definition
-  def : Pat<(v8i8 (opnode (v8i8 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2f32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (opnode (v4i16 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1f64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (opnode (v16i8 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (opnode (v8i16 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2f64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4f32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-
-  // Allow to match BSL instruction pattern with non-constant operand
-  def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd),
-                    (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-
-  // Allow to match llvm.arm.* intrinsics.
-  def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src),
-                    (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src),
-                    (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src),
-                    (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src),
-                    (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
-                    (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src),
-                    (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
-                    (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src),
-                    (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src),
-                    (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src),
-                    (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src),
-                    (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src),
-                    (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instruction BSL
-defm: Neon_bitwise3V_patterns<vselect, BSLvvv_8B, BSLvvv_16B>;
-
-def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm),
-                           (vselect node:$src, node:$Rn, node:$Rm),
-                           [{ (void)N; return false; }]>;
-
-// Vector Bitwise Insert if True
-
-def BITvvv_8B  : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64,   v8i8,
-                   0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8,
-                   0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-
-// Vector Bitwise Insert if False
-
-def BIFvvv_8B  : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64,  v8i8,
-                                0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8,
-                                0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-
-// Vector Absolute Difference and Accumulate (Signed, Unsigned)
-
-def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>;
-def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>;
-
-// Vector Absolute Difference and Accumulate (Unsigned)
-def UABAvvv_8B :  NeonI_3VSame_Constraint_impl<"uaba", ".8b",  VPR64,  v8i8,
-                    0b0, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8,
-                    0b1, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_4H :  NeonI_3VSame_Constraint_impl<"uaba", ".4h",  VPR64,  v4i16,
-                    0b0, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_8H :  NeonI_3VSame_Constraint_impl<"uaba", ".8h",  VPR128, v8i16,
-                    0b1, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_2S :  NeonI_3VSame_Constraint_impl<"uaba", ".2s",  VPR64,  v2i32,
-                    0b0, 0b1, 0b10, 0b01111, Neon_uaba>;
-def UABAvvv_4S :  NeonI_3VSame_Constraint_impl<"uaba", ".4s",  VPR128, v4i32,
-                    0b1, 0b1, 0b10, 0b01111, Neon_uaba>;
-
-// Vector Absolute Difference and Accumulate (Signed)
-def SABAvvv_8B :  NeonI_3VSame_Constraint_impl<"saba", ".8b",  VPR64,  v8i8,
-                    0b0, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8,
-                    0b1, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_4H :  NeonI_3VSame_Constraint_impl<"saba", ".4h",  VPR64,  v4i16,
-                    0b0, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_8H :  NeonI_3VSame_Constraint_impl<"saba", ".8h",  VPR128, v8i16,
-                    0b1, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_2S :  NeonI_3VSame_Constraint_impl<"saba", ".2s",  VPR64,  v2i32,
-                    0b0, 0b0, 0b10, 0b01111, Neon_saba>;
-def SABAvvv_4S :  NeonI_3VSame_Constraint_impl<"saba", ".4s",  VPR128, v4i32,
-                    0b1, 0b0, 0b10, 0b01111, Neon_saba>;
-
-
-// Vector Absolute Difference (Signed, Unsigned)
-defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>;
-defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>;
-
-// Vector Absolute Difference (Floating Point)
-defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd",
-                                    int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Step (Floating Point)
-defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps",
-                                       int_arm_neon_vrecps,
-                                       v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Square Root Step (Floating Point)
-defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts",
-                                        int_arm_neon_vrsqrts,
-                                        v2f32, v4f32, v2f64, 0>;
-
-// Vector Comparisons
-
-def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETEQ)>;
-def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs),
-                         (Neon_cmp node:$lhs, node:$rhs, SETUGE)>;
-def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETGE)>;
-def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETUGT)>;
-def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETGT)>;
-
-// NeonI_compare_aliases class: swaps register operands to implement
-// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
-class NeonI_compare_aliases<string asmop, string asmlane,
-                            Instruction inst, RegisterOperand VPRC>
-  : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
-                    ", $Rm" # asmlane,
-                  (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
-
-// Vector Comparisons (Integer)
-
-// Vector Compare Mask Equal (Integer)
-let isCommutable =1 in {
-defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>;
-}
-
-// Vector Compare Mask Higher or Same (Unsigned Integer)
-defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>;
-
-// Vector Compare Mask Greater Than or Equal (Integer)
-defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>;
-
-// Vector Compare Mask Higher (Unsigned Integer)
-defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>;
-
-// Vector Compare Mask Greater Than (Integer)
-defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>;
-
-// Vector Compare Mask Bitwise Test (Integer)
-defm CMTSTvvv:  NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>;
-
-// Vector Compare Mask Less or Same (Unsigned Integer)
-// CMLS is alias for CMHS with operands reversed.
-def CMLSvvv_8B  : NeonI_compare_aliases<"cmls", ".8b",  CMHSvvv_8B,  VPR64>;
-def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>;
-def CMLSvvv_4H  : NeonI_compare_aliases<"cmls", ".4h",  CMHSvvv_4H,  VPR64>;
-def CMLSvvv_8H  : NeonI_compare_aliases<"cmls", ".8h",  CMHSvvv_8H,  VPR128>;
-def CMLSvvv_2S  : NeonI_compare_aliases<"cmls", ".2s",  CMHSvvv_2S,  VPR64>;
-def CMLSvvv_4S  : NeonI_compare_aliases<"cmls", ".4s",  CMHSvvv_4S,  VPR128>;
-def CMLSvvv_2D  : NeonI_compare_aliases<"cmls", ".2d",  CMHSvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than or Equal (Integer)
-// CMLE is alias for CMGE with operands reversed.
-def CMLEvvv_8B  : NeonI_compare_aliases<"cmle", ".8b",  CMGEvvv_8B,  VPR64>;
-def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>;
-def CMLEvvv_4H  : NeonI_compare_aliases<"cmle", ".4h",  CMGEvvv_4H,  VPR64>;
-def CMLEvvv_8H  : NeonI_compare_aliases<"cmle", ".8h",  CMGEvvv_8H,  VPR128>;
-def CMLEvvv_2S  : NeonI_compare_aliases<"cmle", ".2s",  CMGEvvv_2S,  VPR64>;
-def CMLEvvv_4S  : NeonI_compare_aliases<"cmle", ".4s",  CMGEvvv_4S,  VPR128>;
-def CMLEvvv_2D  : NeonI_compare_aliases<"cmle", ".2d",  CMGEvvv_2D,  VPR128>;
-
-// Vector Compare Mask Lower (Unsigned Integer)
-// CMLO is alias for CMHI with operands reversed.
-def CMLOvvv_8B  : NeonI_compare_aliases<"cmlo", ".8b",  CMHIvvv_8B,  VPR64>;
-def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>;
-def CMLOvvv_4H  : NeonI_compare_aliases<"cmlo", ".4h",  CMHIvvv_4H,  VPR64>;
-def CMLOvvv_8H  : NeonI_compare_aliases<"cmlo", ".8h",  CMHIvvv_8H,  VPR128>;
-def CMLOvvv_2S  : NeonI_compare_aliases<"cmlo", ".2s",  CMHIvvv_2S,  VPR64>;
-def CMLOvvv_4S  : NeonI_compare_aliases<"cmlo", ".4s",  CMHIvvv_4S,  VPR128>;
-def CMLOvvv_2D  : NeonI_compare_aliases<"cmlo", ".2d",  CMHIvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than (Integer)
-// CMLT is alias for CMGT with operands reversed.
-def CMLTvvv_8B  : NeonI_compare_aliases<"cmlt", ".8b",  CMGTvvv_8B,  VPR64>;
-def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>;
-def CMLTvvv_4H  : NeonI_compare_aliases<"cmlt", ".4h",  CMGTvvv_4H,  VPR64>;
-def CMLTvvv_8H  : NeonI_compare_aliases<"cmlt", ".8h",  CMGTvvv_8H,  VPR128>;
-def CMLTvvv_2S  : NeonI_compare_aliases<"cmlt", ".2s",  CMGTvvv_2S,  VPR64>;
-def CMLTvvv_4S  : NeonI_compare_aliases<"cmlt", ".4s",  CMGTvvv_4S,  VPR128>;
-def CMLTvvv_2D  : NeonI_compare_aliases<"cmlt", ".2d",  CMGTvvv_2D,  VPR128>;
-
-
-def neon_uimm0_asmoperand : AsmOperandClass
-{
-  let Name = "UImm0";
-  let PredicateMethod = "isUImm<0>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm0 : Operand<i32>, ImmLeaf<i32, [{return Imm == 0;}]> {
-  let ParserMatchClass = neon_uimm0_asmoperand;
-  let PrintMethod = "printNeonUImm0Operand";
-
-}
-
-multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC>
-{
-  def _8B :  NeonI_2VMisc<0b0, u, 0b00, opcode,
-             (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-             asmop # "\t$Rd.8b, $Rn.8b, $Imm",
-             [(set (v8i8 VPR64:$Rd),
-                (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode,
-             (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-             asmop # "\t$Rd.16b, $Rn.16b, $Imm",
-             [(set (v16i8 VPR128:$Rd),
-                (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.4h, $Rn.4h, $Imm",
-            [(set (v4i16 VPR64:$Rd),
-               (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.8h, $Rn.8h, $Imm",
-            [(set (v8i16 VPR128:$Rd),
-               (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.2s, $Rn.2s, $Imm",
-            [(set (v2i32 VPR64:$Rd),
-               (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.4s, $Rn.4s, $Imm",
-            [(set (v4i32 VPR128:$Rd),
-               (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.2d, $Rn.2d, $Imm",
-            [(set (v2i64 VPR128:$Rd),
-               (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Integer)
-defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
-defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Signed Integer)
-defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
-defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Signed Integer)
-defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>;
-
-// Vector Comparisons (Floating Point)
-
-// Vector Compare Mask Equal (Floating Point)
-let isCommutable =1 in {
-defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq,
-                                      v2i32, v4i32, v2i64, 0>;
-}
-
-// Vector Compare Mask Greater Than Or Equal (Floating Point)
-defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Greater Than (Floating Point)
-defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Less Than Or Equal (Floating Point)
-// FCMLE is alias for FCMGE with operands reversed.
-def FCMLEvvv_2S  : NeonI_compare_aliases<"fcmle", ".2s",  FCMGEvvv_2S,  VPR64>;
-def FCMLEvvv_4S  : NeonI_compare_aliases<"fcmle", ".4s",  FCMGEvvv_4S,  VPR128>;
-def FCMLEvvv_2D  : NeonI_compare_aliases<"fcmle", ".2d",  FCMGEvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than (Floating Point)
-// FCMLT is alias for FCMGT with operands reversed.
-def FCMLTvvv_2S  : NeonI_compare_aliases<"fcmlt", ".2s",  FCMGTvvv_2S,  VPR64>;
-def FCMLTvvv_4S  : NeonI_compare_aliases<"fcmlt", ".4s",  FCMGTvvv_4S,  VPR128>;
-def FCMLTvvv_2D  : NeonI_compare_aliases<"fcmlt", ".2d",  FCMGTvvv_2D,  VPR128>;
-
-def fpzero_izero_asmoperand : AsmOperandClass {
-  let Name = "FPZeroIZero";
-  let ParserMethod = "ParseFPImm0AndImm0Operand";
-  let DiagnosticType = "FPZero";
-}
-
-def fpzz32 : Operand<f32>,
-             ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_izero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode,
-                              string asmop, CondCode CC>
-{
-  def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.2s, $Rn.2s, $FPImm",
-            [(set (v2i32 VPR64:$Rd),
-               (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.4s, $Rn.4s, $FPImm",
-            [(set (v4i32 VPR128:$Rd),
-               (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.2d, $Rn.2d, $FPImm",
-            [(set (v2i64 VPR128:$Rd),
-               (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Floating Point)
-defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
-defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Floating Point)
-defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
-defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Floating Point)
-defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>;
-
-// Vector Absolute Comparisons (Floating Point)
-
-// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point)
-defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge",
-                                      int_arm_neon_vacge,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Greater Than (Floating Point)
-defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt",
-                                      int_arm_neon_vacgt,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
-// FACLE is alias for FACGE with operands reversed.
-def FACLEvvv_2S  : NeonI_compare_aliases<"facle", ".2s",  FACGEvvv_2S,  VPR64>;
-def FACLEvvv_4S  : NeonI_compare_aliases<"facle", ".4s",  FACGEvvv_4S,  VPR128>;
-def FACLEvvv_2D  : NeonI_compare_aliases<"facle", ".2d",  FACGEvvv_2D,  VPR128>;
-
-// Vector Absolute Compare Mask Less Than (Floating Point)
-// FACLT is alias for FACGT with operands reversed.
-def FACLTvvv_2S  : NeonI_compare_aliases<"faclt", ".2s",  FACGTvvv_2S,  VPR64>;
-def FACLTvvv_4S  : NeonI_compare_aliases<"faclt", ".4s",  FACGTvvv_4S,  VPR128>;
-def FACLTvvv_2D  : NeonI_compare_aliases<"faclt", ".2d",  FACGTvvv_2D,  VPR128>;
-
-// Vector halving add (Integer Signed, Unsigned)
-defm SHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd",
-                                        int_arm_neon_vhadds, 1>;
-defm UHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd",
-                                        int_arm_neon_vhaddu, 1>;
-
-// Vector halving sub (Integer Signed, Unsigned)
-defm SHSUBvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub",
-                                        int_arm_neon_vhsubs, 0>;
-defm UHSUBvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub",
-                                        int_arm_neon_vhsubu, 0>;
-
-// Vector rouding halving add (Integer Signed, Unsigned)
-defm SRHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd",
-                                         int_arm_neon_vrhadds, 1>;
-defm URHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd",
-                                         int_arm_neon_vrhaddu, 1>;
-
-// Vector Saturating add (Integer Signed, Unsigned)
-defm SQADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd",
-                   int_arm_neon_vqadds, 1>;
-defm UQADDvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd",
-                   int_arm_neon_vqaddu, 1>;
-
-// Vector Saturating sub (Integer Signed, Unsigned)
-defm SQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub",
-                   int_arm_neon_vqsubs, 1>;
-defm UQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub",
-                   int_arm_neon_vqsubu, 1>;
-
-// Vector Shift Left (Signed and Unsigned Integer)
-defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl",
-                 int_arm_neon_vshifts, 1>;
-defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl",
-                 int_arm_neon_vshiftu, 1>;
-
-// Vector Saturating Shift Left (Signed and Unsigned Integer)
-defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl",
-                  int_arm_neon_vqshifts, 1>;
-defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl",
-                  int_arm_neon_vqshiftu, 1>;
-
-// Vector Rouding Shift Left (Signed and Unsigned Integer)
-defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl",
-                  int_arm_neon_vrshifts, 1>;
-defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl",
-                  int_arm_neon_vrshiftu, 1>;
-
-// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
-defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl",
-                   int_arm_neon_vqrshifts, 1>;
-defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl",
-                   int_arm_neon_vqrshiftu, 1>;
-
-// Vector Maximum (Signed and Unsigned Integer)
-defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>;
-defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>;
-
-// Vector Minimum (Signed and Unsigned Integer)
-defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>;
-defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>;
-
-// Vector Maximum (Floating Point)
-defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax",
-                                     int_arm_neon_vmaxs,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum (Floating Point)
-defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin",
-                                     int_arm_neon_vmins,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum (Floating Point) -  prefer a number over a quiet NaN)
-defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm",
-                                       int_aarch64_neon_vmaxnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum (Floating Point) - prefer a number over a quiet NaN)
-defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm",
-                                       int_aarch64_neon_vminnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector Maximum Pairwise (Signed and Unsigned Integer)
-defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>;
-defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>;
-
-// Vector Minimum Pairwise (Signed and Unsigned Integer)
-defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>;
-defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>;
-
-// Vector Maximum Pairwise (Floating Point)
-defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp",
-                                     int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum Pairwise (Floating Point)
-defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp",
-                                     int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
-defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp",
-                                       int_aarch64_neon_vpmaxnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
-defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp",
-                                       int_aarch64_neon_vpminnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector Addition Pairwise (Integer)
-defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>;
-
-// Vector Addition Pairwise (Floating Point)
-defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp",
-                                       int_arm_neon_vpadd,
-                                       v2f32, v4f32, v2f64, 1>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-// Vector Saturating Doubling Multiply High
-defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh",
-                    int_arm_neon_vqdmulh, 1>;
-
-// Vector Saturating Rouding Doubling Multiply High
-defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh",
-                     int_arm_neon_vqrdmulh, 1>;
-
-// Vector Multiply Extended (Floating Point)
-defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx",
-                                      int_aarch64_neon_vmulx,
-                                      v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for 
-// ADDP, SMINP, UMINP, SMAXP, UMAXP having i32 as output
-class Neon_VectorPair_v2i32_pattern<SDPatternOperator opnode, Instruction INST>
-  : Pat<(v1i32 (opnode (v2i32 VPR64:$Rn))),
-        (EXTRACT_SUBREG
-             (v2i32 (INST (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rn))),
-             sub_32)>;
-
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_sminv, SMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_uminv, UMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_smaxv, SMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_umaxv, UMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_vaddv, ADDP_2S>;
-
-// Vector Immediate Instructions
-
-multiclass neon_mov_imm_shift_asmoperands<string PREFIX>
-{
-  def _asmoperand : AsmOperandClass
-    {
-      let Name = "NeonMovImmShift" # PREFIX;
-      let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands";
-      let PredicateMethod = "isNeonMovImmShift" # PREFIX;
-    }
-}
-
-// Definition of vector immediates shift operands
-
-// The selectable use-cases extract the shift operation
-// information from the OpCmode fields encoded in the immediate.
-def neon_mod_shift_imm_XFORM : SDNodeXForm<imm, [{
-  uint64_t OpCmode = N->getZExtValue();
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
-  if (!HasShift) return SDValue();
-  return CurDAG->getTargetConstant(ShiftImm, MVT::i32);
-}]>;
-
-// Vector immediates shift operands which accept LSL and MSL
-// shift operators with shift value in the range of 0, 8, 16, 24 (LSL),
-// or 0, 8 (LSLH) or 8, 16 (MSL).
-defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">;
-defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">;
-// LSLH restricts shift amount to  0, 8 out of 0, 8, 16, 24
-defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">;
-
-multiclass neon_mov_imm_shift_operands<string PREFIX,
-                                       string HALF, string ISHALF, code pred>
-{
-   def _operand : Operand<i32>, ImmLeaf<i32, pred, neon_mod_shift_imm_XFORM>
-    {
-      let PrintMethod =
-        "printNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
-      let DecoderMethod =
-        "DecodeNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
-      let ParserMatchClass =
-        !cast<AsmOperandClass>("neon_mov_imm_" # PREFIX # HALF # "_asmoperand");
-    }
-}
-
-defm neon_mov_imm_LSL  : neon_mov_imm_shift_operands<"LSL", "", "false", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && !ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_MSL  : neon_mov_imm_shift_operands<"MSL", "", "false", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_LSLH  : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && !ShiftOnesIn);
-}]>;
-
-def neon_uimm1_asmoperand : AsmOperandClass
-{
-  let Name = "UImm1";
-  let PredicateMethod = "isUImm<1>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm2_asmoperand : AsmOperandClass
-{
-  let Name = "UImm2";
-  let PredicateMethod = "isUImm<2>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8_asmoperand : AsmOperandClass
-{
-  let Name = "UImm8";
-  let PredicateMethod = "isUImm<8>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm8_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm64_mask_asmoperand : AsmOperandClass
-{
-  let Name = "NeonUImm64Mask";
-  let PredicateMethod = "isNeonUImm64Mask";
-  let RenderMethod = "addNeonUImm64MaskOperands";
-}
-
-// MCOperand for 64-bit bytemask with each byte having only the
-// value 0x00 and 0xff is encoded as an unsigned 8-bit value
-def neon_uimm64_mask : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm64_mask_asmoperand;
-  let PrintMethod = "printNeonUImm64MaskOperand";
-}
-
-multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
-                                   SDPatternOperator opnode>
-{
-    // shift zeros, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                              (outs VPR64:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                              [(set (v2i32 VPR64:$Rd),
-                                 (v2i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-       bits<2> Simm;
-       let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
-     }
-
-    def _4S  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                              [(set (v4i32 VPR128:$Rd),
-                                 (v4i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
-    }
-
-    // shift zeros, per halfword
-    def _4H  : NeonI_1VModImm<0b0, op,
-                              (outs VPR64:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
-                              [(set (v4i16 VPR64:$Rd),
-                                 (v4i16 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bit  Simm;
-      let cmode = {0b1, 0b0, Simm, 0b0};
-    }
-
-    def _8H  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
-                              [(set (v8i16 VPR128:$Rd),
-                                 (v8i16 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bit Simm;
-      let cmode = {0b1, 0b0, Simm, 0b0};
-     }
-}
-
-multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
-                                                   SDPatternOperator opnode,
-                                                   SDPatternOperator neonopnode>
-{
-  let Constraints = "$src = $Rd" in {
-    // shift zeros, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                 (outs VPR64:$Rd),
-                 (ins VPR64:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                 [(set (v2i32 VPR64:$Rd),
-                    (v2i32 (opnode (v2i32 VPR64:$src),
-                      (v2i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
-    }
-
-    def _4S  : NeonI_1VModImm<0b1, op,
-                 (outs VPR128:$Rd),
-                 (ins VPR128:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                 [(set (v4i32 VPR128:$Rd),
-                    (v4i32 (opnode (v4i32 VPR128:$src),
-                      (v4i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
-    }
-
-    // shift zeros, per halfword
-    def _4H  : NeonI_1VModImm<0b0, op,
-                 (outs VPR64:$Rd),
-                 (ins VPR64:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
-                 [(set (v4i16 VPR64:$Rd),
-                    (v4i16 (opnode (v4i16 VPR64:$src),
-                       (v4i16 (neonopnode timm:$Imm,
-                          neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bit  Simm;
-      let cmode = {0b1, 0b0, Simm, 0b1};
-    }
-
-    def _8H  : NeonI_1VModImm<0b1, op,
-                 (outs VPR128:$Rd),
-                 (ins VPR128:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
-                 [(set (v8i16 VPR128:$Rd),
-                    (v8i16 (opnode (v8i16 VPR128:$src),
-                      (v8i16 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bit Simm;
-      let cmode = {0b1, 0b0, Simm, 0b1};
-    }
-  }
-}
-
-multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
-                                   SDPatternOperator opnode>
-{
-    // shift ones, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                             (outs VPR64:$Rd),
-                             (ins neon_uimm8:$Imm,
-                               neon_mov_imm_MSL_operand:$Simm),
-                             !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                              [(set (v2i32 VPR64:$Rd),
-                                 (v2i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_MSL_operand:$Simm))))],
-                             NoItinerary>,
-               Sched<[WriteFPALU]> {
-       bit Simm;
-       let cmode = {0b1, 0b1, 0b0, Simm};
-     }
-
-   def _4S  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_MSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                              [(set (v4i32 VPR128:$Rd),
-                                 (v4i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_MSL_operand:$Simm))))],
-                              NoItinerary>,
-              Sched<[WriteFPALU]> {
-     bit Simm;
-     let cmode = {0b1, 0b1, 0b0, Simm};
-   }
-}
-
-// Vector Move Immediate Shifted
-let isReMaterializable = 1 in {
-defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Shifted
-let isReMaterializable = 1 in {
-defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-// Vector Bitwise Bit Clear (AND NOT) - immediate
-let isReMaterializable = 1 in {
-defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1,
-                                                         and, Neon_mvni>;
-}
-
-// Vector Bitwise OR - immedidate
-
-let isReMaterializable = 1 in {
-defm ORRvi_lsl   : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0,
-                                                           or, Neon_movi>;
-}
-
-// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate
-// LowerBUILD_VECTOR favors lowering MOVI over MVNI.
-// BIC immediate instructions selection requires additional patterns to
-// transform Neon_movi operands into BIC immediate operands
-
-def neon_mov_imm_LSLH_transform_XFORM : SDNodeXForm<imm, [{
-  uint64_t OpCmode = N->getZExtValue();
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
-  // LSLH restricts shift amount to  0, 8 which are encoded as 0 and 1
-  // Transform encoded shift amount 0 to 1 and 1 to 0.
-  return CurDAG->getTargetConstant(!ShiftImm, MVT::i32);
-}]>;
-
-def neon_mov_imm_LSLH_transform_operand
-  : ImmLeaf<i32, [{
-    unsigned ShiftImm;
-    unsigned ShiftOnesIn;
-    unsigned HasShift =
-      A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-    return (HasShift && !ShiftOnesIn); }],
-  neon_mov_imm_LSLH_transform_XFORM>;
-
-// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0xff, LSL 8)
-// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0xff)
-def : Pat<(v4i16 (and VPR64:$src,
-            (v4i16 (Neon_movi 255,
-              neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0xff, LSL 8)
-// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0xff)
-def : Pat<(v8i16 (and VPR128:$src,
-            (v8i16 (Neon_movi 255,
-              neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_8H VPR128:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v8i8 (and VPR64:$src,
-                  (bitconvert(v4i16 (Neon_movi 255,
-                    neon_mov_imm_LSLH_transform_operand:$Simm))))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i32 (and VPR64:$src,
-                 (bitconvert(v4i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v1i64 (and VPR64:$src,
-                (bitconvert(v4i16 (Neon_movi 255,
-                  neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_4H VPR64:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v16i8 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v4i32 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i64 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
-                                   SDPatternOperator neonopnode,
-                                   Instruction INST4H,
-                                   Instruction INST8H,
-                                   Instruction INST2S,
-                                   Instruction INST4S> {
-  def : Pat<(v8i8 (opnode VPR64:$src,
-                    (bitconvert(v4i16 (neonopnode timm:$Imm,
-                      neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST4H VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i32 (opnode VPR64:$src,
-                   (bitconvert(v4i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST4H VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v1i64 (opnode VPR64:$src,
-                  (bitconvert(v4i16 (neonopnode timm:$Imm,
-                    neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4H VPR64:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v16i8 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v4i32 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i64 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v8i8 (opnode VPR64:$src,
-                    (bitconvert(v2i32 (neonopnode timm:$Imm,
-                      neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST2S VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v4i16 (opnode VPR64:$src,
-                   (bitconvert(v2i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST2S VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v1i64 (opnode VPR64:$src,
-                  (bitconvert(v2i32 (neonopnode timm:$Imm,
-                    neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST2S VPR64:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v16i8 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v8i16 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i64 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-}
-
-// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate
-defm : Neon_bitwiseVi_patterns<and, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H,
-                               BICvi_lsl_2S, BICvi_lsl_4S>;
-
-// Additional patterns for Vector Bitwise OR - immedidate
-defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H,
-                               ORRvi_lsl_2S, ORRvi_lsl_4S>;
-
-
-// Vector Move Immediate Masked
-let isReMaterializable = 1 in {
-defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Masked
-let isReMaterializable = 1 in {
-defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
-                                Instruction inst, RegisterOperand VPRC>
-  : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"),
-                        (inst VPRC:$Rd, neon_uimm8:$Imm,  0), 0b0>;
-
-// Aliases for Vector Move Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Move Inverted Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate
-def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise OR - immedidate
-def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>;
-
-//  Vector Move Immediate - per byte
-let isReMaterializable = 1 in {
-def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0,
-                               (outs VPR64:$Rd), (ins neon_uimm8:$Imm),
-                               "movi\t$Rd.8b, $Imm",
-                               [(set (v8i8 VPR64:$Rd),
-                                  (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-
-def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0,
-                                (outs VPR128:$Rd), (ins neon_uimm8:$Imm),
-                                "movi\t$Rd.16b, $Imm",
-                                [(set (v16i8 VPR128:$Rd),
-                                   (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                 NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, per double word
-let isReMaterializable = 1 in {
-def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1,
-                               (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm),
-                               "movi\t $Rd.2d, $Imm",
-                               [(set (v2i64 VPR128:$Rd),
-                                  (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                               NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, one doubleword
-
-let isReMaterializable = 1 in {
-def MOVIdi : NeonI_1VModImm<0b0, 0b1,
-                           (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
-                           "movi\t $Rd, $Imm",
-                           [(set (v1i64 FPR64:$Rd),
-                             (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Floating Point Move Immediate
-
-class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
-                      Operand immOpType, bit q, bit op>
-  : NeonI_1VModImm<q, op,
-                   (outs VPRC:$Rd), (ins immOpType:$Imm),
-                   "fmov\t$Rd" # asmlane # ", $Imm",
-                   [(set (OpTy VPRC:$Rd),
-                      (OpTy (Neon_fmovi (timm:$Imm))))],
-                   NoItinerary>,
-    Sched<[WriteFPALU]> {
-     let cmode = 0b1111;
-   }
-
-let isReMaterializable = 1 in {
-def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64,  v2f32, fmov32_operand, 0b0, 0b0>;
-def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
-def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
-}
-
-// Vector Shift (Immediate)
-
-// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
-// as follows:
-//
-//    Offset    Encoding
-//     8        immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0>
-//     16       immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0>
-//     32       immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0>
-//     64       immh:immb<6>   = '1xxxxxx', <imm> is encoded in immh:immb<5:0>
-//
-// The shift right immediate amount, in the range 1 to element bits, is computed
-// as Offset - UInt(immh:immb).  The shift left immediate amount, in the range 0
-// to element bits - 1, is computed as UInt(immh:immb) - Offset.
-
-class shr_imm_asmoperands<string OFFSET> : AsmOperandClass {
-  let Name = "ShrImm" # OFFSET;
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "ShrImm" # OFFSET;
-}
-
-class shr_imm<string OFFSET> : Operand<i32> {
-  let EncoderMethod = "getShiftRightImm" # OFFSET;
-  let DecoderMethod = "DecodeShiftRightImm" # OFFSET;
-  let ParserMatchClass =
-    !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand");
-}
-
-def shr_imm8_asmoperand : shr_imm_asmoperands<"8">;
-def shr_imm16_asmoperand : shr_imm_asmoperands<"16">;
-def shr_imm32_asmoperand : shr_imm_asmoperands<"32">;
-def shr_imm64_asmoperand : shr_imm_asmoperands<"64">;
-
-def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>;
-def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>;
-def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>;
-def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>;
-
-class shl_imm_asmoperands<string OFFSET> : AsmOperandClass {
-  let Name = "ShlImm" # OFFSET;
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "ShlImm" # OFFSET;
-}
-
-class shl_imm<string OFFSET> : Operand<i32> {
-  let EncoderMethod = "getShiftLeftImm" # OFFSET;
-  let DecoderMethod = "DecodeShiftLeftImm" # OFFSET;
-  let ParserMatchClass =
-    !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand");
-}
-
-def shl_imm8_asmoperand : shl_imm_asmoperands<"8">;
-def shl_imm16_asmoperand : shl_imm_asmoperands<"16">;
-def shl_imm32_asmoperand : shl_imm_asmoperands<"32">;
-def shl_imm64_asmoperand : shl_imm_asmoperands<"64">;
-
-def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>;
-def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>;
-def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>;
-def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>;
-
-class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
-               RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd),
-                        (Ty (OpNode (Ty VPRC:$Rn),
-                          (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
-  // 64-bit vector types.
-  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // 128-bit vector types.
-  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> {
-    let Inst{22} = 0b1;        // immh:immb = 1xxxxxx
-  }
-}
-
-multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
-  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                     OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                     OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                     OpNode> {
-     let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                      OpNode> {
-                      let Inst{22-19} = 0b0001;
-                    }
-
-  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                     OpNode> {
-                     let Inst{22-20} = 0b001;
-                    }
-
-  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                     OpNode> {
-                      let Inst{22-21} = 0b01;
-                    }
-
-  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                     OpNode> {
-                      let Inst{22} = 0b1;
-                    }
-}
-
-// Shift left
-
-defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
-
-// Additional patterns to match vector shift left by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shl_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shl_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shl_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shl_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shl_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shl_imm32:$Imm),
-              sub_32)>;
-
-// Shift right
-defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
-defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
-
-// Additional patterns to match vector shift right by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shr_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shr_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shr_imm32:$Imm),
-              sub_32)>;
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shr_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shr_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shr_imm32:$Imm),
-              sub_32)>;
-
-def Neon_High16B : PatFrag<(ops node:$in),
-                           (extract_subvector (v16i8 node:$in), (iPTR 8))>;
-def Neon_High8H  : PatFrag<(ops node:$in),
-                           (extract_subvector (v8i16 node:$in), (iPTR 4))>;
-def Neon_High4S  : PatFrag<(ops node:$in),
-                           (extract_subvector (v4i32 node:$in), (iPTR 2))>;
-def Neon_High2D  : PatFrag<(ops node:$in),
-                           (extract_subvector (v2i64 node:$in), (iPTR 1))>;
-def Neon_High4float : PatFrag<(ops node:$in),
-                               (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-def Neon_High2double : PatFrag<(ops node:$in),
-                               (extract_subvector (v2f64 node:$in), (iPTR 1))>;
-
-def Neon_Low16B : PatFrag<(ops node:$in),
-                          (v8i8 (extract_subvector (v16i8 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low8H : PatFrag<(ops node:$in),
-                         (v4i16 (extract_subvector (v8i16 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low4S : PatFrag<(ops node:$in),
-                         (v2i32 (extract_subvector (v4i32 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low2D : PatFrag<(ops node:$in),
-                         (v1i64 (extract_subvector (v2i64 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low4float : PatFrag<(ops node:$in),
-                             (v2f32 (extract_subvector (v4f32 node:$in),
-                                                       (iPTR 0)))>;
-def Neon_Low2double : PatFrag<(ops node:$in),
-                              (v1f64 (extract_subvector (v2f64 node:$in),
-                                                        (iPTR 0)))>;
-
-class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                   string SrcT, ValueType DestTy, ValueType SrcTy,
-                   Operand ImmTy, SDPatternOperator ExtOp>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR64:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [(set (DestTy VPR128:$Rd),
-                        (DestTy (shl
-                          (DestTy (ExtOp (SrcTy VPR64:$Rn))),
-                            (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                       string SrcT, ValueType DestTy, ValueType SrcTy,
-                       int StartIndex, Operand ImmTy,
-                       SDPatternOperator ExtOp, PatFrag getTop>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [(set (DestTy VPR128:$Rd),
-                        (DestTy (shl
-                          (DestTy (ExtOp
-                            (SrcTy (getTop VPR128:$Rn)))),
-                              (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
-                         SDNode ExtOp> {
-  // 64-bit vector types.
-  def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8,
-                         shl_imm8, ExtOp> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16,
-                         shl_imm16, ExtOp> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32,
-                         shl_imm32, ExtOp> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // 128-bit vector types
-  def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8,
-                              8, shl_imm8, ExtOp, Neon_High16B> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16,
-                             4, shl_imm16, ExtOp, Neon_High8H> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32,
-                             2, shl_imm32, ExtOp, Neon_High4S> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // Use other patterns to match when the immediate is 0.
-  def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>;
-
-  def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>;
-
-  def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>;
-
-  def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>;
-
-  def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>;
-
-  def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>;
-}
-
-// Shift left long
-defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
-defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
-
-class NeonI_ext_len_alias<string asmop, string lane, string laneOp,
-                       Instruction inst, RegisterOperand VPRC,
-                       RegisterOperand VPRCOp>
-  : NeonInstAlias<asmop # "\t$Rd" # lane #", $Rn" # laneOp,
-                  (inst VPRC:$Rd, VPRCOp:$Rn, 0), 0b0>;
-
-// Signed integer lengthen (vector) is alias for SSHLL Vd, Vn, #0
-// Signed integer lengthen (vector, second part) is alias for SSHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def SXTLvv_8B  : NeonI_ext_len_alias<"sxtl", ".8h", ".8b",  SSHLLvvi_8B, VPR128, VPR64>;
-def SXTLvv_4H  : NeonI_ext_len_alias<"sxtl", ".4s", ".4h",  SSHLLvvi_4H, VPR128, VPR64>;
-def SXTLvv_2S  : NeonI_ext_len_alias<"sxtl", ".2d", ".2s",  SSHLLvvi_2S, VPR128, VPR64>;
-def SXTL2vv_16B : NeonI_ext_len_alias<"sxtl2", ".8h", ".16b",  SSHLLvvi_16B, VPR128, VPR128>;
-def SXTL2vv_8H  : NeonI_ext_len_alias<"sxtl2", ".4s", ".8h",  SSHLLvvi_8H, VPR128, VPR128>;
-def SXTL2vv_4S  : NeonI_ext_len_alias<"sxtl2", ".2d", ".4s",  SSHLLvvi_4S, VPR128, VPR128>;
-
-// Unsigned integer lengthen (vector) is alias for USHLL Vd, Vn, #0
-// Unsigned integer lengthen (vector, second part) is alias for USHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def UXTLvv_8B  : NeonI_ext_len_alias<"uxtl", ".8h", ".8b",  USHLLvvi_8B, VPR128, VPR64>;
-def UXTLvv_4H  : NeonI_ext_len_alias<"uxtl", ".4s", ".4h",  USHLLvvi_4H, VPR128, VPR64>;
-def UXTLvv_2S  : NeonI_ext_len_alias<"uxtl", ".2d", ".2s",  USHLLvvi_2S, VPR128, VPR64>;
-def UXTL2vv_16B : NeonI_ext_len_alias<"uxtl2", ".8h", ".16b",  USHLLvvi_16B, VPR128, VPR128>;
-def UXTL2vv_8H  : NeonI_ext_len_alias<"uxtl2", ".4s", ".8h",  USHLLvvi_8H, VPR128, VPR128>;
-def UXTL2vv_4S  : NeonI_ext_len_alias<"uxtl2", ".2d", ".4s",  USHLLvvi_4S, VPR128, VPR128>;
-
-def : Pat<(v8i16 (anyext (v8i8 VPR64:$Rn))), (USHLLvvi_8B VPR64:$Rn, 0)>;
-def : Pat<(v4i32 (anyext (v4i16 VPR64:$Rn))), (USHLLvvi_4H VPR64:$Rn, 0)>;
-def : Pat<(v2i64 (anyext (v2i32 VPR64:$Rn))), (USHLLvvi_2S VPR64:$Rn, 0)>;
-
-// Rounding/Saturating shift
-class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDPatternOperator OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
-                        (i32 ImmTy:$Imm))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-// shift right (vector by immediate)
-multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
-                           SDPatternOperator OpNode> {
-  def _8B  : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H  : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                         OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S  : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                         OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop,
-                          SDPatternOperator OpNode> {
-  // 64-bit vector types.
-  def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
-                        OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  // 128-bit vector types.
-  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Rounding shift right
-defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr",
-                                int_aarch64_neon_vsrshr>;
-defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr",
-                                int_aarch64_neon_vurshr>;
-
-// Saturating shift left unsigned
-defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>;
-
-// Saturating shift left
-defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>;
-defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>;
-
-class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDNode OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-           [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
-              (Ty (OpNode (Ty VPRC:$Rn),
-                (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
-           NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// Shift Right accumulate
-multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
-  def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                        OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Shift right and accumulate
-defm SSRAvvi    : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>;
-defm USRAvvi    : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>;
-
-// Rounding shift accumulate
-class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
-                    RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                    SDPatternOperator OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
-                        (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop,
-                             SDPatternOperator OpNode> {
-  def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                          OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                          OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                          OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                           OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                          OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                          OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                          OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Rounding shift right and accumulate
-defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>;
-defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>;
-
-// Shift insert by immediate
-class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDPatternOperator OpNode>
-    : NeonI_2VShiftImm<q, u, opcode,
-           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-           [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
-             (i32 ImmTy:$Imm))))],
-           NoItinerary>,
-      Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// shift left insert (vector by immediate)
-multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> {
-  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-21} = 0b01;
-  }
-
-    // 128-bit vector types
-  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
-                         int_aarch64_neon_vsli> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
-                        int_aarch64_neon_vsli> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// shift right insert (vector by immediate)
-multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> {
-    // 64-bit vector types.
-  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-21} = 0b01;
-  }
-
-    // 128-bit vector types
-  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         int_aarch64_neon_vsri> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        int_aarch64_neon_vsri> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Shift left and insert
-defm SLIvvi   : NeonI_N2VShLIns<0b1, 0b01010, "sli">;
-
-// Shift right and insert
-defm SRIvvi   : NeonI_N2VShRIns<0b1, 0b01000, "sri">;
-
-class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                    string SrcT, Operand ImmTy>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                       string SrcT, Operand ImmTy>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// left long shift by immediate
-multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> {
-  def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> {
-    let Inst{22-21} = 0b01;
-  }
-
-  // Shift Narrow High
-  def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h",
-                              shr_imm8> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s",
-                             shr_imm16> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d",
-                             shr_imm32> {
-    let Inst{22-21} = 0b01;
-  }
-}
-
-// Shift right narrow
-defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">;
-
-// Shift right narrow (prefix Q is saturating, prefix R is rounding)
-defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">;
-defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">;
-defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">;
-defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">;
-defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">;
-defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">;
-defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">;
-
-def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v2i64 (concat_vectors (v1i64 node:$Rm),
-                                                     (v1i64 node:$Rn)))>;
-def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v8i16 (concat_vectors (v4i16 node:$Rm),
-                                                     (v4i16 node:$Rn)))>;
-def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v4i32 (concat_vectors (v2i32 node:$Rm),
-                                                     (v2i32 node:$Rn)))>;
-def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v4f32 (concat_vectors (v2f32 node:$Rm),
-                                                     (v2f32 node:$Rn)))>;
-def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v2f64 (concat_vectors (v1f64 node:$Rm),
-                                                     (v1f64 node:$Rn)))>;
-
-def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v8i16 (srl (v8i16 node:$lhs),
-                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v4i32 (srl (v4i32 node:$lhs),
-                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v2i64 (srl (v2i64 node:$lhs),
-                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v8i16 (sra (v8i16 node:$lhs),
-                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v4i32 (sra (v4i32 node:$lhs),
-                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v2i64 (sra (v2i64 node:$lhs),
-                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-
-// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
-multiclass Neon_shiftNarrow_patterns<string shr> {
-  def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
-              (i32 shr_imm8:$Imm)))),
-            (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
-              (i32 shr_imm16:$Imm)))),
-            (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
-              (i32 shr_imm32:$Imm)))),
-            (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;
-
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
-                VPR128:$Rn, (i32 shr_imm8:$Imm))))))),
-            (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-                         VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
-                VPR128:$Rn, (i32 shr_imm16:$Imm))))))),
-            (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                        VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
-                VPR128:$Rn, (i32 shr_imm32:$Imm))))))),
-            (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                        VPR128:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> {
-  def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)),
-            (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)),
-            (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)),
-            (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>;
-
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v8i8
-                    (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))),
-            (!cast<Instruction>(prefix # "_16B")
-                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v4i16
-                    (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))),
-            (!cast<Instruction>(prefix # "_8H")
-                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v2i32
-                    (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))),
-            (!cast<Instruction>(prefix # "_4S")
-                  (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                  VPR128:$Rn, imm:$Imm)>;
-}
-
-defm : Neon_shiftNarrow_patterns<"lshr">;
-defm : Neon_shiftNarrow_patterns<"ashr">;
-
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">;
-
-// Convert fix-point and float-pointing
-class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
-                RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy,
-                Operand ImmTy, SDPatternOperator IntOp>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
-                       (i32 ImmTy:$Imm))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
-                              SDPatternOperator IntOp> {
-  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64,
-                      shr_imm64, IntOp> {
-    let Inst{22} = 0b1;
-  }
-}
-
-multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop,
-                              SDPatternOperator IntOp> {
-  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64,
-                      shr_imm64, IntOp> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Convert fixed-point to floating-point
-defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf",
-                                   int_arm_neon_vcvtfxs2fp>;
-defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf",
-                                   int_arm_neon_vcvtfxu2fp>;
-
-// Convert floating-point to fixed-point
-defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs",
-                                   int_arm_neon_vcvtfp2fxs>;
-defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu",
-                                   int_arm_neon_vcvtfp2fxu>;
-
-multiclass Neon_sshll2_0<SDNode ext>
-{
-  def _v8i8  : PatFrag<(ops node:$Rn),
-                       (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>;
-  def _v4i16 : PatFrag<(ops node:$Rn),
-                       (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>;
-  def _v2i32 : PatFrag<(ops node:$Rn),
-                       (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>;
-}
-
-defm NI_sext_high : Neon_sshll2_0<sext>;
-defm NI_zext_high : Neon_sshll2_0<zext>;
-
-
-//===----------------------------------------------------------------------===//
-// Multiclasses for NeonI_Across
-//===----------------------------------------------------------------------===//
-
-// Variant 1
-
-multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
-                            string asmop, SDPatternOperator opnode>
-{
-    def _1h8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
-                (outs FPR16:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.8b",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
-                (outs FPR16:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.16b",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1s4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
-                (outs FPR32:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.4h",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1s8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.8h",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    // _1d2s doesn't exist!
-
-    def _1d4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
-                (outs FPR64:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (v1i64 FPR64:$Rd),
-                    (v1i64 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
-defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>;
-
-// Variant 2
-
-multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
-                            string asmop, SDPatternOperator opnode>
-{
-    def _1b8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
-                (outs FPR8:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.8b",
-                [(set (v1i8 FPR8:$Rd),
-                    (v1i8 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
-                (outs FPR8:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.16b",
-                [(set (v1i8 FPR8:$Rd),
-                    (v1i8 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
-                (outs FPR16:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.4h",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
-                (outs FPR16:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.8h",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    // _1s2s doesn't exist!
-
-    def _1s4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
-defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>;
-
-defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>;
-defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>;
-
-defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>;
-
-// Variant 3
-
-multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
-                            string asmop, SDPatternOperator opnode> {
-    def _1s4s:  NeonI_2VAcross<0b1, u, size, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (f32 FPR32:$Rd),
-                    (f32 (opnode (v4f32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
-                                int_aarch64_neon_vmaxnmv>;
-defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv",
-                                int_aarch64_neon_vminnmv>;
-
-defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv",
-                              int_aarch64_neon_vmaxv>;
-defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
-                              int_aarch64_neon_vminv>;
-
-// The followings are for instruction class (Perm)
-
-class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
-                    string asmop, RegisterOperand OpVPR, string OpS,
-                    SDPatternOperator opnode, ValueType Ty>
-  : NeonI_Perm<q, size, opcode,
-               (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-               asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
-               [(set (Ty OpVPR:$Rd),
-                  (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
-                          SDPatternOperator opnode> {
-  def _8b  : NeonI_Permute<0b0, 0b00, opcode, asmop,
-                           VPR64, "8b", opnode, v8i8>;
-  def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
-                           VPR128, "16b",opnode, v16i8>;
-  def _4h  : NeonI_Permute<0b0, 0b01, opcode, asmop,
-                           VPR64, "4h", opnode, v4i16>;
-  def _8h  : NeonI_Permute<0b1, 0b01, opcode, asmop,
-                           VPR128, "8h", opnode, v8i16>;
-  def _2s  : NeonI_Permute<0b0, 0b10, opcode, asmop,
-                           VPR64, "2s", opnode, v2i32>;
-  def _4s  : NeonI_Permute<0b1, 0b10, opcode, asmop,
-                           VPR128, "4s", opnode, v4i32>;
-  def _2d  : NeonI_Permute<0b1, 0b11, opcode, asmop,
-                           VPR128, "2d", opnode, v2i64>;
-}
-
-defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
-defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
-defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
-defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
-defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
-defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
-
-multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
-  def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
-            (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
-
-  def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
-            (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
-
-  def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
-            (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
-defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
-defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
-defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
-defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
-defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
-
-// The followings are for instruction class (3V Diff)
-
-// normal long/long2 pattern
-class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS,
-                 SDPatternOperator opnode, SDPatternOperator ext,
-                 RegisterOperand OpVPR,
-                 ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
-                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
-                        string asmop, SDPatternOperator opnode,
-                        bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                           opnode, sext, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                           opnode, sext, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                           opnode, sext, VPR64, v2i64, v2i32>;
-  }
-}
-
-multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                            opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
-    def _4s8h  : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                            opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
-    def _2d4s  : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                            opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
-  }
-}
-
-multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                           opnode, zext, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                           opnode, zext, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                           opnode, zext, VPR64, v2i64, v2i32>;
-  }
-}
-
-multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                            opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
-    def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                           opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                           opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
-  }
-}
-
-defm SADDLvvv :  NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>;
-defm UADDLvvv :  NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>;
-
-defm SADDL2vvv :  NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>;
-defm UADDL2vvv :  NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>;
-
-defm SSUBLvvv :  NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>;
-defm USUBLvvv :  NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>;
-
-defm SSUBL2vvv :  NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>;
-defm USUBL2vvv :  NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>;
-
-// normal wide/wide2 pattern
-class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS,
-                 SDPatternOperator opnode, SDPatternOperator ext,
-                 RegisterOperand OpVPR,
-                 ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (ResTy VPR128:$Rn),
-                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                         opnode, sext, VPR64, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                         opnode, sext, VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                         opnode, sext, VPR64, v2i64, v2i32>;
-}
-
-defm SADDWvvv :  NeonI_3VDW_s<0b0, 0b0001, "saddw", add>;
-defm SSUBWvvv :  NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>;
-
-multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode> {
-  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                          opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
-  def _4s8h  : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                          opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
-  def _2d4s  : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                          opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm SADDW2vvv :  NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>;
-defm SSUBW2vvv :  NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>;
-
-multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                         opnode, zext, VPR64, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                         opnode, zext, VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                         opnode, zext, VPR64, v2i64, v2i32>;
-}
-
-defm UADDWvvv :  NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>;
-defm USUBWvvv :  NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>;
-
-multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode> {
-  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                          opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
-  def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                         opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                         opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm UADDW2vvv :  NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>;
-defm USUBW2vvv :  NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>;
-
-// Get the high half part of the vector element.
-multiclass NeonI_get_high {
-  def _8h : PatFrag<(ops node:$Rn),
-                    (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
-                                             (v8i16 (Neon_vdup (i32 8)))))))>;
-  def _4s : PatFrag<(ops node:$Rn),
-                    (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
-                                              (v4i32 (Neon_vdup (i32 16)))))))>;
-  def _2d : PatFrag<(ops node:$Rn),
-                    (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
-                                              (v2i64 (Neon_vdup (i32 32)))))))>;
-}
-
-defm NI_get_hi : NeonI_get_high;
-
-// pattern for addhn/subhn with 2 operands
-class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator opnode, SDPatternOperator get_hi,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR64:$Rd),
-                    (ResTy (get_hi
-                      (OpTy (opnode (OpTy VPR128:$Rn),
-                                    (OpTy VPR128:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
-                                SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
-                                     opnode, NI_get_hi_8h, v8i8, v8i16>;
-    def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
-                                     opnode, NI_get_hi_4s, v4i16, v4i32>;
-    def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
-                                     opnode, NI_get_hi_2d, v2i32, v2i64>;
-  }
-}
-
-defm ADDHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>;
-defm SUBHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>;
-
-// pattern for operation with 2 operands
-class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                    string asmop, string ResS, string OpS,
-                    SDPatternOperator opnode,
-                    RegisterOperand ResVPR, RegisterOperand OpVPR,
-                    ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy ResVPR:$Rd),
-                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// normal narrow pattern
-multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
-                          SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
-                              opnode, VPR64, VPR128, v8i8, v8i16>;
-    def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
-                              opnode, VPR64, VPR128, v4i16, v4i32>;
-    def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
-                              opnode, VPR64, VPR128, v2i32, v2i64>;
-  }
-}
-
-defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>;
-defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>;
-
-// pattern for acle intrinsic with 3 operands
-class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let neverHasSideEffects = 1;
-}
-
-multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> {
-  def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">;
-  def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">;
-  def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">;
-}
-
-defm ADDHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">;
-defm SUBHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">;
-
-defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">;
-defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">;
-
-// Patterns have to be separate because there's a SUBREG_TO_REG in the output
-// part.
-class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy,
-                        SDPatternOperator coreop>
-  : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                      (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn),
-                                                        (SrcTy VPR128:$Rm)))))),
-        (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-              VPR128:$Rn, VPR128:$Rm)>;
-
-// addhn2 patterns
-def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8,  v8i16,
-          BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_8h4s,  v4i16, v4i32,
-          BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_4s2d,  v2i32, v2i64,
-          BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>;
-
-// subhn2 patterns
-def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8,  v8i16,
-          BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_8h4s,  v4i16, v4i32,
-          BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_4s2d,  v2i32, v2i64,
-          BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>;
-
-// raddhn2 patterns
-def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vraddhn>;
-
-// rsubhn2 patterns
-def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vrsubhn>;
-
-// pattern that need to extend result
-class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode,
-                     RegisterOperand OpVPR,
-                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
-                                                (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
-                           SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                               opnode, VPR64, v8i16, v8i8, v8i8>;
-    def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                               opnode, VPR64, v4i32, v4i16, v4i16>;
-    def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                               opnode, VPR64, v2i64, v2i32, v2i32>;
-  }
-}
-
-defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>;
-defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>;
-
-multiclass NeonI_Op_High<SDPatternOperator op> {
-  def _16B : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v8i8 (Neon_High16B node:$Rn)),
-                         (v8i8 (Neon_High16B node:$Rm)))>;
-  def _8H  : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v4i16 (Neon_High8H node:$Rn)),
-                         (v4i16 (Neon_High8H node:$Rm)))>;
-  def _4S  : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v2i32 (Neon_High4S node:$Rn)),
-                         (v2i32 (Neon_High4S node:$Rm)))>;
-}
-
-defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>;
-defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>;
-defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>;
-defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>;
-defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>;
-defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>;
-
-multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode,
-                            bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b  : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                !cast<PatFrag>(opnode # "_16B"),
-                                VPR128, v8i16, v16i8, v8i8>;
-    def _4s4h  : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                !cast<PatFrag>(opnode # "_8H"),
-                                VPR128, v4i32, v8i16, v4i16>;
-    def _2d2s  : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                !cast<PatFrag>(opnode # "_4S"),
-                                VPR128, v2i64, v4i32, v2i32>;
-  }
-}
-
-defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>;
-defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>;
-
-// For pattern that need two operators being chained.
-class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode, SDPatternOperator subop,
-                     RegisterOperand OpVPR,
-                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode
-                      (ResTy VPR128:$src),
-                      (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
-                                                 (OpTy OpVPR:$Rm))))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop,
-                             SDPatternOperator opnode, SDPatternOperator subop>{
-  def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                             opnode, subop, VPR64, v8i16, v8i8, v8i8>;
-  def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                             opnode, subop, VPR64, v4i32, v4i16, v4i16>;
-  def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                             opnode, subop, VPR64, v2i64, v2i32, v2i32>;
-}
-
-defm SABALvvv :  NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal",
-                                   add, int_arm_neon_vabds>;
-defm UABALvvv :  NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal",
-                                   add, int_arm_neon_vabdu>;
-
-multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop,
-                              SDPatternOperator opnode, string subop> {
-  def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                             opnode, !cast<PatFrag>(subop # "_16B"),
-                             VPR128, v8i16, v16i8, v8i8>;
-  def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                             opnode, !cast<PatFrag>(subop # "_8H"),
-                             VPR128, v4i32, v8i16, v4i16>;
-  def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                             opnode, !cast<PatFrag>(subop # "_4S"),
-                             VPR128, v2i64, v4i32, v2i32>;
-}
-
-defm SABAL2vvv :  NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add,
-                                     "NI_sabdl_hi">;
-defm UABAL2vvv :  NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
-                                     "NI_uabdl_hi">;
-
-// Long pattern with 2 operands
-multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
-                          SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable,
-      SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode, VPR128, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                              opnode, VPR128, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                              opnode, VPR128, VPR64, v2i64, v2i32>;
-  }
-}
-
-defm SMULLvvv :  NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>;
-defm UMULLvvv :  NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>;
-
-class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator opnode,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
-                 NoItinerary>,
-    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
-
-multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                      !cast<PatFrag>(opnode # "_16B"),
-                                      v8i16, v16i8>;
-    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                     !cast<PatFrag>(opnode # "_8H"),
-                                     v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                     !cast<PatFrag>(opnode # "_4S"),
-                                     v2i64, v4i32>;
-  }
-}
-
-defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2",
-                                         "NI_smull_hi", 1>;
-defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2",
-                                         "NI_umull_hi", 1>;
-
-// Long pattern with 3 operands
-class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode,
-                     ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode
-                      (ResTy VPR128:$src),
-                      (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop,
-                             SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                             opnode, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                             opnode, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                             opnode, v2i64, v2i32>;
-}
-
-def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (add node:$Rd,
-                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (add node:$Rd,
-                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (sub node:$Rd,
-                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (sub node:$Rd,
-                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-defm SMLALvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>;
-defm UMLALvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>;
-
-defm SMLSLvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>;
-defm UMLSLvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>;
-
-class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator subop, SDPatternOperator opnode,
-                           RegisterOperand OpVPR,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
-               asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-               [(set (ResTy VPR128:$Rd),
-                  (ResTy (subop
-                    (ResTy VPR128:$src),
-                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop,
-                                   SDPatternOperator subop, string opnode> {
-  def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                    subop, !cast<PatFrag>(opnode # "_16B"),
-                                    VPR128, v8i16, v16i8>;
-  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                   subop, !cast<PatFrag>(opnode # "_8H"),
-                                   VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                   subop, !cast<PatFrag>(opnode # "_4S"),
-                                   VPR128, v2i64, v4i32>;
-}
-
-defm SMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2",
-                                          add, "NI_smull_hi">;
-defm UMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2",
-                                          add, "NI_umull_hi">;
-
-defm SMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2",
-                                          sub, "NI_smull_hi">;
-defm UMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2",
-                                          sub, "NI_umull_hi">;
-
-multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop,
-                                    SDPatternOperator opnode> {
-  def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                                   opnode, int_arm_neon_vqdmull,
-                                   VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                                   opnode, int_arm_neon_vqdmull,
-                                   VPR64, v2i64, v2i32>;
-}
-
-defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal",
-                                           int_arm_neon_vqadds>;
-defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl",
-                                           int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                              opnode, VPR128, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                              opnode, VPR128, VPR64, v2i64, v2i32>;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
-                                int_arm_neon_vqdmull, 1>;
-}
-
-multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                     !cast<PatFrag>(opnode # "_8H"),
-                                     v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                     !cast<PatFrag>(opnode # "_4S"),
-                                     v2i64, v4i32>;
-  }
-}
-
-defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2",
-                                           "NI_qdmull_hi", 1>;
-
-multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop,
-                                     SDPatternOperator opnode> {
-  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                   opnode, NI_qdmull_hi_8H,
-                                   VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                   opnode, NI_qdmull_hi_4S,
-                                   VPR128, v2i64, v4i32>;
-}
-
-defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2",
-                                             int_arm_neon_vqadds>;
-defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
-                                             int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode_8h8b,
-                         SDPatternOperator opnode_1q1d, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode_8h8b, VPR128, VPR64, v8i16, v8i8>;
-
-    def _1q1d : NeonI_3VD_2Op<0b0, u, 0b11, opcode, asmop, "1q", "1d",
-                              opnode_1q1d, VPR128, VPR64, v16i8, v1i64>;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in
-defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp,
-                              int_aarch64_neon_vmull_p64, 1>;
-
-multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                      !cast<PatFrag>(opnode # "_16B"),
-                                      v8i16, v16i8>;
-
-    def _1q2d : 
-      NeonI_3VDiff<0b1, u, 0b11, opcode,
-                   (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                   asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
-                   [(set (v16i8 VPR128:$Rd),
-                      (v16i8 (int_aarch64_neon_vmull_p64 
-                        (v1i64 (scalar_to_vector
-                          (i64 (vector_extract (v2i64 VPR128:$Rn), 1)))),
-                        (v1i64 (scalar_to_vector
-                          (i64 (vector_extract (v2i64 VPR128:$Rm), 1)))))))],
-                   NoItinerary>,
-      Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
-  }
-
-  def : Pat<(v16i8 (int_aarch64_neon_vmull_p64
-                      (v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 1))),
-                      (v1i64 (extract_subvector (v2i64 VPR128:$Rm), (i64 1))))),
-            (!cast<Instruction>(NAME # "_1q2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
-                                         1>;
-
-// End of implementation for instruction class (3V Diff)
-
-// The followings are vector load/store multiple N-element structure
-// (class SIMD lselem).
-
-// ld1:         load multiple 1-element structure to 1/2/3/4 registers.
-// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
-//              The structure consists of a sequence of sets of N values.
-//              The first element of the structure is placed in the first lane
-//              of the first first vector, the second element in the first lane
-//              of the second vector, and so on.
-// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
-// the three 64-bit vectors list {BA, DC, FE}.
-// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
-// 64-bit vectors list {DA, EB, FC}.
-// Store instructions store multiple structure to N registers like load.
-
-
-class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-  : NeonI_LdStMult<q, 1, opcode, size,
-                 (outs VecList:$Rt), (ins GPR64xsp:$Rn),
-                 asmop # "\t$Rt, [$Rn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
-  def _8B : NeonI_LDVList<0, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_LDVList<0, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_LDVList<0, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _16B : NeonI_LDVList<1, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_LDVList<1, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_LDVList<1, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_LDVList<1, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
-def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
-
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
-
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
-
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
-
-// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
-defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
-def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
-
-defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">;
-def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
-
-defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">;
-def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
-
-class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-  : NeonI_LdStMult<q, 0, opcode, size,
-                 (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
-                 asmop # "\t$Rt, [$Rn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-  let mayStore = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
-  def _8B : NeonI_STVList<0, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_STVList<0, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_STVList<0, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _16B : NeonI_STVList<1, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_STVList<1, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_STVList<1, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_STVList<1, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
-def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
-
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
-
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
-
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
-
-// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
-
-defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
-
-defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
-
-def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-
-def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-
-def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
-def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
-
-def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-
-def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-
-def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
-def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
-
-def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
-          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
-          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
-          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
-          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
-
-// Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
-// FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
-// these patterns are not needed any more.
-def : Pat<(v1i8 (load GPR64xsp:$addr)), (LSFP8_LDR $addr, 0)>;
-def : Pat<(v1i16 (load GPR64xsp:$addr)), (LSFP16_LDR $addr, 0)>;
-def : Pat<(v1i32 (load GPR64xsp:$addr)), (LSFP32_LDR $addr, 0)>;
-
-def : Pat<(store (v1i8 FPR8:$value), GPR64xsp:$addr),
-          (LSFP8_STR $value, $addr, 0)>;
-def : Pat<(store (v1i16 FPR16:$value), GPR64xsp:$addr),
-          (LSFP16_STR $value, $addr, 0)>;
-def : Pat<(store (v1i32 FPR32:$value), GPR64xsp:$addr),
-          (LSFP32_STR $value, $addr, 0)>;
-
-
-// End of vector load/store multiple N-element structure(class SIMD lselem)
-
-// The followings are post-index vector load/store multiple N-element
-// structure(class SIMD lselem-post)
-def exact1_asmoperand : AsmOperandClass {
-  let Name = "Exact1";
-  let PredicateMethod = "isExactImm<1>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
-  let ParserMatchClass = exact1_asmoperand;
-}
-
-def exact2_asmoperand : AsmOperandClass {
-  let Name = "Exact2";
-  let PredicateMethod = "isExactImm<2>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
-  let ParserMatchClass = exact2_asmoperand;
-}
-
-def exact3_asmoperand : AsmOperandClass {
-  let Name = "Exact3";
-  let PredicateMethod = "isExactImm<3>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
-  let ParserMatchClass = exact3_asmoperand;
-}
-
-def exact4_asmoperand : AsmOperandClass {
-  let Name = "Exact4";
-  let PredicateMethod = "isExactImm<4>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
-  let ParserMatchClass = exact4_asmoperand;
-}
-
-def exact6_asmoperand : AsmOperandClass {
-  let Name = "Exact6";
-  let PredicateMethod = "isExactImm<6>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
-  let ParserMatchClass = exact6_asmoperand;
-}
-
-def exact8_asmoperand : AsmOperandClass {
-  let Name = "Exact8";
-  let PredicateMethod = "isExactImm<8>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
-  let ParserMatchClass = exact8_asmoperand;
-}
-
-def exact12_asmoperand : AsmOperandClass {
-  let Name = "Exact12";
-  let PredicateMethod = "isExactImm<12>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
-  let ParserMatchClass = exact12_asmoperand;
-}
-
-def exact16_asmoperand : AsmOperandClass {
-  let Name = "Exact16";
-  let PredicateMethod = "isExactImm<16>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> {
-  let ParserMatchClass = exact16_asmoperand;
-}
-
-def exact24_asmoperand : AsmOperandClass {
-  let Name = "Exact24";
-  let PredicateMethod = "isExactImm<24>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> {
-  let ParserMatchClass = exact24_asmoperand;
-}
-
-def exact32_asmoperand : AsmOperandClass {
-  let Name = "Exact32";
-  let PredicateMethod = "isExactImm<32>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> {
-  let ParserMatchClass = exact32_asmoperand;
-}
-
-def exact48_asmoperand : AsmOperandClass {
-  let Name = "Exact48";
-  let PredicateMethod = "isExactImm<48>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> {
-  let ParserMatchClass = exact48_asmoperand;
-}
-
-def exact64_asmoperand : AsmOperandClass {
-  let Name = "Exact64";
-  let PredicateMethod = "isExactImm<64>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> {
-  let ParserMatchClass = exact64_asmoperand;
-}
-
-multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
-                           RegisterOperand VecList, Operand ImmTy,
-                           string asmop> {
-  let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1,
-      DecoderMethod = "DecodeVLDSTPostInstruction" in {
-    def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size,
-                     (outs VecList:$Rt, GPR64xsp:$wb),
-                     (ins GPR64xsp:$Rn, ImmTy:$amt),
-                     asmop # "\t$Rt, [$Rn], $amt",
-                     [],
-                     NoItinerary>,
-                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdStMult_Post<q, 1, opcode, size,
-                        (outs VecList:$Rt, GPR64xsp:$wb),
-                        (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
-                        asmop # "\t$Rt, [$Rn], $Rm",
-                        [],
-                        NoItinerary>,
-                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
-  }
-}
-
-multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
-    Operand ImmTy2, string asmop> {
-  defm _8B : NeonI_LDWB_VList<0, opcode, 0b00,
-                              !cast<RegisterOperand>(List # "8B_operand"),
-                              ImmTy, asmop>;
-
-  defm _4H : NeonI_LDWB_VList<0, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              ImmTy, asmop>;
-
-  defm _2S : NeonI_LDWB_VList<0, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              ImmTy, asmop>;
-
-  defm _16B : NeonI_LDWB_VList<1, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               ImmTy2, asmop>;
-
-  defm _8H : NeonI_LDWB_VList<1, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              ImmTy2, asmop>;
-
-  defm _4S : NeonI_LDWB_VList<1, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              ImmTy2, asmop>;
-
-  defm _2D : NeonI_LDWB_VList<1, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
-defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "ld1">;
-
-defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
-
-defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "ld3">;
-
-defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "ld1">;
-defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "ld1">;
-
-defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "ld1">;
-defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "ld1">;
-
-defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                                "ld1">;
-defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "ld1">;
-
-multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
-                            RegisterOperand VecList, Operand ImmTy,
-                            string asmop> {
-  let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1,
-      DecoderMethod = "DecodeVLDSTPostInstruction" in {
-    def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size,
-                     (outs GPR64xsp:$wb),
-                     (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
-                     asmop # "\t$Rt, [$Rn], $amt",
-                     [],
-                     NoItinerary>,
-                 Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdStMult_Post<q, 0, opcode, size,
-                      (outs GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
-                      asmop # "\t$Rt, [$Rn], $Rm",
-                      [],
-                      NoItinerary>,
-                    Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
-  }
-}
-
-multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
-                           Operand ImmTy2, string asmop> {
-  defm _8B : NeonI_STWB_VList<0, opcode, 0b00,
-                 !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>;
-
-  defm _4H : NeonI_STWB_VList<0, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              ImmTy, asmop>;
-
-  defm _2S : NeonI_STWB_VList<0, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              ImmTy, asmop>;
-
-  defm _16B : NeonI_STWB_VList<1, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               ImmTy2, asmop>;
-
-  defm _8H : NeonI_STWB_VList<1, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              ImmTy2, asmop>;
-
-  defm _4S : NeonI_STWB_VList<1, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              ImmTy2, asmop>;
-
-  defm _2D : NeonI_STWB_VList<1, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
-defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "st1">;
-
-defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
-
-defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "st3">;
-
-defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "st1">;
-defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "st1">;
-
-defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "st1">;
-defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "st1">;
-
-defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                               "st1">;
-defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "st1">;
-
-// End of post-index vector load/store multiple N-element structure
-// (class SIMD lselem-post)
-
-// The followings are vector load/store single N-element structure
-// (class SIMD lsone).
-def neon_uimm0_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm == 0;}]> {
-  let ParserMatchClass = neon_uimm0_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm1_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 2;}]> {
-  let ParserMatchClass = neon_uimm1_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm2_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 4;}]> {
-  let ParserMatchClass = neon_uimm2_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm3_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 8;}]> {
-  let ParserMatchClass = uimm3_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm4_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 16;}]> {
-  let ParserMatchClass = uimm4_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-    : NeonI_LdOne_Dup<q, r, opcode, size,
-                      (outs VecList:$Rt), (ins GPR64xsp:$Rn),
-                      asmop # "\t$Rt, [$Rn]",
-                      [],
-                      NoItinerary>,
-      Sched<[WriteVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
-  def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "1D_operand"), asmop>;
-
-  def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load single 1-element structure to all lanes of 1 register
-defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
-
-// Load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
-defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
-defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
-
-
-class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
-                    Instruction INST>
-    : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
-          (VTy (INST GPR64xsp:$Rn))>;
-
-// Match all LD1R instructions
-def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
-
-def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
-
-def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
-
-def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
-
-def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
-def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
-
-def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
-def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
-
-def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
-def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
-
-class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
-                       Instruction INST>
-  : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
-        (VTy (INST GPR64xsp:$Rn))>;
-
-def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
-def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
-
-multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
-                                RegisterClass RegList> {
-  defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
-  defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
-  defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
-  defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
-}
-
-// Special vector list operand of 128-bit vectors with bare layout.
-// i.e. only show ".b", ".h", ".s", ".d"
-defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
-defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
-defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
-defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
-
-class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                     Operand ImmOp, string asmop>
-    : NeonI_LdStOne_Lane<1, r, op2_1, op0,
-                         (outs VList:$Rt),
-                         (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
-                         asmop # "\t$Rt[$lane], [$Rn]",
-                         [],
-                         NoItinerary>,
-      Sched<[WriteVecLd, ReadVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-  let hasExtraDefRegAllocReq = 1;
-  let Constraints = "$src = $Rt";
-}
-
-multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
-  def _B : NeonI_LDN_Lane<r, 0b00, op0,
-                          !cast<RegisterOperand>(List # "B_operand"),
-                          neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H : NeonI_LDN_Lane<r, 0b01, op0,
-                          !cast<RegisterOperand>(List # "H_operand"),
-                          neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S : NeonI_LDN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "S_operand"),
-                          neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D : NeonI_LDN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "D_operand"),
-                          neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Load single 1-element structure to one lane of 1 register.
-defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
-
-// Load single N-element structure to one lane of N consecutive registers
-// (N = 2,3,4)
-defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
-defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
-defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
-
-multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
-                          Instruction INST> {
-  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
-                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
-            (VTy (EXTRACT_SUBREG
-                     (INST GPR64xsp:$Rn,
-                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                           ImmOp:$lane),
-                     sub_64))>;
-
-  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
-                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
-            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
-}
-
-// Match all LD1LN instructions
-defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      extloadi8, LD1LN_B>;
-
-defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      extloadi16, LD1LN_H>;
-
-defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-
-defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-
-class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                     Operand ImmOp, string asmop>
-    : NeonI_LdStOne_Lane<0, r, op2_1, op0,
-                         (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
-                         asmop # "\t$Rt[$lane], [$Rn]",
-                         [],
-                         NoItinerary>,
-      Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-  let mayStore = 1;
-  let neverHasSideEffects = 1;
-  let hasExtraDefRegAllocReq = 1;
-}
-
-multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
-  def _B : NeonI_STN_Lane<r, 0b00, op0,
-                          !cast<RegisterOperand>(List # "B_operand"),
-                          neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H : NeonI_STN_Lane<r, 0b01, op0,
-                          !cast<RegisterOperand>(List # "H_operand"),
-                          neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S : NeonI_STN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "S_operand"),
-                           neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D : NeonI_STN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "D_operand"),
-                          neon_uimm1_bare, asmop>{
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Store single 1-element structure from one lane of 1 register.
-defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
-
-// Store single N-element structure from one lane of N consecutive registers
-// (N = 2,3,4)
-defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
-defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
-defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
-
-multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
-                          Instruction INST> {
-  def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
-                     GPR64xsp:$Rn),
-            (INST GPR64xsp:$Rn,
-                  (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
-                  ImmOp:$lane)>;
-
-  def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
-                     GPR64xsp:$Rn),
-            (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
-}
-
-// Match all ST1LN instructions
-defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      truncstorei8, ST1LN_B>;
-
-defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      truncstorei16, ST1LN_H>;
-
-defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-
-defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-
-// End of vector load/store single N-element structure (class SIMD lsone).
-
-
-// The following are post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
-                            RegisterOperand VecList, Operand ImmTy,
-                            string asmop> {
-  let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
-  DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-    def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
-                      (outs VecList:$Rt, GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, ImmTy:$amt),
-                      asmop # "\t$Rt, [$Rn], $amt",
-                      [],
-                      NoItinerary>,
-                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
-                      (outs VecList:$Rt, GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
-                      asmop # "\t$Rt, [$Rn], $Rm",
-                      [],
-                      NoItinerary>,
-                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
-  }
-}
-
-multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
-                         Operand uimm_b, Operand uimm_h,
-                         Operand uimm_s, Operand uimm_d> {
-  defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
-                              !cast<RegisterOperand>(List # "8B_operand"),
-                              uimm_b, asmop>;
-
-  defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              uimm_h, asmop>;
-
-  defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              uimm_s, asmop>;
-
-  defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "1D_operand"),
-                              uimm_d, asmop>;
-
-  defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               uimm_b, asmop>;
-
-  defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              uimm_h, asmop>;
-
-  defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              uimm_s, asmop>;
-
-  defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              uimm_d, asmop>;
-}
-
-// Post-index load single 1-element structure to all lanes of 1 register
-defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
-                             uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
-                             uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
-                             uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
-                             uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
-    Constraints = "$Rn = $wb, $Rt = $src",
-    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-  class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                                Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
-                                (outs VList:$Rt, GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, ImmTy:$amt,
-                                    VList:$src, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $amt",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]> {
-    let Rm = 0b11111;
-  }
-
-  class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                                 Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
-                                (outs VList:$Rt, GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
-                                    VList:$src, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd, ReadVecLd]>;
-}
-
-multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
-                           Operand uimm_b, Operand uimm_h,
-                           Operand uimm_s, Operand uimm_d> {
-  def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
-                               !cast<RegisterOperand>(List # "B_operand"),
-                               uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
-                                   !cast<RegisterOperand>(List # "B_operand"),
-                                   uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
-                               !cast<RegisterOperand>(List # "H_operand"),
-                               uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
-                                   !cast<RegisterOperand>(List # "H_operand"),
-                                   uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "S_operand"),
-                               uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "S_operand"),
-                                   uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "D_operand"),
-                               uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-
-  def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "D_operand"),
-                                   uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Post-index load single 1-element structure to one lane of 1 register.
-defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
-                                uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to one lane of N consecutive
-// registers
-// (N = 2,3,4)
-defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayStore = 1, neverHasSideEffects = 1,
-    hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
-    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-  class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                      Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
-                                (outs GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, ImmTy:$amt,
-                                    VList:$Rt, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $amt",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-    let Rm = 0b11111;
-  }
-
-  class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                       Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
-                                (outs GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
-                                    ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
-}
-
-multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
-                           Operand uimm_b, Operand uimm_h,
-                           Operand uimm_s, Operand uimm_d> {
-  def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
-                               !cast<RegisterOperand>(List # "B_operand"),
-                               uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _B_register : STN_WBReg_Lane<r, 0b00, op0,
-                                   !cast<RegisterOperand>(List # "B_operand"),
-                                   uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
-                               !cast<RegisterOperand>(List # "H_operand"),
-                               uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _H_register : STN_WBReg_Lane<r, 0b01, op0,
-                                   !cast<RegisterOperand>(List # "H_operand"),
-                                   uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "S_operand"),
-                               uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _S_register : STN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "S_operand"),
-                                   uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "D_operand"),
-                               uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-
-  def _D_register : STN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "D_operand"),
-                                   uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Post-index store single 1-element structure from one lane of 1 register.
-defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
-                                uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index store single N-element structure from one lane of N consecutive
-// registers (N = 2,3,4)
-defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
-
-// End of post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-// Neon Scalar instructions implementation
-// Scalar Three Same
-
-class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
-                             RegisterClass FPRC>
-  : NeonI_Scalar3Same<u, size, opcode,
-                      (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
-                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                      [],
-                      NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
-
-multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop,
-                                      bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
-    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
-  }
-}
-
-multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode,
-                                      string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>;
-    def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>;
-  }
-}
-
-multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
-                                        string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>;
-    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
-    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
-    def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
-  }
-}
-
-multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode,
-                                            Instruction INSTD> {
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTB,
-                                               Instruction INSTH,
-                                               Instruction INSTS,
-                                               Instruction INSTD>
-  : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
-  def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-           (INSTB FPR8:$Rn, FPR8:$Rm)>;
-  def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-           (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-           (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
-                                             ValueType SResTy, ValueType STy,
-                                             Instruction INSTS, ValueType DResTy,
-                                             ValueType DTy, Instruction INSTD> {
-  def : Pat<(SResTy (opnode (STy FPR32:$Rn), (STy FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(DResTy (opnode (DTy FPR64:$Rn), (DTy FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC,
-                                              Instruction INSTD>
-  : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)),
-        (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Three Different
-
-class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
-                             RegisterClass FPRCD, RegisterClass FPRCS>
-  : NeonI_Scalar3Diff<u, size, opcode,
-                      (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
-                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                      [],
-                      NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
-  def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
-  def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>;
-}
-
-multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
-  let Constraints = "$Src = $Rd" in {
-    def shh : NeonI_Scalar3Diff<u, 0b01, opcode,
-                       (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                       [],
-                       NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
-    def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
-                       (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                       [],
-                       NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>;
-}
-
-// Scalar Two Registers Miscellaneous
-
-class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop,
-                             RegisterClass FPRCD, RegisterClass FPRCS>
-  : NeonI_Scalar2SameMisc<u, size, opcode,
-                          (outs FPRCD:$Rd), (ins FPRCS:$Rn),
-                          !strconcat(asmop, "\t$Rd, $Rn"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
-                                         string asmop> {
-  def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32,
-                                      FPR32>;
-  def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64,
-                                      FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> {
-  def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> {
-  def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>;
-  def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>;
-  def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>;
-}
-
-class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>;
-
-multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode,
-                                                 string asmop> {
-  def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>;
-  def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>;
-  def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>;
-}
-
-class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
-                                       string asmop, RegisterClass FPRC>
-  : NeonI_Scalar2SameMisc<u, size, opcode,
-                          (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
-                          !strconcat(asmop, "\t$Rd, $Rn"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
-                                                 string asmop> {
-
-  let Constraints = "$Src = $Rd" in {
-    def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>;
-    def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>;
-    def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>;
-    def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>;
-  }
-}
-
-class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
-                                                  Instruction INSTD>
-  : Pat<(f32 (opnode (f64 FPR64:$Rn))),
-        (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_vcvt_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator opnode,
-                                                     Instruction INSTS,
-                                                     Instruction INSTD> {
-  def : Pat<(f32 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(f64 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
-                                                 Instruction INSTS,
-                                                 Instruction INSTD> {
-  def : Pat<(f32 (opnode (f32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(f64 (opnode (f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_V1_D_size_patterns<SDPatternOperator opnode,
-                                              Instruction INSTD>
-  : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
-        (INSTD FPR64:$Rn)>;
-
-class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc<u, 0b11, opcode,
-                          (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
-                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
-                                              string asmop> {
-  def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
-                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpzz32:$FPImm),
-                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
-                           [],
-                           NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-  def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
-                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpzz32:$FPImm),
-                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
-                           [],
-                           NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-                       (v1i64 (bitconvert (v8i8 Neon_AllZero))))),
-        (INSTD FPR64:$Rn, 0)>;
-
-class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
-                                                   Instruction INSTD>
-  : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn),
-                          (i32 neon_uimm0:$Imm), CC)),
-        (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
-
-multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
-                                                      CondCode CC,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (f32 fpzz32:$FPImm))),
-            (INSTS FPR32:$Rn, fpzz32:$FPImm)>;
-  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (f32 fpzz32:$FPImm))),
-            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
-  def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpzz32:$FPImm), CC)),
-            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
-}
-
-multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD> {
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode,
-                                                   Instruction INSTB,
-                                                   Instruction INSTH,
-                                                   Instruction INSTS,
-                                                   Instruction INSTD>
-  : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> {
-  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))),
-            (INSTB FPR8:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-
-}
-
-multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTB,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))),
-            (INSTB FPR8:$Src, FPR8:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Src, FPR16:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Src, FPR32:$Rn)>;
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Src, FPR64:$Rn)>;
-}
-
-// Scalar Shift By Immediate
-
-class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
-                                RegisterClass FPRC, Operand ImmTy>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
-                                            string asmop> {
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode,
-                                               string asmop>
-  : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> {
-  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode,
-                                            string asmop> {
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode,
-                                              string asmop>
-  : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> {
-  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPR64:$Rd),
-                         (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-    let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPR64:$Rd),
-                         (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-    let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
-                                       RegisterClass FPRCD, RegisterClass FPRCS,
-                                       Operand ImmTy>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
-                                                string asmop> {
-  def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16,
-                                             shr_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32,
-                                             shr_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64,
-                                             shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> {
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTD> {
-  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTD> {
-  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_V1_D_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-            (v1i64 (Neon_vdup (i32 shl_imm64:$Imm))))),
-        (INSTD FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_V1_D_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-            (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
-        (INSTD FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode,
-                                                   Instruction INSTB,
-                                                   Instruction INSTH,
-                                                   Instruction INSTS,
-                                                   Instruction INSTD>
-  : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> {
-  def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))),
-                (INSTB FPR8:$Rn, imm:$Imm)>;
-  def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))),
-                (INSTH FPR16:$Rn, imm:$Imm)>;
-  def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
-            (i32 shl_imm64:$Imm))),
-        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
-            (i32 shr_imm64:$Imm))),
-        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))),
-                (INSTH FPR16:$Rn, imm:$Imm)>;
-  def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def ssi : Pat<(f32 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def ssi : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-// Scalar Signed Shift Right (Immediate)
-defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<sra, SSHRddi>;
-
-// Scalar Unsigned Shift Right (Immediate)
-defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<srl, USHRddi>;
-
-// Scalar Signed Rounding Shift Right (Immediate)
-defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>;
-
-// Scalar Unigned Rounding Shift Right (Immediate)
-defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>;
-
-// Scalar Signed Shift Right and Accumulate (Immediate)
-def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsrads_n, SSRA>;
-
-// Scalar Unsigned Shift Right and Accumulate (Immediate)
-def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsradu_n, USRA>;
-
-// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
-def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vrsrads_n, SRSRA>;
-
-// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
-def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vrsradu_n, URSRA>;
-
-// Scalar Shift Left (Immediate)
-defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
-defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftLImm_V1_D_size_patterns<shl, SHLddi>;
-
-// Signed Saturating Shift Left (Immediate)
-defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n,
-                                               SQSHLbbi, SQSHLhhi,
-                                               SQSHLssi, SQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>;
-
-// Unsigned Saturating Shift Left (Immediate)
-defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n,
-                                               UQSHLbbi, UQSHLhhi,
-                                               UQSHLssi, UQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>;
-
-// Signed Saturating Shift Left Unsigned (Immediate)
-defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu,
-                                               SQSHLUbbi, SQSHLUhhi,
-                                               SQSHLUssi, SQSHLUddi>;
-
-// Shift Right And Insert (Immediate)
-def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsri, SRI>;
-
-// Shift Left And Insert (Immediate)
-def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">;
-def : Neon_ScalarShiftLImm_accum_D_size_patterns
-          <int_aarch64_neon_vsli, SLI>;
-
-// Signed Saturating Shift Right Narrow (Immediate)
-defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn,
-                                                    SQSHRNbhi, SQSHRNhsi,
-                                                    SQSHRNsdi>;
-
-// Unsigned Saturating Shift Right Narrow (Immediate)
-defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn,
-                                                    UQSHRNbhi, UQSHRNhsi,
-                                                    UQSHRNsdi>;
-
-// Signed Saturating Rounded Shift Right Narrow (Immediate)
-defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn,
-                                                    SQRSHRNbhi, SQRSHRNhsi,
-                                                    SQRSHRNsdi>;
-
-// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn,
-                                                    UQRSHRNbhi, UQRSHRNhsi,
-                                                    UQRSHRNsdi>;
-
-// Signed Saturating Shift Right Unsigned Narrow (Immediate)
-defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun,
-                                                    SQSHRUNbhi, SQSHRUNhsi,
-                                                    SQSHRUNsdi>;
-
-// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
-                                                    SQRSHRUNbhi, SQRSHRUNhsi,
-                                                    SQRSHRUNsdi>;
-
-// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
-defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxs2fp_n,
-                                                  SCVTF_Nssi, SCVTF_Nddi>;
-
-// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
-defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxu2fp_n,
-                                                  UCVTF_Nssi, UCVTF_Nddi>;
-
-// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
-defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxs_n,
-                                                  FCVTZS_Nssi, FCVTZS_Nddi>;
-
-// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
-defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxu_n,
-                                                  FCVTZU_Nssi, FCVTZU_Nddi>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode,
-                                             Instruction INST>
-    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-          (INST FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode,
-                                             Instruction INST>
-    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-          (INST FPR64:$Rn, imm:$Imm)>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp,
-                                             SCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp,
-                                             UCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs,
-                                             FCVTZS_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu,
-                                             FCVTZU_Nddi>;
-
-// Scalar Integer Add
-let isCommutable = 1 in {
-def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
-}
-
-// Scalar Integer Sub
-def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
-
-// Pattern for Scalar Integer Add and Sub with D register only
-defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>;
-
-// Scalar Integer Saturating Add (Signed, Unsigned)
-defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
-defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
-
-// Scalar Integer Saturating Sub (Signed, Unsigned)
-defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
-defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
-
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Add, Sub  (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb,
-                                           SQADDhhh, SQADDsss, SQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb,
-                                           UQADDhhh, UQADDsss, UQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb,
-                                           SQSUBhhh, SQSUBsss, SQSUBddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
-                                           UQSUBhhh, UQSUBsss, UQSUBddd>;
-
-// Scalar Integer Saturating Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in
-defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
-
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
-}
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Doubling Multiply Half High and
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
-                                                               SQDMULHsss>;
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
-                                                                SQRDMULHsss>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-// Scalar Floating-point Multiply Extended
-defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
-}
-
-// Scalar Floating-point Reciprocal Step
-defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrecps, f32, f32,
-                                         FRECPSsss, f64, f64, FRECPSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrecps (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FRECPSddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Reciprocal Square Root Step
-defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrsqrts, f32, f32,
-                                         FRSQRTSsss, f64, f64, FRSQRTSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrsqrts (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FRSQRTSddd FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Floating-point Multiply Extended,
-multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
-                                                  Instruction INSTS,
-                                                  Instruction INSTD> {
-  def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
-                                              FMULXsss, FMULXddd>;
-def : Pat<(v1f64 (int_aarch64_neon_vmulx (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMULXddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Integer Shift Left (Signed, Unsigned)
-def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
-def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>;
-
-// Scalar Integer Saturating Shift Left (Signed, Unsigned)
-defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
-defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb,
-                                           SQSHLhhh, SQSHLsss, SQSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb,
-                                           UQSHLhhh, UQSHLsss, UQSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
-
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
-def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
-
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
-defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb,
-                                           SQRSHLhhh, SQRSHLsss, SQRSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
-                                           UQRSHLhhh, UQRSHLsss, UQRSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-// Signed Saturating Doubling Multiply-Add Long
-defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
-                                            SQDMLALshh, SQDMLALdss>;
-
-// Signed Saturating Doubling Multiply-Subtract Long
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
-                                            SQDMLSLshh, SQDMLSLdss>;
-
-// Signed Saturating Doubling Multiply Long
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
-}
-defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
-                                         SQDMULLshh, SQDMULLdss>;
-
-// Scalar Signed Integer Convert To Floating-point
-defm SCVTF  : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fps,
-                                                 SCVTFss, SCVTFdd>;
-
-// Scalar Unsigned Integer Convert To Floating-point
-defm UCVTF  : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fpu,
-                                                 UCVTFss, UCVTFdd>;
-
-// Scalar Floating-point Converts
-def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">;
-def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
-                                                  FCVTXN>;
-
-defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
-                                                  FCVTNSss, FCVTNSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtns, FCVTNSdd>;
-
-defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
-                                                  FCVTNUss, FCVTNUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtnu, FCVTNUdd>;
-
-defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
-                                                  FCVTMSss, FCVTMSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtms, FCVTMSdd>;
-
-defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
-                                                  FCVTMUss, FCVTMUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtmu, FCVTMUdd>;
-
-defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
-                                                  FCVTASss, FCVTASdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtas, FCVTASdd>;
-
-defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
-                                                  FCVTAUss, FCVTAUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtau, FCVTAUdd>;
-
-defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
-                                                  FCVTPSss, FCVTPSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtps, FCVTPSdd>;
-
-defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
-                                                  FCVTPUss, FCVTPUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtpu, FCVTPUdd>;
-
-defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
-                                                  FCVTZSss, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzs,
-                                                FCVTZSdd>;
-
-defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
-                                                  FCVTZUss, FCVTZUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzu,
-                                                FCVTZUdd>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
-                                              Instruction INST>
-    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode,
-                                              Instruction INST>
-    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>;
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>;
-
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
-
-// Scalar Floating-point Reciprocal Estimate
-defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpe,
-                                             FRECPEss, FRECPEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrecpe,
-                                              FRECPEdd>;
-
-// Scalar Floating-point Reciprocal Exponent
-defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
-                                             FRECPXss, FRECPXdd>;
-
-// Scalar Floating-point Reciprocal Square Root Estimate
-defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrsqrte,
-                                                 FRSQRTEss, FRSQRTEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrsqrte,
-                                              FRSQRTEdd>;
-
-// Scalar Floating-point Round
-class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>;
-def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>;
-def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>;
-def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>;
-def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>;
-def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>;
-def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
-
-// Scalar Integer Compare
-
-// Scalar Compare Bitwise Equal
-def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
-
-class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
-                                              Instruction INSTD,
-                                              CondCode CC>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)),
-        (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
-
-// Scalar Compare Signed Greather Than Or Equal
-def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
-
-// Scalar Compare Unsigned Higher Or Same
-def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
-
-// Scalar Compare Unsigned Higher
-def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
-
-// Scalar Compare Signed Greater Than
-def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
-
-// Scalar Compare Bitwise Test Bits
-def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
-defm : Neon_Scalar3Same_D_size_patterns<Neon_tst, CMTSTddd>;
-
-// Scalar Compare Bitwise Equal To Zero
-def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq,
-                                                CMEQddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>;
-
-// Scalar Compare Signed Greather Than Or Equal To Zero
-def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge,
-                                                CMGEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>;
-
-// Scalar Compare Signed Greater Than Zero
-def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt,
-                                                CMGTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>;
-
-// Scalar Compare Signed Less Than Or Equal To Zero
-def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez,
-                                                CMLEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>;
-
-// Scalar Compare Less Than Zero
-def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz,
-                                                CMLTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
-
-// Scalar Floating-point Compare
-
-// Scalar Floating-point Compare Mask Equal
-defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fceq, v1i32, f32,
-                                         FCMEQsss, v1i64, f64, FCMEQddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
-
-// Scalar Floating-point Compare Mask Equal To Zero
-defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fceq, SETEQ,
-                                                  FCMEQZssi, FCMEQZddi>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal
-defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcge, v1i32, f32,
-                                         FCMGEsss, v1i64, f64, FCMGEddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
-defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcge, SETGE,
-                                                  FCMGEZssi, FCMGEZddi>;
-
-// Scalar Floating-point Compare Mask Greather Than
-defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcgt, v1i32, f32,
-                                         FCMGTsss, v1i64, f64, FCMGTddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
-
-// Scalar Floating-point Compare Mask Greather Than Zero
-defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcgt, SETGT,
-                                                  FCMGTZssi, FCMGTZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
-defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fclez, SETLE,
-                                                  FCMLEZssi, FCMLEZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Zero
-defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcltz, SETLT,
-                                                  FCMLTZssi, FCMLTZddi>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
-defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcage, v1i32, f32,
-                                         FACGEsss, v1i64, f64, FACGEddd>;
-def : Pat<(v1i64 (int_arm_neon_vacge (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FACGEddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than
-defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcagt, v1i32, f32,
-                                         FACGTsss, v1i64, f64, FACGTddd>;
-def : Pat<(v1i64 (int_arm_neon_vacgt (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FACGTddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Difference
-defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd, f32, f32,
-                                         FABDsss, f64, f64, FABDddd>;
-
-// Scalar Absolute Value
-defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>;
-
-// Scalar Signed Saturating Absolute Value
-defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs,
-                                               SQABSbb, SQABShh, SQABSss, SQABSdd>;
-
-// Scalar Negate
-defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>;
-
-// Scalar Signed Saturating Negate
-defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg,
-                                               SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>;
-
-// Scalar Signed Saturating Accumulated of Unsigned Value
-defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd,
-                                                     SUQADDbb, SUQADDhh,
-                                                     SUQADDss, SUQADDdd>;
-
-// Scalar Unsigned Saturating Accumulated of Signed Value
-defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd,
-                                                     USQADDbb, USQADDhh,
-                                                     USQADDss, USQADDdd>;
-
-def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src),
-                                          (v1i64 FPR64:$Rn))),
-          (SUQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src),
-                                          (v1i64 FPR64:$Rn))),
-          (USQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))),
-          (ABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))),
-          (SQABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))),
-          (SQNEGdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))),
-                      (v1i64 FPR64:$Rn))),
-          (NEGdd FPR64:$Rn)>;
-
-// Scalar Signed Saturating Extract Unsigned Narrow
-defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu,
-                                                     SQXTUNbh, SQXTUNhs,
-                                                     SQXTUNsd>;
-
-// Scalar Signed Saturating Extract Narrow
-defm SQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns,
-                                                     SQXTNbh, SQXTNhs,
-                                                     SQXTNsd>;
-
-// Scalar Unsigned Saturating Extract Narrow
-defm UQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu,
-                                                     UQXTNbh, UQXTNhs,
-                                                     UQXTNsd>;
-
-// Scalar Reduce Pairwise
-
-multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
-                                     string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode,
-                                (outs FPR64:$Rd), (ins VPR128:$Rn),
-                                !strconcat(asmop, "\t$Rd, $Rn.2d"),
-                                [],
-                                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
-                                     string asmop, bit Commutable = 0>
-  : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> {
-  let isCommutable = Commutable in {
-    def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode,
-                                (outs FPR32:$Rd), (ins VPR64:$Rn),
-                                !strconcat(asmop, "\t$Rd, $Rn.2s"),
-                                [],
-                                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-// Scalar Reduce Addition Pairwise (Integer) with
-// Pattern to match llvm.arm.* intrinsic
-defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>;
-
-// Pattern to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Addition Pairwise (Integer)
-def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))),
-          (ADDPvv_D_2D VPR128:$Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))),
-          (ADDPvv_D_2D VPR128:$Rn)>;
-
-// Scalar Reduce Addition Pairwise (Floating Point)
-defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>;
-
-// Scalar Reduce Maximum Pairwise (Floating Point)
-defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>;
-
-// Scalar Reduce Minimum Pairwise (Floating Point)
-defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>;
-
-// Scalar Reduce maxNum Pairwise (Floating Point)
-defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
-
-// Scalar Reduce minNum Pairwise (Floating Point)
-defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
-
-multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnode,
-                                            Instruction INSTS,
-                                            Instruction INSTD> {
-  def : Pat<(f32 (opnode (v2f32 VPR64:$Rn))),
-            (INSTS VPR64:$Rn)>;
-  def : Pat<(f64 (opnode (v2f64 VPR128:$Rn))),
-            (INSTD VPR128:$Rn)>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
-                                        FADDPvv_S_2S, FADDPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
-                                        FMAXPvv_S_2S, FMAXPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
-                                        FMINPvv_S_2S, FMINPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
-                                        FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
-                                        FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
-
-def : Pat<(f32 (int_aarch64_neon_vpfadd (v4f32 VPR128:$Rn))),
-          (FADDPvv_S_2S (v2f32
-               (EXTRACT_SUBREG
-                   (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
-                   sub_64)))>;
-
-// Scalar by element Arithmetic
-
-class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
-                                    string rmlane, bit u, bit szhi, bit szlo,
-                                    RegisterClass ResFPR, RegisterClass OpFPR,
-                                    RegisterOperand OpVPR, Operand OpImm>
-  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
-                             (outs ResFPR:$Rd),
-                             (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
-                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
-                             [],
-                             NoItinerary>,
-    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]> {
-  bits<3> Imm;
-  bits<5> MRm;
-}
-
-class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
-                                                    string rmlane,
-                                                    bit u, bit szhi, bit szlo,
-                                                    RegisterClass ResFPR,
-                                                    RegisterClass OpFPR,
-                                                    RegisterOperand OpVPR,
-                                                    Operand OpImm>
-  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
-                             (outs ResFPR:$Rd),
-                             (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
-                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
-                             [],
-                             NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-  bits<3> Imm;
-  bits<5> MRm;
-}
-
-// Scalar Floating Point  multiply (scalar, by element)
-def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
-  0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
-  0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point  multiply extended (scalar, by element)
-def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
-  0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
-  0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns<
-  SDPatternOperator opnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
-  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-
-  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))),
-             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))),
-             (ResTy (INST (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped operands
-  def  : Pat<(ResTy (opnode
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Rn))),
-             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Rn))),
-             (ResTy (INST (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-}
-
-// Patterns for Scalar Floating Point  multiply (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S,
-  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Patterns for Scalar Floating Point  multiply extended (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
-  FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare,
-  v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
-  FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
-  v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Floating Point fused multiply-add (scalar, by element)
-def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
-  0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
-  0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point fused multiply-subtract (scalar, by element)
-def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
-  0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
-  0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-// We are allowed to match the fma instruction regardless of compile options.
-multiclass Neon_ScalarXIndexedElem_FMA_Patterns<
-  Instruction FMLAI, Instruction FMLSI,
-  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
-  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-  // fmla
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped fmla operands
-  def  : Pat<(ResTy (fma
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // fmls
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped fmls operands
-  def  : Pat<(ResTy (fma
-               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma
-               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-}
-
-// Scalar Floating Point fused multiply-add and
-// multiply-subtract (scalar, by element)
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S,
-  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Signed saturating doubling multiply long (scalar, by element)
-def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
-  SDPatternOperator opnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass FPRC,
-  ValueType OpVTy, ValueType OpTy,
-  ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
-  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
-               (OpVTy (scalar_to_vector
-                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
-               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  //swapped operands
-  def  : Pat<(ResTy (opnode
-               (OpVTy (scalar_to_vector
-                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
-                 (OpVTy FPRC:$Rn))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)),
-               (OpVTy FPRC:$Rn))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-
-// Patterns for Scalar Signed saturating doubling
-// multiply long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply-add long (scalar, by element)
-def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Signed saturating doubling
-// multiply-subtract long (scalar, by element)
-def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
-  SDPatternOperator opnode,
-  SDPatternOperator coreopnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC,
-  ValueType OpTy,
-  ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode (OpTy FPRC:$Rn),
-                 (OpTy (scalar_to_vector
-                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode (OpTy FPRC:$Rn),
-                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  // swapped operands
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode
-                 (OpTy (scalar_to_vector
-                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))),
-                 (OpTy FPRC:$Rn))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode
-                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)),
-                 (OpTy FPRC:$Rn))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-add long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-sub long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-// Patterns for Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating rounding doubling multiply
-// returning high half (scalar, by element)
-def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32,
-  VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32,
-  VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32,
-  VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
-  VPR128Lo, neon_uimm2_bare>;
-
-// Scalar general arithmetic operation
-class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (INST FPR64:$Rn, FPR64:$Rm)>;
-
-class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
-              (v1f64 FPR64:$Ra))),
-          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
-
-def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
-def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
-
-def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
-def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
-
-// Scalar Copy - DUP element to scalar
-class NeonI_Scalar_DUP<string asmop, string asmlane,
-                       RegisterClass ResRC, RegisterOperand VPRC,
-                       Operand OpImm>
-  : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
-                     asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
-                     [],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 0)),
-          (f32 (EXTRACT_SUBREG (v4f32 VPR128:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 1)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 1))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 2)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 2))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 3)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 3))>;
-
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 0)),
-          (f64 (EXTRACT_SUBREG (v2f64 VPR128:$Rn), sub_64))>;
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 1)),
-          (f64 (DUPdv_D (v2f64 VPR128:$Rn), 1))>;
-
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 0)),
-          (f32 (EXTRACT_SUBREG (v2f32 VPR64:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 1)),
-          (f32 (DUPsv_S (v4f32 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            1))>;
-
-def : Pat<(f64 (vector_extract (v1f64 VPR64:$Rn), 0)),
-          (f64 (EXTRACT_SUBREG (v1f64 VPR64:$Rn), sub_64))>;
-
-multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
-  ValueType ResTy, ValueType OpTy,Operand OpLImm,
-  ValueType NOpTy, ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)),
-            (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>;
-
-  def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-                OpNImm:$Imm))>;
-}
-
-// Patterns for extract subvectors of v1ix data using scalar DUP instructions.
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare,
-                                        v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare,
-                                        v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare,
-                                        v2i32, v4i32, neon_uimm1_bare>;
-
-multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy,
-                                          ValueType OpTy, ValueType ElemTy,
-                                          Operand OpImm, ValueType OpNTy,
-                                          ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (vector_insert (ResTy undef),
-              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
-              (neon_uimm0_bare:$Imm))),
-            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
-  def : Pat<(ResTy (vector_insert (ResTy undef),
-              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
-              (OpNImm:$Imm))),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              OpNImm:$Imm))>;
-}
-
-multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy,
-                                          ValueType OpTy, ValueType ElemTy,
-                                          Operand OpImm, ValueType OpNTy,
-                                          ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (scalar_to_vector
-              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))),
-            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
-  def : Pat<(ResTy (scalar_to_vector
-              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              OpNImm:$Imm))>;
-}
-
-// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP
-// instructions.
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
-  v1i64, v2i64, i64, neon_uimm1_bare,
-  v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
-  v1i32, v4i32, i32, neon_uimm2_bare,
-  v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
-  v1i16, v8i16, i32, neon_uimm3_bare,
-  v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
-  v1i8, v16i8, i32, neon_uimm4_bare,
-  v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
-  v1i64, v2i64, i64, neon_uimm1_bare,
-  v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
-  v1i32, v4i32, i32, neon_uimm2_bare,
-  v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
-  v1i16, v8i16, i32, neon_uimm3_bare,
-  v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
-  v1i8, v16i8, i32, neon_uimm4_bare,
-  v8i8, v16i8, neon_uimm3_bare>;
-
-multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
-                                  Instruction DUPI, Operand OpImm,
-                                  RegisterClass ResRC> {
-  def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"),
-          (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>;
-}
-
-// Aliases for Scalar copy - DUP element (scalar)
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>;
-
-multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy,
-                      ValueType OpTy> {
-  def : Pat<(ResTy (GetLow VPR128:$Rn)),
-            (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>;
-  def : Pat<(ResTy (GetHigh VPR128:$Rn)),
-            (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>;
-}
-
-defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>;
-defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>;
-defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>;
-defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
-defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
-defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
-
-// The following is for sext/zext from v1xx to v1xx
-multiclass NeonI_ext<string prefix, SDNode ExtOp> {
-  // v1i32 -> v1i64
-  def : Pat<(v1i64 (ExtOp (v1i32 FPR32:$Rn))),
-            (EXTRACT_SUBREG 
-              (v2i64 (!cast<Instruction>(prefix # "_2S")
-                (v2i32 (SUBREG_TO_REG (i64 0), $Rn, sub_32)), 0)),
-              sub_64)>;
-  
-  // v1i16 -> v1i32
-  def : Pat<(v1i32 (ExtOp (v1i16 FPR16:$Rn))),
-            (EXTRACT_SUBREG 
-              (v4i32 (!cast<Instruction>(prefix # "_4H")
-                (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
-              sub_32)>;
-  
-  // v1i8 -> v1i16
-  def : Pat<(v1i16 (ExtOp (v1i8 FPR8:$Rn))),
-            (EXTRACT_SUBREG 
-              (v8i16 (!cast<Instruction>(prefix # "_8B")
-                (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-              sub_16)>;
-}
-
-defm NeonI_zext : NeonI_ext<"USHLLvvi", zext>;
-defm NeonI_sext : NeonI_ext<"SSHLLvvi", sext>;
-
-// zext v1i8 -> v1i32
-def : Pat<(v1i32 (zext (v1i8 FPR8:$Rn))),
-          (v1i32 (EXTRACT_SUBREG
-            (v1i64 (SUBREG_TO_REG (i64 0),
-              (v1i8 (DUPbv_B
-                (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
-                0)),
-              sub_8)),
-            sub_32))>;
-
-// zext v1i8 -> v1i64
-def : Pat<(v1i64 (zext (v1i8 FPR8:$Rn))),
-          (v1i64 (SUBREG_TO_REG (i64 0),
-            (v1i8 (DUPbv_B
-              (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
-              0)),
-            sub_8))>;
-
-// zext v1i16 -> v1i64
-def : Pat<(v1i64 (zext (v1i16 FPR16:$Rn))),
-          (v1i64 (SUBREG_TO_REG (i64 0),
-            (v1i16 (DUPhv_H
-              (v8i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)),
-              0)),
-            sub_16))>;
-
-// sext v1i8 -> v1i32
-def : Pat<(v1i32 (sext (v1i8 FPR8:$Rn))),
-          (EXTRACT_SUBREG
-            (v4i32 (SSHLLvvi_4H
-              (v4i16 (SUBREG_TO_REG (i64 0),
-                (v1i16 (EXTRACT_SUBREG 
-                  (v8i16 (SSHLLvvi_8B
-                    (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-                  sub_16)),
-                sub_16)), 0)),
-            sub_32)>;
-              
-// sext v1i8 -> v1i64
-def : Pat<(v1i64 (sext (v1i8 FPR8:$Rn))),
-          (EXTRACT_SUBREG 
-            (v2i64 (SSHLLvvi_2S
-              (v2i32 (SUBREG_TO_REG (i64 0),
-                (v1i32 (EXTRACT_SUBREG
-                  (v4i32 (SSHLLvvi_4H
-                    (v4i16 (SUBREG_TO_REG (i64 0),
-                      (v1i16 (EXTRACT_SUBREG 
-                        (v8i16 (SSHLLvvi_8B
-                          (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-                        sub_16)),
-                      sub_16)), 0)),
-                  sub_32)),
-                sub_32)), 0)),
-            sub_64)>;
-
-  
-// sext v1i16 -> v1i64
-def : Pat<(v1i64 (sext (v1i16 FPR16:$Rn))),
-          (EXTRACT_SUBREG
-            (v2i64 (SSHLLvvi_2S
-              (v2i32 (SUBREG_TO_REG (i64 0),
-                (v1i32 (EXTRACT_SUBREG 
-                  (v4i32 (SSHLLvvi_4H
-                    (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
-                  sub_32)),
-                sub_32)), 0)),
-            sub_64)>;
-
-//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//===----------------------------------------------------------------------===//
-
-// 64-bit vector bitcasts...
-
-def : Pat<(v1i64 (bitconvert (v8i8  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  VPR64:$src))), (v4i16 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v4i16  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2i32  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v2i32  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2f32  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v2f32  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v2f32 (bitconvert (v1i64  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v1f64  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v1f64  VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1f64  VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(v1f64 (bitconvert (v1i64  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2i32  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64  VPR64:$src))), (v1f64 VPR64:$src)>;
-
-// ..and 128-bit vector bitcasts...
-
-def : Pat<(v2f64 (bitconvert (v16i8  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8  VPR128:$src))), (v8i16 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v8i16  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4i32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v4i32  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4f32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v4f32  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v2i64  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2i64 (bitconvert (v2f64  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-// ...and scalar bitcasts...
-def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
-def : Pat<(f32 (bitconvert (v1i32  FPR32:$src))), (f32 FPR32:$src)>;
-def : Pat<(f64 (bitconvert (v1i64  FPR64:$src))), (f64 FPR64:$src)>;
-def : Pat<(f64 (bitconvert (v1f64  FPR64:$src))), (f64 FPR64:$src)>;
-
-def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v1f64  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
-
-def : Pat<(i32 (bitconvert (v1i32  FPR32:$src))), (FMOVws $src)>;
-
-def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
-
-def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
-
-def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
-def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
-def : Pat<(v1i64 (bitconvert (f64  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64  FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v1f64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-
-def : Pat<(v1i32 (bitconvert (i32  GPR32:$src))), (FMOVsw $src)>;
-
-def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
-
-// Scalar Three Same
-
-def neon_uimm3 : Operand<i64>,
-                   ImmLeaf<i64, [{return Imm < 8;}]> {
-  let ParserMatchClass = uimm3_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm4 : Operand<i64>,
-                   ImmLeaf<i64, [{return Imm < 16;}]> {
-  let ParserMatchClass = uimm4_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-// Bitwise Extract
-class NeonI_Extract<bit q, bits<2> op2, string asmop,
-                    string OpS, RegisterOperand OpVPR, Operand OpImm>
-  : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd),
-                     (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index),
-                     asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
-                     ", $Rm." # OpS # ", $Index",
-                     [],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>{
-  bits<4> Index;
-}
-
-def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b",
-                               VPR64, neon_uimm3> {
-  let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}};
-}
-
-def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b",
-                               VPR128, neon_uimm4> {
-  let Inst{14-11} = Index;
-}
-
-class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST,
-                 Operand OpImm>
-  : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm),
-                                 (i64 OpImm:$Imm))),
-              (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>;
-
-def : NI_Extract<v8i8,  VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v4i16, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v2i32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v1i64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v2f32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v1f64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>;
-
-// Table lookup
-class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
-             string asmop, string OpS, RegisterOperand OpVPR,
-             RegisterOperand VecList>
-  : NeonI_TBL<q, op2, len, op,
-              (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
-              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
-              [],
-              NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// The vectors in look up table are always 16b
-multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
-  def _8b  : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-
-  def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">;
-defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
-defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
-defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
-
-// Table lookup extension
-class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
-             string asmop, string OpS, RegisterOperand OpVPR,
-             RegisterOperand VecList>
-  : NeonI_TBL<q, op2, len, op,
-              (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
-              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
-              [],
-              NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// The vectors in look up table are always 16b
-multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> {
-  def _8b  : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-
-  def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">;
-defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">;
-defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">;
-defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">;
-
-class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
-                     RegisterClass OpGPR, ValueType OpTy, Operand OpImm>
-  : NeonI_copy<0b1, 0b0, 0b0011,
-               (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd." # Res # "[$Imm], $Rn",
-               [(set (ResTy VPR128:$Rd),
-                 (ResTy (vector_insert
-                   (ResTy VPR128:$src),
-                   (OpTy OpGPR:$Rn),
-                   (OpImm:$Imm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  bits<4> Imm;
-  let Constraints = "$src = $Rd";
-}
-
-//Insert element (vector, from main)
-def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32,
-                           neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32,
-                           neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32,
-                           neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64,
-                           neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn",
-                    (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn",
-                    (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn",
-                    (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn",
-                    (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy,
-                             RegisterClass OpGPR, ValueType OpTy,
-                             Operand OpImm, Instruction INS>
-  : Pat<(ResTy (vector_insert
-              (ResTy VPR64:$src),
-              (OpTy OpGPR:$Rn),
-              (OpImm:$Imm))),
-        (ResTy (EXTRACT_SUBREG
-          (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-            OpGPR:$Rn, OpImm:$Imm)), sub_64))>;
-
-def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32,
-                                          neon_uimm3_bare, INSbw>;
-def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32,
-                                          neon_uimm2_bare, INShw>;
-def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32,
-                                          neon_uimm1_bare, INSsw>;
-def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64,
-                                          neon_uimm0_bare, INSdx>;
-
-class NeonI_INS_element<string asmop, string Res, Operand ResImm>
-  : NeonI_insert<0b1, 0b1,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn,
-                 ResImm:$Immd, ResImm:$Immn),
-                 asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  bits<4> Immd;
-  bits<4> Immn;
-}
-
-//Insert element (vector, from element)
-def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> {
-  let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1};
-  let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}};
-}
-def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> {
-  let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0};
-  let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0};
-  // bit 11 is unspecified, but should be set to zero.
-}
-def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> {
-  let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0};
-  let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0};
-  // bits 11-12 are unspecified, but should be set to zero.
-}
-def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> {
-  let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0};
-  let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0};
-  // bits 11-13 are unspecified, but should be set to zero.
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]",
-                    (INSELb VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]",
-                    (INSELh VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]",
-                    (INSELs VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]",
-                    (INSELd VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>;
-
-multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy,
-                                ValueType MidTy, Operand StImm, Operand NaImm,
-                                Instruction INS> {
-def : Pat<(ResTy (vector_insert
-            (ResTy VPR128:$src),
-            (MidTy (vector_extract
-              (ResTy VPR128:$Rn),
-              (StImm:$Immn))),
-            (StImm:$Immd))),
-          (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
-              StImm:$Immd, StImm:$Immn)>;
-
-def : Pat <(ResTy (vector_insert
-             (ResTy VPR128:$src),
-             (MidTy (vector_extract
-               (NaTy VPR64:$Rn),
-               (NaImm:$Immn))),
-             (StImm:$Immd))),
-           (INS (ResTy VPR128:$src),
-             (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
-             StImm:$Immd, NaImm:$Immn)>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy (vector_extract
-               (ResTy VPR128:$Rn),
-               (StImm:$Immn))),
-             (NaImm:$Immd))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy VPR128:$Rn),
-               NaImm:$Immd, StImm:$Immn)),
-             sub_64))>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy (vector_extract
-               (NaTy VPR64:$Rn),
-               (NaImm:$Immn))),
-             (NaImm:$Immd))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
-               NaImm:$Immd, NaImm:$Immn)),
-             sub_64))>;
-}
-
-defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare,
-                            neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare,
-                            neon_uimm0_bare, INSELd>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
-                            neon_uimm3_bare, INSELb>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
-                            neon_uimm2_bare, INSELh>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                            neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare,
-                            neon_uimm0_bare, INSELd>;
-
-multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
-                                      ValueType MidTy,
-                                      RegisterClass OpFPR, Operand ResImm,
-                                      SubRegIndex SubIndex, Instruction INS> {
-def : Pat <(ResTy (vector_insert
-             (ResTy VPR128:$src),
-             (MidTy OpFPR:$Rn),
-             (ResImm:$Imm))),
-           (INS (ResTy VPR128:$src),
-             (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
-             ResImm:$Imm,
-             (i64 0))>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy OpFPR:$Rn),
-             (ResImm:$Imm))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
-               ResImm:$Imm,
-               (i64 0))),
-             sub_64))>;
-}
-
-defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
-                                  sub_32, INSELs>;
-defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
-                                  sub_64, INSELd>;
-
-class NeonI_SMOV<string asmop, string Res, bit Q,
-                 ValueType OpTy, ValueType eleTy,
-                 Operand OpImm, RegisterClass ResGPR, ValueType ResTy>
-  : NeonI_copy<Q, 0b0, 0b0101,
-               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
-               [(set (ResTy ResGPR:$Rd),
-                 (ResTy (sext_inreg
-                   (ResTy (vector_extract
-                     (OpTy VPR128:$Rn), (OpImm:$Imm))),
-                   eleTy)))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-//Signed integer move (main, from element)
-def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare,
-                        GPR32, i32> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare,
-                        GPR32, i32> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
-                               ValueType eleTy, Operand StImm,  Operand NaImm,
-                               Instruction SMOVI> {
-  def : Pat<(i64 (sext_inreg
-              (i64 (anyext
-                (i32 (vector_extract
-                  (StTy VPR128:$Rn), (StImm:$Imm))))),
-              eleTy)),
-            (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
-  def : Pat<(i64 (sext
-              (i32 (vector_extract
-                (StTy VPR128:$Rn), (StImm:$Imm))))),
-            (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
-  def : Pat<(i64 (sext_inreg
-              (i64 (vector_extract
-                (NaTy VPR64:$Rn), (NaImm:$Imm))),
-              eleTy)),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-
-  def : Pat<(i64 (sext_inreg
-              (i64 (anyext
-                (i32 (vector_extract
-                  (NaTy VPR64:$Rn), (NaImm:$Imm))))),
-              eleTy)),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-
-  def : Pat<(i64 (sext
-              (i32 (vector_extract
-                (NaTy VPR64:$Rn), (NaImm:$Imm))))),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-}
-
-defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                          neon_uimm3_bare, SMOVxb>;
-defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                          neon_uimm2_bare, SMOVxh>;
-defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                          neon_uimm1_bare, SMOVxs>;
-
-class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
-                          ValueType eleTy, Operand StImm,  Operand NaImm,
-                          Instruction SMOVI>
-  : Pat<(i32 (sext_inreg
-          (i32 (vector_extract
-            (NaTy VPR64:$Rn), (NaImm:$Imm))),
-          eleTy)),
-        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-          NaImm:$Imm)>;
-
-def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                         neon_uimm3_bare, SMOVwb>;
-def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                         neon_uimm2_bare, SMOVwh>;
-
-class NeonI_UMOV<string asmop, string Res, bit Q,
-                 ValueType OpTy, Operand OpImm,
-                 RegisterClass ResGPR, ValueType ResTy>
-  : NeonI_copy<Q, 0b0, 0b0111,
-               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
-               [(set (ResTy ResGPR:$Rd),
-                  (ResTy (vector_extract
-                    (OpTy VPR128:$Rn), (OpImm:$Imm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-//Unsigned integer move (main, from element)
-def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare,
-                         GPR64, i64> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]",
-                    (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]",
-                    (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
-                         Operand StImm,  Operand NaImm,
-                         Instruction SMOVI>
-  : Pat<(ResTy (vector_extract
-          (NaTy VPR64:$Rn), NaImm:$Imm)),
-        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-          NaImm:$Imm)>;
-
-def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
-                        neon_uimm3_bare, UMOVwb>;
-def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
-                        neon_uimm2_bare, UMOVwh>;
-def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                        neon_uimm1_bare, UMOVws>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))),
-            255)),
-          (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))),
-            65535)),
-          (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i64 (zext
-            (i32 (vector_extract
-              (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))),
-          (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))),
-            255)),
-          (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))),
-            65535)),
-          (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm2_bare:$Imm)>;
-
-def : Pat<(i64 (zext
-            (i32 (vector_extract
-              (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))),
-          (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm0_bare:$Imm)>;
-
-// Additional copy patterns for scalar types
-def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))),
-          (UMOVwb (v16i8
-            (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))),
-          (UMOVwh (v8i16
-            (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))),
-          (FMOVws FPR32:$Rn)>;
-
-def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
-          (FMOVxd FPR64:$Rn)>;
-
-def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
-          (f64 FPR64:$Rn)>;
-
-def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
-          (v1i8 (EXTRACT_SUBREG (v16i8
-            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_8))>;
-
-def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)),
-          (v1i16 (EXTRACT_SUBREG (v8i16
-            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_16))>;
-
-def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
-          (FMOVsw $src)>;
-
-def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
-          (FMOVdx $src)>;
-
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
-          (v8i8 (EXTRACT_SUBREG (v16i8
-            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
-          (v4i16 (EXTRACT_SUBREG (v8i16
-            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v2i32 (scalar_to_vector GPR32:$Rn)),
-          (v2i32 (EXTRACT_SUBREG (v16i8
-            (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
-          (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
-          (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v4i32 (scalar_to_vector GPR32:$Rn)),
-          (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2i64 (scalar_to_vector GPR64:$Rn)),
-          (INSdx (v2i64 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (v1f64 FPR64:$Rn)>;
-
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
-                         (f64 FPR64:$src), sub_64)>;
-
-class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
-                    RegisterOperand ResVPR, Operand OpImm>
-  : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
-               (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
-               [],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128,
-                              neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128,
-                              neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128,
-                              neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128,
-                              neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64,
-                              neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64,
-                              neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64,
-                              neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
-                                       ValueType OpTy,ValueType NaTy,
-                                       ValueType ExTy, Operand OpLImm,
-                                       Operand OpNImm> {
-def  : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
-        (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
-
-def : Pat<(ResTy (Neon_vduplane
-            (NaTy VPR64:$Rn), OpNImm:$Imm)),
-          (ResTy (DUPELT
-            (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
-}
-defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
-                             neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
-                             neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
-                             neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
-                             neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
-                             neon_uimm1_bare, neon_uimm0_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
-                             neon_uimm1_bare, neon_uimm0_bare>;
-
-def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
-          (v2f32 (DUPELT2s
-            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-            (i64 0)))>;
-def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
-          (v4f32 (DUPELT4s
-            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-            (i64 0)))>;
-def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
-          (v2f64 (DUPELT2d
-            (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
-            (i64 0)))>;
-
-multiclass NeonI_DUP_pattern<Instruction DUPELT, ValueType ResTy,
-                             ValueType OpTy, RegisterClass OpRC,
-                             Operand OpNImm, SubRegIndex SubIndex> {
-def : Pat<(ResTy (Neon_vduplane (OpTy OpRC:$Rn), OpNImm:$Imm)),
-          (ResTy (DUPELT
-            (SUBREG_TO_REG (i64 0), OpRC:$Rn, SubIndex), OpNImm:$Imm))>;
-}
-
-defm : NeonI_DUP_pattern<DUPELT4h, v4i16, v1i16, FPR16, neon_uimm2_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT4s, v4i32, v1i32, FPR32, neon_uimm2_bare,sub_32>;
-defm : NeonI_DUP_pattern<DUPELT8b, v8i8, v1i8, FPR8, neon_uimm3_bare, sub_8>;
-defm : NeonI_DUP_pattern<DUPELT8h, v8i16, v1i16, FPR16, neon_uimm3_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT16b, v16i8, v1i8, FPR8, neon_uimm4_bare, sub_8>;
-
-class NeonI_DUP<bit Q, string asmop, string rdlane,
-                RegisterOperand ResVPR, ValueType ResTy,
-                RegisterClass OpGPR, ValueType OpTy>
-  : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
-               asmop # "\t$Rd" # rdlane # ", $Rn",
-               [(set (ResTy ResVPR:$Rd),
-                 (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
-  let Inst{20-16} = 0b00001;
-  // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
-  let Inst{20-16} = 0b00010;
-  // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
-  let Inst{20-16} = 0b00100;
-  // bits 19-20 are unspecified, but should be set to zero.
-}
-
-def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
-  let Inst{20-16} = 0b01000;
-  // bit 20 is unspecified, but should be set to zero.
-}
-
-def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
-  let Inst{20-16} = 0b00001;
-  // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
-  let Inst{20-16} = 0b00010;
-  // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
-  let Inst{20-16} = 0b00100;
-  // bits 19-20 are unspecified, but should be set to zero.
-}
-
-// patterns for CONCAT_VECTORS
-multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
-          (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
-          (INSELd
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
-            (i64 1),
-            (i64 0))>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
-          (DUPELT2d
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            (i64 0))> ;
-}
-
-defm : Concat_Vector_Pattern<v16i8, v8i8>;
-defm : Concat_Vector_Pattern<v8i16, v4i16>;
-defm : Concat_Vector_Pattern<v4i32, v2i32>;
-defm : Concat_Vector_Pattern<v2i64, v1i64>;
-defm : Concat_Vector_Pattern<v4f32, v2f32>;
-defm : Concat_Vector_Pattern<v2f64, v1f64>;
-
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), undef)),
-          (v2i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32))>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG 
-            (v4i32 (INSELs
-              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)),
-              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              (i64 1),
-              (i64 0))),
-            sub_64)>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rn))),
-          (DUPELT2s (v4i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32)), 0)>;
-
-//patterns for EXTRACT_SUBVECTOR
-def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
-          (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
-          (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
-          (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
-          (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
-          (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
-          (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-
-// The followings are for instruction class (3V Elem)
-
-// Variant 1
-
-class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
-             string asmop, string ResS, string OpS, string EleOpS,
-             Operand OpImm, RegisterOperand ResVPR,
-             RegisterOperand OpVPR, RegisterOperand EleOpVPR>
-  : NeonI_2VElem<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn,
-                                         EleOpVPR:$Re, OpImm:$Index),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
-                 ", $Re." # EleOpS # "[$Index]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  bits<3> Index;
-  bits<5> Re;
-
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
-                     neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
-                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">;
-defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                   RegisterOperand ResVPR, RegisterOperand OpVPR,
-                   RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
-                   ValueType EleOpTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                  RegisterOperand ResVPR, RegisterOperand OpVPR,
-                  RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
-                  ValueType EleOpTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST ResVPR:$src, OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op>
-{
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                     op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                     op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
-                     op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
-                     op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                    op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
-                    op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>;
-defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>;
-
-class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS, string EleOpS,
-                 Operand OpImm, RegisterOperand ResVPR,
-                 RegisterOperand OpVPR, RegisterOperand EleOpVPR>
-  : NeonI_2VElem<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins OpVPR:$Rn,
-                                         EleOpVPR:$Re, OpImm:$Index),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
-                 ", $Re." # EleOpS # "[$Index]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  bits<3> Index;
-  bits<5> Re;
-}
-
-multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
-                         neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
-                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
-defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
-defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
-}
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                       RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                       ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                      RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                      ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                         op, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
-                         op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
-                         op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
-                        op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>;
-defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>;
-defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>;
-
-// Variant 2
-
-multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // _1d2d doesn't exist!
-
-  def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
-                         neon_uimm1_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{0}};
-    let Inst{21} = 0b0;
-    let Inst{20-16} = Re;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
-defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
-}
-
-class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
-                         RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                         ValueType ResTy, ValueType OpTy, ValueType EleOpTy,
-                         SDPatternOperator coreop>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))),
-        (INST OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>;
-
-multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2f32, v2f32, v4f32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                         op, VPR128, VPR128, v4f32, v4f32, v4f32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
-                         op, VPR128, VPR128, v2f64, v2f64, v2f64>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2f32, v2f32, v2f32>;
-
-  def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
-                           op, VPR128, VPR64, v2f64, v2f64, v1f64,
-                           BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>;
-defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>;
-
-def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))),
-                       (v2f32 VPR64:$Rn))),
-          (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))),
-                       (v4f32 VPR128:$Rn))),
-          (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))),
-                       (v2f64 VPR128:$Rn))),
-          (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>;
-
-// The followings are patterns using fma
-// -ffp-contract=fast generates fma
-
-multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // _1d2d doesn't exist!
-
-  def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
-                     neon_uimm1_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{0}};
-    let Inst{21} = 0b0;
-    let Inst{20-16} = Re;
-  }
-}
-
-defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">;
-defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                       RegisterOperand ResVPR, RegisterOperand OpVPR,
-                       ValueType ResTy, ValueType OpTy,
-                       SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane 0
-class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op,
-                      RegisterOperand ResVPR, ValueType ResTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$Rn),
-                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
-                   (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                      RegisterOperand ResVPR, RegisterOperand OpVPR,
-                      ValueType ResTy, ValueType OpTy,
-                      SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm,
-                           SDPatternOperator op,
-                           RegisterOperand ResVPR, RegisterOperand OpVPR,
-                           ValueType ResTy, ValueType OpTy,
-                           SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>;
-
-
-multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> {
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"),
-                        op, VPR64, v2f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"),
-                        op, VPR128, v4f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>;
-
-// Pattern for lane 0
-class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op,
-                      RegisterOperand ResVPR, ValueType ResTy>
-  : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)),
-                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
-                   (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op>
-{
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"),
-                        op, VPR64, v2f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(fneg (Neon_vduplane
-                                     node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"),
-                        op, VPR128, v4f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(fneg (Neon_vduplane
-                                     node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(fneg (Neon_vduplane
-                                    node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(Neon_vduplane
-                                    (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
-                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
-                        BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
-                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
-                        BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(fneg (Neon_combine_2d
-                                         node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(Neon_combine_2d
-                                         (fneg node:$LHS), (fneg node:$RHS))>>;
-}
-
-defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>;
-
-// Variant 3: Long type
-// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S
-//      SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S
-
-multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
-                     neon_uimm2_bare, VPR128, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
-                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
-                     neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">;
-defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">;
-defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">;
-defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">;
-defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">;
-defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">;
-
-multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
-                         neon_uimm2_bare, VPR128, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
-                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
-                         neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
-defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
-defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
-}
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
-          (FMOVdd $src)>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                     RegisterOperand EleOpVPR, ValueType ResTy,
-                     ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                     SDPatternOperator hiop>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                    RegisterOperand EleOpVPR, ValueType ResTy,
-                    ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                    SDPatternOperator hiop>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$src, VPR128:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op,
-                     ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
-                     SDPatternOperator hiop, Instruction DupInst>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
-        (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                     op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                     op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                       op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                       op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
-                       op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
-                       op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                    op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                    op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                      op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                      op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>;
-defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>;
-defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>;
-defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                         RegisterOperand EleOpVPR, ValueType ResTy,
-                         ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                         SDPatternOperator hiop>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                        RegisterOperand EleOpVPR, ValueType ResTy,
-                        ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                        SDPatternOperator hiop>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for fixed lane 0
-class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op,
-                         ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
-                         SDPatternOperator hiop, Instruction DupInst>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
-        (INST VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                         op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                         op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                           op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"),
-                           op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"),
-                           op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                        op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                          op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                          op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>;
-defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>;
-defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>;
-
-multiclass NI_qdma<SDPatternOperator op> {
-  def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                    (op node:$Ra,
-                      (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-
-  def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                    (op node:$Ra,
-                      (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-}
-
-defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>;
-defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>;
-
-multiclass NI_2VEL_v3_qdma_pat<string subop, string op> {
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                     !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo,
-                     v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                     !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128,
-                     v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                       !cast<PatFrag>(op # "_4s"), VPR128Lo,
-                       v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                       !cast<PatFrag>(op # "_2d"), VPR128,
-                       v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
-                       !cast<PatFrag>(op # "_4s"),
-                       v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
-                       !cast<PatFrag>(op # "_2d"),
-                       v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                    !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo,
-                    v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                    !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64,
-                    v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                      !cast<PatFrag>(op # "_4s"), VPR64Lo,
-                      v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                      !cast<PatFrag>(op # "_2d"), VPR64,
-                      v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">;
-defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">;
-
-// End of implementation for instruction class (3V Elem)
-
-class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
-                bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy,
-                SDPatternOperator Neon_Rev>
-  : NeonI_2VMisc<Q, U, size, opcode,
-               (outs ResVPR:$Rd), (ins ResVPR:$Rn),
-               asmop # "\t$Rd." # Res # ", $Rn." # Res,
-               [(set (ResTy ResVPR:$Rd),
-                  (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
-                          v16i8, Neon_rev64>;
-def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128,
-                         v8i16, Neon_rev64>;
-def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128,
-                         v4i32, Neon_rev64>;
-def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64,
-                         v8i8, Neon_rev64>;
-def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64,
-                         v4i16, Neon_rev64>;
-def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64,
-                         v2i32, Neon_rev64>;
-
-def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>;
-def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>;
-
-def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128,
-                          v16i8, Neon_rev32>;
-def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128,
-                          v8i16, Neon_rev32>;
-def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64,
-                         v8i8, Neon_rev32>;
-def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64,
-                         v4i16, Neon_rev32>;
-
-def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128,
-                          v16i8, Neon_rev16>;
-def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
-                         v8i8, Neon_rev16>;
-
-multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
-                             SDPatternOperator Neon_Padd> {
-  def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.8h, $Rn.16b",
-                           [(set (v8i16 VPR128:$Rd),
-                              (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
-                           NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.8b",
-                          [(set (v4i16 VPR64:$Rd),
-                             (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.4s, $Rn.8h",
-                           [(set (v4i32 VPR128:$Rd),
-                              (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.4h",
-                          [(set (v2i32 VPR64:$Rd),
-                             (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.2d, $Rn.4s",
-                           [(set (v2i64 VPR128:$Rd),
-                              (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.1d, $Rn.2s",
-                          [(set (v1i64 VPR64:$Rd),
-                             (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
-                                int_arm_neon_vpaddls>;
-defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
-                                int_arm_neon_vpaddlu>;
-
-def : Pat<(v1i64 (int_aarch64_neon_saddlv (v2i32 VPR64:$Rn))),
-          (SADDLP2s1d $Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_uaddlv (v2i32 VPR64:$Rn))),
-          (UADDLP2s1d $Rn)>;
-
-multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
-                             SDPatternOperator Neon_Padd> {
-  let Constraints = "$src = $Rd" in {
-    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                             asmop # "\t$Rd.8h, $Rn.16b",
-                             [(set (v8i16 VPR128:$Rd),
-                                (v8i16 (Neon_Padd
-                                  (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
-                             NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.4h, $Rn.8b",
-                            [(set (v4i16 VPR64:$Rd),
-                               (v4i16 (Neon_Padd
-                                 (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "\t$Rd.4s, $Rn.8h",
-                            [(set (v4i32 VPR128:$Rd),
-                               (v4i32 (Neon_Padd
-                                 (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.2s, $Rn.4h",
-                            [(set (v2i32 VPR64:$Rd),
-                               (v2i32 (Neon_Padd
-                                 (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "\t$Rd.2d, $Rn.4s",
-                            [(set (v2i64 VPR128:$Rd),
-                               (v2i64 (Neon_Padd
-                                 (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.1d, $Rn.2s",
-                            [(set (v1i64 VPR64:$Rd),
-                               (v1i64 (Neon_Padd
-                                 (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110,
-                                   int_arm_neon_vpadals>;
-defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110,
-                                   int_arm_neon_vpadalu>;
-
-multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
-  def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.8h, $Rn.8h",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                         (outs VPR64:$Rd), (ins VPR64:$Rn),
-                         asmop # "\t$Rd.8b, $Rn.8b",
-                         [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.4h, $Rn.4h",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
-defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>;
-defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>;
-defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>;
-
-multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix,
-                                          SDPatternOperator Neon_Op> {
-  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))),
-            (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>;
-
-  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))),
-            (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))),
-            (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))),
-            (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>;
-
-def : Pat<(v16i8 (sub
-            (v16i8 Neon_AllZero),
-            (v16i8 VPR128:$Rn))),
-          (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (sub
-            (v8i8 Neon_AllZero),
-            (v8i8 VPR64:$Rn))),
-          (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (sub
-            (v8i16 (bitconvert (v16i8 Neon_AllZero))),
-            (v8i16 VPR128:$Rn))),
-          (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>;
-def : Pat<(v4i16 (sub
-            (v4i16 (bitconvert (v8i8 Neon_AllZero))),
-            (v4i16 VPR64:$Rn))),
-          (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>;
-def : Pat<(v4i32 (sub
-            (v4i32 (bitconvert (v16i8 Neon_AllZero))),
-            (v4i32 VPR128:$Rn))),
-          (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>;
-def : Pat<(v2i32 (sub
-            (v2i32 (bitconvert (v8i8 Neon_AllZero))),
-            (v2i32 VPR64:$Rn))),
-          (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>;
-def : Pat<(v2i64 (sub
-            (v2i64 (bitconvert (v16i8 Neon_AllZero))),
-            (v2i64 VPR128:$Rn))),
-          (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>;
-
-multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
-  let Constraints = "$src = $Rd" in {
-    def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                           asmop # "\t$Rd.16b, $Rn.16b",
-                           [], NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.8h, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.4s, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.2d, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.8b, $Rn.8b",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>;
-defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>;
-
-multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix,
-                                           SDPatternOperator Neon_Op> {
-  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))),
-            (v16i8 (!cast<Instruction>(Prefix # 16b)
-              (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>;
-
-  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))),
-            (v8i16 (!cast<Instruction>(Prefix # 8h)
-              (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))),
-            (v4i32 (!cast<Instruction>(Prefix # 4s)
-              (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))),
-            (v2i64 (!cast<Instruction>(Prefix # 2d)
-              (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8b)
-              (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4h)
-              (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2s)
-              (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>;
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>;
-
-multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
-                          SDPatternOperator Neon_Op> {
-  def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [(set (v16i8 VPR128:$Rd),
-                            (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
-                         NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.8h, $Rn.8h",
-                        [(set (v8i16 VPR128:$Rd),
-                           (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4i32 VPR128:$Rd),
-                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.8b, $Rn.8b",
-                        [(set (v8i8 VPR64:$Rd),
-                           (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.4h, $Rn.4h",
-                        [(set (v4i16 VPR64:$Rd),
-                           (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2i32 VPR64:$Rd),
-                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
-defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>;
-
-multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
-                              bits<5> Opcode> {
-  def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.8b, $Rn.8b",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
-defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>;
-defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>;
-
-def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b",
-                    (NOT16b VPR128:$Rd, VPR128:$Rn), 0>;
-def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b",
-                    (NOT8b VPR64:$Rd, VPR64:$Rn), 0>;
-
-def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))),
-          (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))),
-          (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>;
-
-def : Pat<(v16i8 (xor
-            (v16i8 VPR128:$Rn),
-            (v16i8 Neon_AllOne))),
-          (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (xor
-            (v8i8 VPR64:$Rn),
-            (v8i8 Neon_AllOne))),
-          (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (xor
-            (v8i16 VPR128:$Rn),
-            (v8i16 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-def : Pat<(v4i16 (xor
-            (v4i16 VPR64:$Rn),
-            (v4i16 (bitconvert (v8i8 Neon_AllOne))))),
-          (NOT8b VPR64:$Rn)>;
-def : Pat<(v4i32 (xor
-            (v4i32 VPR128:$Rn),
-            (v4i32 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-def : Pat<(v2i32 (xor
-            (v2i32 VPR64:$Rn),
-            (v2i32 (bitconvert (v8i8 Neon_AllOne))))),
-          (NOT8b VPR64:$Rn)>;
-def : Pat<(v2i64 (xor
-            (v2i64 VPR128:$Rn),
-            (v2i64 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-
-def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))),
-          (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))),
-          (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>;
-
-multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
-                                SDPatternOperator Neon_Op> {
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4f32 VPR128:$Rd),
-                           (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [(set (v2f64 VPR128:$Rd),
-                           (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2f32 VPR64:$Rd),
-                           (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
-defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>;
-
-multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
-  def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.8b, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  let Constraints = "$Rd = $src" in {
-    def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                             asmop # "2\t$Rd.16b, $Rn.8h",
-                             [], NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>;
-defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>;
-defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>;
-defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>;
-
-multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix,
-                                        SDPatternOperator Neon_Op> {
-  def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v16i8 (concat_vectors
-              (v8i8 VPR64:$src),
-              (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 8h16b)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-
-  def : Pat<(v8i16 (concat_vectors
-              (v4i16 VPR64:$src),
-              (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 4s8h)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-
-  def : Pat<(v4i32 (concat_vectors
-              (v2i32 VPR64:$src),
-              (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 2d4s)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>;
-defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>;
-
-multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
-  let DecoderMethod = "DecodeSHLLInstruction" in {
-    def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact8:$Imm),
-                            asmop # "\t$Rd.8h, $Rn.8b, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact16:$Imm),
-                            asmop # "\t$Rd.4s, $Rn.4h, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact32:$Imm),
-                            asmop # "\t$Rd.2d, $Rn.2s, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact8:$Imm),
-                            asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
-                            [], NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact16:$Imm),
-                            asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact32:$Imm),
-                            asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>;
-
-class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy,
-                          SDPatternOperator ExtOp, Operand Neon_Imm,
-                          string suffix>
-  : Pat<(DesTy (shl
-          (DesTy (ExtOp (OpTy VPR64:$Rn))),
-            (DesTy (Neon_vdup
-              (i32 Neon_Imm:$Imm))))),
-        (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>;
-
-class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy,
-                               SDPatternOperator ExtOp, Operand Neon_Imm,
-                               string suffix, PatFrag GetHigh>
-  : Pat<(DesTy (shl
-          (DesTy (ExtOp
-            (OpTy (GetHigh VPR128:$Rn)))),
-              (DesTy (Neon_vdup
-                (i32 Neon_Imm:$Imm))))),
-        (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>;
-
-def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h",
-                               Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h",
-                               Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s",
-                               Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s",
-                               Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d",
-                               Neon_High4S>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d",
-                               Neon_High4S>;
-
-multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
-  def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  let Constraints = "$src = $Rd" in {
-    def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>;
-
-multiclass NeonI_2VMisc_Narrow_Pattern<string prefix,
-                                       SDPatternOperator f32_to_f16_Op,
-                                       SDPatternOperator f64_to_f32_Op> {
-
-  def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))),
-              (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>;
-
-  def : Pat<(v8i16 (concat_vectors
-                (v4i16 VPR64:$src),
-                (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))),
-                  (!cast<Instruction>(prefix # "4s8h")
-                    (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-                    (v4f32 VPR128:$Rn))>;
-
-  def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))),
-            (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>;
-
-  def : Pat<(v4f32 (concat_vectors
-              (v2f32 VPR64:$src),
-              (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))),
-                (!cast<Instruction>(prefix # "2d4s")
-                  (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-                  (v2f64 VPR128:$Rn))>;
-}
-
-defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>;
-
-multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
-                                 bits<5> opcode> {
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "2\t$Rd.4s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Constraints = "$src = $Rd";
-  }
-
-  def : Pat<(v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))),
-            (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
-
-  def : Pat<(v4f32 (concat_vectors
-              (v2f32 VPR64:$src),
-              (v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "2d4s")
-               (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-               VPR128:$Rn)>;
-}
-
-defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>;
-
-def Neon_High4Float : PatFrag<(ops node:$in),
-                              (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-
-multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
-  def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR128:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.4s, $Rn.4h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.2d, $Rn.2s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$Rn),
-                          asmop # "2\t$Rd.4s, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$Rn),
-                          asmop # "2\t$Rd.2d, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
-
-multiclass NeonI_2VMisc_Extend_Pattern<string prefix> {
-  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>;
-
-  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp
-              (v4i16 (Neon_High8H
-                (v8i16 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>;
-
-  def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>;
-
-  def : Pat<(v2f64 (fextend
-              (v2f32 (Neon_High4Float
-                (v4f32 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">;
-
-multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
-                                ValueType ResTy4s, ValueType OpTy4s,
-                                ValueType ResTy2d, ValueType OpTy2d,
-                                ValueType ResTy2s, ValueType OpTy2s,
-                                SDPatternOperator Neon_Op> {
-
-  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (ResTy4s VPR128:$Rd),
-                           (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [(set (ResTy2d VPR128:$Rd),
-                           (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (ResTy2s VPR64:$Rd),
-                           (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
-                                  bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64,
-                                v2f64, v2i32, v2f32, Neon_Op>;
-}
-
-defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
-                                     int_arm_neon_vcvtns>;
-defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
-                                     int_arm_neon_vcvtnu>;
-defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
-                                     int_arm_neon_vcvtps>;
-defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
-                                     int_arm_neon_vcvtpu>;
-defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
-                                     int_arm_neon_vcvtms>;
-defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
-                                     int_arm_neon_vcvtmu>;
-defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
-defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
-defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
-                                     int_arm_neon_vcvtas>;
-defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
-                                     int_arm_neon_vcvtau>;
-
-multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
-                                  bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64,
-                                v2i64, v2f32, v2i32, Neon_Op>;
-}
-
-defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>;
-defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>;
-
-multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U,
-                                 bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64,
-                                v2f64, v2f32, v2f32, Neon_Op>;
-}
-
-defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000,
-                                     int_aarch64_neon_frintn>;
-defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>;
-defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>;
-defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>;
-defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>;
-defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>;
-defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>;
-defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
-                                    int_arm_neon_vrecpe>;
-defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
-                                     int_arm_neon_vrsqrte>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
-defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
-}
-
-multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
-                               bits<5> opcode, SDPatternOperator Neon_Op> {
-  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4i32 VPR128:$Rd),
-                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2i32 VPR64:$Rd),
-                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
-                                  int_arm_neon_vrecpe>;
-defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100,
-                                   int_arm_neon_vrsqrte>;
-
-// Crypto Class
-class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_AES<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                     asmop # "\t$Rd.16b, $Rn.16b",
-                     [(set (v16i8 VPR128:$Rd),
-                        (v16i8 (opnode (v16i8 VPR128:$src),
-                                       (v16i8 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>;
-def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>;
-
-class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
-                      string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_AES<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$Rn),
-                     asmop # "\t$Rd.16b, $Rn.16b",
-                     [(set (v16i8 VPR128:$Rd),
-                        (v16i8 (opnode (v16i8 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
-def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
-
-class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_SHA<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                     asmop # "\t$Rd.4s, $Rn.4s",
-                     [(set (v4i32 VPR128:$Rd),
-                        (v4i32 (opnode (v4i32 VPR128:$src),
-                                       (v4i32 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1",
-                                 int_arm_neon_sha1su1>;
-def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0",
-                                   int_arm_neon_sha256su0>;
-
-class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_SHA<size, opcode,
-                     (outs FPR32:$Rd), (ins FPR32:$Rn),
-                     asmop # "\t$Rd, $Rn",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  let Predicates = [HasNEON, HasCrypto];
-  let hasSideEffects = 0;
-}
-
-def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
-def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
-          (COPY_TO_REGCLASS (SHA1H (COPY_TO_REGCLASS i32:$Rn, FPR32)), GPR32)>;
-
-
-class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
-                           SDPatternOperator opnode>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs VPR128:$Rd),
-                       (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-                       [(set (v4i32 VPR128:$Rd),
-                          (v4i32 (opnode (v4i32 VPR128:$src),
-                                         (v4i32 VPR128:$Rn),
-                                         (v4i32 VPR128:$Rm))))],
-                       NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0",
-                                   int_arm_neon_sha1su0>;
-def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1",
-                                     int_arm_neon_sha256su1>;
-
-class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
-                           SDPatternOperator opnode>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs FPR128:$Rd),
-                       (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd, $Rn, $Rm.4s",
-                       [(set (v4i32 FPR128:$Rd),
-                          (v4i32 (opnode (v4i32 FPR128:$src),
-                                         (v4i32 FPR128:$Rn),
-                                         (v4i32 VPR128:$Rm))))],
-                       NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
-                                   int_arm_neon_sha256h>;
-def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
-                                    int_arm_neon_sha256h2>;
-
-class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs FPR128:$Rd),
-                       (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd, $Rn, $Rm.4s",
-                       [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let hasSideEffects = 0;
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c">;
-def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p">;
-def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m">;
-
-def : Pat<(int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1C v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1M v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1P v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-
-// Additional patterns to match shl to USHL.
-def : Pat<(v8i8 (shl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (USHLvvv_8B $Rn, $Rm)>;
-def : Pat<(v4i16 (shl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (USHLvvv_4H $Rn, $Rm)>;
-def : Pat<(v2i32 (shl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (USHLvvv_2S $Rn, $Rm)>;
-def : Pat<(v1i64 (shl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (USHLddd $Rn, $Rm)>;
-def : Pat<(v16i8 (shl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (USHLvvv_16B $Rn, $Rm)>;
-def : Pat<(v8i16 (shl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (USHLvvv_8H $Rn, $Rm)>;
-def : Pat<(v4i32 (shl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (USHLvvv_4S $Rn, $Rm)>;
-def : Pat<(v2i64 (shl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (USHLvvv_2D $Rn, $Rm)>;
-
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Additional patterns to match sra, srl.
-// For a vector right shift by vector, the shift amounts of SSHL/USHL are
-// negative. Negate the vector of shift amount first.
-def : Pat<(v8i8 (srl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (USHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (srl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (USHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (srl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (USHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (srl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (USHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (srl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (USHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (srl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (USHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (srl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (USHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (srl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (USHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
-              sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
-              sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
-              sub_32)>;
-
-def : Pat<(v8i8 (sra (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (SSHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (sra (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (SSHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (sra (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (SSHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (sra (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (SSHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (sra (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (SSHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (sra (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (SSHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (sra (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (SSHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (sra (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (SSHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
-              sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
-              sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
-              sub_32)>;
-
-//
-// Patterns for handling half-precision values
-//
-
-// Convert between f16 value and f32 value
-def : Pat<(f32 (f16_to_f32 (i32 GPR32:$Rn))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw $Rn), sub_16))>;
-def : Pat<(i32 (f32_to_f16 (f32 FPR32:$Rn))),
-          (FMOVws (SUBREG_TO_REG (i64 0), (f16 (FCVThs $Rn)), sub_16))>;
-
-// Convert f16 value coming in as i16 value to f32
-def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 (
-            f32_to_f16 (f32 FPR32:$Rn))))))),
-          (f32 FPR32:$Rn)>;
-
-// Patterns for vector extract of half-precision FP value in i16 storage type
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
-            (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))),
-          (FCVTsh (f16 (DUPhv_H
-            (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            neon_uimm2_bare:$Imm)))>;
-
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
-            (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))),
-          (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>;
-
-// Patterns for vector insert of half-precision FP value 0 in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
-            (neon_uimm3_bare:$Imm))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn),
-            (v8i16 (SUBREG_TO_REG (i64 0),
-              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
-              sub_16)),
-            neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
-            (neon_uimm2_bare:$Imm))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0),
-                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
-                sub_16)),
-              neon_uimm2_bare:$Imm, 0)),
-            sub_64))>;
-
-// Patterns for vector insert of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint
-              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
-            (neon_uimm3_bare:$Imm))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn),
-            (v8i16 (SUBREG_TO_REG (i64 0),
-              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
-              sub_16)),
-            neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint
-              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
-            (neon_uimm2_bare:$Imm))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0),
-                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
-                sub_16)),
-              neon_uimm2_bare:$Imm, 0)),
-            sub_64))>;
-
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
-              (neon_uimm3_bare:$Imm1))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
-            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-// Patterns for vector copy of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
-              (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
-              65535)))))))),
-            (neon_uimm3_bare:$Imm1))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
-            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
-              (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)),
-              65535)))))))),
-            (neon_uimm3_bare:$Imm1))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-              neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)),
-            sub_64))>;
-
-
diff --git a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
similarity index 77%
rename from lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
rename to lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 9a8e1c3d91c0..3df9c4f23aa1 100644
--- a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64LoadStoreOptimizer.cpp - ARM64 load/store opt. pass --*- C++ -*-=//
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM64InstrInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -29,9 +29,9 @@
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-ldst-opt"
+#define DEBUG_TYPE "aarch64-ldst-opt"
 
-/// ARM64AllocLoadStoreOpt - Post-register allocation pass to combine
+/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
 /// load / store instructions to form ldp / stp instructions.
 
 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
@@ -40,39 +40,38 @@ STATISTIC(NumPreFolded, "Number of pre-index updates folded");
 STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 
-static cl::opt<unsigned> ScanLimit("arm64-load-store-scan-limit", cl::init(20),
-                                   cl::Hidden);
+static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
+                                   cl::init(20), cl::Hidden);
 
 // Place holder while testing unscaled load/store combining
-static cl::opt<bool>
-EnableARM64UnscaledMemOp("arm64-unscaled-mem-op", cl::Hidden,
-                         cl::desc("Allow ARM64 unscaled load/store combining"),
-                         cl::init(true));
+static cl::opt<bool> EnableAArch64UnscaledMemOp(
+    "aarch64-unscaled-mem-op", cl::Hidden,
+    cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
 
 namespace {
-struct ARM64LoadStoreOpt : public MachineFunctionPass {
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
   static char ID;
-  ARM64LoadStoreOpt() : MachineFunctionPass(ID) {}
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
 
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
 
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
-  // If a matching instruction is found, mergeForward is set to true if the
+  // If a matching instruction is found, MergeForward is set to true if the
   // merge is to remove the first instruction and replace the second with
   // a pair-wise insn, and false if the reverse is true.
   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
-                                               bool &mergeForward,
+                                               bool &MergeForward,
                                                unsigned Limit);
   // Merge the two instructions indicated into a single pair-wise instruction.
-  // If mergeForward is true, erase the first instruction and fold its
+  // If MergeForward is true, erase the first instruction and fold its
   // operation into the second. If false, the reverse. Return the instruction
   // following the first instruction (which may change during processing).
   MachineBasicBlock::iterator
   mergePairedInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator Paired, bool mergeForward);
+                   MachineBasicBlock::iterator Paired, bool MergeForward);
 
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
@@ -102,76 +101,76 @@ struct ARM64LoadStoreOpt : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
   const char *getPassName() const override {
-    return "ARM64 load / store optimization pass";
+    return "AArch64 load / store optimization pass";
   }
 
 private:
   int getMemSize(MachineInstr *MemMI);
 };
-char ARM64LoadStoreOpt::ID = 0;
+char AArch64LoadStoreOpt::ID = 0;
 }
 
 static bool isUnscaledLdst(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
-  case ARM64::STURSi:
+  case AArch64::STURSi:
     return true;
-  case ARM64::STURDi:
+  case AArch64::STURDi:
     return true;
-  case ARM64::STURQi:
+  case AArch64::STURQi:
     return true;
-  case ARM64::STURWi:
+  case AArch64::STURWi:
     return true;
-  case ARM64::STURXi:
+  case AArch64::STURXi:
     return true;
-  case ARM64::LDURSi:
+  case AArch64::LDURSi:
     return true;
-  case ARM64::LDURDi:
+  case AArch64::LDURDi:
     return true;
-  case ARM64::LDURQi:
+  case AArch64::LDURQi:
     return true;
-  case ARM64::LDURWi:
+  case AArch64::LDURWi:
     return true;
-  case ARM64::LDURXi:
+  case AArch64::LDURXi:
     return true;
   }
 }
 
 // Size in bytes of the data moved by an unscaled load or store
-int ARM64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
   switch (MemMI->getOpcode()) {
   default:
-    llvm_unreachable("Opcode has has unknown size!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
+    llvm_unreachable("Opcode has unknown size!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
     return 4;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
+  case AArch64::STRDui:
+  case AArch64::STURDi:
     return 8;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
     return 16;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
     return 4;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
     return 8;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
     return 4;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
     return 8;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
     return 16;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
     return 4;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
     return 8;
   }
 }
@@ -180,36 +179,36 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no pairwise equivalent!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
-    return ARM64::STPSi;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
-    return ARM64::STPDi;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
-    return ARM64::STPQi;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
-    return ARM64::STPWi;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
-    return ARM64::STPXi;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
-    return ARM64::LDPSi;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
-    return ARM64::LDPDi;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
-    return ARM64::LDPQi;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
-    return ARM64::LDPWi;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
-    return ARM64::LDPXi;
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return AArch64::STPSi;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return AArch64::STPDi;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return AArch64::STPQi;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return AArch64::STPWi;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return AArch64::STPXi;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return AArch64::LDPSi;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return AArch64::LDPDi;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return AArch64::LDPQi;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return AArch64::LDPWi;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return AArch64::LDPXi;
   }
 }
 
@@ -217,16 +216,26 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no pre-indexed equivalent!");
-  case ARM64::STRSui:    return ARM64::STRSpre;
-  case ARM64::STRDui:    return ARM64::STRDpre;
-  case ARM64::STRQui:    return ARM64::STRQpre;
-  case ARM64::STRWui:    return ARM64::STRWpre;
-  case ARM64::STRXui:    return ARM64::STRXpre;
-  case ARM64::LDRSui:    return ARM64::LDRSpre;
-  case ARM64::LDRDui:    return ARM64::LDRDpre;
-  case ARM64::LDRQui:    return ARM64::LDRQpre;
-  case ARM64::LDRWui:    return ARM64::LDRWpre;
-  case ARM64::LDRXui:    return ARM64::LDRXpre;
+  case AArch64::STRSui:
+    return AArch64::STRSpre;
+  case AArch64::STRDui:
+    return AArch64::STRDpre;
+  case AArch64::STRQui:
+    return AArch64::STRQpre;
+  case AArch64::STRWui:
+    return AArch64::STRWpre;
+  case AArch64::STRXui:
+    return AArch64::STRXpre;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpre;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpre;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpre;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpre;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpre;
   }
 }
 
@@ -234,33 +243,33 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
-  case ARM64::STRSui:
-    return ARM64::STRSpost;
-  case ARM64::STRDui:
-    return ARM64::STRDpost;
-  case ARM64::STRQui:
-    return ARM64::STRQpost;
-  case ARM64::STRWui:
-    return ARM64::STRWpost;
-  case ARM64::STRXui:
-    return ARM64::STRXpost;
-  case ARM64::LDRSui:
-    return ARM64::LDRSpost;
-  case ARM64::LDRDui:
-    return ARM64::LDRDpost;
-  case ARM64::LDRQui:
-    return ARM64::LDRQpost;
-  case ARM64::LDRWui:
-    return ARM64::LDRWpost;
-  case ARM64::LDRXui:
-    return ARM64::LDRXpost;
+  case AArch64::STRSui:
+    return AArch64::STRSpost;
+  case AArch64::STRDui:
+    return AArch64::STRDpost;
+  case AArch64::STRQui:
+    return AArch64::STRQpost;
+  case AArch64::STRWui:
+    return AArch64::STRWpost;
+  case AArch64::STRXui:
+    return AArch64::STRXpost;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpost;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpost;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpost;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpost;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpost;
   }
 }
 
 MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
-                                    MachineBasicBlock::iterator Paired,
-                                    bool mergeForward) {
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      bool MergeForward) {
   MachineBasicBlock::iterator NextI = I;
   ++NextI;
   // If NextI is the second of the two instructions to be merged, we need
@@ -271,16 +280,17 @@ ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     ++NextI;
 
   bool IsUnscaled = isUnscaledLdst(I->getOpcode());
-  int OffsetStride = IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(I) : 1;
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
 
   unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
   // Insert our new paired instruction after whichever of the paired
-  // instructions mergeForward indicates.
-  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
-  // Also based on mergeForward is from where we copy the base register operand
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  // Also based on MergeForward is from where we copy the base register operand
   // so we get the flags compatible with the input code.
   MachineOperand &BaseRegOp =
-      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
+      MergeForward ? Paired->getOperand(1) : I->getOperand(1);
 
   // Which register is Rt and which is Rt2 depends on the offset order.
   MachineInstr *RtMI, *Rt2MI;
@@ -294,7 +304,7 @@ ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   }
   // Handle Unscaled
   int OffsetImm = RtMI->getOperand(2).getImm();
-  if (IsUnscaled && EnableARM64UnscaledMemOp)
+  if (IsUnscaled && EnableAArch64UnscaledMemOp)
     OffsetImm /= OffsetStride;
 
   // Construct the new instruction.
@@ -354,8 +364,8 @@ static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
   if (IsUnscaled) {
     // Convert the byte-offset used by unscaled into an "element" offset used
     // by the scaled pair load/store instructions.
-    int elemOffset = Offset / OffsetStride;
-    if (elemOffset > 63 || elemOffset < -64)
+    int ElemOffset = Offset / OffsetStride;
+    if (ElemOffset > 63 || ElemOffset < -64)
       return false;
   }
   return true;
@@ -372,15 +382,15 @@ static int alignTo(int Num, int PowOf2) {
 /// findMatchingInsn - Scan the instructions looking for a load/store that can
 /// be combined with the current instruction into a load/store pair.
 MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                    bool &mergeForward, unsigned Limit) {
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                      bool &MergeForward, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator MBBI = I;
   MachineInstr *FirstMI = I;
   ++MBBI;
 
   int Opc = FirstMI->getOpcode();
-  bool mayLoad = FirstMI->mayLoad();
+  bool MayLoad = FirstMI->mayLoad();
   bool IsUnscaled = isUnscaledLdst(Opc);
   unsigned Reg = FirstMI->getOperand(0).getReg();
   unsigned BaseReg = FirstMI->getOperand(1).getReg();
@@ -394,7 +404,7 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   if (FirstMI->modifiesRegister(BaseReg, TRI))
     return E;
   int OffsetStride =
-      IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
     return E;
 
@@ -444,7 +454,7 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // If the alignment requirements of the paired (scaled) instruction
         // can't express the offset of the unscaled input, bail and keep
         // looking.
-        if (IsUnscaled && EnableARM64UnscaledMemOp &&
+        if (IsUnscaled && EnableAArch64UnscaledMemOp &&
             (alignTo(MinOffset, OffsetStride) != MinOffset)) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
           continue;
@@ -452,7 +462,7 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // If the destination register of the loads is the same register, bail
         // and keep looking. A load-pair instruction with both destination
         // registers the same is UNPREDICTABLE and will result in an exception.
-        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
+        if (MayLoad && Reg == MI->getOperand(0).getReg()) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
           continue;
         }
@@ -461,7 +471,7 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // the two instructions, we can combine the second into the first.
         if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
             !UsedRegs[MI->getOperand(0).getReg()]) {
-          mergeForward = false;
+          MergeForward = false;
           return MBBI;
         }
 
@@ -470,7 +480,7 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // second.
         if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
             !UsedRegs[FirstMI->getOperand(0).getReg()]) {
-          mergeForward = true;
+          MergeForward = true;
           return MBBI;
         }
         // Unable to combine these instructions due to interference in between.
@@ -507,10 +517,10 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 }
 
 MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                         MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
+AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
          "Unexpected base register update instruction to merge!");
   MachineBasicBlock::iterator NextI = I;
   // Return the instruction following the merged instruction, which is
@@ -520,14 +530,15 @@ ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
     ++NextI;
 
   int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
          "Can't merge 1 << 12 offset into pre-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
+  if (Update->getOpcode() == AArch64::SUBXri)
     Value = -Value;
 
   unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
   MachineInstrBuilder MIB =
       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
           .addOperand(I->getOperand(0))
           .addOperand(I->getOperand(1))
           .addImm(Value);
@@ -549,11 +560,10 @@ ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                          MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
+MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
+    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
          "Unexpected base register update instruction to merge!");
   MachineBasicBlock::iterator NextI = I;
   // Return the instruction following the merged instruction, which is
@@ -563,14 +573,15 @@ ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
     ++NextI;
 
   int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
          "Can't merge 1 << 12 offset into post-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
+  if (Update->getOpcode() == AArch64::SUBXri)
     Value = -Value;
 
   unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
   MachineInstrBuilder MIB =
       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
           .addOperand(I->getOperand(0))
           .addOperand(I->getOperand(1))
           .addImm(Value);
@@ -597,17 +608,17 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
   switch (MI->getOpcode()) {
   default:
     break;
-  case ARM64::SUBXri:
+  case AArch64::SUBXri:
     // Negate the offset for a SUB instruction.
     Offset *= -1;
   // FALLTHROUGH
-  case ARM64::ADDXri:
+  case AArch64::ADDXri:
     // Make sure it's a vanilla immediate operand, not a relocation or
     // anything else we can't handle.
     if (!MI->getOperand(2).isImm())
       break;
     // Watch out for 1 << 12 shifted value.
-    if (ARM64_AM::getShiftValue(MI->getOperand(3).getImm()))
+    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
       break;
     // If the instruction has the base register as source and dest and the
     // immediate will fit in a signed 9-bit integer, then we have a match.
@@ -625,9 +636,8 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
   return false;
 }
 
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
-                                                 unsigned Limit, int Value) {
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+    MachineBasicBlock::iterator I, unsigned Limit, int Value) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineInstr *MemMI = I;
   MachineBasicBlock::iterator MBBI = I;
@@ -680,9 +690,8 @@ ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
   return E;
 }
 
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
-                                                  unsigned Limit) {
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+    MachineBasicBlock::iterator I, unsigned Limit) {
   MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineInstr *MemMI = I;
@@ -734,7 +743,7 @@ ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
   return E;
 }
 
-bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
   bool Modified = false;
   // Two tranformations to do here:
   // 1) Find loads and stores that can be merged into a single load or store
@@ -760,27 +769,27 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       // Just move on to the next instruction.
       ++MBBI;
       break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
     // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
       // If this is a volatile load/store, don't mess with it.
       if (MI->hasOrderedMemoryRef()) {
         ++MBBI;
@@ -792,20 +801,20 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
         break;
       }
       // Check if this load/store has a hint to avoid pair formation.
-      // MachineMemOperands hints are set by the ARM64StorePairSuppress pass.
+      // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
       if (TII->isLdStPairSuppressed(MI)) {
         ++MBBI;
         break;
       }
       // Look ahead up to ScanLimit instructions for a pairable instruction.
-      bool mergeForward = false;
+      bool MergeForward = false;
       MachineBasicBlock::iterator Paired =
-          findMatchingInsn(MBBI, mergeForward, ScanLimit);
+          findMatchingInsn(MBBI, MergeForward, ScanLimit);
       if (Paired != E) {
         // Merge the loads into a pair. Keeping the iterator straight is a
         // pain, so we let the merge routine tell us what the next instruction
         // is after it's done mucking about.
-        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
+        MBBI = mergePairedInsns(MBBI, Paired, MergeForward);
 
         Modified = true;
         ++NumPairCreated;
@@ -831,27 +840,27 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       // Just move on to the next instruction.
       ++MBBI;
       break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
     // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
       // Make sure this is a reg+imm (as opposed to an address reloc).
       if (!MI->getOperand(2).isImm()) {
         ++MBBI;
@@ -920,9 +929,9 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
   return Modified;
 }
 
-bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
   TRI = TM.getRegisterInfo();
 
   bool Modified = false;
@@ -937,6 +946,6 @@ bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
 
 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store
 /// optimization pass.
-FunctionPass *llvm::createARM64LoadStoreOptimizationPass() {
-  return new ARM64LoadStoreOpt();
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+  return new AArch64LoadStoreOpt();
 }
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 103aeb00d873..75a17b9dc999 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst -==//
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,146 +12,191 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64AsmPrinter.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64MCInstLower.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-MCOperand
-AArch64AsmPrinter::lowerSymbolOperand(const MachineOperand &MO,
-                                      const MCSymbol *Sym) const {
-  const MCExpr *Expr = nullptr;
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
+                                       AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
 
-  Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, OutContext);
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
 
-  switch (MO.getTargetFlags()) {
-  case AArch64II::MO_GOT:
-    Expr = AArch64MCExpr::CreateGOT(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOT_LO12:
-    Expr = AArch64MCExpr::CreateGOTLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_LO12:
-    Expr = AArch64MCExpr::CreateLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_DTPREL_G1:
-    Expr = AArch64MCExpr::CreateDTPREL_G1(Expr, OutContext);
-    break;
-  case AArch64II::MO_DTPREL_G0_NC:
-    Expr = AArch64MCExpr::CreateDTPREL_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOTTPREL:
-    Expr = AArch64MCExpr::CreateGOTTPREL(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOTTPREL_LO12:
-    Expr = AArch64MCExpr::CreateGOTTPRELLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_TLSDESC:
-    Expr = AArch64MCExpr::CreateTLSDesc(Expr, OutContext);
-    break;
-  case AArch64II::MO_TLSDESC_LO12:
-    Expr = AArch64MCExpr::CreateTLSDescLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_TPREL_G1:
-    Expr = AArch64MCExpr::CreateTPREL_G1(Expr, OutContext);
-    break;
-  case AArch64II::MO_TPREL_G0_NC:
-    Expr = AArch64MCExpr::CreateTPREL_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G3:
-    Expr = AArch64MCExpr::CreateABS_G3(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G2_NC:
-    Expr = AArch64MCExpr::CreateABS_G2_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G1_NC:
-    Expr = AArch64MCExpr::CreateABS_G1_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G0_NC:
-    Expr = AArch64MCExpr::CreateABS_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_NO_FLAG:
-    // Expr is already correct
-    break;
-  default:
-    llvm_unreachable("Unexpected MachineOperand flag");
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                       MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
   }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                    MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
 
+  if (MO.getTargetFlags() & AArch64II::MO_GOT)
+    RefFlags |= AArch64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= AArch64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= AArch64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= AArch64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= AArch64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+    RefFlags |= AArch64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+           AArch64II::MO_PAGEOFF)
+    RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+    RefFlags |= AArch64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+    RefFlags |= AArch64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+    RefFlags |= AArch64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+    RefFlags |= AArch64MCExpr::VK_G0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_NC)
+    RefFlags |= AArch64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(),
-                                                          OutContext),
-                                   OutContext);
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+  AArch64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+  Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
 
   return MCOperand::CreateExpr(Expr);
 }
 
-bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
-                                     MCOperand &MCOp) const {
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                                 MCSymbol *Sym) const {
+  if (TargetTriple.isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                      MCOperand &MCOp) const {
   switch (MO.getType()) {
-  default: llvm_unreachable("unknown operand type");
+  default:
+    llvm_unreachable("unknown operand type");
   case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
     if (MO.isImplicit())
       return false;
-    assert(!MO.getSubReg() && "Subregs should be eliminated!");
     MCOp = MCOperand::CreateReg(MO.getReg());
     break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
   case MachineOperand::MO_Immediate:
     MCOp = MCOperand::CreateImm(MO.getImm());
     break;
-  case MachineOperand::MO_FPImmediate: {
-    assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported");
-    MCOp = MCOperand::CreateFPImm(0.0);
-    break;
-  }
-  case MachineOperand::MO_BlockAddress:
-    MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(
+        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal()));
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
     break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                                   MO.getMBB()->getSymbol(), OutContext));
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
     break;
   case MachineOperand::MO_JumpTableIndex:
-    MCOp = lowerSymbolOperand(MO, GetJTISymbol(MO.getIndex()));
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    MCOp = lowerSymbolOperand(MO, GetCPISymbol(MO.getIndex()));
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
     break;
-  case MachineOperand::MO_RegisterMask:
-    // Ignore call clobbers
-    return false;
-
   }
-
   return true;
 }
 
-void llvm::LowerAArch64MachineInstrToMCInst(const MachineInstr *MI,
-                                            MCInst &OutMI,
-                                            AArch64AsmPrinter &AP) {
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
     MCOperand MCOp;
-    if (AP.lowerOperand(MO, MCOp))
+    if (lowerOperand(MI->getOperand(i), MCOp))
       OutMI.addOperand(MCOp);
   }
 }
diff --git a/lib/Target/ARM64/ARM64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
similarity index 78%
rename from lib/Target/ARM64/ARM64MCInstLower.h
rename to lib/Target/AArch64/AArch64MCInstLower.h
index 7e3a2c8e54f2..ba50ba9e2fe5 100644
--- a/lib/Target/ARM64/ARM64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -1,4 +1,4 @@
-//===-- ARM64MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARM64_MCINSTLOWER_H
-#define ARM64_MCINSTLOWER_H
+#ifndef AArch64_MCINSTLOWER_H
+#define AArch64_MCINSTLOWER_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Compiler.h"
@@ -25,15 +25,15 @@ class MachineModuleInfoMachO;
 class MachineOperand;
 class Mangler;
 
-/// ARM64MCInstLower - This class is used to lower an MachineInstr
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
 /// into an MCInst.
-class LLVM_LIBRARY_VISIBILITY ARM64MCInstLower {
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
   MCContext &Ctx;
   AsmPrinter &Printer;
   Triple TargetTriple;
 
 public:
-  ARM64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+  AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
 
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
deleted file mode 100644
index f45d8f784f42..000000000000
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- AArch64MachineFuctionInfo.cpp - AArch64 machine function info -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file just contains the anchor for the AArch64MachineFunctionInfo to
-// force vtable emission.
-//
-//===----------------------------------------------------------------------===//
-#include "AArch64MachineFunctionInfo.h"
-
-using namespace llvm;
-
-void AArch64MachineFunctionInfo::anchor() { }
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 33da54f97fda..7c257ba9116f 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- AArch64MachineFuctionInfo.h - AArch64 machine function info -*- C++ -*-==//
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AARCH64MACHINEFUNCTIONINFO_H
-#define AARCH64MACHINEFUNCTIONINFO_H
+#ifndef AArch64MACHINEFUNCTIONINFO_H
+#define AArch64MACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
 
 namespace llvm {
 
-/// This class is derived from MachineFunctionInfo and contains private AArch64
-/// target-specific information for each MachineFunction.
-class AArch64MachineFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo : public MachineFunctionInfo {
 
   /// Number of bytes of arguments this function has on the stack. If the callee
   /// is expected to restore the argument stack this should be a multiple of 16,
@@ -39,111 +41,123 @@ class AArch64MachineFunctionInfo : public MachineFunctionInfo {
   /// callee is expected to pop the args.
   unsigned ArgumentStackToRestore;
 
-  /// If the stack needs to be adjusted on frame entry in two stages, this
-  /// records the size of the first adjustment just prior to storing
-  /// callee-saved registers. The callee-saved slots are addressed assuming
-  /// SP == <incoming-SP> - InitialStackAdjust.
-  unsigned InitialStackAdjust;
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
 
-  /// Number of local-dynamic TLS accesses.
-  unsigned NumLocalDynamics;
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of the area where LowerFormalArguments puts the
-  /// general-purpose registers that might contain variadic parameters.
-  int VariadicGPRIdx;
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The size of the frame object used to store the general-purpose registers
-  /// which might contain variadic arguments. This is the offset from
-  /// VariadicGPRIdx to what's stored in __gr_top.
-  unsigned VariadicGPRSize;
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of the area where LowerFormalArguments puts the
-  /// floating-point registers that might contain variadic parameters.
-  int VariadicFPRIdx;
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The size of the frame object used to store the floating-point registers
-  /// which might contain variadic arguments. This is the offset from
-  /// VariadicFPRIdx to what's stored in __vr_top.
-  unsigned VariadicFPRSize;
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of an object pointing just past the last known stacked
-  /// argument on entry to a variadic function. This goes into the __stack field
-  /// of the va_list type.
-  int VariadicStackIdx;
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
 
-  /// The offset of the frame pointer from the stack pointer on function
-  /// entry. This is expected to be negative.
-  int FramePointerOffset;
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
 
 public:
-  AArch64MachineFunctionInfo()
-    : BytesInStackArgArea(0),
-      ArgumentStackToRestore(0),
-      InitialStackAdjust(0),
-      NumLocalDynamics(0),
-      VariadicGPRIdx(0),
-      VariadicGPRSize(0),
-      VariadicFPRIdx(0),
-      VariadicFPRSize(0),
-      VariadicStackIdx(0),
-      FramePointerOffset(0) {}
-
-  explicit AArch64MachineFunctionInfo(MachineFunction &MF)
-    : BytesInStackArgArea(0),
-      ArgumentStackToRestore(0),
-      InitialStackAdjust(0),
-      NumLocalDynamics(0),
-      VariadicGPRIdx(0),
-      VariadicGPRSize(0),
-      VariadicFPRIdx(0),
-      VariadicFPRSize(0),
-      VariadicStackIdx(0),
-      FramePointerOffset(0) {}
+  AArch64FunctionInfo()
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+
+  explicit AArch64FunctionInfo(MachineFunction &MF)
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+    (void)MF;
+  }
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
-  void setBytesInStackArgArea (unsigned bytes) { BytesInStackArgArea = bytes;}
+  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
 
   unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
   void setArgumentStackToRestore(unsigned bytes) {
     ArgumentStackToRestore = bytes;
   }
 
-  unsigned getInitialStackAdjust() const { return InitialStackAdjust; }
-  void setInitialStackAdjust(unsigned bytes) { InitialStackAdjust = bytes; }
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
 
-  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
-  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
 
-  int getVariadicGPRIdx() const { return VariadicGPRIdx; }
-  void setVariadicGPRIdx(int Idx) { VariadicGPRIdx = Idx; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
 
-  unsigned getVariadicGPRSize() const { return VariadicGPRSize; }
-  void setVariadicGPRSize(unsigned Size) { VariadicGPRSize = Size; }
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
 
-  int getVariadicFPRIdx() const { return VariadicFPRIdx; }
-  void setVariadicFPRIdx(int Idx) { VariadicFPRIdx = Idx; }
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
 
-  unsigned getVariadicFPRSize() const { return VariadicFPRSize; }
-  void setVariadicFPRSize(unsigned Size) { VariadicFPRSize = Size; }
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
 
-  int getVariadicStackIdx() const { return VariadicStackIdx; }
-  void setVariadicStackIdx(int Idx) { VariadicStackIdx = Idx; }
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
 
-  int getFramePointerOffset() const { return FramePointerOffset; }
-  void setFramePointerOffset(int Idx) { FramePointerOffset = Idx; }
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
 
-};
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  class MILOHDirective {
+    MCLOHType Kind;
 
+    /// Arguments of this directive. Order matters.
+    SmallVector<const MachineInstr *, 3> Args;
+
+  public:
+    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+
+    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+        : Kind(Kind), Args(Args.begin(), Args.end()) {
+      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+    }
+
+    MCLOHType getKind() const { return Kind; }
+    const LOHArgs &getArgs() const { return Args; }
+  };
+
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+    LOHRelated.insert(Args.begin(), Args.end());
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
 } // End llvm namespace
 
-#endif
+#endif // AArch64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM64/ARM64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
similarity index 99%
rename from lib/Target/ARM64/ARM64PerfectShuffle.h
rename to lib/Target/AArch64/AArch64PerfectShuffle.h
index 6759236fd143..b22fa2424d5c 100644
--- a/lib/Target/ARM64/ARM64PerfectShuffle.h
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -1,4 +1,4 @@
-//===-- ARM64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -------------===//
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/ARM64/ARM64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
similarity index 92%
rename from lib/Target/ARM64/ARM64PromoteConstant.cpp
rename to lib/Target/AArch64/AArch64PromoteConstant.cpp
index e61a62262d39..4723cc4978e5 100644
--- a/lib/Target/ARM64/ARM64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -1,5 +1,4 @@
-
-//===-- ARM64PromoteConstant.cpp --- Promote constant to global for ARM64 -===//
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ARM64PromoteConstant pass which promotes constants
+// This file implements the AArch64PromoteConstant pass which promotes constants
 // to global variables when this is likely to be more efficient. Currently only
 // types related to constant vector (i.e., constant vector, array of constant
 // vectors, constant structure with a constant vector field, etc.) are promoted
@@ -21,7 +20,7 @@
 // FIXME: This pass may be useful for other targets too.
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
+#include "AArch64.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
@@ -41,17 +40,17 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-promote-const"
+#define DEBUG_TYPE "aarch64-promote-const"
 
 // Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-promote-const", cl::Hidden,
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
                             cl::desc("Promote all vector constants"));
 
 STATISTIC(NumPromoted, "Number of promoted constants");
 STATISTIC(NumPromotedUses, "Number of promoted constants uses");
 
 //===----------------------------------------------------------------------===//
-//                       ARM64PromoteConstant
+//                       AArch64PromoteConstant
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -81,13 +80,13 @@ namespace {
 ///
 /// Therefore the final assembly final has 4 different loads. With this pass
 /// enabled, only one load is issued for the constants.
-class ARM64PromoteConstant : public ModulePass {
+class AArch64PromoteConstant : public ModulePass {
 
 public:
   static char ID;
-  ARM64PromoteConstant() : ModulePass(ID) {}
+  AArch64PromoteConstant() : ModulePass(ID) {}
 
-  const char *getPassName() const override { return "ARM64 Promote Constant"; }
+  const char *getPassName() const override { return "AArch64 Promote Constant"; }
 
   /// Iterate over the functions and promote the interesting constants into
   /// global variables with module scope.
@@ -202,20 +201,20 @@ class ARM64PromoteConstant : public ModulePass {
 };
 } // end anonymous namespace
 
-char ARM64PromoteConstant::ID = 0;
+char AArch64PromoteConstant::ID = 0;
 
 namespace llvm {
-void initializeARM64PromoteConstantPass(PassRegistry &);
+void initializeAArch64PromoteConstantPass(PassRegistry &);
 }
 
-INITIALIZE_PASS_BEGIN(ARM64PromoteConstant, "arm64-promote-const",
-                      "ARM64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+                      "AArch64 Promote Constant Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64PromoteConstant, "arm64-promote-const",
-                    "ARM64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+                    "AArch64 Promote Constant Pass", false, false)
 
-ModulePass *llvm::createARM64PromoteConstantPass() {
-  return new ARM64PromoteConstant();
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+  return new AArch64PromoteConstant();
 }
 
 /// Check if the given type uses a vector type.
@@ -330,7 +329,7 @@ static bool shouldConvert(const Constant *Cst) {
 }
 
 Instruction *
-ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
   // If this user is a phi, the insertion point is in the related
   // incoming basic block.
   PHINode *PhiInst = dyn_cast<PHINode>(*Use);
@@ -344,9 +343,9 @@ ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
   return InsertionPoint;
 }
 
-bool ARM64PromoteConstant::isDominated(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
       *NewPt->getParent()->getParent()).getDomTree();
@@ -371,9 +370,9 @@ bool ARM64PromoteConstant::isDominated(Instruction *NewPt,
   return false;
 }
 
-bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
       *NewPt->getParent()->getParent()).getDomTree();
   BasicBlock *NewBB = NewPt->getParent();
@@ -422,7 +421,7 @@ bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
   return false;
 }
 
-void ARM64PromoteConstant::computeInsertionPoints(
+void AArch64PromoteConstant::computeInsertionPoints(
     Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
   DEBUG(dbgs() << "** Compute insertion points **\n");
   for (Value::user_iterator UseIt = Val->user_begin(),
@@ -464,9 +463,8 @@ void ARM64PromoteConstant::computeInsertionPoints(
   }
 }
 
-bool
-ARM64PromoteConstant::insertDefinitions(Constant *Cst,
-                                        InsertionPointsPerFunc &InsPtsPerFunc) {
+bool AArch64PromoteConstant::insertDefinitions(
+    Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
   // We will create one global variable per Module.
   DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
   bool HasChanged = false;
@@ -533,13 +531,13 @@ ARM64PromoteConstant::insertDefinitions(Constant *Cst,
   return HasChanged;
 }
 
-bool ARM64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
   InsertionPointsPerFunc InsertPtsPerFunc;
   computeInsertionPoints(Val, InsertPtsPerFunc);
   return insertDefinitions(Val, InsertPtsPerFunc);
 }
 
-bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
+bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
   assert(Cst && "Given variable is not a valid constant.");
 
   if (!shouldConvert(Cst))
@@ -553,7 +551,7 @@ bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
   return computeAndInsertDefinitions(Cst);
 }
 
-bool ARM64PromoteConstant::runOnFunction(Function &F) {
+bool AArch64PromoteConstant::runOnFunction(Function &F) {
   // Look for instructions using constant vector. Promote that constant to a
   // global variable. Create as few loads of this variable as possible and
   // update the uses accordingly.
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5382effd7bb9..01b9587b3174 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -12,175 +12,393 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "AArch64RegisterInfo.h"
 #include "AArch64FrameLowering.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
 
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
-AArch64RegisterInfo::AArch64RegisterInfo()
-  : AArch64GenRegisterInfo(AArch64::X30) {
-}
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
+                                         const AArch64Subtarget *sti)
+    : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
 
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return CSR_PCS_SaveList;
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_SaveList;
+  else
+    return CSR_AArch64_AAPCS_SaveList;
 }
 
-const uint32_t*
-AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID) const {
-  return CSR_PCS_RegMask;
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_RegMask;
+  else
+    return CSR_AArch64_AAPCS_RegMask;
 }
 
-const uint32_t *AArch64RegisterInfo::getTLSDescCallPreservedMask() const {
-  return TLSDesc_RegMask;
-}
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+  if (STI->isTargetDarwin())
+    return CSR_AArch64_TLS_Darwin_RegMask;
 
-const TargetRegisterClass *
-AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &AArch64::FlagClassRegClass)
-    return &AArch64::GPR64RegClass;
-
-  return RC;
+  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+  return CSR_AArch64_TLS_ELF_RegMask;
 }
 
-
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  return CSR_AArch64_AAPCS_ThisReturn_RegMask;
+}
 
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  Reserved.set(AArch64::XSP);
-  Reserved.set(AArch64::WSP);
-
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AArch64::SP);
   Reserved.set(AArch64::XZR);
+  Reserved.set(AArch64::WSP);
   Reserved.set(AArch64::WZR);
 
-  if (TFI->hasFP(MF)) {
-    Reserved.set(AArch64::X29);
+  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+    Reserved.set(AArch64::FP);
     Reserved.set(AArch64::W29);
   }
 
+  if (STI->isTargetDarwin()) {
+    Reserved.set(AArch64::X18); // Platform register
+    Reserved.set(AArch64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    Reserved.set(AArch64::X19);
+    Reserved.set(AArch64::W19);
+  }
+
   return Reserved;
 }
 
-static bool hasFrameOffset(int opcode) {
-  return opcode != AArch64::LD1x2_8B  && opcode != AArch64::LD1x3_8B  &&
-         opcode != AArch64::LD1x4_8B  && opcode != AArch64::ST1x2_8B  &&
-         opcode != AArch64::ST1x3_8B  && opcode != AArch64::ST1x4_8B  &&
-         opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B &&
-         opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B &&
-         opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B;
-}
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-void
-AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
-                                         int SPAdj,
-                                         unsigned FIOperandNum,
-                                         RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Cannot deal with nonzero SPAdj yet");
-  MachineInstr &MI = *MBBI;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64FrameLowering *TFI =
-   static_cast<const AArch64FrameLowering *>(MF.getTarget().getFrameLowering());
-
-  // In order to work out the base and offset for addressing, the FrameLowering
-  // code needs to know (sometimes) whether the instruction is storing/loading a
-  // callee-saved register, or whether it's a more generic
-  // operation. Fortunately the frame indices are used *only* for that purpose
-  // and are contiguous, so we can check here.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  int MinCSFI = 0;
-  int MaxCSFI = -1;
-
-  if (CSI.size()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  switch (Reg) {
+  default:
+    break;
+  case AArch64::SP:
+  case AArch64::XZR:
+  case AArch64::WSP:
+  case AArch64::WZR:
+    return true;
+  case AArch64::X18:
+  case AArch64::W18:
+    return STI->isTargetDarwin();
+  case AArch64::FP:
+  case AArch64::W29:
+    return TFI->hasFP(MF) || STI->isTargetDarwin();
+  case AArch64::W19:
+  case AArch64::X19:
+    return hasBasePointer(MF);
   }
 
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  bool IsCalleeSaveOp = FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI;
+  return false;
+}
 
-  unsigned FrameReg;
-  int64_t Offset;
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj,
-                                           IsCalleeSaveOp);
-  // A vector load/store instruction doesn't have an offset operand.
-  bool HasOffsetOp = hasFrameOffset(MI.getOpcode());
-  if (HasOffsetOp)
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &AArch64::GPR64RegClass;
+}
 
-  // DBG_VALUE instructions have no real restrictions so they can be handled
-  // easily.
-  if (MI.isDebugValue()) {
-    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/ false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::CCRRegClass)
+    return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
+  return RC;
+}
 
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo());
-  int MinOffset, MaxOffset, OffsetScale;
-  if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s || !HasOffsetOp) {
-    MinOffset = 0;
-    MaxOffset = 0xfff;
-    OffsetScale = 1;
-  } else {
-    // Load/store of a stack object
-    TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset);
-  }
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
 
-  // There are two situations we don't use frame + offset directly in the
-  // instruction:
-  // (1) The offset can't really be scaled
-  // (2) Can't encode offset as it doesn't have an offset operand
-  if ((Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) ||
-      (!HasOffsetOp && Offset != 0)) {
-    unsigned BaseReg =
-      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-    emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII,
-                  BaseReg, FrameReg, BaseReg, Offset);
-    FrameReg = BaseReg;
-    Offset = 0;
-  }
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
 
-  // Negative offsets are expected if we address from FP, but for
-  // now this checks nothing has gone horribly wrong.
-  assert(Offset >= 0 && "Unexpected negative offset from SP");
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  if (MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    if (MFI->getLocalFrameSize() < 256)
+      return false;
+    return true;
+  }
 
-  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true);
-  if (HasOffsetOp)
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
+  return false;
 }
 
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  if (TFI->hasFP(MF))
-    return AArch64::X29;
-  else
-    return AArch64::XSP;
+  return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+    const MachineFunction &MF) const {
+  return true;
 }
 
 bool
 AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Only consider eliminating leaf frames.
+  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+                          MFI->adjustsStack()))
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                            int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const AArch64FrameLowering *AFI
-    = static_cast<const AArch64FrameLowering*>(TFI);
-  return AFI->useFPForAddressing(MF);
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
 }
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       unsigned BaseReg,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
+}
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
+      MF.getTarget().getFrameLowering());
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                  MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR64commonRegClassID:
+    return 32 - 1                                      // XZR/SP
+           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+           - STI->isTargetDarwin() // X18 reserved as platform register
+           - hasBasePointer(MF);   // X19
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+    return 32;
+
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return 32;
+
+  case AArch64::FPR128_loRegClassID:
+    return 16;
+  }
+}
+
+} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 5b501f9cc160..76af1edce723 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -1,4 +1,4 @@
-//==- AArch64RegisterInfo.h - AArch64 Register Information Impl -*- C++ -*-===//
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the AArch64 implementation of the MCRegisterInfo class.
+// This file contains the AArch64 implementation of the MRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64REGISTERINFO_H
-#define LLVM_TARGET_AARCH64REGISTERINFO_H
-
-#include "llvm/Target/TargetRegisterInfo.h"
+#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
+#define LLVM_TARGET_AArch64REGISTERINFO_H
 
 #define GET_REGINFO_HEADER
 #include "AArch64GenRegisterInfo.inc"
@@ -23,12 +21,23 @@ namespace llvm {
 
 class AArch64InstrInfo;
 class AArch64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
 
 struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
-  AArch64RegisterInfo();
+private:
+  const AArch64InstrInfo *TII;
+  const AArch64Subtarget *STI;
+
+public:
+  AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti);
+
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
 
+  /// Code Generation virtual methods...
   const MCPhysReg *
-  getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override;
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
 
   unsigned getCSRFirstUseCost() const override {
@@ -38,42 +47,55 @@ struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
     return 5;
   }
 
-  const uint32_t *getTLSDescCallPreservedMask() const;
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override;
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *Rs = nullptr) const override;
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
 
-  /// getCrossCopyRegClass - Returns a legal register class to copy a register
-  /// in the specified class to or from. Returns original class if it is
-  /// possible to copy between a two registers of the specified class.
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
-  /// getLargestLegalSuperClass - Returns the largest super class of RC that is
-  /// legal to use in the current sub-target and has the same spill size.
-  const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override {
-    if (RC == &AArch64::tcGPR64RegClass)
-      return &AArch64::GPR64RegClass;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
 
-    return RC;
-  }
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
 
-  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
-    return true;
-  }
-
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
-    return true;
-  }
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
-  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_AARCH64REGISTERINFO_H
+#endif // LLVM_TARGET_AArch64REGISTERINFO_H
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 9de7abdf5ff0..a30e4ad0b5d8 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,4 +1,4 @@
-//===- AArch64RegisterInfo.td - ARM Register defs ----------*- tablegen -*-===//
+//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,284 +7,587 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file contains declarations that describe the AArch64 register file
 //
 //===----------------------------------------------------------------------===//
 
-let Namespace = "AArch64" in {
-def sub_128 : SubRegIndex<128>;
-def sub_64 : SubRegIndex<64>;
-def sub_32 : SubRegIndex<32>;
-def sub_16 : SubRegIndex<16>;
-def sub_8  : SubRegIndex<8>;
-
-// Note: Code depends on these having consecutive numbers.
-def qqsub : SubRegIndex<256, 256>;
-
-def qsub_0 : SubRegIndex<128>;
-def qsub_1 : SubRegIndex<128, 128>;
-def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
-def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
-
-def dsub_0 : SubRegIndex<64>;
-def dsub_1 : SubRegIndex<64, 64>;
-def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
-def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
-}
 
-// Registers are identified with 5-bit ID numbers.
-class AArch64Reg<bits<16> enc, string n> : Register<n> {
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
   let HWEncoding = enc;
   let Namespace = "AArch64";
+  let SubRegs = subregs;
 }
 
-class AArch64RegWithSubs<bits<16> enc, string n, list<Register> subregs = [],
-                         list<SubRegIndex> inds = []>
-      : AArch64Reg<enc, n> {
-  let SubRegs = subregs;
-  let SubRegIndices = inds;
+let Namespace = "AArch64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
 }
 
 //===----------------------------------------------------------------------===//
-//  Integer registers: w0-w30, wzr, wsp, x0-x30, xzr, sp
+// Registers
 //===----------------------------------------------------------------------===//
-
-foreach Index = 0-30 in {
-  def W#Index : AArch64Reg< Index, "w"#Index>, DwarfRegNum<[Index]>;
+def W0    : AArch64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : AArch64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : AArch64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : AArch64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : AArch64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : AArch64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : AArch64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : AArch64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : AArch64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : AArch64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : AArch64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : AArch64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : AArch64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : AArch64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : AArch64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : AArch64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : AArch64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : AArch64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : AArch64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : AArch64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR    : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP    : AArch64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
 }
 
-def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR : AArch64Reg<31, "wzr">;
+// Condition code register.
+def NZCV  : AArch64Reg<0, "nzcv">;
 
-// Could be combined with previous loop, but this way leaves w and x registers
-// consecutive as LLVM register numbers, which makes for easier debugging.
-foreach Index = 0-30 in {
-  def X#Index : AArch64RegWithSubs<Index, "x"#Index,
-                                   [!cast<Register>("W"#Index)], [sub_32]>,
-                DwarfRegNum<[Index]>;
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
 }
 
-def XSP : AArch64RegWithSubs<31, "sp", [WSP], [sub_32]>, DwarfRegNum<[31]>;
-def XZR : AArch64RegWithSubs<31, "xzr", [WZR], [sub_32]>;
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
 
-// Most instructions treat register 31 as zero for reads and a black-hole for
-// writes.
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
 
-// Note that the order of registers is important for the Disassembler here:
-// tablegen uses it to form MCRegisterClass::getRegister, which we assume can
-// take an encoding value.
-def GPR32 : RegisterClass<"AArch64", [i32], 32,
-                          (add (sequence "W%u", 0, 30), WZR)> {
+def GPR64spPlus0Operand : AsmOperandClass {
+  let Name = "GPR64sp0";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseGPR64sp0Operand";
 }
 
-def GPR64 : RegisterClass<"AArch64", [i64], 64,
-                          (add (sequence "X%u", 0, 30), XZR)> {
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+  let ParserMatchClass = GPR64spPlus0Operand;
 }
 
-def GPR32nowzr : RegisterClass<"AArch64", [i32], 32,
-                               (sequence "W%u", 0, 30)> {
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28, FP, LR)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
 }
 
-def GPR64noxzr : RegisterClass<"AArch64", [i64], 64,
-                               (sequence "X%u", 0, 30)> {
-}
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
 
-// For tail calls, we can't use callee-saved registers or the structure-return
-// register, as they are supposed to be live across function calls and may be
-// clobbered by the epilogue.
-def tcGPR64 : RegisterClass<"AArch64", [i64], 64,
-                            (add (sequence "X%u", 0, 7),
-                                 (sequence "X%u", 9, 18))> {
+def B0    : AArch64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : AArch64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : AArch64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : AArch64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : AArch64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : AArch64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : AArch64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : AArch64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : AArch64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : AArch64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : AArch64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : AArch64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : AArch64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : AArch64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : AArch64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : AArch64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : AArch64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : AArch64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : AArch64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : AArch64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
 }
 
+let SubRegIndices = [hsub] in {
+def S0    : AArch64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : AArch64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : AArch64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : AArch64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : AArch64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : AArch64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : AArch64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : AArch64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : AArch64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : AArch64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
 
-// Certain addressing-useful instructions accept sp directly. Again the order of
-// registers is important to the Disassembler.
-def GPR32wsp : RegisterClass<"AArch64", [i32], 32,
-                             (add (sequence "W%u", 0, 30), WSP)> {
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : AArch64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : AArch64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : AArch64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : AArch64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : AArch64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : AArch64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : AArch64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : AArch64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : AArch64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : AArch64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
 }
 
-def GPR64xsp : RegisterClass<"AArch64", [i64], 64,
-                             (add (sequence "X%u", 0, 30), XSP)> {
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : AArch64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : AArch64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : AArch64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : AArch64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : AArch64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : AArch64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : AArch64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : AArch64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : AArch64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : AArch64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
 }
 
-// Some aliases *only* apply to SP (e.g. MOV uses different encoding for SP and
-// non-SP variants). We can't use a bare register in those patterns because
-// TableGen doesn't like it, so we need a class containing just stack registers
-def Rxsp : RegisterClass<"AArch64", [i64], 64,
-                         (add XSP)> {
+def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
 }
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           128, (sequence "Q%u", 0, 31)>;
 
-def Rwsp : RegisterClass<"AArch64", [i32], 32,
-                         (add WSP)> {
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 196;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
 }
 
-//===----------------------------------------------------------------------===//
-//  Scalar registers in the vector unit:
-//  b0-b31, h0-h31, s0-s31, d0-d31, q0-q31
-//===----------------------------------------------------------------------===//
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
 
-foreach Index = 0-31 in {
-  def B # Index : AArch64Reg< Index, "b" # Index>,
-                  DwarfRegNum<[!add(Index, 64)]>;
 
-  def H # Index : AArch64RegWithSubs<Index, "h" # Index,
-                                     [!cast<Register>("B" # Index)], [sub_8]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+  let Name = "VectorReg64";
+  let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+  let Name = "VectorReg128";
+  let PredicateMethod = "isVectorReg";
+}
 
-  def S # Index : AArch64RegWithSubs<Index, "s" # Index,
-                                     [!cast<Register>("H" # Index)], [sub_16]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg64AsmOperand;
+}
 
-  def D # Index : AArch64RegWithSubs<Index, "d" # Index,
-                                     [!cast<Register>("S" # Index)], [sub_32]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg128AsmOperand;
+}
 
-  def Q # Index : AArch64RegWithSubs<Index, "q" # Index,
-                                     [!cast<Register>("D" # Index)], [sub_64]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
 }
 
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
 
-def FPR8 : RegisterClass<"AArch64", [v1i8], 8,
-                          (sequence "B%u", 0, 31)> {
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
 }
 
-def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
-                          (sequence "H%u", 0, 31)> {
-}
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
 
-def FPR32 : RegisterClass<"AArch64", [f32, v1i32], 32,
-                          (sequence "S%u", 0, 31)> {
-}
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
 
-def FPR64 : RegisterClass<"AArch64",
-                          [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
-                          64, (sequence "D%u", 0, 31)>;
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
 
-def FPR128 : RegisterClass<"AArch64",
-                           [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
-                           128, (sequence "Q%u", 0, 31)>;
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
 
-def FPR64Lo : RegisterClass<"AArch64",
-                            [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
-                            64, (sequence "D%u", 0, 15)>;
+  // 64-bit register lists with explicit type.
 
-def FPR128Lo : RegisterClass<"AArch64",
-                             [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
-                             128, (sequence "Q%u", 0, 15)>;
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
 
-//===----------------------------------------------------------------------===//
-//  Vector registers:
-//===----------------------------------------------------------------------===//
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
 
-def VPR64AsmOperand : AsmOperandClass {
-  let Name = "VPR";
-  let PredicateMethod = "isReg";
-  let RenderMethod = "addRegOperands";
-}
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
 
-def VPR64 : RegisterOperand<FPR64, "printVPRRegister">;
+  // 128-bit register lists with explicit type
 
-def VPR128 : RegisterOperand<FPR128, "printVPRRegister">;
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
 
-def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">;
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
 
-def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">;
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
 
-// Flags register
-def NZCV : Register<"nzcv"> {
-  let Namespace = "AArch64";
-}
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
 
-def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
-  let CopyCost = -1;
-  let isAllocatable = 0;
-}
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
 
-//===----------------------------------------------------------------------===//
-//  Consecutive vector registers
-//===----------------------------------------------------------------------===//
-// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D31_D0
-def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
-                              [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-                              
-// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
-def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
-                              [(rotl FPR64, 0), (rotl FPR64, 1),
-                               (rotl FPR64, 2)]>;
-                               
-// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
-def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
-                              [(rotl FPR64, 0), (rotl FPR64, 1),
-                               (rotl FPR64, 2), (rotl FPR64, 3)]>;
-
-// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
-def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
-                              [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-
-// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
-def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
-                              [(rotl FPR128, 0), (rotl FPR128, 1),
-                               (rotl FPR128, 2)]>;
-                               
-// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
-def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
-                              [(rotl FPR128, 0), (rotl FPR128, 1),
-                               (rotl FPR128, 2), (rotl FPR128, 3)]>;
-
-// The followings are super register classes to model 2/3/4 consecutive
-// 64-bit/128-bit registers.
-
-def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
-
-def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
-  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
-}
-
-def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
-
-def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
-
-def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
-  let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
-}
-
-def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
-
-
-// The followings are vector list operands
-multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
-                               RegisterClass RegList> {
-  def _asmoperand : AsmOperandClass {
-    let Name = PREFIX # LAYOUT # Count;
-    let RenderMethod = "addVectorListOperands";
-    let PredicateMethod = 
-        "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">";
-    let ParserMethod = "ParseVectorList";
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
   }
 
-  def _operand : RegisterOperand<RegList,
-        "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> {
-    let ParserMatchClass =
-      !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
   }
-}
 
-multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
-                           RegisterClass QRegList> {
-  defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
-  defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
-  defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
-  defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
-  defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
-  defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
-  defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
-  defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
 }
 
-// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
-defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
-defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
-defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
-defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/ARM64/ARM64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
similarity index 98%
rename from lib/Target/ARM64/ARM64SchedA53.td
rename to lib/Target/AArch64/AArch64SchedA53.td
index cf1a82027642..d709bee7b9eb 100644
--- a/lib/Target/ARM64/ARM64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -1,4 +1,4 @@
-//=- ARM64SchedA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -148,9 +148,9 @@ def : ReadAdvance<ReadVLD, 0>;
 
 // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
 //       operands are needed one cycle later if and only if they are to be
-//       shifted. Otherwise, they too are needed two cycle later. This same
+//       shifted. Otherwise, they too are needed two cycles later. This same
 //       ReadAdvance applies to Extended registers as well, even though there is
-//       a seperate SchedPredicate for them.
+//       a separate SchedPredicate for them.
 def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
                              WriteISReg, WriteIEReg,WriteIS,
                              WriteID32,WriteID64,
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
new file mode 100644
index 000000000000..8209f965cfd5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -0,0 +1,304 @@
+//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexA57Model : SchedMachineModel {
+  let IssueWidth        =   8; // 3-way decode and 8-way issue
+  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1> { let BufferSize = 8; }  // Type B micro-ops
+def A57UnitI : ProcResource<2> { let BufferSize = 8; }  // Type I micro-ops
+def A57UnitM : ProcResource<1> { let BufferSize = 8; }  // Type M micro-ops
+def A57UnitL : ProcResource<1> { let BufferSize = 8; }  // Type L micro-ops
+def A57UnitS : ProcResource<1> { let BufferSize = 8; }  // Type S micro-ops
+def A57UnitX : ProcResource<1> { let BufferSize = 8; }  // Type X micro-ops
+def A57UnitW : ProcResource<1> { let BufferSize = 8; }  // Type W micro-ops
+let SchedModel = CortexA57Model in {
+  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
+}
+
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "AArch64SchedA57WriteRes.td"
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Cortex-A57. The Cortex-A57 types are directly associated with resources, so
+// defining the aliases precludes the need for mapping them using WriteRes. The
+// aliases are sufficient for creating a coarse, working model. As the model
+// evolves, InstRWs will be used to override these SchedAliases.
+
+def : SchedAlias<WriteImm,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteI,     A57Write_1cyc_1I>;
+def : SchedAlias<WriteISReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteIEReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteExtr,  A57Write_1cyc_1I>;
+def : SchedAlias<WriteIS,    A57Write_1cyc_1I>;
+def : SchedAlias<WriteID32,  A57Write_19cyc_1M>;
+def : SchedAlias<WriteID64,  A57Write_35cyc_1M>;
+def : SchedAlias<WriteIM32,  A57Write_3cyc_1M>;
+def : SchedAlias<WriteIM64,  A57Write_5cyc_1M>;
+def : SchedAlias<WriteBr,    A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrReg, A57Write_1cyc_1B>;
+def : SchedAlias<WriteLD,    A57Write_4cyc_1L>;
+def : SchedAlias<WriteST,    A57Write_1cyc_1S>;
+def : SchedAlias<WriteSTP,   A57Write_1cyc_1S>;
+def : SchedAlias<WriteAdr,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteLDIdx, A57Write_4cyc_1I_1L>;
+def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>;
+def : SchedAlias<WriteF,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCmp,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFDiv,  A57Write_18cyc_1X>;
+def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
+def : SchedAlias<WriteVST,   A57Write_1cyc_1S>;
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+// Forwarding logic is not [yet] explicitly modeled beyond what is captured
+// in the latencies of the A57 Generic SchedWriteRes's.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above ShchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1B_1I], (instrs BL)>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instrs BLR)>;
+
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+// Multiply high
+def : InstRW<[A57Write_6cyc_1M], (instrs SMULHrr, UMULHrr)>;
+
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I],    (instrs EXTRWrri)>;
+def : InstRW<[A57Write_3cyc_1I_1M], (instrs EXTRXrri)>;
+def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
+
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>;
+
+
+// Vector Load
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1i(64)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_7cyc_3L],            (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_7cyc_3L, WriteAdr],  (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],           (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2i(8|16)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],            (instregex "LD2i(32)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],  (instregex "LD2i(32)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2i(64)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],             (instregex "LD2Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],   (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],   (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_2V],           (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD2Twov(2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_1L_3V],           (instregex "LD3i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],            (instregex "LD3i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],  (instregex "LD3i(32)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD3i(64)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],              (instregex "LD3Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],    (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_1L_3V],            (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr],  (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD3Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],               (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],     (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_10cyc_3L_4V],           (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],               (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],     (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD4i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD4i(32)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(64)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],              (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],    (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],               (instregex "LD4Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],     (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],            (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr],  (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_4V],           (instregex "LD4Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],                (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_11cyc_4L_4V],           (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],                (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],      (instregex "LD4Fourv(2d)_POST$")>;
+
+// Vector Store
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1S],            (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],  (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST1i(64)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[A57Write_1cyc_1S],                  (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],        (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],                (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],   (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],           (instregex "ST2i(64)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_2S_1V],              (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],    (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S_2V],           (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],             (instregex "ST2Twov(2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],   (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],            (instregex "ST3i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],  (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],           (instregex "ST3i(32)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],           (instregex "ST3i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_3S_2V],                 (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr],       (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S_4V],           (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],                (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],      (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],             (instregex "ST4i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],   (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],           (instregex "ST4i(32)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],            (instregex "ST4i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],  (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[A57Write_4cyc_4S_2V],                  (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr],        (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S_4V],           (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],                (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
+
+} // SchedModel = CortexA57Model
diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
new file mode 100644
index 000000000000..a8f421bf38d2
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -0,0 +1,512 @@
+//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: A57Write
+//   Latency: #cyc
+//   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+//      11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 5;  }
+def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
+def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; }
+def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
+def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
+def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
+def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; }
+def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
+def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
+def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
+def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
+def A57Write_4cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
+def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 64;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L     : SchedWriteRes<[A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V    : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S     : SchedWriteRes<[A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_36cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 36;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S     : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1M_2S     : SchedWriteRes<[A57UnitM,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_3S        : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_2S_1V     : SchedWriteRes<[A57UnitS, A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_5cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_7cyc_3L        : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1I_1L_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL,
+                                             A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_2V     : SchedWriteRes<[A57UnitL,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_9cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 4 micro-op types
+
+def A57Write_2cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_3S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_2S_1V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_4cyc_4S       : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+def A57Write_7cyc_1I_3L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 4;
+}
+def A57Write_5cyc_2I_2L    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_1I_1L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_4L       : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_2L_2V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_1L_3V    : SchedWriteRes<[A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 5 micro-op types
+
+def A57Write_3cyc_3S_2V    : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 5;
+}
+def A57Write_8cyc_1I_4L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 5;
+}
+def A57Write_4cyc_1I_4S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_2L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_1L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_2L_3V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 6 micro-op types
+
+def A57Write_3cyc_1I_3S_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_2I_4S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_4S_2V    : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_6cyc_6S       : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_2L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_1L_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_2L_4V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 7 micro-op types
+
+def A57Write_10cyc_3L_4V : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 7;
+}
+def A57Write_4cyc_1I_4S_2V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 7;
+}
+def A57Write_6cyc_1I_6S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS, A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 7;
+}
+def A57Write_9cyc_1I_2L_4V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 7;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 8 micro-op types
+
+def A57Write_10cyc_1I_3L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 8;
+}
+def A57Write_11cyc_4L_4V : SchedWriteRes<[A57UnitL, A57UnitL,
+                                          A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 8;
+}
+def A57Write_8cyc_8S  : SchedWriteRes<[A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 8;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 9 micro-op types
+
+def A57Write_8cyc_1I_8S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 9;
+}
+def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 9;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 10 micro-op types
+
+def A57Write_6cyc_6S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 10;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 11 micro-op types
+
+def A57Write_6cyc_1I_6S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 12 micro-op types
+
+def A57Write_8cyc_8S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 13 micro-op types
+
+def A57Write_8cyc_1I_8S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 13;
+}
+
diff --git a/lib/Target/ARM64/ARM64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
similarity index 98%
rename from lib/Target/ARM64/ARM64SchedCyclone.td
rename to lib/Target/AArch64/AArch64SchedCyclone.td
index c04a7bb8bafa..a2a180237789 100644
--- a/lib/Target/ARM64/ARM64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -1,4 +1,4 @@
-//=- ARMSchedCyclone.td - ARM64 Cyclone Scheduling Defs ------*- tablegen -*-=//
+//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the machine model for ARM64 Cyclone to support
+// This file defines the machine model for AArch64 Cyclone to support
 // instruction scheduling and other instruction cost heuristics.
 //
 //===----------------------------------------------------------------------===//
@@ -239,13 +239,13 @@ def : WriteRes<WriteST, [CyUnitLS]> {
 def CyWriteLDIdx : SchedWriteVariant<[
   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
-def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map ARM64->Cyclone type.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
 
 // EXAMPLE: STR Xn, Xm [, lsl 3]
 def CyWriteSTIdx : SchedWriteVariant<[
   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
-def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map ARM64->Cyclone type.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
 
 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
 // EXAMPLE: LDR Xn, Xm [, lsl 3]
@@ -253,7 +253,7 @@ def ReadBaseRS : SchedReadAdvance<1>;
 def CyReadAdrBase : SchedReadVariant<[
   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
-def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map ARM64->Cyclone type.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
 
 //---
 // 7.8.9,7.8.11. Load/Store, paired
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index 6ec47dbaa589..eaa9110ab1bc 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -1,4 +1,4 @@
-//===- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,74 +7,98 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// Generic processor itineraries for legacy compatibility.
-
-def GenericItineraries : ProcessorItineraries<[], [], []>;
-
-
-//===----------------------------------------------------------------------===//
-// Base SchedReadWrite types
-
-// Basic ALU
-def WriteALU : SchedWrite;  // Generic: may contain shift and/or ALU operation
-def WriteALUs : SchedWrite; // Shift only with no ALU operation
-def ReadALU : SchedRead;    // Operand not needed for shifting
-def ReadALUs : SchedRead;   // Operand needed for shifting
-
-// Multiply with optional accumulate
-def WriteMAC : SchedWrite;
-def ReadMAC : SchedRead;
-
-// Compares
-def WriteCMP : SchedWrite;
-def ReadCMP : SchedRead;
-
-// Division
-def WriteDiv : SchedWrite;
-def ReadDiv : SchedRead;
-
-// Loads
-def WriteLd : SchedWrite;
-def WritePreLd : SchedWrite;
-def WriteVecLd : SchedWrite;
-def ReadLd : SchedRead;
-def ReadPreLd : SchedRead;
-def ReadVecLd : SchedRead;
-
-// Stores
-def WriteSt : SchedWrite;
-def WriteVecSt : SchedWrite;
-def ReadSt : SchedRead;
-def ReadVecSt : SchedRead;
-
-// Branches
-def WriteBr : SchedWrite;
-def WriteBrL : SchedWrite;
-def ReadBr : SchedRead;
-
-// Floating Point ALU
-def WriteFPALU : SchedWrite;
-def ReadFPALU : SchedRead;
-
-// Floating Point MAC, Mul, Div, Sqrt
-//   Most processors will simply send all of these down a dedicated pipe, but
-//   they're explicitly separated here for flexibility of modeling later. May
-//   consider consolidating them into a single WriteFPXXXX type in the future.
-def WriteFPMAC : SchedWrite;
-def WriteFPMul : SchedWrite;
-def WriteFPDiv : SchedWrite;
-def WriteFPSqrt : SchedWrite;
-def ReadFPMAC : SchedRead;
-def ReadFPMul : SchedRead;
-def ReadFPDiv : SchedRead;
-def ReadFPSqrt : SchedRead;
-
-// Noop
-def WriteNoop : SchedWrite;
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget specific Machine Models.
-
-include "AArch64ScheduleA53.td"
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const AArch64InstrInfo *TII =
+    static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// AArch64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def ReadI          : SchedRead;  // ALU
+def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
+def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def ReadID         : SchedRead;  // 32/64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def ReadIM         : SchedRead;  // 32/64-bit Multiply
+def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/AArch64/AArch64ScheduleA53.td b/lib/Target/AArch64/AArch64ScheduleA53.td
deleted file mode 100644
index 20a14e79228a..000000000000
--- a/lib/Target/AArch64/AArch64ScheduleA53.td
+++ /dev/null
@@ -1,144 +0,0 @@
-//=- AArch64ScheduleA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the itinerary class data for the ARM Cortex A53 processors.
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
-
-// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
-def CortexA53Model : SchedMachineModel {
-  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
-  let LoadLatency = 2; // Optimistic load latency assuming bypass.
-                       // This is overriden by OperandCycles if the
-                       // Itineraries are queried instead.
-  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
-                             // Specification - Instruction Timings"
-                             // v 1.0 Spreadsheet
-}
-
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the default BufferSize = -1.
-// Cortex-A53 is in-order and therefore should be using BufferSize = 0. The
-// current configuration performs better with the basic latencies provided so
-// far. Will revisit BufferSize once the latency information is more accurate.
-
-let SchedModel = CortexA53Model in {
-
-def A53UnitALU    : ProcResource<2>;                        // Int ALU
-def A53UnitMAC    : ProcResource<1>;                        // Int MAC
-def A53UnitDiv    : ProcResource<1>;                        // Int Division
-def A53UnitLdSt   : ProcResource<1>;                        // Load/Store
-def A53UnitB      : ProcResource<1>;                        // Branch
-def A53UnitFPALU  : ProcResource<1>;                        // FP ALU
-def A53UnitFPMDS  : ProcResource<1>;                        // FP Mult/Div/Sqrt
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-// Issue - Every instruction must consume an A53WriteIssue. Optionally,
-//         instructions that cannot be dual-issued will also include the
-//         A53WriteIssue2nd in their SchedRW list. That second WriteRes will
-//         ensure that a second issue slot is consumed.
-def A53WriteIssue : SchedWriteRes<[]>;
-def A53WriteIssue2nd : SchedWriteRes<[]> { let Latency = 0; }
-
-// ALU - These are reduced to 1 despite a true latency of 4 in order to easily
-//       model forwarding logic. Once forwarding is properly modelled, then
-//       they'll be corrected.
-def : WriteRes<WriteALU, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteALUs, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteCMP, [A53UnitALU]> { let Latency = 1; }
-
-// MAC
-def : WriteRes<WriteMAC, [A53UnitMAC]> { let Latency = 4; }
-
-// Div
-def : WriteRes<WriteDiv, [A53UnitDiv]> { let Latency = 4; }
-
-// Load - Note: Vector loads take 1-5 cycles to issue. For the WriteVecLd below,
-//        choosing the median of 3 which makes the latency 6. May model this more
-//        carefully in the future.
-def : WriteRes<WriteLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WritePreLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecLd, [A53UnitLdSt]> { let Latency = 6; }
-
-// Store - Note: Vector stores take 1-3 cycles to issue. For the ReadVecSt below,
-//         choosing the median of 2 which makes the latency 5. May model this more
-//         carefully in the future.
-def : WriteRes<WriteSt, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecSt, [A53UnitLdSt]> { let Latency = 5; }
-
-// Branch
-def : WriteRes<WriteBr, [A53UnitB]>;
-def : WriteRes<WriteBrL, [A53UnitB]>;
-
-// FP ALU
-def : WriteRes<WriteFPALU, [A53UnitFPALU]> {let Latency = 6; }
-
-// FP MAC, Mul, Div, Sqrt
-//   Using Double Precision numbers for now as a worst case. Additionally, not
-//   modeling the exact hazard but instead treating the whole pipe as a hazard.
-//   As an example VMUL, VMLA, and others are actually pipelined. VDIV and VSQRT
-//   have a total latency of 33 and 32 respectively but only a hazard of 29 and
-//   28 (double-prescion example).
-def : WriteRes<WriteFPMAC, [A53UnitFPMDS]> { let Latency = 10; }
-def : WriteRes<WriteFPMul, [A53UnitFPMDS]> { let Latency = 6; }
-def : WriteRes<WriteFPDiv, [A53UnitFPMDS]> { let Latency = 33;
-                                             let ResourceCycles = [29]; }
-def : WriteRes<WriteFPSqrt, [A53UnitFPMDS]> { let Latency = 32;
-                                              let ResourceCycles = [28]; }
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types.
-
-// No forwarding defined for ReadALU yet.
-def : ReadAdvance<ReadALU, 0>;
-
-// No forwarding defined for ReadCMP yet.
-def : ReadAdvance<ReadCMP, 0>;
-
-// No forwarding defined for ReadBr yet.
-def : ReadAdvance<ReadBr, 0>;
-
-// No forwarding defined for ReadMAC yet.
-def : ReadAdvance<ReadMAC, 0>;
-
-// No forwarding defined for ReadDiv yet.
-def : ReadAdvance<ReadDiv, 0>;
-
-// No forwarding defined for ReadLd, ReadPreLd, ReadVecLd yet.
-def : ReadAdvance<ReadLd, 0>;
-def : ReadAdvance<ReadPreLd, 0>;
-def : ReadAdvance<ReadVecLd, 0>;
-
-// No forwarding defined for ReadSt and ReadVecSt yet.
-def : ReadAdvance<ReadSt, 0>;
-def : ReadAdvance<ReadVecSt, 0>;
-
-// No forwarding defined for ReadFPALU yet.
-def : ReadAdvance<ReadFPALU, 0>;
-
-// No forwarding defined for ReadFPMAC/Mul/Div/Sqrt yet.
-def : ReadAdvance<ReadFPMAC, 0>;
-def : ReadAdvance<ReadFPMul, 0>;
-def : ReadAdvance<ReadFPDiv, 0>;
-def : ReadAdvance<ReadFPSqrt, 0>;
-
-}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 17010d41ed42..19c9e6451fe1 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -12,15 +12,49 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "arm-selectiondag-info"
+#define DEBUG_TYPE "aarch64-selectiondag-info"
 
-AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const AArch64TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM),
-    Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
-}
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const DataLayout *DL)
+    : TargetSelectionDAGInfo(DL) {}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const char *bzeroEntry =
+      (V && V->isNullValue())
+          ? DAG.getTarget().getSubtarget<AArch64Subtarget>().getBZeroEntry()
+          : nullptr;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const AArch64TargetLowering &TLI =
+        *static_cast<const AArch64TargetLowering *>(
+            DAG.getTarget().getTargetLowering());
 
-AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {
+    EVT IntPtr = TLI.getPointerTy();
+    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+      .setDiscardResult();
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
 }
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d412ed2be180..1180eea6f4c1 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -11,22 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64SELECTIONDAGINFO_H
-#define LLVM_AARCH64SELECTIONDAGINFO_H
+#ifndef AArch64SELECTIONDAGINFO_H
+#define AArch64SELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 
 namespace llvm {
 
-class AArch64TargetMachine;
-
 class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
-  const AArch64Subtarget *Subtarget;
 public:
-  explicit AArch64SelectionDAGInfo(const AArch64TargetMachine &TM);
+  explicit AArch64SelectionDAGInfo(const DataLayout *DL);
   ~AArch64SelectionDAGInfo();
-};
 
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                                  SDValue Dst, SDValue Src, SDValue Size,
+                                  unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+};
 }
 
 #endif
diff --git a/lib/Target/ARM64/ARM64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
similarity index 82%
rename from lib/Target/ARM64/ARM64StorePairSuppress.cpp
rename to lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 5416f11510c3..45f8ddbd2d85 100644
--- a/lib/Target/ARM64/ARM64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -1,4 +1,4 @@
-//===---- ARM64StorePairSuppress.cpp --- Suppress store pair formation ----===//
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,23 +11,23 @@
 // store pairs. Later we may do the same for floating point loads.
 // ===---------------------------------------------------------------------===//
 
-#include "ARM64InstrInfo.h"
+#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-stp-suppress"
+#define DEBUG_TYPE "aarch64-stp-suppress"
 
 namespace {
-class ARM64StorePairSuppress : public MachineFunctionPass {
-  const ARM64InstrInfo *TII;
+class AArch64StorePairSuppress : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
   MachineFunction *MF;
@@ -37,10 +37,10 @@ class ARM64StorePairSuppress : public MachineFunctionPass {
 
 public:
   static char ID;
-  ARM64StorePairSuppress() : MachineFunctionPass(ID) {}
+  AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
 
   virtual const char *getPassName() const override {
-    return "ARM64 Store Pair Suppression";
+    return "AArch64 Store Pair Suppression";
   }
 
   bool runOnMachineFunction(MachineFunction &F) override;
@@ -57,11 +57,11 @@ class ARM64StorePairSuppress : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
-char ARM64StorePairSuppress::ID = 0;
+char AArch64StorePairSuppress::ID = 0;
 } // anonymous
 
-FunctionPass *llvm::createARM64StorePairSuppressPass() {
-  return new ARM64StorePairSuppress();
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+  return new AArch64StorePairSuppress();
 }
 
 /// Return true if an STP can be added to this block without increasing the
@@ -70,7 +70,7 @@ FunctionPass *llvm::createARM64StorePairSuppressPass() {
 /// critical path. If the critical path is longer than the resource height, the
 /// extra vector ops can limit physreg renaming. Otherwise, it could simply
 /// oversaturate the vector units.
-bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
   if (!MinInstr)
     MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
 
@@ -79,7 +79,7 @@ bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
 
   // Get the machine model's scheduling class for STPQi.
   // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
-  unsigned SCIdx = TII->get(ARM64::STPDi).getSchedClass();
+  unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
   const MCSchedClassDesc *SCDesc =
       SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
 
@@ -103,22 +103,22 @@ bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
 /// tell us if it's profitable with no cpu knowledge here.
 ///
 /// FIXME: We plan to develop a decent Target abstraction for simple loads and
-/// stores. Until then use a nasty switch similar to ARM64LoadStoreOptimizer.
-bool ARM64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
     return true;
   }
 }
 
-bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TII = static_cast<const ARM64InstrInfo *>(MF->getTarget().getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
   TRI = MF->getTarget().getRegisterInfo();
   MRI = &MF->getRegInfo();
   const TargetSubtargetInfo &ST =
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index f88c899cc9c1..bb0b72c585b8 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information --------------===//
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,93 +7,124 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the AArch64 specific subclass of TargetSubtargetInfo.
+// This file implements the AArch64 specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
-#include "AArch64RegisterInfo.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-subtarget"
 
-#define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
 #include "AArch64GenSubtargetInfo.inc"
 
-enum AlignMode {
-  DefaultAlign,
-  StrictAlign,
-  NoStrictAlign
-};
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
-      cl::Hidden, cl::init(DefaultAlign),
-      cl::values(
-          clEnumValN(DefaultAlign,  "aarch64-default-align",
-                     "Generate unaligned accesses only on hardware/OS "
-                     "combinations that are known to support them"),
-          clEnumValN(StrictAlign,   "aarch64-strict-align",
-                     "Disallow all unaligned memory accesses"),
-          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
-                     "Allow unaligned memory accesses"),
-          clEnumValEnd));
-
-// Pin the vtable to this file.
-void AArch64Subtarget::anchor() {}
-
-AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
-                                   bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasFPARMv8(false), HasNEON(false), HasCrypto(false), TargetTriple(TT),
-      CPUString(CPU), IsLittleEndian(LittleEndian) {
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+                     "converter pass"), cl::init(true), cl::Hidden);
 
-  initializeSubtargetFeatures(CPU, FS);
-}
-
-void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
-                                                   StringRef FS) {
-  AllowsUnalignedMem = false;
+AArch64Subtarget &
+AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
+  // Determine default and user-specified characteristics
 
-  if (CPU.empty())
+  if (CPUString.empty())
     CPUString = "generic";
 
-  std::string FullFS = FS;
-  if (CPUString == "generic") {
-    // Enable FP by default.
-    if (FullFS.empty())
-      FullFS = "+fp-armv8";
+  ParseSubtargetFeatures(CPUString, FS);
+  return *this;
+}
+
+AArch64Subtarget::AArch64Subtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS, TargetMachine &TM,
+                                   bool LittleEndian)
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
+      TargetTriple(TT),
+      // This nested ternary is horrible, but DL needs to be properly
+      // initialized
+      // before TLInfo is constructed.
+      DL(isTargetMachO()
+             ? "e-m:o-i64:64-i128:128-n32:64-S128"
+             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
+      FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)),
+      TSInfo(&DL), TLInfo(TM) {}
+
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const {
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // MachO large model always goes via a GOT, simply to get a single 8-byte
+  // absolute relocation on all global addresses.
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return AArch64II::MO_GOT;
+
+  // The small code mode's direct accesses use ADRP, which cannot necessarily
+  // produce the value 0 (if the code is above 4GB). Therefore they must use the
+  // GOT.
+  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
+    return AArch64II::MO_GOT;
+
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+
+  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
+  //   + On MachO, if the symbol is defined in this module the GOT can be
+  //     skipped.
+  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
+  //     defined could end up in unexpected places. Use a GOT.
+  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
+    if (isTargetMachO())
+      return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
+                                               : AArch64II::MO_NO_FLAG;
     else
-      FullFS = "+fp-armv8," + FullFS;
+      // No need to go through the GOT for local symbols on ELF.
+      return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
   }
 
-  ParseSubtargetFeatures(CPU, FullFS);
-
-  switch (Align) {
-    case DefaultAlign:
-      // Linux targets support unaligned accesses on AARCH64
-      AllowsUnalignedMem = isTargetLinux();
-      break;
-    case StrictAlign:
-      AllowsUnalignedMem = false;
-      break;
-    case NoStrictAlign:
-      AllowsUnalignedMem = true;
-      break;
-  }
+  return AArch64II::MO_NO_FLAG;
 }
 
-bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
-                                          Reloc::Model RelocM) const {
-  if (RelocM == Reloc::Static)
-    return false;
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+  // Prefer bzero on Darwin only.
+  if(isTargetDarwin())
+    return "bzero";
+
+  return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         MachineInstr *begin, MachineInstr *end,
+                                         unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+}
 
-  return !GV->hasLocalLinkage() && !GV->hasHiddenVisibility();
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+  return EnableEarlyIfConvert;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index dd2b4d211f2d..52124f650c7a 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -1,4 +1,4 @@
-//==-- AArch64Subtarget.h - Define Subtarget for the AArch64 ---*- C++ -*--===//
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the AArch64 specific subclass of TargetSubtargetInfo.
+// This file declares the AArch64 specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_SUBTARGET_H
-#define LLVM_TARGET_AARCH64_SUBTARGET_H
+#ifndef AArch64SUBTARGET_H
+#define AArch64SUBTARGET_H
 
-#include "llvm/ADT/Triple.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -22,13 +27,12 @@
 #include "AArch64GenSubtargetInfo.inc"
 
 namespace llvm {
-class StringRef;
 class GlobalValue;
+class StringRef;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
-  virtual void anchor();
 protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57};
+  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
 
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily;
@@ -36,54 +40,93 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
   bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
+  bool HasCRC;
 
-  /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
-  /// accesses for some types.  For details, see
-  /// AArch64TargetLowering::allowsUnalignedMemoryAccesses().
-  bool AllowsUnalignedMem;
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove;
 
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing;
 
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
-  /// IsLittleEndian - The target is Little Endian
-  bool IsLittleEndian;
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
 
+  const DataLayout DL;
+  AArch64FrameLowering FrameLowering;
+  AArch64InstrInfo InstrInfo;
+  AArch64SelectionDAGInfo TSInfo;
+  AArch64TargetLowering TLInfo;
 private:
-  void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
+  /// initializeSubtargetDependencies - Initializes using CPUString and the
+  /// passed in feature string so that we can use initializer lists for
+  /// subtarget initialization.
+  AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
 
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
-  ///
-  AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
-                   bool LittleEndian);
+  AArch64Subtarget(const std::string &TT, const std::string &CPU,
+		   const std::string &FS, TargetMachine &TM, bool LittleEndian);
 
-  bool enableMachineScheduler() const override {
-    return true;
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const AArch64FrameLowering *getFrameLowering() const {
+    return &FrameLowering;
   }
+  const AArch64TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+  const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  bool enableMachineScheduler() const override { return true; }
 
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
 
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+
+  bool isLittleEndian() const { return DL.isLittleEndian(); }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isCyclone() const { return CPUString == "cyclone"; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
 
-  bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
 
-  bool isLittle() const { return IsLittleEndian; }
+  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
 
-  const std::string & getCPUString() const { return CPUString; }
+  bool enableEarlyIfConversion() const override;
 };
 } // End llvm namespace
 
-#endif  // LLVM_TARGET_AARCH64_SUBTARGET_H
+#endif // AArch64SUBTARGET_H
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6bd6f5912d79..f99b90b800f4 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -7,42 +7,77 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the implementation of the AArch64TargetMachine
-// methods. Principally just setting up the passes needed to generate correct
-// code on this architecture.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
-
 using namespace llvm;
 
+static cl::opt<bool>
+EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
+                     cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
+                    " integer instructions"), cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote "
+                      "constant pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the"
+                 " linker optimization hints (LOH)"), cl::init(true),
+                 cl::Hidden);
+
+static cl::opt<bool>
+EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
+                              cl::desc("Enable the pass that removes dead"
+                                       " definitons and replaces stores to"
+                                       " them with stores to the zero"
+                                       " register"),
+                              cl::init(true));
+
+static cl::opt<bool>
+EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
+                   " optimization pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
+  // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
   RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+
+  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
 }
 
+/// TargetMachine ctor - Create an AArch64 architecture model.
+///
 AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, LittleEndian),
-    InstrInfo(Subtarget),
-    DL(LittleEndian ?
-       "e-m:e-i64:64-i128:128-n32:64-S128" :
-       "E-m:e-i64:64-i128:128-n32:64-S128"),
-    TLInfo(*this),
-    TSInfo(*this),
-    FrameLowering(Subtarget) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, LittleEndian) {
   initAsmInfo();
 }
 
@@ -64,58 +99,118 @@ AArch64beTargetMachine(const Target &T, StringRef TT,
                        CodeGenOpt::Level OL)
   : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
-  // allows the AArch64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createAArch64TargetTransformInfoPass(this));
-}
-
 namespace {
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
   AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   AArch64TargetMachine &getAArch64TargetMachine() const {
     return getTM<AArch64TargetMachine>();
   }
 
-  const AArch64Subtarget &getAArch64Subtarget() const {
-    return *getAArch64TargetMachine().getSubtargetImpl();
-  }
-
+  void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
   bool addPreEmitPass() override;
 };
 } // namespace
 
+void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
+  // allows the AArch64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAArch64TargetTransformInfoPass(this));
+}
+
+TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AArch64PassConfig(this, PM);
+}
+
+void AArch64PassConfig::addIRPasses() {
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
+// Pass Pipeline Configuration
 bool AArch64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createAArch64PromoteConstantPass());
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createGlobalMergePass(TM));
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64AddressTypePromotionPass());
 
   return false;
 }
 
-TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new AArch64PassConfig(this, PM);
+bool AArch64PassConfig::addInstSelector() {
+  addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64CleanupLocalDynamicTLSPass());
+
+  return false;
 }
 
-bool AArch64PassConfig::addPreEmitPass() {
-  addPass(&UnpackMachineBundlesID);
-  addPass(createAArch64BranchFixupPass());
+bool AArch64PassConfig::addILPOpts() {
+  if (EnableCCMP)
+    addPass(createAArch64ConditionalCompares());
+  addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createAArch64StorePairSuppressPass());
   return true;
 }
 
-bool AArch64PassConfig::addInstSelector() {
-  addPass(createAArch64ISelDAG(getAArch64TargetMachine(), getOptLevel()));
+bool AArch64PassConfig::addPreRegAlloc() {
+  // Use AdvSIMD scalar instructions whenever profitable.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+    addPass(createAArch64AdvSIMDScalar());
+  return true;
+}
 
-  // For ELF, cleanup any local-dynamic TLS accesses.
-  if (getAArch64Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
-    addPass(createAArch64CleanupLocalDynamicTLSPass());
+bool AArch64PassConfig::addPostRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+    addPass(createAArch64DeadRegisterDefinitions());
+  return true;
+}
 
-  return false;
+bool AArch64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createAArch64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+    addPass(createAArch64LoadStoreOptimizationPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  addPass(createAArch64BranchRelaxation());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+      TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
+    addPass(createAArch64CollectLOHPass());
+  return true;
 }
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 3800635e0fac..6a159e91e22f 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -1,4 +1,4 @@
-//=== AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-===//
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,59 +11,56 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64TARGETMACHINE_H
-#define LLVM_AARCH64TARGETMACHINE_H
+#ifndef AArch64TARGETMACHINE_H
+#define AArch64TARGETMACHINE_H
 
-#include "AArch64FrameLowering.h"
-#include "AArch64ISelLowering.h"
 #include "AArch64InstrInfo.h"
-#include "AArch64SelectionDAGInfo.h"
+#include "AArch64ISelLowering.h"
 #include "AArch64Subtarget.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
 
 class AArch64TargetMachine : public LLVMTargetMachine {
-  AArch64Subtarget          Subtarget;
-  AArch64InstrInfo          InstrInfo;
-  const DataLayout          DL;
-  AArch64TargetLowering     TLInfo;
-  AArch64SelectionDAGInfo   TSInfo;
-  AArch64FrameLowering      FrameLowering;
+protected:
+  AArch64Subtarget Subtarget;
 
 public:
   AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL,
-                       bool LittleEndian);
+                       CodeGenOpt::Level OL, bool IsLittleEndian);
 
-  const AArch64InstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
+  const AArch64Subtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const AArch64TargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-
   const AArch64FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return getSubtargetImpl()->getFrameLowering();
   }
-
-  const AArch64TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
+  const AArch64InstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
   }
-
   const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
 
-  const AArch64Subtarget *getSubtargetImpl() const override { return &Subtarget; }
-
-  const DataLayout *getDataLayout() const override { return &DL; }
-
-  const TargetRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
+  // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
+  /// \brief Register AArch64 analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
@@ -72,8 +69,8 @@ class AArch64TargetMachine : public LLVMTargetMachine {
 class AArch64leTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
-  AArch64leTargetMachine(const Target &T, StringRef TT,
-                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+  AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
                          Reloc::Model RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
@@ -83,12 +80,12 @@ class AArch64leTargetMachine : public AArch64TargetMachine {
 class AArch64beTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
-  AArch64beTargetMachine(const Target &T, StringRef TT,
-                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+  AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
                          Reloc::Model RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 663d61944a8f..4069038dffe7 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -6,19 +6,47 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
-
 
 #include "AArch64TargetObjectFile.h"
-
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
 using namespace llvm;
+using namespace dwarf;
 
-void
-AArch64ElfTargetObjectFile::Initialize(MCContext &Ctx,
-                                       const TargetMachine &TM) {
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 6e57103a426a..de63cb42542a 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- AArch64TargetObjectFile.h - AArch64 Object Info ---------*- C++ -*-===//
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,25 +6,34 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+class AArch64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding, Mangler &Mang,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
 
-  /// AArch64ElfTargetObjectFile - This implementation is used for ELF
-  /// AArch64 targets.
-  class AArch64ElfTargetObjectFile : public TargetLoweringObjectFileELF {
-    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-  };
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+};
 
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0228d123bc6f..1dac14b96af9 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass ---------===//
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,10 +16,12 @@
 
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
+#include <algorithm>
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64tti"
@@ -34,25 +36,28 @@ void initializeAArch64TTIPass(PassRegistry &);
 namespace {
 
 class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const AArch64TargetMachine *TM;
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
 public:
-  AArch64TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
+  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
   AArch64TTI(const AArch64TargetMachine *TM)
-      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
         TLI(TM->getTargetLowering()) {
     initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override {
-    pushTTIStack(this);
-  }
+  void initializePass() override { pushTTIStack(this); }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -60,18 +65,24 @@ class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
+      return (TargetTransformInfo *)this;
     return this;
   }
 
   /// \name Scalar TTI Implementations
   /// @{
+  unsigned getIntImmCost(int64_t Val) const;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
 
   /// @}
 
-
   /// \name Vector TTI Implementations
   /// @{
 
@@ -81,7 +92,7 @@ class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
         return 32;
       return 0;
     }
-    return 32;
+    return 31;
   }
 
   unsigned getRegisterBitWidth(bool Vector) const override {
@@ -94,6 +105,25 @@ class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
   }
 
   unsigned getMaximumUnrollFactor() const override { return 2; }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+      override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+      override;
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  OperandValueKind Opd1Info = OK_AnyValue,
+                                  OperandValueKind Opd2Info = OK_AnyValue) const
+      override;
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+      override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
   /// @}
 };
 
@@ -107,3 +137,364 @@ ImmutablePass *
 llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
   return new AArch64TTI(TM);
 }
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+  // Check if the immediate can be encoded within an instruction.
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+    return 0;
+
+  if (Val < 0)
+    Val = ~Val;
+
+  // Calculate how many moves we will need to materialize this constant.
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
+}
+
+unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free) : Cost;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  switch (IID) {
+  default:
+    return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if (Idx == 1) {
+      unsigned NumConstants = (BitSize + 63) / 64;
+      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TCC_Basic)
+        ? static_cast<unsigned>(TCC_Free) : Cost;
+    }
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+AArch64TTI::PopcntSupportKind
+AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return PSK_FastHardware;
+  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return PSK_Software;
+}
+
+unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+
+    // Complex: to v2f32
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+
+    // Complex: to v4f32
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+
+    // Complex: to v2f64
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+
+
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+
+    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
+
+    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
+    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
+
+    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
+  };
+
+  int Idx = ConvertCostTableLookup<MVT>(
+      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+      SrcTy.getSimpleVT());
+  if (Idx != -1)
+    return ConversionTbl[Idx].Cost;
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return 2;
+}
+
+unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind Opd1Info,
+                                          OperandValueKind Opd2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                       Opd2Info);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower vector selects well that are wider than the register width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    unsigned AmortizationCost = 20;
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx =
+          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+                                 SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return VectorSelectTbl[Idx].Cost;
+    }
+  }
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    unsigned AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 9fe3497c6a1c..f861df0bf99f 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6,34 +6,31 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the (GNU-style) assembly parser for the AArch64
-// architecture.
-//
-//===----------------------------------------------------------------------===//
-
 
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cstdio>
 using namespace llvm;
 
 namespace {
@@ -41,22 +38,71 @@ namespace {
 class AArch64Operand;
 
 class AArch64AsmParser : public MCTargetAsmParser {
+private:
+  StringRef Mnemonic; ///< Instruction mnemonic.
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+
+  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  int tryParseRegister();
+  int tryMatchVectorRegister(StringRef &Kind, bool expected);
+  bool parseRegister(OperandVector &Operands);
+  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+  bool parseVectorList(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool isCondCode,
+                    bool invertCondCode);
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveTLSDescCall(SMLoc L);
+
+  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+
+  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
 #define GET_ASSEMBLER_HEADER
 #include "AArch64GenAsmMatcher.inc"
 
+  /// }
+
+  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+  bool tryParseVectorRegister(OperandVector &Operands);
+
 public:
   enum AArch64MatchResultTy {
-    Match_FirstAArch64 = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "AArch64GenAsmMatcher.inc"
   };
-
   AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-                   const MCInstrInfo &MII,
-                   const MCTargetOptions &Options)
+                 const MCInstrInfo &MII,
+                 const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
@@ -64,191 +110,197 @@ class AArch64AsmParser : public MCTargetAsmParser {
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
-  // These are the public interface of the MCTargetAsmParser
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                       SMLoc NameLoc,
-                       SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
-
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseDirective(AsmToken DirectiveID) override;
-  bool ParseDirectiveTLSDescCall(SMLoc L);
-  bool ParseDirectiveWord(unsigned Size, SMLoc L);
-
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer&Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm) override;
-
-  // The rest of the sub-parsers have more freedom over interface: they return
-  // an OperandMatchResultTy because it's less ambiguous than true/false or
-  // -1/0/1 even if it is more verbose
-  OperandMatchResultTy
-  ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               StringRef Mnemonic);
-
-  OperandMatchResultTy ParseImmediate(const MCExpr *&ExprVal);
-
-  OperandMatchResultTy ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind);
-
-  OperandMatchResultTy
-  ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                uint32_t NumLanes);
-
-  OperandMatchResultTy
-  ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                uint32_t &NumLanes);
-
-  OperandMatchResultTy
-  ParseImmWithLSLOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseCondCodeOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseCRxOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseFPImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseFPImm0AndImm0Operand( SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  template<typename SomeNamedImmMapper> OperandMatchResultTy
-  ParseNamedImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return ParseNamedImmOperand(SomeNamedImmMapper(), Operands);
-  }
-
-  OperandMatchResultTy
-  ParseNamedImmOperand(const NamedImmMapper &Mapper,
-                       SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseLSXAddressOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseShiftExtend(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
-                      SMLoc &LayoutLoc);
-
-  OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
-
-  bool validateInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  /// Scan the next token (which had better be an identifier) and determine
-  /// whether it represents a general-purpose or vector register. It returns
-  /// true if an identifier was found and populates its reference arguments. It
-  /// does not consume the token.
-  bool
-  IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, StringRef &LayoutSpec,
-                   SMLoc &LayoutLoc) const;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
 
+  static bool classifySymbolRef(const MCExpr *Expr,
+                                AArch64MCExpr::VariantKind &ELFRefKind,
+                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                int64_t &Addend);
 };
-
-}
+} // end anonymous namespace
 
 namespace {
 
-/// Instances of this class represent a parsed AArch64 machine instruction.
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
 class AArch64Operand : public MCParsedAsmOperand {
 private:
   enum KindTy {
-    k_ImmWithLSL,     // #uimm {, LSL #amt }
-    k_CondCode,       // eq/ne/...
-    k_FPImmediate,    // Limited-precision floating-point imm
-    k_Immediate,      // Including expressions referencing symbols
+    k_Immediate,
+    k_ShiftedImm,
+    k_CondCode,
     k_Register,
+    k_VectorList,
+    k_VectorIndex,
+    k_Token,
+    k_SysReg,
+    k_SysCR,
+    k_Prefetch,
     k_ShiftExtend,
-    k_VectorList,     // A sequential list of 1 to 4 registers.
-    k_SysReg,         // The register operand of MRS and MSR instructions
-    k_Token,          // The mnemonic; other raw tokens the auto-generated
-    k_WrappedRegister // Load/store exclusive permit a wrapped register.
+    k_FPImm,
+    k_Barrier
   } Kind;
 
   SMLoc StartLoc, EndLoc;
 
-  struct ImmWithLSLOp {
-    const MCExpr *Val;
-    unsigned ShiftAmount;
-    bool ImplicitAmount;
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
   };
 
-  struct CondCodeOp {
-    A64CC::CondCodes Code;
+  struct RegOp {
+    unsigned RegNum;
+    bool isVector;
   };
 
-  struct FPImmOp {
-    double Val;
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned NumElements;
+    unsigned ElementKind;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
   };
 
   struct ImmOp {
     const MCExpr *Val;
   };
 
-  struct RegOp {
-    unsigned RegNum;
+  struct ShiftedImmOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
   };
 
-  struct ShiftExtendOp {
-    A64SE::ShiftExtSpecifiers ShiftType;
-    unsigned Amount;
-    bool ImplicitAmount;
+  struct CondCodeOp {
+    AArch64CC::CondCode Code;
   };
 
-  // A vector register list is a sequential list of 1 to 4 registers.
-  struct VectorListOp {
-    unsigned RegNum;
-    unsigned Count;
-    A64Layout::VectorLayout Layout;
+  struct FPImmOp {
+    unsigned Val; // Encoded 8-bit representation.
+  };
+
+  struct BarrierOp {
+    unsigned Val; // Not the enum since not all values have names.
   };
 
   struct SysRegOp {
     const char *Data;
     unsigned Length;
+    uint64_t FeatureBits; // We need to pass through information about which
+                          // core we are compiling for so that the SysReg
+                          // Mappers can appropriately conditionalize.
   };
 
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
+  struct SysCRImmOp {
+    unsigned Val;
+  };
+
+  struct PrefetchOp {
+    unsigned Val;
+  };
+
+  struct ShiftExtendOp {
+    AArch64_AM::ShiftExtendType Type;
+    unsigned Amount;
+    bool HasExplicitAmount;
+  };
+
+  struct ExtendOp {
+    unsigned Val;
   };
 
   union {
-    struct ImmWithLSLOp ImmWithLSL;
-    struct CondCodeOp CondCode;
-    struct FPImmOp FPImm;
-    struct ImmOp Imm;
+    struct TokOp Tok;
     struct RegOp Reg;
-    struct ShiftExtendOp ShiftExtend;
     struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct ShiftedImmOp ShiftedImm;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct BarrierOp Barrier;
     struct SysRegOp SysReg;
-    struct TokOp Tok;
+    struct SysCRImmOp SysCRImm;
+    struct PrefetchOp Prefetch;
+    struct ShiftExtendOp ShiftExtend;
   };
 
-  AArch64Operand(KindTy K, SMLoc S, SMLoc E)
-    : MCParsedAsmOperand(), Kind(K), StartLoc(S), EndLoc(E) {}
+  // Keep the MCContext around as the MCExprs may need manipulated during
+  // the add<>Operands() calls.
+  MCContext &Ctx;
 
 public:
-  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand() {
+  AArch64Operand(KindTy K, MCContext &_Ctx)
+      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
+
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_ShiftedImm:
+      ShiftedImm = o.ShiftedImm;
+      break;
+    case k_CondCode:
+      CondCode = o.CondCode;
+      break;
+    case k_FPImm:
+      FPImm = o.FPImm;
+      break;
+    case k_Barrier:
+      Barrier = o.Barrier;
+      break;
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_VectorList:
+      VectorList = o.VectorList;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    case k_SysReg:
+      SysReg = o.SysReg;
+      break;
+    case k_SysCR:
+      SysCRImm = o.SysCRImm;
+      break;
+    case k_Prefetch:
+      Prefetch = o.Prefetch;
+      break;
+    case k_ShiftExtend:
+      ShiftExtend = o.ShiftExtend;
+      break;
+    }
   }
 
+  /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const override { return EndLoc; }
-  void print(raw_ostream&) const override;
-  void dump() const override;
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const override {
-    assert((Kind == k_Register || Kind == k_WrappedRegister)
-           && "Invalid access!");
-    return Reg.RegNum;
+  bool isTokenSuffix() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok.IsSuffix;
   }
 
   const MCExpr *getImm() const {
@@ -256,731 +308,779 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Imm.Val;
   }
 
-  A64CC::CondCodes getCondCode() const {
-    assert(Kind == k_CondCode && "Invalid access!");
-    return CondCode.Code;
+  const MCExpr *getShiftedImmVal() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.Val;
   }
 
-  static bool isNonConstantExpr(const MCExpr *E,
-                                AArch64MCExpr::VariantKind &Variant) {
-    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
-      Variant = A64E->getKind();
-      return true;
-    } else if (!isa<MCConstantExpr>(E)) {
-      Variant = AArch64MCExpr::VK_AARCH64_None;
-      return true;
-    }
-
-    return false;
+  unsigned getShiftedImmShift() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.ShiftAmount;
   }
 
-  bool isCondCode() const { return Kind == k_CondCode; }
-  bool isToken() const override { return Kind == k_Token; }
-  bool isReg() const override { return Kind == k_Register; }
-  bool isImm() const override { return Kind == k_Immediate; }
-  bool isMem() const override { return false; }
-  bool isFPImm() const { return Kind == k_FPImmediate; }
-  bool isShiftOrExtend() const { return Kind == k_ShiftExtend; }
-  bool isSysReg() const { return Kind == k_SysReg; }
-  bool isImmWithLSL() const { return Kind == k_ImmWithLSL; }
-  bool isWrappedReg() const { return Kind == k_WrappedRegister; }
-
-  bool isAddSubImmLSL0() const {
-    if (!isImmWithLSL()) return false;
-    if (ImmWithLSL.ShiftAmount != 0) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC
-          || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC_LO12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  AArch64CC::CondCode getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
   }
 
-  bool isAddSubImmLSL12() const {
-    if (!isImmWithLSL()) return false;
-    if (ImmWithLSL.ShiftAmount != 12) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_DTPREL_HI12
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_HI12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImm && "Invalid access!");
+    return FPImm.Val;
   }
 
-  template<unsigned MemSize, unsigned RmSize> bool isAddrRegExtend() const {
-    if (!isShiftOrExtend()) return false;
-
-    A64SE::ShiftExtSpecifiers Ext = ShiftExtend.ShiftType;
-    if (RmSize == 32 && !(Ext == A64SE::UXTW || Ext == A64SE::SXTW))
-      return false;
-
-    if (RmSize == 64 && !(Ext == A64SE::LSL || Ext == A64SE::SXTX))
-      return false;
-
-    return ShiftExtend.Amount == Log2_32(MemSize) || ShiftExtend.Amount == 0;
+  unsigned getBarrier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.Val;
   }
 
-  bool isAdrpLabel() const {
-    if (!isImm()) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(getImm(), Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_None
-        || Variant == AArch64MCExpr::VK_AARCH64_GOT
-        || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL
-        || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC;
-    }
-
-    return isLabel<21, 4096>();
+  unsigned getReg() const override {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.RegNum;
   }
 
-  template<unsigned RegWidth>  bool isBitfieldWidth() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  unsigned getVectorListStart() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.RegNum;
   }
 
-  template<int RegWidth>
-  bool isCVTFixedPos() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  unsigned getVectorListCount() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.Count;
   }
 
-  bool isFMOVImm() const {
-    if (!isFPImm()) return false;
-
-    APFloat RealVal(FPImm.Val);
-    uint32_t ImmVal;
-    return A64Imms::isFPImm(RealVal, ImmVal);
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
   }
 
-  bool isFPZero() const {
-    if (!isFPImm()) return false;
-
-    APFloat RealVal(FPImm.Val);
-    return RealVal.isPosZero();
+  StringRef getSysReg() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return StringRef(SysReg.Data, SysReg.Length);
   }
 
-  template<unsigned field_width, unsigned scale>
-  bool isLabel() const {
-    if (!isImm()) return false;
-
-    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) {
-      return true;
-    } else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
-      int64_t Val = CE->getValue();
-      int64_t Min = - (scale * (1LL << (field_width - 1)));
-      int64_t Max = scale * ((1LL << (field_width - 1)) - 1);
-      return (Val % scale) == 0 && Val >= Min && Val <= Max;
-    }
-
-    // N.b. this disallows explicit relocation specifications via an
-    // AArch64MCExpr. Users needing that behaviour
-    return false;
+  uint64_t getSysRegFeatureBits() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return SysReg.FeatureBits;
   }
 
-  bool isLane1() const {
-    if (!isImm()) return false;
-
-    // Because it's come through custom assembly parsing, it must always be a
-    // constant expression.
-    return cast<MCConstantExpr>(getImm())->getValue() == 1;
+  unsigned getSysCR() const {
+    assert(Kind == k_SysCR && "Invalid access!");
+    return SysCRImm.Val;
   }
 
-  bool isLoadLitLabel() const {
-    if (!isImm()) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(getImm(), Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_None
-          || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL;
-    }
-
-    return isLabel<19, 4>();
+  unsigned getPrefetch() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return Prefetch.Val;
   }
 
-  template<unsigned RegWidth> bool isLogicalImm() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
-    if (!CE) return false;
-
-    uint32_t Bits;
-    return A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+  AArch64_AM::ShiftExtendType getShiftExtendType() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Type;
   }
 
-  template<unsigned RegWidth> bool isLogicalImmMOV() const {
-    if (!isLogicalImm<RegWidth>()) return false;
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
-
-    // The move alias for ORR is only valid if the immediate cannot be
-    // represented with a move (immediate) instruction; they take priority.
-    int UImm16, Shift;
-    return !A64Imms::isMOVZImm(RegWidth, CE->getValue(), UImm16, Shift)
-      && !A64Imms::isMOVNImm(RegWidth, CE->getValue(), UImm16, Shift);
+  unsigned getShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Amount;
   }
 
-  template<int MemSize>
-  bool isOffsetUImm12() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-
-    // Assume they know what they're doing for now if they've given us a
-    // non-constant expression. In principle we could check for ridiculous
-    // things that can't possibly work or relocations that would almost
-    // certainly break resulting code.
-    if (!CE)
-      return true;
-
-    int64_t Val = CE->getValue();
-
-    // Must be a multiple of the access size in bytes.
-    if ((Val & (MemSize - 1)) != 0) return false;
-
-    // Must be 12-bit unsigned
-    return Val >= 0 && Val <= 0xfff * MemSize;
+  bool hasShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.HasExplicitAmount;
   }
 
-  template<A64SE::ShiftExtSpecifiers SHKind, bool is64Bit>
-  bool isShift() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != SHKind)
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return false; }
+  bool isSImm9() const {
+    if (!isImm())
       return false;
-
-    return is64Bit ? ShiftExtend.Amount <= 63 : ShiftExtend.Amount <= 31;
-  }
-
-  bool isMOVN32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVN64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G2,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val < 256);
   }
-
-
-  bool isMOVZ32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0,
-      AArch64MCExpr::VK_AARCH64_ABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVZ64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0,
-      AArch64MCExpr::VK_AARCH64_ABS_G1,
-      AArch64MCExpr::VK_AARCH64_ABS_G2,
-      AArch64MCExpr::VK_AARCH64_ABS_G3,
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G2,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  bool isSImm7s4() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
   }
-
-  bool isMOVK32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVK64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G2_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G3,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  bool isSImm7s8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
   }
-
-  bool isMoveWideImm(unsigned RegWidth,
-                     const AArch64MCExpr::VariantKind *PermittedModifiers,
-                     unsigned NumModifiers) const {
-    if (!isImmWithLSL()) return false;
-
-    if (ImmWithLSL.ShiftAmount % 16 != 0) return false;
-    if (ImmWithLSL.ShiftAmount >= RegWidth) return false;
-
-    AArch64MCExpr::VariantKind Modifier;
-    if (isNonConstantExpr(ImmWithLSL.Val, Modifier)) {
-      // E.g. "#:abs_g0:sym, lsl #16" makes no sense.
-      if (!ImmWithLSL.ImplicitAmount) return false;
-
-      for (unsigned i = 0; i < NumModifiers; ++i)
-        if (PermittedModifiers[i] == Modifier) return true;
-
+  bool isSImm7s16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+  }
+
+  bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      // If we don't understand the expression, assume the best and
+      // let the fixup and relocation code deal with it.
+      return true;
     }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE && CE->getValue() >= 0  && CE->getValue() <= 0xffff;
-  }
-
-  template<int RegWidth, bool (*isValidImm)(int, uint64_t, int&, int&)>
-  bool isMoveWideMovAlias() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    int UImm16, Shift;
-    uint64_t Value = CE->getValue();
-
-    // If this is a 32-bit instruction then all bits above 32 should be the
-    // same: either of these is fine because signed/unsigned values should be
-    // permitted.
-    if (RegWidth == 32) {
-      if ((Value >> 32) != 0 && (Value >> 32) != 0xffffffff)
-        return false;
-
-      Value &= 0xffffffffULL;
+    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+        ELFRefKind == AArch64MCExpr::VK_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+      // Note that we don't range-check the addend. It's adjusted modulo page
+      // size when converted, so there is no "out of range" condition when using
+      // @pageoff.
+      return Addend >= 0 && (Addend % Scale) == 0;
+    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+      return Addend == 0;
     }
 
-    return isValidImm(RegWidth, Value, UImm16, Shift);
-  }
-
-  bool isMSRWithReg() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64SysReg::MSRMapper().fromString(Name, IsKnownRegister);
-
-    return IsKnownRegister;
+    return false;
   }
 
-  bool isMSRPState() const {
-    if (!isSysReg()) return false;
+  template <int Scale> bool isUImm12Offset() const {
+    if (!isImm())
+      return false;
 
-    bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64PState::PStateMapper().fromString(Name, IsKnownRegister);
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return isSymbolicUImm12Offset(getImm(), Scale);
 
-    return IsKnownRegister;
+    int64_t Val = MCE->getValue();
+    return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
   }
 
-  bool isMRS() const {
-    if (!isSysReg()) return false;
-
-    // First check against specific MSR-only (write-only) registers
-    bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64SysReg::MRSMapper().fromString(Name, IsKnownRegister);
-
-    return IsKnownRegister;
+  bool isImm0_7() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 8);
   }
-
-  bool isPRFM() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-
-    if (!CE)
+  bool isImm1_8() const {
+    if (!isImm())
       return false;
-
-    return CE->getValue() >= 0 && CE->getValue() <= 31;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 9);
   }
-
-  template<A64SE::ShiftExtSpecifiers SHKind> bool isRegExtend() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != SHKind)
+  bool isImm0_15() const {
+    if (!isImm())
       return false;
-
-    return ShiftExtend.Amount <= 4;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 16);
   }
-
-  bool isRegExtendLSL() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+  bool isImm1_16() const {
+    if (!isImm())
       return false;
-
-    return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 17);
   }
-
-  // if 0 < value <= w, return true
-  bool isShrFixedWidth(int w) const {
+  bool isImm0_31() const {
     if (!isImm())
       return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= w;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 32);
   }
-
-  bool isShrImm8() const { return isShrFixedWidth(8); }
-
-  bool isShrImm16() const { return isShrFixedWidth(16); }
-
-  bool isShrImm32() const { return isShrFixedWidth(32); }
-
-  bool isShrImm64() const { return isShrFixedWidth(64); }
-
-  // if 0 <= value < w, return true
-  bool isShlFixedWidth(int w) const {
+  bool isImm1_31() const {
     if (!isImm())
       return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < w;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 32);
   }
+  bool isImm1_32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 33);
+  }
+  bool isImm0_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 64);
+  }
+  bool isImm1_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 64);
+  }
+  bool isImm1_64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 65);
+  }
+  bool isImm0_127() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 128);
+  }
+  bool isImm0_255() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 256);
+  }
+  bool isImm0_65535() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 65536);
+  }
+  bool isImm32_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 32 && Val < 64);
+  }
+  bool isLogicalImm32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32);
+  }
+  bool isLogicalImm64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+  }
+  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+  bool isAddSubImm() const {
+    if (!isShiftedImm() && !isImm())
+      return false;
 
-  bool isShlImm8() const { return isShlFixedWidth(8); }
-
-  bool isShlImm16() const { return isShlFixedWidth(16); }
+    const MCExpr *Expr;
 
-  bool isShlImm32() const { return isShlFixedWidth(32); }
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    if (isShiftedImm()) {
+      unsigned Shift = ShiftedImm.ShiftAmount;
+      Expr = ShiftedImm.Val;
+      if (Shift != 0 && Shift != 12)
+        return false;
+    } else {
+      Expr = getImm();
+    }
 
-  bool isShlImm64() const { return isShlFixedWidth(64); }
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+                                          DarwinRefKind, Addend)) {
+      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+          || ELFRefKind == AArch64MCExpr::VK_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+    }
 
-  bool isNeonMovImmShiftLSL() const {
-    if (!isShiftOrExtend())
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isSIMDImmType10() const {
+    if (!isImm())
       return false;
-
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return false;
-
-    // Valid shift amount is 0, 8, 16 and 24.
-    return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24;
+    return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
   }
-
-  bool isNeonMovImmShiftLSLH() const {
-    if (!isShiftOrExtend())
+  bool isBranchTarget26() const {
+    if (!isImm())
       return false;
-
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
       return false;
-
-    // Valid shift amount is 0 and 8.
-    return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8;
+    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+  }
+  bool isPCRelLabel19() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+  }
+  bool isBranchTarget14() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
   }
 
-  bool isNeonMovImmShiftMSL() const {
-    if (!isShiftOrExtend())
+  bool
+  isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+    if (!isImm())
       return false;
 
-    if (ShiftExtend.ShiftType != A64SE::MSL)
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+                                             DarwinRefKind, Addend)) {
+      return false;
+    }
+    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
       return false;
 
-    // Valid shift amount is 8 and 16.
-    return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
-  }
+    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+      if (ELFRefKind == AllowedModifiers[i])
+        return Addend == 0;
+    }
 
-  template <A64Layout::VectorLayout Layout, unsigned Count>
-  bool isVectorList() const {
-    return Kind == k_VectorList && VectorList.Layout == Layout &&
-           VectorList.Count == Count;
+    return false;
   }
 
-  template <int MemSize> bool isSImm7Scaled() const {
-    if (!isImm())
-      return false;
+  bool isMovZSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+  bool isMovZSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+        AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2};
+    return isMovWSymbol(Variants);
+  }
 
-    int64_t Val = CE->getValue();
-    if (Val % MemSize != 0) return false;
+  bool isMovZSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G1,      AArch64MCExpr::VK_ABS_G1_S,
+        AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+        AArch64MCExpr::VK_DTPREL_G1,
+    };
+    return isMovWSymbol(Variants);
+  }
 
-    Val /= MemSize;
+  bool isMovZSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+        AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0};
+    return isMovWSymbol(Variants);
+  }
 
-    return Val >= -64 && Val < 64;
+  bool isMovKSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
   }
 
-  template<int BitWidth>
-  bool isSImm() const {
-    if (!isImm()) return false;
+  bool isMovKSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2_NC};
+    return isMovWSymbol(Variants);
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+  bool isMovKSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC,
+      AArch64MCExpr::VK_DTPREL_G1_NC
+    };
+    return isMovWSymbol(Variants);
+  }
 
-    return CE->getValue() >= -(1LL << (BitWidth - 1))
-      && CE->getValue() < (1LL << (BitWidth - 1));
+  bool isMovKSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G0_NC,   AArch64MCExpr::VK_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC
+    };
+    return isMovWSymbol(Variants);
   }
 
-  template<int bitWidth>
-  bool isUImm() const {
+  template<int RegWidth, int Shift>
+  bool isMOVZMovAlias() const {
     if (!isImm()) return false;
 
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
+    uint64_t Value = CE->getValue();
 
-    return CE->getValue() >= 0 && CE->getValue() < (1LL << bitWidth);
-  }
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
 
-  bool isUImm() const {
-    if (!isImm()) return false;
+    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+    if (Value == 0 && Shift != 0)
+      return false;
 
-    return isa<MCConstantExpr>(getImm());
+    return (Value & ~(0xffffULL << Shift)) == 0;
   }
 
-  bool isNeonUImm64Mask() const {
-    if (!isImm())
-      return false;
+  template<int RegWidth, int Shift>
+  bool isMOVNMovAlias() const {
+    if (!isImm()) return false;
 
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
-      return false;
-
+    if (!CE) return false;
     uint64_t Value = CE->getValue();
 
-    // i64 value with each byte being either 0x00 or 0xff.
-    for (unsigned i = 0; i < 8; ++i, Value >>= 8)
-      if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff)
+    // MOVZ takes precedence over MOVN.
+    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
+      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
         return false;
-    return true;
+
+    Value = ~Value;
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
+
+    return (Value & ~(0xffffULL << Shift)) == 0;
   }
 
-  // if value == N, return true
-  template<int N>
-  bool isExactImm() const {
-    if (!isImm()) return false;
+  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isMRSSystemRegister() const {
+    if (!isSysReg()) return false;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
 
-    return CE->getValue() == N;
+    return IsKnownRegister;
   }
+  bool isMSRSystemRegister() const {
+    if (!isSysReg()) return false;
 
-  bool isFPZeroIZero() const {
-    return isFPZero();
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
   }
+  bool isSystemPStateField() const {
+    if (!isSysReg()) return false;
 
-  static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
-                                          unsigned ShiftAmount,
-                                          bool ImplicitAmount,
-										  SMLoc S,SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
-    Op->ImmWithLSL.Val = Val;
-    Op->ImmWithLSL.ShiftAmount = ShiftAmount;
-    Op->ImmWithLSL.ImplicitAmount = ImplicitAmount;
-    return Op;
+    bool IsKnownRegister;
+    AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+  bool isVectorRegLo() const {
+    return Kind == k_Register && Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isGPR32as64() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
 
-  static AArch64Operand *CreateCondCode(A64CC::CondCodes Code,
-                                        SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_CondCode, S, E);
-    Op->CondCode.Code = Code;
-    return Op;
+  bool isGPR64sp0() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
   }
 
-  static AArch64Operand *CreateFPImm(double Val,
-                                     SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_FPImmediate, S, E);
-    Op->FPImm.Val = Val;
-    return Op;
+  /// Is this a vector list with the type implicit (presumably attached to the
+  /// instruction itself)?
+  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+    return Kind == k_VectorList && VectorList.Count == NumRegs &&
+           !VectorList.ElementKind;
   }
 
-  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_Immediate, S, E);
-    Op->Imm.Val = Val;
-    return Op;
+  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  bool isTypedVectorList() const {
+    if (Kind != k_VectorList)
+      return false;
+    if (VectorList.Count != NumRegs)
+      return false;
+    if (VectorList.ElementKind != ElementKind)
+      return false;
+    return VectorList.NumElements == NumElements;
   }
 
-  static AArch64Operand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_Register, S, E);
-    Op->Reg.RegNum = RegNum;
-    return Op;
+  bool isVectorIndex1() const {
+    return Kind == k_VectorIndex && VectorIndex.Val == 1;
+  }
+  bool isVectorIndexB() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 16;
+  }
+  bool isVectorIndexH() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 8;
+  }
+  bool isVectorIndexS() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 4;
   }
+  bool isVectorIndexD() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isTokenEqual(StringRef Str) const {
+    return Kind == k_Token && getToken() == Str;
+  }
+  bool isSysCR() const { return Kind == k_SysCR; }
+  bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+  bool isShifter() const {
+    if (!isShiftExtend())
+      return false;
 
-  static AArch64Operand *CreateWrappedReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_WrappedRegister, S, E);
-    Op->Reg.RegNum = RegNum;
-    return Op;
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+            ST == AArch64_AM::MSL);
   }
+  bool isExtend() const {
+    if (!isShiftExtend())
+      return false;
 
-  static AArch64Operand *CreateShiftExtend(A64SE::ShiftExtSpecifiers ShiftTyp,
-                                           unsigned Amount,
-                                           bool ImplicitAmount,
-                                           SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, S, E);
-    Op->ShiftExtend.ShiftType = ShiftTyp;
-    Op->ShiftExtend.Amount = Amount;
-    Op->ShiftExtend.ImplicitAmount = ImplicitAmount;
-    return Op;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+            ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+            ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+            ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
   }
 
-  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S) {
-    AArch64Operand *Op = new AArch64Operand(k_SysReg, S, S);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    return Op;
+  bool isExtend64() const {
+    if (!isExtend())
+      return false;
+    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+  }
+  bool isExtendLSL64() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
   }
 
-  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                          A64Layout::VectorLayout Layout,
-                                          SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
-    Op->VectorList.RegNum = RegNum;
-    Op->VectorList.Count = Count;
-    Op->VectorList.Layout = Layout;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
+  template<int Width> bool isMemXExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
   }
 
-  static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
-    AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    return Op;
+  template<int Width> bool isMemWExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
   }
 
+  template <unsigned width>
+  bool isArithmeticShifter() const {
+    if (!isShifter())
+      return false;
 
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
+    // An arithmetic shifter is LSL, LSR, or ASR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
   }
 
-  template<unsigned RegWidth>
-  void addBFILSBOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    unsigned EncodedVal = (RegWidth - CE->getValue()) % RegWidth;
-    Inst.addOperand(MCOperand::CreateImm(EncodedVal));
-  }
+  template <unsigned width>
+  bool isLogicalShifter() const {
+    if (!isShifter())
+      return false;
 
-  void addBFIWidthOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+    // A logical shifter is LSL, LSR, ASR or ROR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+           getShiftExtendAmount() < width;
   }
 
-  void addBFXWidthOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  bool isMovImm32Shifter() const {
+    if (!isShifter())
+      return false;
 
-    uint64_t LSB = Inst.getOperand(Inst.getNumOperands()-1).getImm();
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    // A MOVi shifter is LSL of 0, 16, 32, or 48.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16);
+  }
+
+  bool isMovImm64Shifter() const {
+    if (!isShifter())
+      return false;
 
-    Inst.addOperand(MCOperand::CreateImm(LSB + CE->getValue() - 1));
+    // A MOVi shifter is LSL of 0 or 16.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
   }
 
-  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  bool isLogicalVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
   }
 
-  void addCVTFixedPosOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  bool isLogicalVecHalfWordShifter() const {
+    if (!isLogicalVecShifter())
+      return false;
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(64 - CE->getValue()));
+    // A logical vector shifter is a left shift by 0 or 8.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8);
   }
 
-  void addFMOVImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  bool isMoveVecShifter() const {
+    if (!isShiftExtend())
+      return false;
 
-    APFloat RealVal(FPImm.Val);
-    uint32_t ImmVal;
-    A64Imms::isFPImm(RealVal, ImmVal);
+    // A logical vector shifter is a left shift by 8 or 16.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::MSL &&
+           (Shift == 8 || Shift == 16);
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  // Fallback unscaled operands are for aliases of LDR/STR that fall back
+  // to LDUR/STUR when the offset is not legal for the former but is for
+  // the latter. As such, in addition to checking for being a legal unscaled
+  // address, also check that it is not a legal scaled address. This avoids
+  // ambiguity in the matcher.
+  template<int Width>
+  bool isSImm9OffsetFB() const {
+    return isSImm9() && !isUImm12Offset<Width / 8>();
   }
 
-  void addFPZeroOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands");
-    Inst.addOperand(MCOperand::CreateImm(0));
+  bool isAdrpLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (4096 * (1LL << (21 - 1)));
+      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
+    }
+
+    return true;
   }
 
-  void addFPZeroIZeroOperands(MCInst &Inst, unsigned N) const {
-    addFPZeroOperands(Inst, N);
+  bool isAdrLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (1LL << (21 - 1));
+      int64_t Max = ((1LL << (21 - 1)) - 1);
+      return Val >= Min && Val <= Max;
+    }
+
+    return true;
   }
 
-  void addInvCondCodeOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    unsigned Encoded = A64InvertCondCode(getCondCode());
-    Inst.addOperand(MCOperand::CreateImm(Encoded));
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
   }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -988,575 +1088,796 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
+  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
-  }
+    assert(
+        AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
 
-  template<int MemSize>
-  void addSImm7ScaledOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+    uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+        RI->getEncodingValue(getReg()));
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Val = CE->getValue() / MemSize;
-    Inst.addOperand(MCOperand::CreateImm(Val  & 0x7f));
+    Inst.addOperand(MCOperand::CreateReg(Reg));
   }
 
-  template<int BitWidth>
-  void addSImmOperands(MCInst &Inst, unsigned N) const {
+  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Val = CE->getValue();
-    Inst.addOperand(MCOperand::CreateImm(Val  & ((1ULL << BitWidth) - 1)));
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(AArch64::D0 + getReg() - AArch64::Q0));
   }
 
-  void addImmWithLSLOperands(MCInst &Inst, unsigned N) const {
-    assert (N == 1 && "Invalid number of operands!");
-
-    addExpr(Inst, ImmWithLSL.Val);
+  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  template<unsigned field_width, unsigned scale>
-  void addLabelOperands(MCInst &Inst, unsigned N) const {
+  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
-
-    if (!CE) {
-      addExpr(Inst, Imm.Val);
-      return;
-    }
-
-    int64_t Val = CE->getValue();
-    assert(Val % scale == 0 && "Unaligned immediate in instruction");
-    Val /= scale;
-
-    Inst.addOperand(MCOperand::CreateImm(Val & ((1LL << field_width) - 1)));
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  template<int MemSize>
-  void addOffsetUImm12Operands(MCInst &Inst, unsigned N) const {
+  template <unsigned NumRegs>
+  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::D0,       AArch64::D0_D1,
+                                    AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
 
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / MemSize));
-    } else {
-      Inst.addOperand(MCOperand::CreateExpr(getImm()));
-    }
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
   }
 
-  template<unsigned RegWidth>
-  void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
-
-    uint32_t Bits;
-    A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+  template <unsigned NumRegs>
+  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::Q0,       AArch64::Q0_Q1,
+                                    AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
 
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
   }
 
-  void addMRSOperands(MCInst &Inst, unsigned N) const {
+  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-
-    bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64SysReg::MRSMapper().fromString(Name, Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
   }
 
-  void addMSRWithRegOperands(MCInst &Inst, unsigned N) const {
+  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
 
-    bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64SysReg::MSRMapper().fromString(Name, Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
   }
 
-  void addMSRPStateOperands(MCInst &Inst, unsigned N) const {
+  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
 
-    bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64PState::PStateMapper().fromString(Name, Valid);
+  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // If this is a pageoff symrefexpr with an addend, adjust the addend
+    // to be only the page-offset portion. Otherwise, just add the expr
+    // as-is.
+    addExpr(Inst, getImm());
   }
 
-  void addMoveWideImmOperands(MCInst &Inst, unsigned N) const {
+  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
-
-    addExpr(Inst, ImmWithLSL.Val);
-
-    AArch64MCExpr::VariantKind Variant;
-    if (!isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      Inst.addOperand(MCOperand::CreateImm(ImmWithLSL.ShiftAmount / 16));
-      return;
-    }
-
-    // We know it's relocated
-    switch (Variant) {
-    case AArch64MCExpr::VK_AARCH64_ABS_G0:
-    case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G0:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G0:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
+    if (isShiftedImm()) {
+      addExpr(Inst, getShiftedImmVal());
+      Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
+    } else {
+      addExpr(Inst, getImm());
       Inst.addOperand(MCOperand::CreateImm(0));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G1:
-    case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G1:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
-    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
-      Inst.addOperand(MCOperand::CreateImm(1));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G2:
-    case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G2:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-      Inst.addOperand(MCOperand::CreateImm(2));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G3:
-      Inst.addOperand(MCOperand::CreateImm(3));
-      break;
-    default: llvm_unreachable("Inappropriate move wide relocation");
     }
   }
 
-  template<int RegWidth, bool isValidImm(int, uint64_t, int&, int&)>
-  void addMoveWideMovAliasOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-    int UImm16, Shift;
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  }
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
+  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      addExpr(Inst, getImm());
+    else
+      Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
+  }
 
-    if (RegWidth == 32) {
-      Value &= 0xffffffffULL;
-    }
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
 
-    bool Valid = isValidImm(RegWidth, Value, UImm16, Shift);
-    (void)Valid;
-    assert(Valid && "Invalid immediates should have been weeded out by now");
+  template<int Scale>
+  void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
 
-    Inst.addOperand(MCOperand::CreateImm(UImm16));
-    Inst.addOperand(MCOperand::CreateImm(Shift));
+    if (!MCE) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale));
   }
 
-  void addPRFMOperands(MCInst &Inst, unsigned N) const {
+  void addSImm9Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    assert(CE->getValue() >= 0 && CE->getValue() <= 31
-           && "PRFM operand should be 5-bits");
-
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
   }
 
-  // For Add-sub (extended register) operands.
-  void addRegExtendOperands(MCInst &Inst, unsigned N) const {
+  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
   }
 
-  // For Vector Immediates shifted imm operands.
-  void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const {
+  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3.
-    int64_t Imm = ShiftExtend.Amount / 8;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    // Encode LSLH shift amount 0, 8  as 0, 1.
-    int64_t Imm = ShiftExtend.Amount / 8;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    // Encode MSL shift amount 8, 16  as 0, 1.
-    int64_t Imm = ShiftExtend.Amount / 8 - 1;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  // For the extend in load-store (register offset) instructions.
-  template<unsigned MemSize>
-  void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
-    addAddrRegExtendOperands(Inst, N, MemSize);
+  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addAddrRegExtendOperands(MCInst &Inst, unsigned N,
-                                unsigned MemSize) const {
+  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    // First bit of Option is set in instruction classes, the high two bits are
-    // as follows:
-    unsigned OptionHi = 0;
-    switch (ShiftExtend.ShiftType) {
-    case A64SE::UXTW:
-    case A64SE::LSL:
-      OptionHi = 1;
-      break;
-    case A64SE::SXTW:
-    case A64SE::SXTX:
-      OptionHi = 3;
-      break;
-    default:
-      llvm_unreachable("Invalid extend type for register offset");
-    }
+  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    unsigned S = 0;
-    if (MemSize == 1 && !ShiftExtend.ImplicitAmount)
-      S = 1;
-    else if (MemSize != 1 && ShiftExtend.Amount != 0)
-      S = 1;
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm((OptionHi << 1) | S));
+  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
-  void addShiftOperands(MCInst &Inst, unsigned N) const {
+
+  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
   }
 
-  void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const {
+  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
 
-    // A bit from each byte in the constant forms the encoded immediate
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
+  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
 
-    unsigned Imm = 0;
-    for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
-      Imm |= (Value & 1) << i;
+  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
     }
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
   }
 
-  void addVectorListOperands(MCInst &Inst, unsigned N) const {
+  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
   }
-};
 
-} // end anonymous namespace.
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               StringRef Mnemonic) {
+  void addBarrierOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
+  }
 
-  // See if the operand has a custom parser
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
 
-  // It could either succeed, fail or just not care.
-  if (ResTy != MatchOperand_NoMatch)
-    return ResTy;
+    bool Valid;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
 
-  switch (getLexer().getKind()) {
-  default:
-    Error(Parser.getTok().getLoc(), "unexpected token in operand");
-    return MatchOperand_ParseFail;
-  case AsmToken::Identifier: {
-    // It might be in the LSL/UXTB family ...
-    OperandMatchResultTy GotShift = ParseShiftExtend(Operands);
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
 
-    // We can only continue if no tokens were eaten.
-    if (GotShift != MatchOperand_NoMatch)
-      return GotShift;
+  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
 
-    // ... or it might be a register ...
-    uint32_t NumLanes = 0;
-    OperandMatchResultTy GotReg = ParseRegister(Operands, NumLanes);
-    assert(GotReg != MatchOperand_ParseFail
-           && "register parsing shouldn't partially succeed");
-
-    if (GotReg == MatchOperand_Success) {
-      if (Parser.getTok().is(AsmToken::LBrac))
-        return ParseNEONLane(Operands, NumLanes);
-      else
-        return MatchOperand_Success;
-    }
-    // ... or it might be a symbolish thing
-  }
-    // Fall through
-  case AsmToken::LParen:  // E.g. (strcmp-4)
-  case AsmToken::Integer: // 1f, 2b labels
-  case AsmToken::String:  // quoted labels
-  case AsmToken::Dot:     // . is Current location
-  case AsmToken::Dollar:  // $ is PC
-  case AsmToken::Colon: {
-    SMLoc StartLoc  = Parser.getTok().getLoc();
-    SMLoc EndLoc;
-    const MCExpr *ImmVal = nullptr;
-
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+    bool Valid;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
 
-    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
-    return MatchOperand_Success;
+    Inst.addOperand(MCOperand::CreateImm(Bits));
   }
-  case AsmToken::Hash: {   // Immediates
-    SMLoc StartLoc = Parser.getTok().getLoc();
-    SMLoc EndLoc;
-    const MCExpr *ImmVal = nullptr;
-    Parser.Lex();
 
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
 
-    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
-    return MatchOperand_Success;
-  }
-  case AsmToken::LBrac: {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateToken("[", Loc));
-    Parser.Lex(); // Eat '['
+    bool Valid;
+    uint32_t Bits =
+        AArch64PState::PStateMapper().fromString(getSysReg(), Valid);
 
-    // There's no comma after a '[', so we can parse the next operand
-    // immediately.
-    return ParseOperand(Operands, Mnemonic);
+    Inst.addOperand(MCOperand::CreateImm(Bits));
   }
-  // The following will likely be useful later, but not in very early cases
-  case AsmToken::LCurly: // SIMD vector list is not parsed here
-    llvm_unreachable("Don't know how to deal with '{' in operand");
-    return MatchOperand_ParseFail;
+
+  void addSysCROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
   }
-}
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmediate(const MCExpr *&ExprVal) {
-  if (getLexer().is(AsmToken::Colon)) {
-    AArch64MCExpr::VariantKind RefKind;
+  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
+  }
 
-    OperandMatchResultTy ResTy = ParseRelocPrefix(RefKind);
-    if (ResTy != MatchOperand_Success)
-      return ResTy;
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Imm =
+        AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
 
-    const MCExpr *SubExprVal;
-    if (getParser().parseExpression(SubExprVal))
-      return MatchOperand_ParseFail;
+  void addExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
 
-    ExprVal = AArch64MCExpr::Create(RefKind, SubExprVal, getContext());
-    return MatchOperand_Success;
+  void addExtend64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
-  // No weird AArch64MCExpr prefix
-  return getParser().parseExpression(ExprVal)
-    ? MatchOperand_ParseFail : MatchOperand_Success;
-}
+  void addMemExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0));
+  }
 
-// A lane attached to a NEON register. "[N]", which should yield three tokens:
-// '[', N, ']'. A hash is not allowed to precede the immediate here.
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                uint32_t NumLanes) {
-  SMLoc Loc = Parser.getTok().getLoc();
+  // For 8-bit load/store instructions with a register offset, both the
+  // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+  // they're disambiguated by whether the shift was explicit or implicit rather
+  // than its size.
+  void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount()));
+  }
 
-  assert(Parser.getTok().is(AsmToken::LBrac) && "inappropriate operand");
-  Operands.push_back(AArch64Operand::CreateToken("[", Loc));
-  Parser.Lex(); // Eat '['
+  template<int Shift>
+  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
 
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(), "expected lane number");
-    return MatchOperand_ParseFail;
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
   }
 
-  if (Parser.getTok().getIntVal() >= NumLanes) {
-    Error(Parser.getTok().getLoc(), "lane number incompatible with layout");
-    return MatchOperand_ParseFail;
+  template<int Shift>
+  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
   }
 
-  const MCExpr *Lane = MCConstantExpr::Create(Parser.getTok().getIntVal(),
-                                              getContext());
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat actual lane
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateImm(Lane, S, E));
+  void print(raw_ostream &OS) const override;
 
+  static std::unique_ptr<AArch64Operand>
+  CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->Tok.IsSuffix = IsSuffix;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 
-  if (Parser.getTok().isNot(AsmToken::RBrac)) {
-    Error(Parser.getTok().getLoc(), "expected ']' after lane");
-    return MatchOperand_ParseFail;
+  static std::unique_ptr<AArch64Operand>
+  CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.isVector = isVector;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
 
-  Operands.push_back(AArch64Operand::CreateToken("]", Loc));
-  Parser.Lex(); // Eat ']'
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
+                   char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.NumElements = NumElements;
+    Op->VectorList.ElementKind = ElementKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-  return MatchOperand_Success;
-}
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind) {
-  assert(getLexer().is(AsmToken::Colon) && "expected a ':'");
-  Parser.Lex();
+  static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                   SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-  if (getLexer().isNot(AsmToken::Identifier)) {
-    Error(Parser.getTok().getLoc(),
-          "expected relocation specifier in operand after ':'");
-    return MatchOperand_ParseFail;
+  static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val,
+                                                          unsigned ShiftAmount,
+                                                          SMLoc S, SMLoc E,
+                                                          MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
+    Op->ShiftedImm .Val = Val;
+    Op->ShiftedImm.ShiftAmount = ShiftAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
 
-  std::string LowerCase = Parser.getTok().getIdentifier().lower();
-  RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
-    .Case("got",              AArch64MCExpr::VK_AARCH64_GOT)
-    .Case("got_lo12",         AArch64MCExpr::VK_AARCH64_GOT_LO12)
-    .Case("lo12",             AArch64MCExpr::VK_AARCH64_LO12)
-    .Case("abs_g0",           AArch64MCExpr::VK_AARCH64_ABS_G0)
-    .Case("abs_g0_nc",        AArch64MCExpr::VK_AARCH64_ABS_G0_NC)
-    .Case("abs_g1",           AArch64MCExpr::VK_AARCH64_ABS_G1)
-    .Case("abs_g1_nc",        AArch64MCExpr::VK_AARCH64_ABS_G1_NC)
-    .Case("abs_g2",           AArch64MCExpr::VK_AARCH64_ABS_G2)
-    .Case("abs_g2_nc",        AArch64MCExpr::VK_AARCH64_ABS_G2_NC)
-    .Case("abs_g3",           AArch64MCExpr::VK_AARCH64_ABS_G3)
-    .Case("abs_g0_s",         AArch64MCExpr::VK_AARCH64_SABS_G0)
-    .Case("abs_g1_s",         AArch64MCExpr::VK_AARCH64_SABS_G1)
-    .Case("abs_g2_s",         AArch64MCExpr::VK_AARCH64_SABS_G2)
-    .Case("dtprel_g2",        AArch64MCExpr::VK_AARCH64_DTPREL_G2)
-    .Case("dtprel_g1",        AArch64MCExpr::VK_AARCH64_DTPREL_G1)
-    .Case("dtprel_g1_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC)
-    .Case("dtprel_g0",        AArch64MCExpr::VK_AARCH64_DTPREL_G0)
-    .Case("dtprel_g0_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC)
-    .Case("dtprel_hi12",      AArch64MCExpr::VK_AARCH64_DTPREL_HI12)
-    .Case("dtprel_lo12",      AArch64MCExpr::VK_AARCH64_DTPREL_LO12)
-    .Case("dtprel_lo12_nc",   AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC)
-    .Case("gottprel_g1",      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1)
-    .Case("gottprel_g0_nc",   AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC)
-    .Case("gottprel",         AArch64MCExpr::VK_AARCH64_GOTTPREL)
-    .Case("gottprel_lo12",    AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12)
-    .Case("tprel_g2",         AArch64MCExpr::VK_AARCH64_TPREL_G2)
-    .Case("tprel_g1",         AArch64MCExpr::VK_AARCH64_TPREL_G1)
-    .Case("tprel_g1_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC)
-    .Case("tprel_g0",         AArch64MCExpr::VK_AARCH64_TPREL_G0)
-    .Case("tprel_g0_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC)
-    .Case("tprel_hi12",       AArch64MCExpr::VK_AARCH64_TPREL_HI12)
-    .Case("tprel_lo12",       AArch64MCExpr::VK_AARCH64_TPREL_LO12)
-    .Case("tprel_lo12_nc",    AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC)
-    .Case("tlsdesc",          AArch64MCExpr::VK_AARCH64_TLSDESC)
-    .Case("tlsdesc_lo12",     AArch64MCExpr::VK_AARCH64_TLSDESC_LO12)
-    .Default(AArch64MCExpr::VK_AARCH64_None);
-
-  if (RefKind == AArch64MCExpr::VK_AARCH64_None) {
-    Error(Parser.getTok().getLoc(),
-          "expected relocation specifier in operand after ':'");
-    return MatchOperand_ParseFail;
+  static std::unique_ptr<AArch64Operand>
+  CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
+    Op->CondCode.Code = Code;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
-  Parser.Lex(); // Eat identifier
 
-  if (getLexer().isNot(AsmToken::Colon)) {
-    Error(Parser.getTok().getLoc(),
-          "expected ':' after relocation specifier");
-    return MatchOperand_ParseFail;
+  static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S,
+                                                     MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
-  Parser.Lex();
-  return MatchOperand_Success;
-}
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmWithLSLOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
+    Op->Barrier.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 
-  SMLoc S = Parser.getTok().getLoc();
+  static std::unique_ptr<AArch64Operand>
+  CreateSysReg(StringRef Str, SMLoc S, uint64_t FeatureBits, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
+    Op->SysReg.Data = Str.data();
+    Op->SysReg.Length = Str.size();
+    Op->SysReg.FeatureBits = FeatureBits;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 
-  if (Parser.getTok().is(AsmToken::Hash))
-    Parser.Lex(); // Eat '#'
-  else if (Parser.getTok().isNot(AsmToken::Integer))
-    // Operand should start from # or should be integer, emit error otherwise.
-    return MatchOperand_NoMatch;
+  static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
+                                                     SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
+    Op->SysCRImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-  const MCExpr *Imm;
-  if (ParseImmediate(Imm) != MatchOperand_Success)
-    return MatchOperand_ParseFail;
-  else if (Parser.getTok().isNot(AsmToken::Comma)) {
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, 0, true, S, E));
-    return MatchOperand_Success;
+  static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, SMLoc S,
+                                                        MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
+    Op->Prefetch.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
 
-  // Eat ','
-  Parser.Lex();
+  static std::unique_ptr<AArch64Operand>
+  CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
+                    bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
+    Op->ShiftExtend.Type = ShOp;
+    Op->ShiftExtend.Amount = Val;
+    Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+};
 
-  // The optional operand must be "lsl #N" where N is non-negative.
-  if (Parser.getTok().is(AsmToken::Identifier)
-      && Parser.getTok().getIdentifier().equals_lower("lsl")) {
-    Parser.Lex();
+} // end anonymous namespace.
 
-    if (Parser.getTok().is(AsmToken::Hash)) {
-      Parser.Lex();
-
-      if (Parser.getTok().isNot(AsmToken::Integer)) {
-        Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
-        return MatchOperand_ParseFail;
-      }
-    }
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_FPImm:
+    OS << "<fpimm " << getFPImm() << "("
+       << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+    break;
+  case k_Barrier: {
+    bool Valid;
+    StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid);
+    if (Valid)
+      OS << "<barrier " << Name << ">";
+    else
+      OS << "<barrier invalid #" << getBarrier() << ">";
+    break;
+  }
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_ShiftedImm: {
+    unsigned Shift = getShiftedImmShift();
+    OS << "<shiftedimm ";
+    getShiftedImmVal()->print(OS);
+    OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+    break;
+  }
+  case k_CondCode:
+    OS << "<condcode " << getCondCode() << ">";
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_VectorList: {
+    OS << "<vectorlist ";
+    unsigned Reg = getVectorListStart();
+    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+      OS << Reg + i << " ";
+    OS << ">";
+    break;
+  }
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  case k_SysReg:
+    OS << "<sysreg: " << getSysReg() << '>';
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_SysCR:
+    OS << "c" << getSysCR();
+    break;
+  case k_Prefetch: {
+    bool Valid;
+    StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
+    if (Valid)
+      OS << "<prfop " << Name << ">";
+    else
+      OS << "<prfop invalid #" << getPrefetch() << ">";
+    break;
+  }
+  case k_ShiftExtend: {
+    OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+       << getShiftExtendAmount();
+    if (!hasShiftExtendAmount())
+      OS << "<imp>";
+    OS << '>';
+    break;
+  }
   }
+}
 
-  int64_t ShiftAmount = Parser.getTok().getIntVal();
+/// @name Auto-generated Match Functions
+/// {
 
-  if (ShiftAmount < 0) {
-    Error(Parser.getTok().getLoc(), "positive shift amount required");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat the number
+static unsigned MatchRegisterName(StringRef Name);
 
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, ShiftAmount,
-                                                      false, S, E));
-  return MatchOperand_Success;
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+      .Case("v0", AArch64::Q0)
+      .Case("v1", AArch64::Q1)
+      .Case("v2", AArch64::Q2)
+      .Case("v3", AArch64::Q3)
+      .Case("v4", AArch64::Q4)
+      .Case("v5", AArch64::Q5)
+      .Case("v6", AArch64::Q6)
+      .Case("v7", AArch64::Q7)
+      .Case("v8", AArch64::Q8)
+      .Case("v9", AArch64::Q9)
+      .Case("v10", AArch64::Q10)
+      .Case("v11", AArch64::Q11)
+      .Case("v12", AArch64::Q12)
+      .Case("v13", AArch64::Q13)
+      .Case("v14", AArch64::Q14)
+      .Case("v15", AArch64::Q15)
+      .Case("v16", AArch64::Q16)
+      .Case("v17", AArch64::Q17)
+      .Case("v18", AArch64::Q18)
+      .Case("v19", AArch64::Q19)
+      .Case("v20", AArch64::Q20)
+      .Case("v21", AArch64::Q21)
+      .Case("v22", AArch64::Q22)
+      .Case("v23", AArch64::Q23)
+      .Case("v24", AArch64::Q24)
+      .Case("v25", AArch64::Q25)
+      .Case("v26", AArch64::Q26)
+      .Case("v27", AArch64::Q27)
+      .Case("v28", AArch64::Q28)
+      .Case("v29", AArch64::Q29)
+      .Case("v30", AArch64::Q30)
+      .Case("v31", AArch64::Q31)
+      .Default(0);
 }
 
+static bool isValidVectorKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".8b", true)
+      .Case(".16b", true)
+      .Case(".4h", true)
+      .Case(".8h", true)
+      .Case(".2s", true)
+      .Case(".4s", true)
+      .Case(".1d", true)
+      .Case(".2d", true)
+      .Case(".1q", true)
+      // Accept the width neutral ones, too, for verbose syntax. If those
+      // aren't used in the right places, the token operand won't match so
+      // all will work out.
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      .Default(false);
+}
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCondCodeOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  if (Parser.getTok().isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+                                 char &ElementKind) {
+  assert(isValidVectorKind(Name));
 
-  StringRef Tok = Parser.getTok().getIdentifier();
-  A64CC::CondCodes CondCode = A64StringToCondCode(Tok);
+  ElementKind = Name.lower()[Name.size() - 1];
+  NumElements = 0;
 
-  if (CondCode == A64CC::Invalid)
-    return MatchOperand_NoMatch;
+  if (Name.size() == 2)
+    return;
 
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat condition code
-  SMLoc E = Parser.getTok().getLoc();
+  // Parse the lane count
+  Name = Name.drop_front();
+  while (isdigit(Name.front())) {
+    NumElements = 10 * NumElements + (Name.front() - '0');
+    Name = Name.drop_front();
+  }
+}
 
-  Operands.push_back(AArch64Operand::CreateCondCode(CondCode, S, E));
-  return MatchOperand_Success;
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  StartLoc = getLoc();
+  RegNo = tryParseRegister();
+  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  return (RegNo == (unsigned)-1);
+}
+
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  // Also handle a few aliases of registers.
+  if (RegNum == 0)
+    RegNum = StringSwitch<unsigned>(lowerCase)
+                 .Case("fp",  AArch64::FP)
+                 .Case("lr",  AArch64::LR)
+                 .Case("x31", AArch64::XZR)
+                 .Case("w31", AArch64::WZR)
+                 .Default(0);
+
+  if (RegNum == 0)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+  return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    TokError("vector register expected");
+    return -1;
+  }
+
+  StringRef Name = Parser.getTok().getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchVectorRegName(Head);
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidVectorKind(Kind)) {
+        TokError("invalid vector kind qualifier");
+        return -1;
+      }
+    }
+    Parser.Lex(); // Eat the register token.
+    return RegNum;
+  }
+
+  if (expected)
+    TokError("vector register expected");
+  return -1;
 }
 
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCRxOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
@@ -1575,743 +1896,1225 @@ AArch64AsmParser::ParseCRxOperand(
     return MatchOperand_ParseFail;
   }
 
-  const MCExpr *CRImm = MCConstantExpr::Create(CRNum, getContext());
-
-  Parser.Lex();
-  SMLoc E = Parser.getTok().getLoc();
-
-  Operands.push_back(AArch64Operand::CreateImm(CRImm, S, E));
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
   return MatchOperand_Success;
 }
 
+/// tryParsePrefetch - Try to parse a prefetch operand.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImmOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  // Either an identifier for named values or a 5-bit immediate.
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    if (Hash)
+      Parser.Lex(); // Eat hash token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
 
-  SMLoc S = Parser.getTok().getLoc();
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for prefetch operand");
+      return MatchOperand_ParseFail;
+    }
+    unsigned prfop = MCE->getValue();
+    if (prfop > 31) {
+      TokError("prefetch operand out of range, [0,31] expected");
+      return MatchOperand_ParseFail;
+    }
 
-  bool Hash = false;
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
+    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+    return MatchOperand_Success;
   }
 
-  bool Negative = false;
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    Negative = true;
-    Parser.Lex(); // Eat '-'
-  } else if (Parser.getTok().is(AsmToken::Plus)) {
-    Parser.Lex(); // Eat '+'
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
   }
 
-  if (Parser.getTok().isNot(AsmToken::Real)) {
-    if (!Hash)
-      return MatchOperand_NoMatch;
-    Error(S, "Expected floating-point immediate");
+  bool Valid;
+  unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
   }
 
-  APFloat RealVal(APFloat::IEEEdouble, Parser.getTok().getString());
-  if (Negative) RealVal.changeSign();
-  double DblVal = RealVal.convertToDouble();
-
-  Parser.Lex(); // Eat real number
-  SMLoc E = Parser.getTok().getLoc();
-
-  Operands.push_back(AArch64Operand::CreateFPImm(DblVal, S, E));
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
   return MatchOperand_Success;
 }
 
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImm0AndImm0Operand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-
-  SMLoc S = Parser.getTok().getLoc();
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
 
-  bool Hash = false;
   if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
+    Parser.Lex(); // Eat hash token.
   }
 
-  APFloat RealVal(0.0);
-  if (Parser.getTok().is(AsmToken::Real)) {
-    if(Parser.getTok().getString() != "0.0") {
-      Error(S, "only #0.0 is acceptable as immediate");
+  if (parseSymbolicImmVal(Expr))
+    return MatchOperand_ParseFail;
+
+  AArch64MCExpr::VariantKind ELFRefKind;
+  MCSymbolRefExpr::VariantKind DarwinRefKind;
+  int64_t Addend;
+  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+        ELFRefKind == AArch64MCExpr::VK_INVALID) {
+      // No modifier was specified at all; this is the syntax for an ELF basic
+      // ADRP relocation (unfortunately).
+      Expr =
+          AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+               Addend != 0) {
+      Error(S, "gotpage label reference not allowed an addend");
       return MatchOperand_ParseFail;
-    }
-  }
-  else if (Parser.getTok().is(AsmToken::Integer)) {
-    if(Parser.getTok().getIntVal() != 0) {
-      Error(S, "only #0.0 is acceptable as immediate");
+    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+      // The operand must be an @page or @gotpage qualified symbolref.
+      Error(S, "page or gotpage label reference expected");
       return MatchOperand_ParseFail;
     }
   }
-  else {
-    if (!Hash)
-      return MatchOperand_NoMatch;
-    Error(S, "only #0.0 is acceptable as immediate");
-    return MatchOperand_ParseFail;
-  }
 
-  Parser.Lex(); // Eat real number
-  SMLoc E = Parser.getTok().getLoc();
+  // We have either a label reference possibly with addend or an immediate. The
+  // addend is a raw value here. The linker will adjust it to only reference the
+  // page.
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
 
-  Operands.push_back(AArch64Operand::CreateFPImm(0.0, S, E));
   return MatchOperand_Success;
 }
 
-// Automatically generated
-static unsigned MatchRegisterName(StringRef Name);
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
 
-bool
-AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
-                                   StringRef &Layout,
-                                   SMLoc &LayoutLoc) const {
-  const AsmToken &Tok = Parser.getTok();
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
 
-  if (Tok.isNot(AsmToken::Identifier))
-    return false;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
 
-  std::string LowerReg = Tok.getString().lower();
-  size_t DotPos = LowerReg.find('.');
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
 
-  bool IsVec128 = false;
-  SMLoc S = Tok.getLoc();
-  RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
+  return MatchOperand_Success;
+}
 
-  if (DotPos == std::string::npos) {
-    Layout = StringRef();
-  } else {
-    // Everything afterwards needs to be a literal token, expected to be
-    // '.2d','.b' etc for vector registers.
-
-    // This StringSwitch validates the input and (perhaps more importantly)
-    // gives us a permanent string to use in the token (a pointer into LowerReg
-    // would go out of scope when we return).
-    LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
-    StringRef LayoutText = StringRef(LowerReg).substr(DotPos);
-
-    // See if it's a 128-bit layout first.
-    Layout = StringSwitch<const char *>(LayoutText)
-      .Case(".q", ".q").Case(".1q", ".1q")
-      .Case(".d", ".d").Case(".2d", ".2d")
-      .Case(".s", ".s").Case(".4s", ".4s")
-      .Case(".h", ".h").Case(".8h", ".8h")
-      .Case(".b", ".b").Case(".16b", ".16b")
-      .Default("");
-
-    if (Layout.size() != 0)
-      IsVec128 = true;
-    else {
-      Layout = StringSwitch<const char *>(LayoutText)
-                   .Case(".1d", ".1d")
-                   .Case(".2s", ".2s")
-                   .Case(".4h", ".4h")
-                   .Case(".8b", ".8b")
-                   .Default("");
-    }
+/// tryParseFPImm - A floating point immediate expression operand.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
 
-    if (Layout.size() == 0) {
-      // If we've still not pinned it down the register is malformed.
-      return false;
+  bool Hash = false;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat '#'
+    Hash = true;
+  }
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    // Check for out of range values. As an exception, we let Zero through,
+    // as we handle that special case in post-processing before matching in
+    // order to use the zero register for it.
+    if (Val == -1 && !RealVal.isZero()) {
+      TokError("expected compatible register or floating-point constant");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val;
+    if (!isNegative && Tok.getString().startswith("0x")) {
+      Val = Tok.getIntVal();
+      if (Val > 255 || Val < 0) {
+        TokError("encoded floating point value out of range");
+        return MatchOperand_ParseFail;
+      }
+    } else {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      // If we had a '-' in front, toggle the sign bit.
+      IntVal ^= (uint64_t)isNegative << 63;
+      Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
     }
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
   }
 
-  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
-  if (RegNum == AArch64::NoRegister) {
-    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
-      .Case("ip0", AArch64::X16)
-      .Case("ip1", AArch64::X17)
-      .Case("fp", AArch64::X29)
-      .Case("lr", AArch64::X30)
-      .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0)
-      .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1)
-      .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2)
-      .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3)
-      .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4)
-      .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5)
-      .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6)
-      .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7)
-      .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8)
-      .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9)
-      .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10)
-      .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11)
-      .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12)
-      .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13)
-      .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14)
-      .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15)
-      .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16)
-      .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17)
-      .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18)
-      .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19)
-      .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20)
-      .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21)
-      .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22)
-      .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23)
-      .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24)
-      .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25)
-      .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26)
-      .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27)
-      .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28)
-      .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29)
-      .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30)
-      .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31)
-      .Default(AArch64::NoRegister);
-  }
-  if (RegNum == AArch64::NoRegister)
-    return false;
+  if (!Hash)
+    return MatchOperand_NoMatch;
 
-  return true;
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
 }
 
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                uint32_t &NumLanes) {
-  unsigned RegNum;
-  StringRef Layout;
-  SMLoc RegEndLoc, LayoutLoc;
-  SMLoc S = Parser.getTok().getLoc();
-
-  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
-    return MatchOperand_NoMatch;
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
 
-  Operands.push_back(AArch64Operand::CreateReg(RegNum, S, RegEndLoc));
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat '#'
+  else if (Parser.getTok().isNot(AsmToken::Integer))
+    // Operand should start from # or should be integer, emit error otherwise.
+    return MatchOperand_NoMatch;
 
-  if (Layout.size() != 0) {
-    unsigned long long TmpLanes = 0;
-    llvm::getAsUnsignedInteger(Layout.substr(1), 10, TmpLanes);
-    if (TmpLanes != 0) {
-      NumLanes = TmpLanes;
-    } else {
-      // If the number of lanes isn't specified explicitly, a valid instruction
-      // will have an element specifier and be capable of acting on the entire
-      // vector register.
-      switch (Layout.back()) {
-      default: llvm_unreachable("Invalid layout specifier");
-      case 'b': NumLanes = 16; break;
-      case 'h': NumLanes = 8; break;
-      case 's': NumLanes = 4; break;
-      case 'd': NumLanes = 2; break;
-      case 'q': NumLanes = 1; break;
+  const MCExpr *Imm;
+  if (parseSymbolicImmVal(Imm))
+    return MatchOperand_ParseFail;
+  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    uint64_t ShiftAmount = 0;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+    if (MCE) {
+      int64_t Val = MCE->getValue();
+      if (Val > 0xfff && (Val & 0xfff) == 0) {
+        Imm = MCConstantExpr::Create(Val >> 12, getContext());
+        ShiftAmount = 12;
       }
     }
-
-    Operands.push_back(AArch64Operand::CreateToken(Layout, LayoutLoc));
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+                                                        getContext()));
+    return MatchOperand_Success;
   }
 
+  // Eat ','
   Parser.Lex();
-  return MatchOperand_Success;
-}
 
-bool
-AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                SMLoc &EndLoc) {
-  // This callback is used for things like DWARF frame directives in
-  // assembly. They don't care about things like NEON layouts or lanes, they
-  // just want to be able to produce the DWARF register number.
-  StringRef LayoutSpec;
-  SMLoc RegEndLoc, LayoutLoc;
-  StartLoc = Parser.getTok().getLoc();
-
-  if (!IdentifyRegister(RegNo, RegEndLoc, LayoutSpec, LayoutLoc))
-    return true;
+  // The optional operand must be "lsl #N" where N is non-negative.
+  if (!Parser.getTok().is(AsmToken::Identifier) ||
+      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
 
+  // Eat 'lsl'
   Parser.Lex();
-  EndLoc = Parser.getTok().getLoc();
 
-  return false;
-}
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex();
+  }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNamedImmOperand(const NamedImmMapper &Mapper,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Since these operands occur in very limited circumstances, without
-  // alternatives, we actually signal an error if there is no match. If relaxing
-  // this, beware of unintended consequences: an immediate will be accepted
-  // during matching, no matter how it gets into the AArch64Operand.
-  const AsmToken &Tok = Parser.getTok();
-  SMLoc S = Tok.getLoc();
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
 
-  if (Tok.is(AsmToken::Identifier)) {
-    bool ValidName;
-    uint32_t Code = Mapper.fromString(Tok.getString().lower(), ValidName);
+  int64_t ShiftAmount = Parser.getTok().getIntVal();
 
-    if (!ValidName) {
-      Error(S, "operand specifier not recognised");
-      return MatchOperand_ParseFail;
-    }
+  if (ShiftAmount < 0) {
+    Error(Parser.getTok().getLoc(), "positive shift amount required");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the number
 
-    Parser.Lex(); // We're done with the identifier. Eat it
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+                                                      S, E, getContext()));
+  return MatchOperand_Success;
+}
 
-    SMLoc E = Parser.getTok().getLoc();
-    const MCExpr *Imm = MCConstantExpr::Create(Code, getContext());
-    Operands.push_back(AArch64Operand::CreateImm(Imm, S, E));
-    return MatchOperand_Success;
-  } else if (Tok.is(AsmToken::Hash)) {
-    Parser.Lex();
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+  AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+                    .Case("eq", AArch64CC::EQ)
+                    .Case("ne", AArch64CC::NE)
+                    .Case("cs", AArch64CC::HS)
+                    .Case("hs", AArch64CC::HS)
+                    .Case("cc", AArch64CC::LO)
+                    .Case("lo", AArch64CC::LO)
+                    .Case("mi", AArch64CC::MI)
+                    .Case("pl", AArch64CC::PL)
+                    .Case("vs", AArch64CC::VS)
+                    .Case("vc", AArch64CC::VC)
+                    .Case("hi", AArch64CC::HI)
+                    .Case("ls", AArch64CC::LS)
+                    .Case("ge", AArch64CC::GE)
+                    .Case("lt", AArch64CC::LT)
+                    .Case("gt", AArch64CC::GT)
+                    .Case("le", AArch64CC::LE)
+                    .Case("al", AArch64CC::AL)
+                    .Case("nv", AArch64CC::NV)
+                    .Default(AArch64CC::Invalid);
+  return CC;
+}
 
-    const MCExpr *ImmVal;
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+                                     bool invertCondCode) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!CE || CE->getValue() < 0 || !Mapper.validImm(CE->getValue())) {
-      Error(S, "Invalid immediate for instruction");
-      return MatchOperand_ParseFail;
-    }
+  StringRef Cond = Tok.getString();
+  AArch64CC::CondCode CC = parseCondCodeString(Cond);
+  if (CC == AArch64CC::Invalid)
+    return TokError("invalid condition code");
+  Parser.Lex(); // Eat identifier token.
 
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E));
-    return MatchOperand_Success;
+  if (invertCondCode) {
+    if (CC == AArch64CC::AL || CC == AArch64CC::NV)
+      return TokError("condition codes AL and NV are invalid for this instruction");
+    CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
   }
 
-  Error(S, "unexpected operand for instruction");
-  return MatchOperand_ParseFail;
+  Operands.push_back(
+      AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+  return false;
 }
 
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseSysRegOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
-
-  // Any MSR/MRS operand will be an identifier, and we want to store it as some
-  // kind of string: SPSel is valid for two different forms of MSR with two
-  // different encodings. There's no collision at the moment, but the potential
-  // is there.
-  if (!Tok.is(AsmToken::Identifier)) {
+  std::string LowerID = Tok.getString().lower();
+  AArch64_AM::ShiftExtendType ShOp =
+      StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+          .Case("lsl", AArch64_AM::LSL)
+          .Case("lsr", AArch64_AM::LSR)
+          .Case("asr", AArch64_AM::ASR)
+          .Case("ror", AArch64_AM::ROR)
+          .Case("msl", AArch64_AM::MSL)
+          .Case("uxtb", AArch64_AM::UXTB)
+          .Case("uxth", AArch64_AM::UXTH)
+          .Case("uxtw", AArch64_AM::UXTW)
+          .Case("uxtx", AArch64_AM::UXTX)
+          .Case("sxtb", AArch64_AM::SXTB)
+          .Case("sxth", AArch64_AM::SXTH)
+          .Case("sxtw", AArch64_AM::SXTW)
+          .Case("sxtx", AArch64_AM::SXTX)
+          .Default(AArch64_AM::InvalidShiftExtend);
+
+  if (ShOp == AArch64_AM::InvalidShiftExtend)
     return MatchOperand_NoMatch;
-  }
 
   SMLoc S = Tok.getLoc();
-  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), S));
-  Parser.Lex(); // Eat identifier
-
-  return MatchOperand_Success;
-}
+  Parser.Lex();
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseLSXAddressOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-
-  unsigned RegNum;
-  SMLoc RegEndLoc, LayoutLoc;
-  StringRef Layout;
-  if(!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)
-     || !AArch64MCRegisterClasses[AArch64::GPR64xspRegClassID].contains(RegNum)
-     || Layout.size() != 0) {
-    // Check Layout.size because we don't want to let "x3.4s" or similar
-    // through.
-    return MatchOperand_NoMatch;
-  }
-  Parser.Lex(); // Eat register
+  bool Hash = getLexer().is(AsmToken::Hash);
+  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+    if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+        ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+        ShOp == AArch64_AM::MSL) {
+      // We expect a number here.
+      TokError("expected #imm after shift specifier");
+      return MatchOperand_ParseFail;
+    }
 
-  if (Parser.getTok().is(AsmToken::RBrac)) {
-    // We're done
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+    // "extend" type operatoins don't need an immediate, #0 is implicit.
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
     return MatchOperand_Success;
   }
 
-  // Otherwise, only ", #0" is valid
+  if (Hash)
+    Parser.Lex(); // Eat the '#'.
 
-  if (Parser.getTok().isNot(AsmToken::Comma)) {
-    Error(Parser.getTok().getLoc(), "expected ',' or ']' after register");
+  // Make sure we do actually have a number
+  if (!Parser.getTok().is(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(),
+          "expected integer shift amount");
     return MatchOperand_ParseFail;
   }
-  Parser.Lex(); // Eat ','
 
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
-    Error(Parser.getTok().getLoc(), "expected '#0'");
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
     return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat '#'
 
-  if (Parser.getTok().isNot(AsmToken::Integer)
-      || Parser.getTok().getIntVal() != 0 ) {
-    Error(Parser.getTok().getLoc(), "expected '#0'");
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE) {
+    TokError("expected #imm after shift specifier");
     return MatchOperand_ParseFail;
   }
-  Parser.Lex(); // Eat '0'
 
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateShiftExtend(
+      ShOp, MCE->getValue(), true, S, E, getContext()));
   return MatchOperand_Success;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseShiftExtend(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  StringRef IDVal = Parser.getTok().getIdentifier();
-  std::string LowerID = IDVal.lower();
-
-  A64SE::ShiftExtSpecifiers Spec =
-      StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
-        .Case("lsl", A64SE::LSL)
-	.Case("msl", A64SE::MSL)
-	.Case("lsr", A64SE::LSR)
-	.Case("asr", A64SE::ASR)
-	.Case("ror", A64SE::ROR)
-	.Case("uxtb", A64SE::UXTB)
-	.Case("uxth", A64SE::UXTH)
-	.Case("uxtw", A64SE::UXTW)
-	.Case("uxtx", A64SE::UXTX)
-	.Case("sxtb", A64SE::SXTB)
-	.Case("sxth", A64SE::SXTH)
-	.Case("sxtw", A64SE::SXTW)
-	.Case("sxtx", A64SE::SXTX)
-	.Default(A64SE::Invalid);
-
-  if (Spec == A64SE::Invalid)
-    return MatchOperand_NoMatch;
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+                                   OperandVector &Operands) {
+  if (Name.find('.') != StringRef::npos)
+    return TokError("invalid operand");
 
-  // Eat the shift
-  SMLoc S, E;
-  S = Parser.getTok().getLoc();
-  Parser.Lex();
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
 
-  if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR &&
-      Spec != A64SE::ROR && Spec != A64SE::MSL) {
-    // The shift amount can be omitted for the extending versions, but not real
-    // shifts:
-    //     add x0, x0, x0, uxtb
-    // is valid, and equivalent to
-    //     add x0, x0, x0, uxtb #0
-
-    if (Parser.getTok().is(AsmToken::Comma) ||
-        Parser.getTok().is(AsmToken::EndOfStatement) ||
-        Parser.getTok().is(AsmToken::RBrac)) {
-      Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, 0, true,
-                                                           S, E));
-      return MatchOperand_Success;
-    }
-  }
+  const AsmToken &Tok = Parser.getTok();
+  StringRef Op = Tok.getString();
+  SMLoc S = Tok.getLoc();
 
-  // Eat # at beginning of immediate
-  if (!Parser.getTok().is(AsmToken::Hash)) {
-    Error(Parser.getTok().getLoc(),
-          "expected #imm after shift specifier");
-    return MatchOperand_ParseFail;
+  const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
+  do {                                                                         \
+    Expr = MCConstantExpr::Create(op1, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
+    Expr = MCConstantExpr::Create(op2, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+  } while (0)
+
+  if (Mnemonic == "ic") {
+    if (!Op.compare_lower("ialluis")) {
+      // SYS #0, C7, C1, #0
+      SYS_ALIAS(0, 7, 1, 0);
+    } else if (!Op.compare_lower("iallu")) {
+      // SYS #0, C7, C5, #0
+      SYS_ALIAS(0, 7, 5, 0);
+    } else if (!Op.compare_lower("ivau")) {
+      // SYS #3, C7, C5, #1
+      SYS_ALIAS(3, 7, 5, 1);
+    } else {
+      return TokError("invalid operand for IC instruction");
+    }
+  } else if (Mnemonic == "dc") {
+    if (!Op.compare_lower("zva")) {
+      // SYS #3, C7, C4, #1
+      SYS_ALIAS(3, 7, 4, 1);
+    } else if (!Op.compare_lower("ivac")) {
+      // SYS #3, C7, C6, #1
+      SYS_ALIAS(0, 7, 6, 1);
+    } else if (!Op.compare_lower("isw")) {
+      // SYS #0, C7, C6, #2
+      SYS_ALIAS(0, 7, 6, 2);
+    } else if (!Op.compare_lower("cvac")) {
+      // SYS #3, C7, C10, #1
+      SYS_ALIAS(3, 7, 10, 1);
+    } else if (!Op.compare_lower("csw")) {
+      // SYS #0, C7, C10, #2
+      SYS_ALIAS(0, 7, 10, 2);
+    } else if (!Op.compare_lower("cvau")) {
+      // SYS #3, C7, C11, #1
+      SYS_ALIAS(3, 7, 11, 1);
+    } else if (!Op.compare_lower("civac")) {
+      // SYS #3, C7, C14, #1
+      SYS_ALIAS(3, 7, 14, 1);
+    } else if (!Op.compare_lower("cisw")) {
+      // SYS #0, C7, C14, #2
+      SYS_ALIAS(0, 7, 14, 2);
+    } else {
+      return TokError("invalid operand for DC instruction");
+    }
+  } else if (Mnemonic == "at") {
+    if (!Op.compare_lower("s1e1r")) {
+      // SYS #0, C7, C8, #0
+      SYS_ALIAS(0, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e2r")) {
+      // SYS #4, C7, C8, #0
+      SYS_ALIAS(4, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e3r")) {
+      // SYS #6, C7, C8, #0
+      SYS_ALIAS(6, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e1w")) {
+      // SYS #0, C7, C8, #1
+      SYS_ALIAS(0, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e2w")) {
+      // SYS #4, C7, C8, #1
+      SYS_ALIAS(4, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e3w")) {
+      // SYS #6, C7, C8, #1
+      SYS_ALIAS(6, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e0r")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 2);
+    } else if (!Op.compare_lower("s1e0w")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 3);
+    } else if (!Op.compare_lower("s12e1r")) {
+      // SYS #4, C7, C8, #4
+      SYS_ALIAS(4, 7, 8, 4);
+    } else if (!Op.compare_lower("s12e1w")) {
+      // SYS #4, C7, C8, #5
+      SYS_ALIAS(4, 7, 8, 5);
+    } else if (!Op.compare_lower("s12e0r")) {
+      // SYS #4, C7, C8, #6
+      SYS_ALIAS(4, 7, 8, 6);
+    } else if (!Op.compare_lower("s12e0w")) {
+      // SYS #4, C7, C8, #7
+      SYS_ALIAS(4, 7, 8, 7);
+    } else {
+      return TokError("invalid operand for AT instruction");
+    }
+  } else if (Mnemonic == "tlbi") {
+    if (!Op.compare_lower("vmalle1is")) {
+      // SYS #0, C8, C3, #0
+      SYS_ALIAS(0, 8, 3, 0);
+    } else if (!Op.compare_lower("alle2is")) {
+      // SYS #4, C8, C3, #0
+      SYS_ALIAS(4, 8, 3, 0);
+    } else if (!Op.compare_lower("alle3is")) {
+      // SYS #6, C8, C3, #0
+      SYS_ALIAS(6, 8, 3, 0);
+    } else if (!Op.compare_lower("vae1is")) {
+      // SYS #0, C8, C3, #1
+      SYS_ALIAS(0, 8, 3, 1);
+    } else if (!Op.compare_lower("vae2is")) {
+      // SYS #4, C8, C3, #1
+      SYS_ALIAS(4, 8, 3, 1);
+    } else if (!Op.compare_lower("vae3is")) {
+      // SYS #6, C8, C3, #1
+      SYS_ALIAS(6, 8, 3, 1);
+    } else if (!Op.compare_lower("aside1is")) {
+      // SYS #0, C8, C3, #2
+      SYS_ALIAS(0, 8, 3, 2);
+    } else if (!Op.compare_lower("vaae1is")) {
+      // SYS #0, C8, C3, #3
+      SYS_ALIAS(0, 8, 3, 3);
+    } else if (!Op.compare_lower("alle1is")) {
+      // SYS #4, C8, C3, #4
+      SYS_ALIAS(4, 8, 3, 4);
+    } else if (!Op.compare_lower("vale1is")) {
+      // SYS #0, C8, C3, #5
+      SYS_ALIAS(0, 8, 3, 5);
+    } else if (!Op.compare_lower("vaale1is")) {
+      // SYS #0, C8, C3, #7
+      SYS_ALIAS(0, 8, 3, 7);
+    } else if (!Op.compare_lower("vmalle1")) {
+      // SYS #0, C8, C7, #0
+      SYS_ALIAS(0, 8, 7, 0);
+    } else if (!Op.compare_lower("alle2")) {
+      // SYS #4, C8, C7, #0
+      SYS_ALIAS(4, 8, 7, 0);
+    } else if (!Op.compare_lower("vale2is")) {
+      // SYS #4, C8, C3, #5
+      SYS_ALIAS(4, 8, 3, 5);
+    } else if (!Op.compare_lower("vale3is")) {
+      // SYS #6, C8, C3, #5
+      SYS_ALIAS(6, 8, 3, 5);
+    } else if (!Op.compare_lower("alle3")) {
+      // SYS #6, C8, C7, #0
+      SYS_ALIAS(6, 8, 7, 0);
+    } else if (!Op.compare_lower("vae1")) {
+      // SYS #0, C8, C7, #1
+      SYS_ALIAS(0, 8, 7, 1);
+    } else if (!Op.compare_lower("vae2")) {
+      // SYS #4, C8, C7, #1
+      SYS_ALIAS(4, 8, 7, 1);
+    } else if (!Op.compare_lower("vae3")) {
+      // SYS #6, C8, C7, #1
+      SYS_ALIAS(6, 8, 7, 1);
+    } else if (!Op.compare_lower("aside1")) {
+      // SYS #0, C8, C7, #2
+      SYS_ALIAS(0, 8, 7, 2);
+    } else if (!Op.compare_lower("vaae1")) {
+      // SYS #0, C8, C7, #3
+      SYS_ALIAS(0, 8, 7, 3);
+    } else if (!Op.compare_lower("alle1")) {
+      // SYS #4, C8, C7, #4
+      SYS_ALIAS(4, 8, 7, 4);
+    } else if (!Op.compare_lower("vale1")) {
+      // SYS #0, C8, C7, #5
+      SYS_ALIAS(0, 8, 7, 5);
+    } else if (!Op.compare_lower("vale2")) {
+      // SYS #4, C8, C7, #5
+      SYS_ALIAS(4, 8, 7, 5);
+    } else if (!Op.compare_lower("vale3")) {
+      // SYS #6, C8, C7, #5
+      SYS_ALIAS(6, 8, 7, 5);
+    } else if (!Op.compare_lower("vaale1")) {
+      // SYS #0, C8, C7, #7
+      SYS_ALIAS(0, 8, 7, 7);
+    } else if (!Op.compare_lower("ipas2e1")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 4, 1);
+    } else if (!Op.compare_lower("ipas2le1")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 4, 5);
+    } else if (!Op.compare_lower("ipas2e1is")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 0, 1);
+    } else if (!Op.compare_lower("ipas2le1is")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 0, 5);
+    } else if (!Op.compare_lower("vmalls12e1")) {
+      // SYS #4, C8, C7, #6
+      SYS_ALIAS(4, 8, 7, 6);
+    } else if (!Op.compare_lower("vmalls12e1is")) {
+      // SYS #4, C8, C3, #6
+      SYS_ALIAS(4, 8, 3, 6);
+    } else {
+      return TokError("invalid operand for TLBI instruction");
+    }
   }
-  Parser.Lex();
 
-  // Make sure we do actually have a number
-  if (!Parser.getTok().is(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(),
-          "expected integer shift amount");
-    return MatchOperand_ParseFail;
-  }
-  unsigned Amount = Parser.getTok().getIntVal();
-  Parser.Lex();
-  E = Parser.getTok().getLoc();
+#undef SYS_ALIAS
 
-  Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, Amount, false,
-                                                       S, E));
+  Parser.Lex(); // Eat operand.
 
-  return MatchOperand_Success;
-}
+  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+  bool HasRegister = false;
 
-/// Try to parse a vector register token, If it is a vector register,
-/// the token is eaten and return true. Otherwise return false.
-bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
-                                      StringRef &Layout, SMLoc &LayoutLoc) {
-  bool IsVector = true;
-
-  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
-    IsVector = false;
-  else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID]
-                .contains(RegNum) &&
-           !AArch64MCRegisterClasses[AArch64::FPR128RegClassID]
-                .contains(RegNum))
-    IsVector = false;
-  else if (Layout.size() == 0)
-    IsVector = false;
-
-  if (!IsVector)
-    Error(Parser.getTok().getLoc(), "expected vector type register");
-
-  Parser.Lex(); // Eat this token.
-  return IsVector;
-}
+  // Check for the optional register operand.
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat comma.
 
+    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+      return TokError("expected register operand");
 
-// A vector list contains 1-4 consecutive registers.
-// Now there are two kinds of vector list when number of vector > 1:
-//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-//   (2) {Vn.layout - Vm.layout}
-// If the layout is like .b/.h/.s/.d, also parse the lane.
-AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  if (Parser.getTok().isNot(AsmToken::LCurly)) {
-    Error(Parser.getTok().getLoc(), "'{' expected");
-    return MatchOperand_ParseFail;
+    HasRegister = true;
   }
-  SMLoc SLoc = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '{' token.
 
-  unsigned Reg, Count = 1;
-  StringRef LayoutStr;
-  SMLoc RegEndLoc, LayoutLoc;
-  if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
-    return MatchOperand_ParseFail;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    return TokError("unexpected token in argument list");
+  }
 
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    Parser.Lex(); // Eat the minus.
+  if (ExpectRegister && !HasRegister) {
+    return TokError("specified " + Mnemonic + " op requires a register");
+  }
+  else if (!ExpectRegister && HasRegister) {
+    return TokError("specified " + Mnemonic + " op does not use a register");
+  }
 
-    unsigned Reg2;
-    StringRef LayoutStr2;
-    SMLoc RegEndLoc2, LayoutLoc2;
-    SMLoc RegLoc2 = Parser.getTok().getLoc();
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
 
-    if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
-      return MatchOperand_ParseFail;
-    unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
 
-    if (LayoutStr != LayoutStr2) {
-      Error(LayoutLoc2, "expected the same vector layout");
+  // Can be either a #imm style literal or an option name
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    // Immediate operand.
+    if (Hash)
+      Parser.Lex(); // Eat the '#'
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
       return MatchOperand_ParseFail;
     }
-    if (Space == 0 || Space > 3) {
-      Error(RegLoc2, "invalid number of vectors");
+    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+      Error(ExprLoc, "barrier operand out of range");
       return MatchOperand_ParseFail;
     }
+    Operands.push_back(
+        AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
+    return MatchOperand_Success;
+  }
 
-    Count += Space;
-  } else {
-    unsigned LastReg = Reg;
-    while (Parser.getTok().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
-      unsigned Reg2;
-      StringRef LayoutStr2;
-      SMLoc RegEndLoc2, LayoutLoc2;
-      SMLoc RegLoc2 = Parser.getTok().getLoc();
-
-      if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
-        return MatchOperand_ParseFail;
-      unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
-                                        : (Reg2 + 32 - LastReg);
-      Count++;
-
-      // The space between two vectors should be 1. And they should have the same layout.
-      // Total count shouldn't be great than 4
-      if (Space != 1) {
-        Error(RegLoc2, "invalid space between two vectors");
-        return MatchOperand_ParseFail;
-      }
-      if (LayoutStr != LayoutStr2) {
-        Error(LayoutLoc2, "expected the same vector layout");
-        return MatchOperand_ParseFail;
-      }
-      if (Count > 4) {
-        Error(RegLoc2, "invalid number of vectors");
-        return MatchOperand_ParseFail;
-      }
-
-      LastReg = Reg2;
-    }
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
   }
 
-  if (Parser.getTok().isNot(AsmToken::RCurly)) {
-    Error(Parser.getTok().getLoc(), "'}' expected");
+  bool Valid;
+  unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("invalid barrier option name");
     return MatchOperand_ParseFail;
   }
-  SMLoc ELoc = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '}' token.
 
-  A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
-  if (Count > 1) { // If count > 1, create vector list using super register.
-    bool IsVec64 = (Layout < A64Layout::VL_16B);
-    static unsigned SupRegIDs[3][2] = {
-      { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
-      { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
-      { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
-    };
-    unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
-    unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
-    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-    Reg = MRI->getMatchingSuperReg(Reg, Sub0,
-                                   &AArch64MCRegisterClasses[SupRegID]);
+  // The only valid named option for ISB is 'sy'
+  if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+    TokError("'sy' or #imm operand expected");
+    return MatchOperand_ParseFail;
   }
+
   Operands.push_back(
-      AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
+      AArch64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+  Parser.Lex(); // Consume the option
 
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(),
+                     STI.getFeatureBits(), getContext()));
+  Parser.Lex(); // Eat identifier
+
+  return MatchOperand_Success;
+}
+
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return true;
+
+  SMLoc S = getLoc();
+  // Check for a vector register specifier first.
+  StringRef Kind;
+  int64_t Reg = tryMatchVectorRegister(Kind, false);
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+  // If there was an explicit qualifier, that goes on as a literal text
+  // operand.
+  if (!Kind.empty())
+    Operands.push_back(
+        AArch64Operand::CreateToken(Kind, false, S, getContext()));
+
+  // If there is an index specifier following the register, parse that too.
   if (Parser.getTok().is(AsmToken::LBrac)) {
-    uint32_t NumLanes = 0;
-    switch(Layout) {
-    case A64Layout::VL_B : NumLanes = 16; break;
-    case A64Layout::VL_H : NumLanes = 8; break;
-    case A64Layout::VL_S : NumLanes = 4; break;
-    case A64Layout::VL_D : NumLanes = 2; break;
-    default:
-      SMLoc Loc = getLexer().getLoc();
-      Error(Loc, "expected comma before next operand");
-      return MatchOperand_ParseFail;
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
     }
-    return ParseNEONLane(Operands, NumLanes);
-  } else {
-    return MatchOperand_Success;
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
   }
+
+  return false;
 }
 
-// FIXME: We would really like to be able to tablegen'erate this.
-bool AArch64AsmParser::
-validateInstruction(MCInst &Inst,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  switch (Inst.getOpcode()) {
-  case AArch64::BFIwwii:
-  case AArch64::BFIxxii:
-  case AArch64::SBFIZwwii:
-  case AArch64::SBFIZxxii:
-  case AArch64::UBFIZwwii:
-  case AArch64::UBFIZxxii:  {
-    unsigned ImmOps = Inst.getNumOperands() - 2;
-    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
-    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
-
-    if (ImmR != 0 && ImmS >= ImmR) {
-      return Error(Operands[4]->getStartLoc(),
-                   "requested insert overflows register");
-    }
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  // Try for a vector register.
+  if (!tryParseVectorRegister(Operands))
     return false;
+
+  // Try for a scalar register.
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+
+  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+  // as a string token in the instruction itself.
+  if (getLexer().getKind() == AsmToken::LBrac) {
+    SMLoc LBracS = getLoc();
+    Parser.Lex();
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      SMLoc IntS = getLoc();
+      int64_t Val = Tok.getIntVal();
+      if (Val == 1) {
+        Parser.Lex();
+        if (getLexer().getKind() == AsmToken::RBrac) {
+          SMLoc RBracS = getLoc();
+          Parser.Lex();
+          Operands.push_back(
+              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("1", false, IntS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+          return false;
+        }
+      }
+    }
   }
-  case AArch64::BFXILwwii:
-  case AArch64::BFXILxxii:
-  case AArch64::SBFXwwii:
-  case AArch64::SBFXxxii:
-  case AArch64::UBFXwwii:
-  case AArch64::UBFXxxii: {
-    unsigned ImmOps = Inst.getNumOperands() - 2;
-    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
-    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
-    int64_t RegWidth = 0;
-    switch (Inst.getOpcode()) {
-    case AArch64::SBFXxxii: case AArch64::UBFXxxii: case AArch64::BFXILxxii:
-      RegWidth = 64;
-      break;
-    case AArch64::SBFXwwii: case AArch64::UBFXwwii: case AArch64::BFXILwwii:
-      RegWidth = 32;
-      break;
+
+  return false;
+}
+
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  bool HasELFModifier = false;
+  AArch64MCExpr::VariantKind RefKind;
+
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat ':"
+    HasELFModifier = true;
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
     }
 
-    if (ImmS >= RegWidth || ImmS < ImmR) {
-      return Error(Operands[4]->getStartLoc(),
-                   "requested extract overflows register");
+    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+                  .Case("lo12", AArch64MCExpr::VK_LO12)
+                  .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+                  .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+                  .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+                  .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+                  .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+                  .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+                  .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+                  .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+                  .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+                  .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+                  .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+                  .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+                  .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+                  .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+                  .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+                  .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+                  .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+                  .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+                  .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+                  .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+                  .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+                  .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+                  .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+                  .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+                  .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+                  .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+                  .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+                  .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+                  .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+                  .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+                  .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+                  .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+                  .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+                  .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+                  .Default(AArch64MCExpr::VK_INVALID);
+
+    if (RefKind == AArch64MCExpr::VK_INVALID) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
     }
-    return false;
-  }
-  case AArch64::ICix: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
-    if (!A64IC::NeedsRegister(ICOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified IC op does not use a register");
+
+    Parser.Lex(); // Eat identifier
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
+      return true;
     }
-    return false;
+    Parser.Lex(); // Eat ':'
   }
-  case AArch64::ICi: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
-    if (A64IC::NeedsRegister(ICOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified IC op requires a register");
+
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  if (HasELFModifier)
+    ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext());
+
+  return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+  StringRef Kind;
+  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+  if (FirstReg == -1)
+    return true;
+  int64_t PrevReg = FirstReg;
+  unsigned Count = 1;
+
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the minus.
+
+    SMLoc Loc = getLoc();
+    StringRef NextKind;
+    int64_t Reg = tryMatchVectorRegister(NextKind, true);
+    if (Reg == -1)
+      return true;
+    // Any Kind suffices must match on all regs in the list.
+    if (Kind != NextKind)
+      return Error(Loc, "mismatched register size suffix");
+
+    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
+
+    if (Space == 0 || Space > 3) {
+      return Error(Loc, "invalid number of vectors");
     }
-    return false;
+
+    Count += Space;
   }
-  case AArch64::TLBIix: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
-    if (!A64TLBI::NeedsRegister(TLBIOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified TLBI op does not use a register");
+  else {
+    while (Parser.getTok().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma token.
+
+      SMLoc Loc = getLoc();
+      StringRef NextKind;
+      int64_t Reg = tryMatchVectorRegister(NextKind, true);
+      if (Reg == -1)
+        return true;
+      // Any Kind suffices must match on all regs in the list.
+      if (Kind != NextKind)
+        return Error(Loc, "mismatched register size suffix");
+
+      // Registers must be incremental (with wraparound at 31)
+      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+       return Error(Loc, "registers must be sequential");
+
+      PrevReg = Reg;
+      ++Count;
     }
-    return false;
   }
-  case AArch64::TLBIi: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
-    if (A64TLBI::NeedsRegister(TLBIOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified TLBI op requires a register");
+
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(getLoc(), "'}' expected");
+  Parser.Lex(); // Eat the '}' token.
+
+  if (Count > 4)
+    return Error(S, "invalid number of vectors");
+
+  unsigned NumElements = 0;
+  char ElementKind = 0;
+  if (!Kind.empty())
+    parseValidVectorKind(Kind, NumElements, ElementKind);
+
+  Operands.push_back(AArch64Operand::CreateVectorList(
+      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+  // If there is an index specifier following the list, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
     }
-    return false;
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
   }
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = MatchRegisterName(Tok.getString().lower());
+
+  MCContext &Ctx = getContext();
+  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+  if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+    return MatchOperand_NoMatch;
+
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat register
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Operands.push_back(
+        AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+    return MatchOperand_Success;
   }
+  Parser.Lex(); // Eat comma.
 
-  return false;
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat hash
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+      cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+  return MatchOperand_Success;
 }
 
+/// parseOperand - Parse a arm instruction operand.  For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+                                  bool invertCondCode) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Nothing custom, so do general case parsing.
+  SMLoc S, E;
+  switch (getLexer().getKind()) {
+  default: {
+    SMLoc S = getLoc();
+    const MCExpr *Expr;
+    if (parseSymbolicImmVal(Expr))
+      return Error(S, "invalid operand");
+
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+                                                   getContext()));
+    Parser.Lex(); // Eat '['
 
-// Parses the instruction *together with* all operands, appending each parsed
-// operand to the "Operands" list
-bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                        StringRef Name, SMLoc NameLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  StringRef PatchedName = StringSwitch<StringRef>(Name.lower())
-    .Case("beq", "b.eq")
-    .Case("bne", "b.ne")
-    .Case("bhs", "b.hs")
-    .Case("bcs", "b.cs")
-    .Case("blo", "b.lo")
-    .Case("bcc", "b.cc")
-    .Case("bmi", "b.mi")
-    .Case("bpl", "b.pl")
-    .Case("bvs", "b.vs")
-    .Case("bvc", "b.vc")
-    .Case("bhi", "b.hi")
-    .Case("bls", "b.ls")
-    .Case("bge", "b.ge")
-    .Case("blt", "b.lt")
-    .Case("bgt", "b.gt")
-    .Case("ble", "b.le")
-    .Case("bal", "b.al")
-    .Case("bnv", "b.nv")
-    .Default(Name);
-
-  size_t CondCodePos = PatchedName.find('.');
-
-  StringRef Mnemonic = PatchedName.substr(0, CondCodePos);
-  Operands.push_back(AArch64Operand::CreateToken(Mnemonic, NameLoc));
-
-  if (CondCodePos != StringRef::npos) {
-    // We have a condition code
-    SMLoc S = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 1);
-    StringRef CondStr = PatchedName.substr(CondCodePos + 1, StringRef::npos);
-    A64CC::CondCodes Code;
-
-    Code = A64StringToCondCode(CondStr);
-
-    if (Code == A64CC::Invalid) {
-      Error(S, "invalid condition code");
-      Parser.eatToEndOfStatement();
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return parseOperand(Operands, false, false);
+  }
+  case AsmToken::LCurly:
+    return parseVectorList(Operands);
+  case AsmToken::Identifier: {
+    // If we're expecting a Condition Code operand, then just parse that.
+    if (isCondCode)
+      return parseCondCode(Operands, invertCondCode);
+
+    // If it's a register name, parse it.
+    if (!parseRegister(Operands))
+      return false;
+
+    // This could be an optional "shift" or "extend" operand.
+    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = getLoc();
+    if (getParser().parseExpression(IdVal))
       return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::Integer:
+  case AsmToken::Real:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = getLoc();
+    if (getLexer().is(AsmToken::Hash))
+      Parser.Lex();
+
+    // Parse a negative sign
+    bool isNegative = false;
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      isNegative = true;
+      // We need to consume this token only when we have a Real, otherwise
+      // we let parseSymbolicImmVal take care of it
+      if (Parser.getLexer().peekTok().is(AsmToken::Real))
+        Parser.Lex();
     }
 
-    SMLoc DotL = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos);
+    // The only Real that should come through here is a literal #0.0 for
+    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+    // so convert the value.
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Real)) {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+          Mnemonic != "fcmlt")
+        return TokError("unexpected floating point literal");
+      else if (IntVal != 0 || isNegative)
+        return TokError("expected floating-point constant #0.0");
+      Parser.Lex(); // Eat the token.
+
+      Operands.push_back(
+          AArch64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          AArch64Operand::CreateToken(".0", false, S, getContext()));
+      return false;
+    }
 
-    Operands.push_back(AArch64Operand::CreateToken(".",  DotL));
-    SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 3);
-    Operands.push_back(AArch64Operand::CreateCondCode(Code, S, E));
+    const MCExpr *ImmVal;
+    if (parseSymbolicImmVal(ImmVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+    return false;
   }
+  }
+}
 
-  // Now we parse the operands of this instruction
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
+  Name = StringSwitch<StringRef>(Name.lower())
+             .Case("beq", "b.eq")
+             .Case("bne", "b.ne")
+             .Case("bhs", "b.hs")
+             .Case("bcs", "b.cs")
+             .Case("blo", "b.lo")
+             .Case("bcc", "b.cc")
+             .Case("bmi", "b.mi")
+             .Case("bpl", "b.pl")
+             .Case("bvs", "b.vs")
+             .Case("bvc", "b.vc")
+             .Case("bhi", "b.hi")
+             .Case("bls", "b.ls")
+             .Case("bge", "b.ge")
+             .Case("blt", "b.lt")
+             .Case("bgt", "b.gt")
+             .Case("ble", "b.le")
+             .Case("bal", "b.al")
+             .Case("bnv", "b.nv")
+             .Default(Name);
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
+    bool IsError = parseSysAlias(Head, NameLoc, Operands);
+    if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
+      Parser.eatToEndOfStatement();
+    return IsError;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+  Mnemonic = Head;
+
+  // Handle condition codes for a branch mnemonic
+  if (Head == "b" && Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start + 1, Next);
+
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()));
+    AArch64CC::CondCode CC = parseCondCodeString(Head);
+    if (CC == AArch64CC::Invalid)
+      return Error(SuffixLoc, "invalid condition code");
+    Operands.push_back(
+        AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()) + 1);
+    Operands.push_back(
+        AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+  }
+
+  // Conditional compare instructions have a Condition Code operand, which needs
+  // to be parsed and an immediate operand created.
+  bool condCodeFourthOperand =
+      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+       Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+  // These instructions are aliases to some of the conditional select
+  // instructions. However, the condition code is inverted in the aliased
+  // instruction.
+  //
+  // FIXME: Is this the correct way to handle these? Or should the parser
+  //        generate the aliased instructions directly?
+  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+  bool condCodeThirdOperand =
+      (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+  // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Mnemonic)) {
+    if (parseOperand(Operands, false, false)) {
       Parser.eatToEndOfStatement();
       return true;
     }
 
+    unsigned N = 2;
     while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex();  // Eat the comma.
+      Parser.Lex(); // Eat the comma.
 
       // Parse and remember the operand.
-      if (ParseOperand(Operands, Mnemonic)) {
+      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+                                     (N == 3 && condCodeThirdOperand) ||
+                                     (N == 2 && condCodeSecondOperand),
+                       condCodeSecondOperand || condCodeThirdOperand)) {
         Parser.eatToEndOfStatement();
         return true;
       }
 
-
       // After successfully parsing some operands there are two special cases to
       // consider (i.e. notional operands not separated by commas). Both are due
       // to memory specifiers:
@@ -2322,52 +3125,704 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
       // in the given context!
       if (Parser.getTok().is(AsmToken::RBrac)) {
         SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+        Operands.push_back(AArch64Operand::CreateToken("]", false, Loc,
+                                                       getContext()));
         Parser.Lex();
       }
 
       if (Parser.getTok().is(AsmToken::Exclaim)) {
         SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(AArch64Operand::CreateToken("!", Loc));
+        Operands.push_back(AArch64Operand::CreateToken("!", false, Loc,
+                                                       getContext()));
         Parser.Lex();
       }
+
+      ++N;
     }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = getLexer().getLoc();
+    SMLoc Loc = Parser.getTok().getLoc();
     Parser.eatToEndOfStatement();
-    return Error(Loc, "expected comma before next operand");
+    return Error(Loc, "unexpected token in argument list");
   }
 
-  // Eat the EndOfStatement
-  Parser.Lex();
-
+  Parser.Lex(); // Consume the EndOfStatement
   return false;
 }
 
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+                                         SmallVectorImpl<SMLoc> &Loc) {
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  // Check for indexed addressing modes w/ the base register being the
+  // same as a destination/source register or pair load where
+  // the Rt == Rt2. All of those are undefined behaviour.
+  switch (Inst.getOpcode()) {
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    // FALLTHROUGH
+  }
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPSWpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::STPDpost:
+  case AArch64::STPDpre:
+  case AArch64::STPQpost:
+  case AArch64::STPQpre:
+  case AArch64::STPSpost:
+  case AArch64::STPSpre:
+  case AArch64::STPWpost:
+  case AArch64::STPWpre:
+  case AArch64::STPXpost:
+  case AArch64::STPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpre:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpre:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBpost:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHpost:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRXpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::STRBBpost:
+  case AArch64::STRBpost:
+  case AArch64::STRHHpost:
+  case AArch64::STRHpost:
+  case AArch64::STRWpost:
+  case AArch64::STRXpost:
+  case AArch64::STRBBpre:
+  case AArch64::STRBpre:
+  case AArch64::STRHHpre:
+  case AArch64::STRHpre:
+  case AArch64::STRWpre:
+  case AArch64::STRXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  }
+
+  // Now check immediate ranges. Separate from the above as there is overlap
+  // in the instructions being checked and this keeps the nested conditionals
+  // to a minimum.
+  switch (Inst.getOpcode()) {
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXri:
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri: {
+    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+    // some slight duplication here.
+    if (Inst.getOperand(2).isExpr()) {
+      const MCExpr *Expr = Inst.getOperand(2).getExpr();
+      AArch64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      int64_t Addend;
+      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+        return Error(Loc[2], "invalid immediate expression");
+      }
+
+      // Only allow these with ADDXri.
+      if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+          Inst.getOpcode() == AArch64::ADDXri)
+        return false;
+
+      // Only allow these with ADDXri/ADDWri
+      if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+          (Inst.getOpcode() == AArch64::ADDXri ||
+          Inst.getOpcode() == AArch64::ADDWri))
+        return false;
+
+      // Don't allow expressions in the immediate field otherwise
+      return Error(Loc[2], "invalid immediate expression");
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+  switch (ErrCode) {
+  case Match_MissingFeature:
+    return Error(Loc,
+                 "instruction requires a CPU feature not currently enabled");
+  case Match_InvalidOperand:
+    return Error(Loc, "invalid operand for instruction");
+  case Match_InvalidSuffix:
+    return Error(Loc, "invalid type suffix for instruction");
+  case Match_InvalidCondCode:
+    return Error(Loc, "expected AArch64 condition code");
+  case Match_AddSubRegExtendSmall:
+    return Error(Loc,
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(Loc,
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubSecondSource:
+    return Error(Loc,
+      "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_LogicalSecondSource:
+    return Error(Loc, "expected compatible register or logical immediate");
+  case Match_InvalidMovImm32Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+  case Match_InvalidMovImm64Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+  case Match_AddSubRegShift32:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_InvalidFPImm:
+    return Error(Loc,
+                 "expected compatible register or floating-point constant");
+  case Match_InvalidMemoryIndexedSImm9:
+    return Error(Loc, "index must be an integer in range [-256, 255].");
+  case Match_InvalidMemoryIndexed4SImm7:
+    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+  case Match_InvalidMemoryIndexed8SImm7:
+    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+  case Match_InvalidMemoryIndexed16SImm7:
+    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+  case Match_InvalidMemoryWExtend8:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_InvalidMemoryWExtend16:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_InvalidMemoryWExtend32:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_InvalidMemoryWExtend64:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_InvalidMemoryWExtend128:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+  case Match_InvalidMemoryXExtend8:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_InvalidMemoryXExtend16:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_InvalidMemoryXExtend32:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_InvalidMemoryXExtend64:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_InvalidMemoryXExtend128:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_InvalidMemoryIndexed1:
+    return Error(Loc, "index must be an integer in range [0, 4095].");
+  case Match_InvalidMemoryIndexed2:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+  case Match_InvalidMemoryIndexed4:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+  case Match_InvalidMemoryIndexed8:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+  case Match_InvalidMemoryIndexed16:
+    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_7:
+    return Error(Loc, "immediate must be an integer in range [0, 7].");
+  case Match_InvalidImm0_15:
+    return Error(Loc, "immediate must be an integer in range [0, 15].");
+  case Match_InvalidImm0_31:
+    return Error(Loc, "immediate must be an integer in range [0, 31].");
+  case Match_InvalidImm0_63:
+    return Error(Loc, "immediate must be an integer in range [0, 63].");
+  case Match_InvalidImm0_127:
+    return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_65535:
+    return Error(Loc, "immediate must be an integer in range [0, 65535].");
+  case Match_InvalidImm1_8:
+    return Error(Loc, "immediate must be an integer in range [1, 8].");
+  case Match_InvalidImm1_16:
+    return Error(Loc, "immediate must be an integer in range [1, 16].");
+  case Match_InvalidImm1_32:
+    return Error(Loc, "immediate must be an integer in range [1, 32].");
+  case Match_InvalidImm1_64:
+    return Error(Loc, "immediate must be an integer in range [1, 64].");
+  case Match_InvalidIndex1:
+    return Error(Loc, "expected lane specifier '[1]'");
+  case Match_InvalidIndexB:
+    return Error(Loc, "vector lane must be an integer in range [0, 15].");
+  case Match_InvalidIndexH:
+    return Error(Loc, "vector lane must be an integer in range [0, 7].");
+  case Match_InvalidIndexS:
+    return Error(Loc, "vector lane must be an integer in range [0, 3].");
+  case Match_InvalidIndexD:
+    return Error(Loc, "vector lane must be an integer in range [0, 1].");
+  case Match_InvalidLabel:
+    return Error(Loc, "expected label or encodable integer pc offset");
+  case Match_MRS:
+    return Error(Loc, "expected readable system register");
+  case Match_MSR:
+    return Error(Loc, "expected writable system register or pstate");
+  case Match_MnemonicFail:
+    return Error(Loc, "unrecognized instruction mnemonic");
+  default:
+    assert(0 && "unexpected error code!");
+    return Error(Loc, "invalid instruction format");
+  }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+
+  StringRef Tok = Op.getToken();
+  unsigned NumOperands = Operands.size();
+
+  if (NumOperands == 4 && Tok == "lsl") {
+    AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
+    AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+    if (Op2.isReg() && Op3.isImm()) {
+      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+      if (Op3CE) {
+        uint64_t Op3Val = Op3CE->getValue();
+        uint64_t NewOp3Val = 0;
+        uint64_t NewOp4Val = 0;
+        if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                Op2.getReg())) {
+          NewOp3Val = (32 - Op3Val) & 0x1f;
+          NewOp4Val = 31 - Op3Val;
+        } else {
+          NewOp3Val = (64 - Op3Val) & 0x3f;
+          NewOp4Val = 63 - Op3Val;
+        }
+
+        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
+        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
+
+        Operands[0] = AArch64Operand::CreateToken(
+            "ubfm", false, Op.getStartLoc(), getContext());
+        Operands.push_back(AArch64Operand::CreateImm(
+            NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
+        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
+                                                Op3.getEndLoc(), getContext());
+      }
+    }
+  } else if (NumOperands == 5) {
+    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+    // UBFIZ -> UBFM aliases.
+    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+                  Op1.getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3.getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4.getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp3Val = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                  Op1.getReg()))
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+          else
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+
+          uint64_t NewOp4Val = Op4Val - 1;
+
+          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+            return Error(Op4.getStartLoc(),
+                         "requested insert overflows register");
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::Create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[3] = AArch64Operand::CreateImm(
+              NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+          if (Tok == "bfi")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "sbfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "ubfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op.getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+        }
+      }
+
+      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+      // UBFX -> UBFM aliases.
+    } else if (NumOperands == 5 &&
+               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+                  Op1.getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3.getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4.getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+            return Error(Op4.getStartLoc(),
+                         "requested extract overflows register");
+
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+          if (Tok == "bfxil")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "sbfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "ubfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op.getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+    // The source register can be Wn here, but the matcher expects a
+    // GPR64. Twiddle it here if necessary.
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+    if (Op.isReg()) {
+      unsigned Reg = getXRegFromWReg(Op.getReg());
+      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
+    }
+  }
+  // FIXME: Likewise for sxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op.getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR64. Twiddle it here if necessary.
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+      if (Op.isReg()) {
+        unsigned Reg = getXRegFromWReg(Op.getReg());
+        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
+      }
+    }
+  }
+  // FIXME: Likewise for uxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op.getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR32. Twiddle it here if necessary.
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+      if (Op.isReg()) {
+        unsigned Reg = getWRegFromXReg(Op.getReg());
+        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
+      }
+    }
+  }
+
+  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+  if (NumOperands == 3 && Tok == "fmov") {
+    AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]);
+    AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
+    if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
+      unsigned zreg =
+          AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+              RegOp.getReg())
+              ? AArch64::WZR
+              : AArch64::XZR;
+      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
+    }
+  }
+
+  MCInst Inst;
+  // First try to match against the secondary set of tables containing the
+  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+  // If that fails, try against the alternate table containing long-form NEON:
+  // "fadd v0.2s, v1.2s, v2.2s"
+  if (MatchResult != Match_Success)
+    MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+  switch (MatchResult) {
+  case Match_Success: {
+    // Perform range checking and other semantic validations
+    SmallVector<SMLoc, 8> OperandLocs;
+    NumOperands = Operands.size();
+    for (unsigned i = 1; i < NumOperands; ++i)
+      OperandLocs.push_back(Operands[i]->getStartLoc());
+    if (validateInstruction(Inst, OperandLocs))
+      return true;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (neon, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_MnemonicFail:
+    return showMatchError(IDLoc, MatchResult);
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    // If the match failed on a suffix token operand, tweak the diagnostic
+    // accordingly.
+    if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() &&
+        ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
+      MatchResult = Match_InvalidSuffix;
+
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexed1:
+  case Match_InvalidMemoryIndexed2:
+  case Match_InvalidMemoryIndexed4:
+  case Match_InvalidMemoryIndexed8:
+  case Match_InvalidMemoryIndexed16:
+  case Match_InvalidCondCode:
+  case Match_AddSubRegExtendSmall:
+  case Match_AddSubRegExtendLarge:
+  case Match_AddSubSecondSource:
+  case Match_LogicalSecondSource:
+  case Match_AddSubRegShift32:
+  case Match_AddSubRegShift64:
+  case Match_InvalidMovImm32Shift:
+  case Match_InvalidMovImm64Shift:
+  case Match_InvalidFPImm:
+  case Match_InvalidMemoryWExtend8:
+  case Match_InvalidMemoryWExtend16:
+  case Match_InvalidMemoryWExtend32:
+  case Match_InvalidMemoryWExtend64:
+  case Match_InvalidMemoryWExtend128:
+  case Match_InvalidMemoryXExtend8:
+  case Match_InvalidMemoryXExtend16:
+  case Match_InvalidMemoryXExtend32:
+  case Match_InvalidMemoryXExtend64:
+  case Match_InvalidMemoryXExtend128:
+  case Match_InvalidMemoryIndexed4SImm7:
+  case Match_InvalidMemoryIndexed8SImm7:
+  case Match_InvalidMemoryIndexed16SImm7:
+  case Match_InvalidMemoryIndexedSImm9:
+  case Match_InvalidImm0_7:
+  case Match_InvalidImm0_15:
+  case Match_InvalidImm0_31:
+  case Match_InvalidImm0_63:
+  case Match_InvalidImm0_127:
+  case Match_InvalidImm0_65535:
+  case Match_InvalidImm1_8:
+  case Match_InvalidImm1_16:
+  case Match_InvalidImm1_32:
+  case Match_InvalidImm1_64:
+  case Match_InvalidIndex1:
+  case Match_InvalidIndexB:
+  case Match_InvalidIndexH:
+  case Match_InvalidIndexS:
+  case Match_InvalidIndexD:
+  case Match_InvalidLabel:
+  case Match_MSR:
+  case Match_MRS: {
+    if (ErrorInfo >= Operands.size())
+      return Error(IDLoc, "too few operands for instruction");
+    // Any time we get here, there's nothing fancy to do. Just get the
+    // operand SMLoc and display the diagnostic.
+    SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+/// ParseDirective parses the arm specific directives
 bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
+  SMLoc Loc = DirectiveID.getLoc();
   if (IDVal == ".hword")
-    return ParseDirectiveWord(2, DirectiveID.getLoc());
-  else if (IDVal == ".word")
-    return ParseDirectiveWord(4, DirectiveID.getLoc());
-  else if (IDVal == ".xword")
-    return ParseDirectiveWord(8, DirectiveID.getLoc());
-  else if (IDVal == ".tlsdesccall")
-    return ParseDirectiveTLSDescCall(DirectiveID.getLoc());
-
-  return true;
+    return parseDirectiveWord(2, Loc);
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, Loc);
+  if (IDVal == ".xword")
+    return parseDirectiveWord(8, Loc);
+  if (IDVal == ".tlsdesccall")
+    return parseDirectiveTLSDescCall(Loc);
+
+  return parseDirectiveLOH(IDVal, Loc);
 }
 
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
-bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
-        return false;
+        return true;
 
       getParser().getStreamer().EmitValue(Value, Size);
 
@@ -2375,10 +3830,8 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma)) {
-        Error(L, "unexpected token in directive");
-        return false;
-      }
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
       Parser.Lex();
     }
   }
@@ -2389,15 +3842,14 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
 
 // parseDirectiveTLSDescCall:
 //   ::= .tlsdesccall symbol
-bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
   StringRef Name;
-  if (getParser().parseIdentifier(Name)) {
-    Error(L, "expected symbol after directive");
-    return false;
-  }
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
 
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
 
   MCInst Inst;
   Inst.setOpcode(AArch64::TLSDESCCALL);
@@ -2407,271 +3859,181 @@ bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
   return false;
 }
 
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+  if (IDVal != MCLOHDirectiveName())
+    return true;
+  MCLOHType Kind;
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    if (getParser().getTok().isNot(AsmToken::Integer))
+      return TokError("expected an identifier or a number in directive");
+    // We successfully get a numeric value for the identifier.
+    // Check if it is valid.
+    int64_t Id = getParser().getTok().getIntVal();
+    Kind = (MCLOHType)Id;
+    // Check that Id does not overflow MCLOHType.
+    if (!isValidMCLOHType(Kind) || Id != Kind)
+      return TokError("invalid numeric identifier in directive");
+  } else {
+    StringRef Name = getTok().getIdentifier();
+    // We successfully parse an identifier.
+    // Check if it is a recognized one.
+    int Id = MCLOHNameToId(Name);
+
+    if (Id == -1)
+      return TokError("invalid identifier in directive");
+    Kind = (MCLOHType)Id;
+  }
+  // Consume the identifier.
+  Lex();
+  // Get the number of arguments of this LOH.
+  int NbArgs = MCLOHIdToNbArgs(Kind);
+
+  assert(NbArgs != -1 && "Invalid number of arguments");
+
+  SmallVector<MCSymbol *, 3> Args;
+  for (int Idx = 0; Idx < NbArgs; ++Idx) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    Args.push_back(getContext().GetOrCreateSymbol(Name));
+
+    if (Idx + 1 == NbArgs)
+      break;
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
 
-bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                 MCStreamer &Out, unsigned &ErrorInfo,
-                                 bool MatchingInlineAsm) {
-  MCInst Inst;
-  unsigned MatchResult;
-  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
-                                     MatchingInlineAsm);
+  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  return false;
+}
 
-  if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
-    return Error(IDLoc, "too few operands for instruction");
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+                                    AArch64MCExpr::VariantKind &ELFRefKind,
+                                    MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                    int64_t &Addend) {
+  ELFRefKind = AArch64MCExpr::VK_INVALID;
+  DarwinRefKind = MCSymbolRefExpr::VK_None;
+  Addend = 0;
+
+  if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+    ELFRefKind = AE->getKind();
+    Expr = AE->getSubExpr();
+  }
+
+  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (SE) {
+    // It's a simple symbol reference with no addend.
+    DarwinRefKind = SE->getKind();
+    return true;
+  }
 
-  switch (MatchResult) {
-  default: break;
-  case Match_Success:
-    if (validateInstruction(Inst, Operands))
-      return true;
+  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+  if (!BE)
+    return false;
 
-    Out.EmitInstruction(Inst, STI);
+  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  if (!SE)
     return false;
-  case Match_MissingFeature:
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
-    return true;
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      ErrorLoc = ((AArch64Operand*)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    }
+  DarwinRefKind = SE->getKind();
 
-    return Error(ErrorLoc, "invalid operand for instruction");
-  }
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+  if (BE->getOpcode() != MCBinaryExpr::Add &&
+      BE->getOpcode() != MCBinaryExpr::Sub)
+    return false;
 
-  case Match_AddSubRegExtendSmall:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegExtendLarge:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegShift32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
-  case Match_AddSubRegShift64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
-  case Match_AddSubSecondSource:
-      return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-          "expected compatible register, symbol or integer in range [0, 4095]");
-  case Match_CVTFixedPos32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 32]");
-  case Match_CVTFixedPos64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 64]");
-  case Match_CondCode:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected AArch64 condition code");
-  case Match_FPImm:
-    // Any situation which allows a nontrivial floating-point constant also
-    // allows a register.
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected compatible register or floating-point constant");
-  case Match_FPZero:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected floating-point constant #0.0 or invalid register type");
-  case Match_Label:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected label or encodable integer pc offset");
-  case Match_Lane1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected lane specifier '[1]'");
-  case Match_LoadStoreExtend32_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
-  case Match_LoadStoreExtend32_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
-  case Match_LoadStoreExtend32_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
-  case Match_LoadStoreExtend32_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
-  case Match_LoadStoreExtend32_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtw' with optional shift of #0 or #4");
-  case Match_LoadStoreExtend64_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0");
-  case Match_LoadStoreExtend64_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
-  case Match_LoadStoreExtend64_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
-  case Match_LoadStoreExtend64_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
-  case Match_LoadStoreExtend64_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
-  case Match_LoadStoreSImm7_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 4 in range [-256, 252]");
-  case Match_LoadStoreSImm7_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 8 in range [-512, 504]");
-  case Match_LoadStoreSImm7_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 16 in range [-1024, 1008]");
-  case Match_LoadStoreSImm9:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [-256, 255]");
-  case Match_LoadStoreUImm12_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 4095]");
-  case Match_LoadStoreUImm12_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 8190]");
-  case Match_LoadStoreUImm12_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 16380]");
-  case Match_LoadStoreUImm12_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 32760]");
-  case Match_LoadStoreUImm12_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 65520]");
-  case Match_LogicalSecondSource:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected compatible register or logical immediate");
-  case Match_MOVWUImm16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected relocated symbol or integer in range [0, 65535]");
-  case Match_MRS:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected readable system register");
-  case Match_MSR:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected writable system register or pstate");
-  case Match_NamedImm_at:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                "expected symbolic 'at' operand: s1e[0-3][rw] or s12e[01][rw]");
-  case Match_NamedImm_dbarrier:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-             "expected integer in range [0, 15] or symbolic barrier operand");
-  case Match_NamedImm_dc:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic 'dc' operand");
-  case Match_NamedImm_ic:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'ic' operand: 'ialluis', 'iallu' or 'ivau'");
-  case Match_NamedImm_isb:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15] or 'sy'");
-  case Match_NamedImm_prefetch:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected prefetch hint: p(ld|st|i)l[123](strm|keep)");
-  case Match_NamedImm_tlbi:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected translation buffer invalidation operand");
-  case Match_UImm16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 65535]");
-  case Match_UImm3:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 7]");
-  case Match_UImm4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15]");
-  case Match_UImm5:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 31]");
-  case Match_UImm6:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 63]");
-  case Match_UImm7:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 127]");
-  case Match_Width32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [<lsb>, 31]");
-  case Match_Width64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [<lsb>, 63]");
-  case Match_ShrImm8:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 8]");
-  case Match_ShrImm16:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 16]");
-  case Match_ShrImm32:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 32]");
-  case Match_ShrImm64:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 64]");
-  case Match_ShlImm8:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 7]");
-  case Match_ShlImm16:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15]");
-  case Match_ShlImm32:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 31]");
-  case Match_ShlImm64:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 63]");
-  }
+  // See if the addend is is a constant, otherwise there's more going
+  // on here than we can deal with.
+  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+  if (!AddendExpr)
+    return false;
 
-  llvm_unreachable("Implement any new match types added!");
-  return true;
+  Addend = AddendExpr->getValue();
+  if (BE->getOpcode() == MCBinaryExpr::Sub)
+    Addend = -Addend;
+
+  // It's some symbol reference + a constant addend, but really
+  // shouldn't use both Darwin and ELF syntax.
+  return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+         DarwinRefKind == MCSymbolRefExpr::VK_None;
 }
 
-void AArch64Operand::print(raw_ostream &OS) const {
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
+
+  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                      unsigned Kind) {
+  AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ExpectedVal;
   switch (Kind) {
-  case k_CondCode:
-    OS << "<CondCode: " << CondCode.Code << ">";
+  default:
+    return Match_InvalidOperand;
+  case MCK__35_0:
+    ExpectedVal = 0;
     break;
-  case k_FPImmediate:
-    OS << "<fpimm: " << FPImm.Val << ">";
+  case MCK__35_1:
+    ExpectedVal = 1;
     break;
-  case k_ImmWithLSL:
-    OS << "<immwithlsl: imm=" << ImmWithLSL.Val
-       << ", shift=" << ImmWithLSL.ShiftAmount << ">";
+  case MCK__35_12:
+    ExpectedVal = 12;
     break;
-  case k_Immediate:
-    getImm()->print(OS);
+  case MCK__35_16:
+    ExpectedVal = 16;
     break;
-  case k_Register:
-    OS << "<register " << getReg() << '>';
+  case MCK__35_2:
+    ExpectedVal = 2;
     break;
-  case k_Token:
-    OS << '\'' << getToken() << '\'';
+  case MCK__35_24:
+    ExpectedVal = 24;
     break;
-  case k_ShiftExtend:
-    OS << "<shift: type=" << ShiftExtend.ShiftType
-       << ", amount=" << ShiftExtend.Amount << ">";
+  case MCK__35_3:
+    ExpectedVal = 3;
     break;
-  case k_SysReg: {
-    StringRef Name(SysReg.Data, SysReg.Length);
-    OS << "<sysreg: " << Name << '>';
+  case MCK__35_32:
+    ExpectedVal = 32;
     break;
-  }
-  default:
-    llvm_unreachable("No idea how to print this kind of operand");
+  case MCK__35_4:
+    ExpectedVal = 4;
+    break;
+  case MCK__35_48:
+    ExpectedVal = 48;
+    break;
+  case MCK__35_6:
+    ExpectedVal = 6;
+    break;
+  case MCK__35_64:
+    ExpectedVal = 64;
+    break;
+  case MCK__35_8:
+    ExpectedVal = 8;
     break;
   }
+  if (!Op.isImm())
+    return Match_InvalidOperand;
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
+  if (!CE)
+    return Match_InvalidOperand;
+  if (CE->getValue() == ExpectedVal)
+    return Match_Success;
+  return Match_InvalidOperand;
 }
-
-void AArch64Operand::dump() const {
-  print(errs());
-}
-
-
-/// Force static initialization.
-extern "C" void LLVMInitializeAArch64AsmParser() {
-  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
-  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_MATCHER_IMPLEMENTATION
-#include "AArch64GenAsmMatcher.inc"
diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt
index e81ec70437a4..cc0a9d86a14e 100644
--- a/lib/Target/AArch64/AsmParser/CMakeLists.txt
+++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt
@@ -1,3 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64AsmParser
   AArch64AsmParser.cpp
   )
+
diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
index 2d8f63212378..11eb9d55f615 100644
--- a/lib/Target/AArch64/AsmParser/LLVMBuild.txt
+++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
index 56c9ef52ea58..00268c76f8e8 100644
--- a/lib/Target/AArch64/AsmParser/Makefile
+++ b/lib/Target/AArch64/AsmParser/Makefile
@@ -9,7 +9,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64AsmParser
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' ARM target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index dfc10afcdcfe..789d549bb156 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -1,37 +1,51 @@
 set(LLVM_TARGET_DEFINITIONS AArch64.td)
 
-tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
+tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(AArch64CommonTableGen)
 
 add_llvm_target(AArch64CodeGen
+  AArch64AddressTypePromotion.cpp
+  AArch64AdvSIMDScalarPass.cpp
   AArch64AsmPrinter.cpp
-  AArch64BranchFixupPass.cpp
+  AArch64BranchRelaxation.cpp
+  AArch64CleanupLocalDynamicTLSPass.cpp
+  AArch64CollectLOH.cpp
+  AArch64ConditionalCompares.cpp
+  AArch64DeadRegisterDefinitionsPass.cpp
+  AArch64ExpandPseudoInsts.cpp
+  AArch64FastISel.cpp
   AArch64FrameLowering.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
-  AArch64MachineFunctionInfo.cpp
+  AArch64LoadStoreOptimizer.cpp
   AArch64MCInstLower.cpp
+  AArch64PromoteConstant.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
+  AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
-  )
+)
 
+add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
-add_subdirectory(TargetInfo)
 add_subdirectory(Utils)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 01f1497dc33f..6de27d6d51a5 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1,4 +1,4 @@
-//===- AArch64Disassembler.cpp - Disassembler for AArch64 ISA -------------===//
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,241 +7,169 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the functions necessary to decode AArch64 instruction
-// bitpatterns into MCInsts (with the help of TableGenerated information from
-// the instruction definitions).
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
-#include "AArch64RegisterInfo.h"
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
 #include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "arm-disassembler"
+#define DEBUG_TYPE "aarch64-disassembler"
 
-typedef MCDisassembler::DecodeStatus DecodeStatus;
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
 
-namespace {
-/// AArch64 disassembler for all AArch64 platforms.
-class AArch64Disassembler : public MCDisassembler {
-public:
-  /// Initializes the disassembler.
-  ///
-  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
-    : MCDisassembler(STI, Ctx) {
-  }
-
-  ~AArch64Disassembler() {}
-
-  /// See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
-};
-
-}
-
-// Forward-declarations used in the auto-generated files.
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
 static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
                                               unsigned RegNo, uint64_t Address,
                                               const void *Decoder);
-static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst,
-                                                unsigned RegNo, uint64_t Address,
-                                                const void *Decoder);
-
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
-                                                  unsigned RegNo,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
-                                               unsigned OptionHiS,
-                                               uint64_t Address,
-                                               const void *Decoder);
-
-
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
-
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
-
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
-                                        unsigned RmBits,
-                                        uint64_t Address,
-                                        const void *Decoder);
-
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                           uint64_t Address,
                                           const void *Decoder);
-
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
-                                             unsigned FullImm,
-                                             uint64_t Address,
-                                             const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
-                                            unsigned Bits,
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
-
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
-                                           unsigned ShiftAmount,
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
                                            const void *Decoder);
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
-                             uint64_t Address, const void *Decoder);
-
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
-                                            unsigned ShiftAmount,
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
                                               const void *Decoder);
-
-static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
                                               const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
 
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
-                                              unsigned Insn,
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
                                               const void *Decoder);
-
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
-                                                       unsigned Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder);
-
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
-                                          unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-
-static DecodeStatus
-DecodeSysRegOperand(const A64SysReg::SysRegMapper &InstMapper,
-                    llvm::MCInst &Inst, unsigned Val,
-                    uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder);
-
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder);
-
-
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
-                                                   unsigned Val,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
                                                const void *Decoder);
-
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const void *Decoder);
-
-static bool Check(DecodeStatus &Out, DecodeStatus In);
-
-#include "AArch64GenDisassemblerTables.inc"
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -258,487 +186,479 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
   llvm_unreachable("Invalid DecodeStatus!");
 }
 
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new AArch64Disassembler(STI, Ctx);
+}
+
 DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 const MemoryObject &Region,
-                                                 uint64_t Address,
-                                                 raw_ostream &os,
-                                                 raw_ostream &cs) const {
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
   CommentStream = &cs;
 
   uint8_t bytes[4];
 
+  Size = 0;
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, bytes) == -1) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
+  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+    return Fail;
+  Size = 4;
 
   // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn = (bytes[3] << 24) |
-    (bytes[2] << 16) |
-    (bytes[1] <<  8) |
-    (bytes[0] <<  0);
+  uint32_t insn =
+      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
 
   // Calling the auto-generated decoder function.
-  DecodeStatus result = decodeInstruction(DecoderTableA6432, MI, insn, Address,
-                                          this, STI);
-  if (result != MCDisassembler::Fail) {
-    Size = 4;
-    return result;
-  }
-
-  MI.clear();
-  Size = 0;
-  return MCDisassembler::Fail;
+  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
 }
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const AArch64Disassembler *Dis = static_cast<const AArch64Disassembler*>(D);
-  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
-  return RegInfo->getRegClass(RC).getRegister(RegNo);
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+                              LLVMSymbolLookupCallback SymbolLookUp,
+                              void *DisInfo, MCContext *Ctx,
+                              MCRelocationInfo *RelInfo) {
+  return new llvm::AArch64ExternalSymbolizer(
+                                     *Ctx,
+                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
+                                     GetOpInfo, SymbolLookUp, DisInfo);
 }
 
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
+                                       createAArch64ExternalSymbolizer);
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR64RegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+                                       createAArch64ExternalSymbolizer);
 }
 
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::GPR64xspRegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
-}
+static const unsigned FPR128DecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
 
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR32RegClassID, RegNo);
+  unsigned Register = FPR128DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::GPR32wspRegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
 }
 
-static DecodeStatus
-DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::FPR8RegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
-}
+static const unsigned FPR64DecoderTable[] = {
+    AArch64::D0,  AArch64::D1,  AArch64::D2,  AArch64::D3,  AArch64::D4,
+    AArch64::D5,  AArch64::D6,  AArch64::D7,  AArch64::D8,  AArch64::D9,
+    AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+    AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+    AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+    AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+    AArch64::D30, AArch64::D31
+};
 
-static DecodeStatus
-DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR16RegClassID, RegNo);
+  unsigned Register = FPR64DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
+static const unsigned FPR32DecoderTable[] = {
+    AArch64::S0,  AArch64::S1,  AArch64::S2,  AArch64::S3,  AArch64::S4,
+    AArch64::S5,  AArch64::S6,  AArch64::S7,  AArch64::S8,  AArch64::S9,
+    AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+    AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+    AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+    AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+    AArch64::S30, AArch64::S31
+};
 
-static DecodeStatus
-DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR32RegClassID, RegNo);
+  unsigned Register = FPR32DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static const unsigned FPR16DecoderTable[] = {
+    AArch64::H0,  AArch64::H1,  AArch64::H2,  AArch64::H3,  AArch64::H4,
+    AArch64::H5,  AArch64::H6,  AArch64::H7,  AArch64::H8,  AArch64::H9,
+    AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+    AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+    AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+    AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+    AArch64::H30, AArch64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR64RegClassID, RegNo);
+  unsigned Register = FPR16DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 15)
-    return MCDisassembler::Fail;
-
-  return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
-}
+static const unsigned FPR8DecoderTable[] = {
+    AArch64::B0,  AArch64::B1,  AArch64::B2,  AArch64::B3,  AArch64::B4,
+    AArch64::B5,  AArch64::B6,  AArch64::B7,  AArch64::B8,  AArch64::B9,
+    AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+    AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+    AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+    AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+    AArch64::B30, AArch64::B31
+};
 
-static DecodeStatus
-DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR128RegClassID, RegNo);
+  unsigned Register = FPR8DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 15)
-    return MCDisassembler::Fail;
-
-  return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
-}
+static const unsigned GPR64DecoderTable[] = {
+    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
+    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
+    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+    AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+    AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+    AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+    AArch64::LR,  AArch64::XZR
+};
 
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
-                                                  unsigned RegNo,
-                                                  uint64_t Address,
-                                                  const void *Decoder) {
-  if (RegNo > 30)
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
+  unsigned Register = GPR64DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
-                                            unsigned RegID,
-                                            const void *Decoder) {
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, RegID, RegNo);
+    return Fail;
+  unsigned Register = GPR64DecoderTable[RegNo];
+  if (Register == AArch64::XZR)
+    Register = AArch64::SP;
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
-                                 Decoder);
-}
+static const unsigned GPR32DecoderTable[] = {
+    AArch64::W0,  AArch64::W1,  AArch64::W2,  AArch64::W3,  AArch64::W4,
+    AArch64::W5,  AArch64::W6,  AArch64::W7,  AArch64::W8,  AArch64::W9,
+    AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+    AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+    AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+    AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+    AArch64::W30, AArch64::WZR
+};
 
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
                                              const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
-                                 Decoder);
-}
+  if (RegNo > 31)
+    return Fail;
 
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
-                                 Decoder);
+  unsigned Register = GPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
                                                const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
-                                 Decoder);
-}
+  if (RegNo > 31)
+    return Fail;
 
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
-                                 Decoder);
+  unsigned Register = GPR32DecoderTable[RegNo];
+  if (Register == AArch64::WZR)
+    Register = AArch64::WSP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
-                                 Decoder);
-}
+static const unsigned VectorDecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
 
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
-                                               unsigned OptionHiS,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // Option{1} must be 1. OptionHiS is made up of {Option{2}, Option{1},
-  // S}. Hence we want to check bit 1.
-  if (!(OptionHiS & 2))
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
 
-  Inst.addOperand(MCOperand::CreateImm(OptionHiS));
-  return MCDisassembler::Success;
+  unsigned Register = VectorDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // In the 32-bit variant, bit 6 must be zero. I.e. the immediate must be
-  // between 0 and 31.
-  if (Imm6Bits > 31)
-    return MCDisassembler::Fail;
+static const unsigned QQDecoderTable[] = {
+  AArch64::Q0_Q1,   AArch64::Q1_Q2,   AArch64::Q2_Q3,   AArch64::Q3_Q4,
+  AArch64::Q4_Q5,   AArch64::Q5_Q6,   AArch64::Q6_Q7,   AArch64::Q7_Q8,
+  AArch64::Q8_Q9,   AArch64::Q9_Q10,  AArch64::Q10_Q11, AArch64::Q11_Q12,
+  AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+  AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+  AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+  AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+  AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // 1 <= Imm <= 32. Encoded as 64 - Imm so: 63 >= Encoded >= 32.
-  if (Imm6Bits < 32)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
-  return MCDisassembler::Success;
-}
+static const unsigned QQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2,    AArch64::Q1_Q2_Q3,    AArch64::Q2_Q3_Q4,
+  AArch64::Q3_Q4_Q5,    AArch64::Q4_Q5_Q6,    AArch64::Q5_Q6_Q7,
+  AArch64::Q6_Q7_Q8,    AArch64::Q7_Q8_Q9,    AArch64::Q8_Q9_Q10,
+  AArch64::Q9_Q10_Q11,  AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+  AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+  AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+  AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+  AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+  AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+  AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+  AArch64::Q30_Q31_Q0,  AArch64::Q31_Q0_Q1
+};
 
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
-                                        unsigned RmBits,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  // Any bits are valid in the instruction (they're architecturally ignored),
-  // but a code generator should insert 0.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(8 - Val));
-  return MCDisassembler::Success;
-}
+static const unsigned QQQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2_Q3,     AArch64::Q1_Q2_Q3_Q4,     AArch64::Q2_Q3_Q4_Q5,
+  AArch64::Q3_Q4_Q5_Q6,     AArch64::Q4_Q5_Q6_Q7,     AArch64::Q5_Q6_Q7_Q8,
+  AArch64::Q6_Q7_Q8_Q9,     AArch64::Q7_Q8_Q9_Q10,    AArch64::Q8_Q9_Q10_Q11,
+  AArch64::Q9_Q10_Q11_Q12,  AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+  AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+  AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+  AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+  AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+  AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+  AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+  AArch64::Q30_Q31_Q0_Q1,   AArch64::Q31_Q0_Q1_Q2
+};
 
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(16 - Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(32 - Val));
-  return MCDisassembler::Success;
-}
+static const unsigned DDDecoderTable[] = {
+  AArch64::D0_D1,   AArch64::D1_D2,   AArch64::D2_D3,   AArch64::D3_D4,
+  AArch64::D4_D5,   AArch64::D5_D6,   AArch64::D6_D7,   AArch64::D7_D8,
+  AArch64::D8_D9,   AArch64::D9_D10,  AArch64::D10_D11, AArch64::D11_D12,
+  AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+  AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+  AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+  AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+  AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
 
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  if (Val > 7)
-    return MCDisassembler::Fail;
+static const unsigned DDDDecoderTable[] = {
+  AArch64::D0_D1_D2,    AArch64::D1_D2_D3,    AArch64::D2_D3_D4,
+  AArch64::D3_D4_D5,    AArch64::D4_D5_D6,    AArch64::D5_D6_D7,
+  AArch64::D6_D7_D8,    AArch64::D7_D8_D9,    AArch64::D8_D9_D10,
+  AArch64::D9_D10_D11,  AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+  AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+  AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+  AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+  AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+  AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+  AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+  AArch64::D30_D31_D0,  AArch64::D31_D0_D1
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 15)
-    return MCDisassembler::Fail;
+static const unsigned DDDDDecoderTable[] = {
+  AArch64::D0_D1_D2_D3,     AArch64::D1_D2_D3_D4,     AArch64::D2_D3_D4_D5,
+  AArch64::D3_D4_D5_D6,     AArch64::D4_D5_D6_D7,     AArch64::D5_D6_D7_D8,
+  AArch64::D6_D7_D8_D9,     AArch64::D7_D8_D9_D10,    AArch64::D8_D9_D10_D11,
+  AArch64::D9_D10_D11_D12,  AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+  AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+  AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+  AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+  AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+  AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+  AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+  AArch64::D30_D31_D0_D1,   AArch64::D31_D0_D1_D2
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 31)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  // scale{5} is asserted as 1 in tblgen.
+  Imm |= 0x20;  
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 63)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
 }
 
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
-                                             unsigned FullImm,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  unsigned Imm16 = FullImm & 0xffff;
-  unsigned Shift = FullImm >> 16;
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  int64_t ImmVal = Imm;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 19-bit immediate.
+  if (ImmVal & (1 << (19 - 1)))
+    ImmVal |= ~((1LL << 19) - 1);
 
-  if (RegWidth == 32 && Shift > 1) return MCDisassembler::Fail;
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+                                     Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  return Success;
+}
 
-  Inst.addOperand(MCOperand::CreateImm(Imm16));
-  Inst.addOperand(MCOperand::CreateImm(Shift));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm((Imm  >> 1) & 1));
+  Inst.addOperand(MCOperand::CreateImm(Imm & 1));
+  return Success;
 }
 
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
-                                            unsigned Bits,
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  uint64_t Imm;
-  if (!A64Imms::isLogicalImmBits(RegWidth, Bits, Imm))
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Bits));
-  return MCDisassembler::Success;
-}
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
 
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
 
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
-                                           unsigned ShiftAmount,
-                                           uint64_t Address,
-                                           const void *Decoder) {
-  // Only values 0-4 are valid for this 3-bit field
-  if (ShiftAmount > 4)
-    return MCDisassembler::Fail;
+  bool ValidNamed;
+  (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
 
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
+  return ValidNamed ? Success : Fail;
 }
 
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
-                                            unsigned ShiftAmount,
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  // Only values below 32 are valid for a 32-bit register
-  if (ShiftAmount > 31)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned ImmS = fieldFromInstruction(Insn, 10, 6);
-  unsigned ImmR = fieldFromInstruction(Insn, 16, 6);
-  unsigned SF = fieldFromInstruction(Insn, 31, 1);
-
-  // Undef for 0b11 just in case it occurs. Don't want the compiler to optimise
-  // out assertions that it thinks should never be hit.
-  enum OpcTypes { SBFM = 0, BFM, UBFM, Undef } Opc;
-  Opc = (OpcTypes)fieldFromInstruction(Insn, 29, 2);
-
-  if (!SF) {
-    // ImmR and ImmS must be between 0 and 31 for 32-bit instructions.
-    if (ImmR > 31 || ImmS > 31)
-      return MCDisassembler::Fail;
-  }
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
 
-  if (SF) {
-    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    // BFM MCInsts use Rd as a source too.
-    if (Opc == BFM) DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
-  } else {
-    DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
-    // BFM MCInsts use Rd as a source too.
-    if (Opc == BFM) DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // ASR and LSR have more specific patterns so they won't get here:
-  assert(!(ImmS == 31 && !SF && Opc != BFM)
-         && "shift should have used auto decode");
-  assert(!(ImmS == 63 && SF && Opc != BFM)
-         && "shift should have used auto decode");
-
-  // Extension instructions similarly:
-  if (Opc == SBFM && ImmR == 0) {
-    assert((ImmS != 7 && ImmS != 15) && "extension got here");
-    assert((ImmS != 31 || SF == 0) && "extension got here");
-  } else if (Opc == UBFM && ImmR == 0) {
-    assert((SF != 0 || (ImmS != 7 && ImmS != 15)) && "extension got here");
-  }
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
 
-  if (Opc == UBFM) {
-    // It might be a LSL instruction, which actually takes the shift amount
-    // itself as an MCInst operand.
-    if (SF && (ImmS + 1) % 64 == ImmR) {
-      Inst.setOpcode(AArch64::LSLxxi);
-      Inst.addOperand(MCOperand::CreateImm(63 - ImmS));
-      return MCDisassembler::Success;
-    } else if (!SF && (ImmS + 1) % 32 == ImmR) {
-      Inst.setOpcode(AArch64::LSLwwi);
-      Inst.addOperand(MCOperand::CreateImm(31 - ImmS));
-      return MCDisassembler::Success;
-    }
-  }
-
-  // Otherwise it's definitely either an extract or an insert depending on which
-  // of ImmR or ImmS is larger.
-  unsigned ExtractOp, InsertOp;
-  switch (Opc) {
-  default: llvm_unreachable("unexpected instruction trying to decode bitfield");
-  case SBFM:
-    ExtractOp = SF ? AArch64::SBFXxxii : AArch64::SBFXwwii;
-    InsertOp = SF ? AArch64::SBFIZxxii : AArch64::SBFIZwwii;
-    break;
-  case BFM:
-    ExtractOp = SF ? AArch64::BFXILxxii : AArch64::BFXILwwii;
-    InsertOp = SF ? AArch64::BFIxxii : AArch64::BFIwwii;
-    break;
-  case UBFM:
-    ExtractOp = SF ? AArch64::UBFXxxii : AArch64::UBFXwwii;
-    InsertOp = SF ? AArch64::UBFIZxxii : AArch64::UBFIZwwii;
-    break;
-  }
-
-  // Otherwise it's a boring insert or extract
-  Inst.addOperand(MCOperand::CreateImm(ImmR));
-  Inst.addOperand(MCOperand::CreateImm(ImmS));
-
-
-  if (ImmS < ImmR)
-    Inst.setOpcode(InsertOp);
-  else
-    Inst.setOpcode(ExtractOp);
+  bool ValidNamed;
+  (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
 
-  return MCDisassembler::Success;
+  return ValidNamed ? Success : Fail;
 }
 
 static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
@@ -761,812 +681,879 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
   // Add the lane
   Inst.addOperand(MCOperand::CreateImm(1));
 
-  return MCDisassembler::Success;
+  return Success;
 }
 
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
+  return Success;
+}
 
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
-                                              unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  DecodeStatus Result = MCDisassembler::Success;
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(Insn, 10, 5);
-  unsigned SImm7 = fieldFromInstruction(Insn, 15, 7);
-  unsigned L = fieldFromInstruction(Insn, 22, 1);
-  unsigned V = fieldFromInstruction(Insn, 26, 1);
-  unsigned Opc = fieldFromInstruction(Insn, 30, 2);
-
-  // Not an official name, but it turns out that bit 23 distinguishes indexed
-  // from non-indexed operations.
-  unsigned Indexed = fieldFromInstruction(Insn, 23, 1);
-
-  if (Indexed && L == 0) {
-    // The MCInst for an indexed store has an out operand and 4 ins:
-    //    Rn_wb, Rt, Rt2, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // You shouldn't load to the same register twice in an instruction...
-  if (L && Rt == Rt2)
-    Result = MCDisassembler::SoftFail;
-
-  // ... or do any operation that writes-back to a transfer register. But note
-  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
-  if (Indexed && V == 0 && Rn != 31 && (Rt == Rn || Rt2 == Rn))
-    Result = MCDisassembler::SoftFail;
-
-  // Exactly how we decode the MCInst's registers depends on the Opc and V
-  // fields of the instruction. These also obviously determine the size of the
-  // operation so we can fill in that information while we're at it.
-  if (V) {
-    // The instruction operates on the FP/SIMD registers
-    switch (Opc) {
-    default: return MCDisassembler::Fail;
-    case 0:
-      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR32RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 1:
-      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 2:
-      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR128RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    }
-  } else {
-    switch (Opc) {
-    default: return MCDisassembler::Fail;
-    case 0:
-      DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 1:
-      assert(L && "unexpected \"store signed\" attempt");
-      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 2:
-      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    }
-  }
-
-  if (Indexed && L == 1) {
-    // The MCInst for an indexed load has 3 out operands and an 3 ins:
-    //    Rt, Rt2, Rn_wb, Rt2, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(SImm7));
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
+  return Success;
+}
 
-  return Result;
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 64);
 }
 
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
-                                                       uint32_t Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Val, 0, 5);
-  unsigned Rn = fieldFromInstruction(Val, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(Val, 10, 5);
-  unsigned MemSize = fieldFromInstruction(Val, 30, 2);
-
-  DecodeStatus S = MCDisassembler::Success;
-  if (Rt == Rt2) S = MCDisassembler::SoftFail;
-
-  switch (MemSize) {
-    case 2:
-      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder)))
-        return MCDisassembler::Fail;
-      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder)))
-        return MCDisassembler::Fail;
-      break;
-    case 3:
-      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder)))
-        return MCDisassembler::Fail;
-      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder)))
-        return MCDisassembler::Fail;
-      break;
-    default:
-      llvm_unreachable("Invalid MemSize in DecodeLoadPairExclusiveInstruction");
-  }
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
 
-  if (!Check(S, DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder)))
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 32);
+}
 
-  return S;
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
 }
 
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
-                                          unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  SomeNamedImmMapper Mapper;
-  bool ValidNamed;
-  Mapper.toString(Val, ValidNamed);
-  if (ValidNamed || Mapper.validImm(Val)) {
-    Inst.addOperand(MCOperand::CreateImm(Val));
-    return MCDisassembler::Success;
-  }
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 16);
+}
 
-  return MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
 }
 
-static DecodeStatus DecodeSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                                        llvm::MCInst &Inst,
-                                        unsigned Val,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  bool ValidNamed;
-  Mapper.toString(Val, ValidNamed);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 8);
+}
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 64);
+}
 
-  return ValidNamed ? MCDisassembler::Success : MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 32);
 }
 
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder) {
-  return DecodeSysRegOperand(A64SysReg::MRSMapper(), Inst, Val, Address,
-                             Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 16);
 }
 
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder) {
-  return DecodeSysRegOperand(A64SysReg::MSRMapper(), Inst, Val, Address,
-                             Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 8);
 }
 
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
-                                                   unsigned Insn,
-                                                   uint64_t Address,
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
                                                    const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Imm9 = fieldFromInstruction(Insn, 12, 9);
-
-  unsigned Opc = fieldFromInstruction(Insn, 22, 2);
-  unsigned V = fieldFromInstruction(Insn, 26, 1);
-  unsigned Size = fieldFromInstruction(Insn, 30, 2);
-
-  if (Opc == 0 || (V == 1 && Opc == 2)) {
-    // It's a store, the MCInst gets: Rn_wb, Rt, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+  unsigned shift = (shiftHi << 6) | shiftLo;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrs:
+  case AArch64::ADDSWrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDWrs:
+  case AArch64::ANDSWrs:
+  case AArch64::BICWrs:
+  case AArch64::BICSWrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORNWrs:
+  case AArch64::EORWrs:
+  case AArch64::EONWrs: {
+    // if sf == '0' and imm6<5> == '1' then ReservedValue()
+    if (shiftLo >> 5 == 1)
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
   }
-
-  if (V == 0 && (Opc == 2 || Size == 3)) {
-    DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-  } else if (V == 0) {
-    DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
-  } else if (V == 1 && (Opc & 2)) {
-    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-  } else {
-    switch (Size) {
-    case 0:
-      DecodeFPR8RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 1:
-      DecodeFPR16RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
+  case AArch64::ADDXrs:
+  case AArch64::ADDSXrs:
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDXrs:
+  case AArch64::ANDSXrs:
+  case AArch64::BICXrs:
+  case AArch64::BICSXrs:
+  case AArch64::ORRXrs:
+  case AArch64::ORNXrs:
+  case AArch64::EORXrs:
+  case AArch64::EONXrs:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
   }
 
-  if (Opc != 0 && (V != 1 || Opc != 2)) {
-    // It's a load, the MCInst gets: Rt, Rn_wb, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
 
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 16);
+  unsigned shift = fieldFromInstruction(insn, 21, 2);
+  shift <<= 4;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::MOVZWi:
+  case AArch64::MOVNWi:
+  case AArch64::MOVKWi:
+    if (shift & (1U << 5))
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case AArch64::MOVZXi:
+  case AArch64::MOVNXi:
+  case AArch64::MOVKXi:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
 
-  Inst.addOperand(MCOperand::CreateImm(Imm9));
+  if (Inst.getOpcode() == AArch64::MOVKWi ||
+      Inst.getOpcode() == AArch64::MOVKXi)
+    Inst.addOperand(Inst.getOperand(0));
 
-  // N.b. The official documentation says undpredictable if Rt == Rn, but this
-  // takes place at the architectural rather than encoding level:
-  //
-  // "STR xzr, [sp], #4" is perfectly valid.
-  if (V == 0 && Rt == Rn && Rn != 31)
-    return MCDisassembler::SoftFail;
-  else
-    return MCDisassembler::Success;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
 }
 
-static MCDisassembler *createAArch64Disassembler(const Target &T,
-                                                 const MCSubtargetInfo &STI,
-                                                 MCContext &Ctx) {
-  return new AArch64Disassembler(STI, Ctx);
-}
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned offset = fieldFromInstruction(insn, 10, 12);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
 
-extern "C" void LLVMInitializeAArch64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
-                                         createAArch64Disassembler);
-}
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFMui:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STRBBui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+  case AArch64::STRHHui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+  case AArch64::STRWui:
+  case AArch64::LDRWui:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::STRXui:
+  case AArch64::LDRXui:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
 
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
-                             uint64_t Address, const void *Decoder) {
-  bool IsLSL = false;
-  if (Ext == A64SE::LSL)
-    IsLSL = true;
-  else if (Ext != A64SE::MSL)
-    return MCDisassembler::Fail;
-
-  // MSL and LSLH accepts encoded shift amount 0 or 1.
-  if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1)
-    return MCDisassembler::Fail;
-
-  // LSL  accepts encoded shift amount 0, 1, 2 or 3.
-  if (IsLSL && ShiftAmount > 3)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
 }
 
-// Decode post-index vector load/store instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of vector list in bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
-  unsigned Opcode = fieldFromInstruction(Insn, 12, 4);
-  unsigned IsLoad = fieldFromInstruction(Insn, 22, 1);
-  // 0 for 64bit vector list, 1 for 128bit vector list
-  unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  int64_t offset = fieldFromInstruction(insn, 12, 9);
 
-  unsigned NumVecs;
-  switch (Opcode) {
-  case 0: // ld4/st4
-  case 2: // ld1/st1 with 4 vectors
-    NumVecs = 4; break;
-  case 4: // ld3/st3
-  case 6: // ld1/st1 with 3 vectors
-    NumVecs = 3; break;
-  case 7: // ld1/st1 with 1 vector
-    NumVecs = 1; break;
-  case 8:  // ld2/st2
-  case 10: // ld1/st1 with 2 vectors
-    NumVecs = 2; break;
+  // offset is a 9-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (9 - 1)))
+    offset |= ~((1LL << 9) - 1);
+
+  // First operand is always the writeback to the address register, if needed.
+  switch (Inst.getOpcode()) {
   default:
-    llvm_unreachable("Invalid opcode for post-index load/store instructions");
+    break;
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
   }
 
-  // Decode vector list of 1/2/3/4 vectors for load instructions.
-  if (IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFUMi:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STURBBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURHHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURWi:
+  case AArch64::LDURWi:
+  case AArch64::LDTRSBWi:
+  case AArch64::LDTRSHWi:
+  case AArch64::STTRWi:
+  case AArch64::LDTRWi:
+  case AArch64::STTRHi:
+  case AArch64::LDTRHi:
+  case AArch64::LDTRBi:
+  case AArch64::STTRBi:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::LDURXi:
+  case AArch64::LDTRSBXi:
+  case AArch64::LDTRSHXi:
+  case AArch64::LDTRSWi:
+  case AArch64::STTRXi:
+  case AArch64::LDTRXi:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURDi:
+  case AArch64::STURDi:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSi:
+  case AArch64::STURSi:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURHi:
+  case AArch64::STURHi:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURBi:
+  case AArch64::STURBi:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
   }
 
-  // Decode write back register, which is equal to Rn.
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
-  if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte
-    Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8)));
-  else // Decode Rm
-    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
-  // Decode vector list of 1/2/3/4 vectors for load instructions.
-  if (!IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
-  }
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
+
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+  bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+  // Cannot write back to a transfer register (but xzr != sp).
+  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+    return SoftFail;
 
-  return MCDisassembler::Success;
+  return Success;
 }
 
-// Decode post-index vector load/store lane instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of the changed bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
-                                                   uint64_t Address,
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
                                                    const void *Decoder) {
-  bool Is64bitVec = false;
-  bool IsLoadDup = false;
-  bool IsLoad = false;
-  // The total number of bytes transferred.
-  // TransferBytes = NumVecs * OneLaneBytes
-  unsigned TransferBytes = 0;
-  unsigned NumVecs = 0;
-  unsigned Opc = Inst.getOpcode();
-  switch (Opc) {
-  case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
-  case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
-  case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
-  case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
-      TransferBytes = 8; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 1;
-    break;
-  }
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
 
-  case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
-  case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
-  case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
-  case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
-      TransferBytes = 8; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 1;
+  unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::STLXRW:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARW:
+  case AArch64::LDARB:
+  case AArch64::LDARH:
+  case AArch64::LDAXRW:
+  case AArch64::LDAXRB:
+  case AArch64::LDAXRH:
+  case AArch64::LDXRW:
+  case AArch64::LDXRB:
+  case AArch64::LDXRH:
+  case AArch64::STLRW:
+  case AArch64::STLRB:
+  case AArch64::STLRH:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
-  case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
-  case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
-  case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
-      TransferBytes = 16; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 2;
+  case AArch64::STLXRX:
+  case AArch64::STXRX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARX:
+  case AArch64::LDAXRX:
+  case AArch64::LDXRX:
+  case AArch64::STLRX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
-  case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
-  case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
-  case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
-      TransferBytes = 16; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 2;
+  case AArch64::STLXPW:
+  case AArch64::STXPW:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPW:
+  case AArch64::LDXPW:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
-  case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
-  case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
-  case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
-      TransferBytes = 24; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 3;
+  case AArch64::STLXPX:
+  case AArch64::STXPX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPX:
+  case AArch64::LDXPX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
   }
 
-  case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
-  case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
-  case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
-  case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
-      TransferBytes = 24; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 3;
-    break;
-  }
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
 
-  case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
-  case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
-  case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
-  case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
-      TransferBytes = 32; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 4;
-    break;
-  }
+  // You shouldn't load to the same register twice in an instruction...
+  if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+       Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+      Rt == Rt2)
+    return SoftFail;
 
-  case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
-  case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
-  case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
-  case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
-      TransferBytes = 32; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 4;
-    break;
-  }
+  return Success;
+}
 
-  case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
-  case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
-  case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
-  case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
-      TransferBytes = 8; break;
-    }
-    IsLoad = true;
-    NumVecs = 1;
-    break;
-  }
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  int64_t offset = fieldFromInstruction(insn, 15, 7);
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
 
-  case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
-  case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
-  case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
-  case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
-      TransferBytes = 16; break;
-    }
-    IsLoad = true;
-    NumVecs = 2;
-    break;
-  }
+  // offset is a 7-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (7 - 1)))
+    offset |= ~((1LL << 7) - 1);
 
-  case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
-  case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
-  case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
-  case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
-      TransferBytes = 24; break;
-    }
-    IsLoad = true;
-    NumVecs = 3;
-    break;
-  }
+  unsigned Opcode = Inst.getOpcode();
+  bool NeedsDisjointWritebackTransfer = false;
 
-  case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
-  case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
-  case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
-  case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
-      TransferBytes = 32; break;
-    }
-    IsLoad = true;
-    NumVecs = 4;
+  // First operand is always writeback of base register.
+  switch (Opcode) {
+  default:
     break;
-  }
-
-  case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
-  case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
-  case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
-  case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
-      TransferBytes = 1; break;
-    case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
-      TransferBytes = 2; break;
-    case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
-      TransferBytes = 4; break;
-    case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
-      TransferBytes = 8; break;
-    }
-    NumVecs = 1;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
     break;
   }
 
-  case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
-  case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
-  case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
-  case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
-      TransferBytes = 2; break;
-    case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
-      TransferBytes = 4; break;
-    case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
-      TransferBytes = 8; break;
-    case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
-      TransferBytes = 16; break;
-    }
-    NumVecs = 2;
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPXi:
+  case AArch64::STNPXi:
+  case AArch64::LDPXi:
+  case AArch64::STPXi:
+  case AArch64::LDPSWi:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
-  case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
-  case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
-  case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
-      TransferBytes = 3; break;
-    case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
-      TransferBytes = 6; break;
-    case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
-      TransferBytes = 12; break;
-    case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
-      TransferBytes = 24; break;
-    }
-    NumVecs = 3;
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPWi:
+  case AArch64::STNPWi:
+  case AArch64::LDPWi:
+  case AArch64::STPWi:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
-  case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
-  case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
-  case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
-      TransferBytes = 4; break;
-    case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
-      TransferBytes = 8; break;
-    case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
-      TransferBytes = 16; break;
-    case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
-      TransferBytes = 32; break;
-    }
-    NumVecs = 4;
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPDi:
+  case AArch64::STNPDi:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDi:
+  case AArch64::STPDi:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPSi:
+  case AArch64::STNPSi:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSi:
+  case AArch64::STPSi:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
   }
 
-  default:
-    return MCDisassembler::Fail;
-  } // End of switch (Opc)
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
 
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
-
-  // Decode post-index of load duplicate lane
-  if (IsLoadDup) {
-    switch (NumVecs) {
-    case 1:
-      Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-    }
-
-    // Decode write back register, which is equal to Rn.
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
-    if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
-      Inst.addOperand(MCOperand::CreateImm(TransferBytes));
-    else // Decode Rm
-      DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
-    return MCDisassembler::Success;
-  }
+  // You shouldn't load to the same register twice in an instruction...
+  if (IsLoad && Rt == Rt2)
+    return SoftFail;
 
-  // Decode post-index of load/store lane
-  // Loads have a vector list as output.
-  if (IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-    }
-  }
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    return SoftFail;
+
+  return Success;
+}
 
-  // Decode write back register, which is equal to Rn.
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extend = fieldFromInstruction(insn, 10, 6);
 
-  if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
-    Inst.addOperand(MCOperand::CreateImm(TransferBytes));
-  else // Decode Rm
-    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+  unsigned shift = extend & 0x7;
+  if (shift > 4)
+    return Fail;
 
-  // Decode the source vector list.
-  switch (NumVecs) {
-  case 1:
-    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-    break;
-  case 2:
-    DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrx:
+  case AArch64::SUBWrx:
+    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 3:
-    DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+  case AArch64::ADDSWrx:
+  case AArch64::SUBSWrx:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 4:
-    DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-  }
-
-  // Decode lane
-  unsigned Q = fieldFromInstruction(Insn, 30, 1);
-  unsigned S = fieldFromInstruction(Insn, 10, 3);
-  unsigned lane = 0;
-  // Calculate the number of lanes by number of vectors and transferred bytes.
-  // NumLanes = 16 bytes / bytes of each lane
-  unsigned NumLanes = 16 / (TransferBytes / NumVecs);
-  switch (NumLanes) {
-  case 16: // A vector has 16 lanes, each lane is 1 bytes.
-    lane = (Q << 3) | S;
+  case AArch64::ADDXrx:
+  case AArch64::SUBXrx:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 8:
-    lane = (Q << 2) | (S >> 1);
+  case AArch64::ADDSXrx:
+  case AArch64::SUBSXrx:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 4:
-    lane = (Q << 1) | (S >> 2);
+  case AArch64::ADDXrx64:
+  case AArch64::SUBXrx64:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 2:
-    lane = Q;
+  case AArch64::SUBSXrx64:
+  case AArch64::ADDSXrx64:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
     break;
   }
-  Inst.addOperand(MCOperand::CreateImm(lane));
 
-  return MCDisassembler::Success;
+  Inst.addOperand(MCOperand::CreateImm(extend));
+  return Success;
 }
 
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned size = fieldFromInstruction(Insn, 22, 2);
-  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+  unsigned imm;
+
+  if (Datasize) {
+    if (Inst.getOpcode() == AArch64::ANDSXri)
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 13);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+      return Fail;
+  } else {
+    if (Inst.getOpcode() == AArch64::ANDSWri)
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 12);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+      return Fail;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return Success;
+}
 
-  DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
 
-  if(Q)
-    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  if (Inst.getOpcode() == AArch64::MOVID)
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
   else
-    DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder);
+    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
 
-  switch (size) {
-  case 0:
-    Inst.addOperand(MCOperand::CreateImm(8));
+  switch (Inst.getOpcode()) {
+  default:
     break;
-  case 1:
-    Inst.addOperand(MCOperand::CreateImm(16));
+  case AArch64::MOVIv4i16:
+  case AArch64::MOVIv8i16:
+  case AArch64::MVNIv4i16:
+  case AArch64::MVNIv8i16:
+  case AArch64::MOVIv2i32:
+  case AArch64::MOVIv4i32:
+  case AArch64::MVNIv2i32:
+  case AArch64::MVNIv4i32:
+    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
     break;
-  case 2:
-    Inst.addOperand(MCOperand::CreateImm(32));
+  case AArch64::MOVIv2s_msl:
+  case AArch64::MOVIv4s_msl:
+  case AArch64::MVNIv2s_msl:
+  case AArch64::MVNIv4s_msl:
+    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
     break;
-  default :
-    return MCDisassembler::Fail;
   }
-  return MCDisassembler::Success;
+
+  return Success;
 }
 
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  // Tied operands added twice.
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+
+  return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+  imm |= fieldFromInstruction(insn, 29, 2);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 21-bit immediate.
+  if (imm & (1 << (21 - 1)))
+    imm |= ~((1LL << 21) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Imm = fieldFromInstruction(insn, 10, 14);
+  unsigned S = fieldFromInstruction(insn, 29, 1);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+  unsigned ShifterVal = (Imm >> 12) & 3;
+  unsigned ImmVal = Imm & 0xFFF;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  if (ShifterVal != 0 && ShifterVal != 1)
+    return Fail;
+
+  if (Datasize) {
+    if (Rd == 31 && !S)
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  } else {
+    if (Rd == 31 && !S)
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+  }
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
+  return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int64_t imm = fieldFromInstruction(insn, 0, 26);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 26-bit immediate.
+  if (imm & (1 << (26 - 1)))
+    imm |= ~((1LL << 26) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+  uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+  uint64_t pstate_field = (op1 << 3) | op2;
+
+  Inst.addOperand(MCOperand::CreateImm(pstate_field));
+  Inst.addOperand(MCOperand::CreateImm(crm));
+
+  bool ValidNamed;
+  (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
+  
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+  bit |= fieldFromInstruction(insn, 19, 5);
+  int64_t dst = fieldFromInstruction(insn, 5, 14);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 14-bit immediate.
+  if (dst & (1 << (14 - 1)))
+    dst |= ~((1LL << 14) - 1);
+
+  if (fieldFromInstruction(insn, 31, 1) == 0)
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+  else
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(bit));
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(dst));
+
+  return Success;
+}
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
similarity index 75%
rename from lib/Target/ARM64/Disassembler/ARM64Disassembler.h
rename to lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 8989925f36b8..68d4867977b0 100644
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -1,4 +1,4 @@
-//===- ARM64Disassembler.h - Disassembler for ARM64 -------------*- C++ -*-===//
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARM64DISASSEMBLER_H
-#define ARM64DISASSEMBLER_H
+#ifndef AArch64DISASSEMBLER_H
+#define AArch64DISASSEMBLER_H
 
 #include "llvm/MC/MCDisassembler.h"
 
@@ -21,12 +21,12 @@ class MCInst;
 class MemoryObject;
 class raw_ostream;
 
-class ARM64Disassembler : public MCDisassembler {
+class AArch64Disassembler : public MCDisassembler {
 public:
-  ARM64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : MCDisassembler(STI, Ctx) {}
 
-  ~ARM64Disassembler() {}
+  ~AArch64Disassembler() {}
 
   /// getInstruction - See MCDisassembler.
   MCDisassembler::DecodeStatus
diff --git a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
similarity index 86%
rename from lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp
rename to lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 2f8e516d185d..24663684a3fd 100644
--- a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -1,4 +1,4 @@
-//===- ARM64ExternalSymbolizer.cpp - Symbolizer for ARM64 -------*- C++ -*-===//
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM64ExternalSymbolizer.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "Utils/ARM64BaseInfo.h"
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -19,7 +19,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "arm64-disassembler"
+#define DEBUG_TYPE "aarch64-disassembler"
 
 static MCSymbolRefExpr::VariantKind
 getVariant(uint64_t LLVMDisassembler_VariantKind) {
@@ -58,14 +58,9 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) {
 /// a symbol look up is done to see it is returns a specific reference type
 /// to add to the comment stream.  This function returns Success if it adds
 /// an operand to the MCInst and Fail otherwise.
-bool ARM64ExternalSymbolizer::tryAddingSymbolicOperand(
-                                                     MCInst &MI,
-                                                     raw_ostream &CommentStream,
-                                                     int64_t Value,
-                                                     uint64_t Address,
-                                                     bool IsBranch,
-                                                     uint64_t Offset,
-                                                     uint64_t InstSize) {
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+    MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+    bool IsBranch, uint64_t Offset, uint64_t InstSize) {
   // FIXME: This method shares a lot of code with
   //        MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
   //        refactor the MCExternalSymbolizer interface to allow more of this
@@ -94,7 +89,7 @@ bool ARM64ExternalSymbolizer::tryAddingSymbolicOperand(
       else if (ReferenceType ==
                LLVMDisassembler_ReferenceType_Out_Objc_Message)
         CommentStream << "Objc message: " << ReferenceName;
-    } else if (MI.getOpcode() == ARM64::ADRP) {
+    } else if (MI.getOpcode() == AArch64::ADRP) {
         ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
         // otool expects the fully encoded ADRP instruction to be passed in as
         // the value here, so reconstruct it:
@@ -107,19 +102,19 @@ bool ARM64ExternalSymbolizer::tryAddingSymbolicOperand(
                      &ReferenceName);
         CommentStream << format("0x%llx",
                                 0xfffffffffffff000LL & (Address + Value));
-    } else if (MI.getOpcode() == ARM64::ADDXri ||
-               MI.getOpcode() == ARM64::LDRXui ||
-               MI.getOpcode() == ARM64::LDRXl ||
-               MI.getOpcode() == ARM64::ADR) {
-      if (MI.getOpcode() == ARM64::ADDXri)
+    } else if (MI.getOpcode() == AArch64::ADDXri ||
+               MI.getOpcode() == AArch64::LDRXui ||
+               MI.getOpcode() == AArch64::LDRXl ||
+               MI.getOpcode() == AArch64::ADR) {
+      if (MI.getOpcode() == AArch64::ADDXri)
         ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
-      else if (MI.getOpcode() == ARM64::LDRXui)
+      else if (MI.getOpcode() == AArch64::LDRXui)
         ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
-      if (MI.getOpcode() == ARM64::LDRXl) {
+      if (MI.getOpcode() == AArch64::LDRXl) {
         ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
         SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
                      &ReferenceName);
-      } else if (MI.getOpcode() == ARM64::ADR) {
+      } else if (MI.getOpcode() == AArch64::ADR) {
         ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
         SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
                             &ReferenceName);
@@ -128,7 +123,7 @@ bool ARM64ExternalSymbolizer::tryAddingSymbolicOperand(
         // otool expects the fully encoded ADD/LDR instruction to be passed in
         // as the value here, so reconstruct it:
         unsigned EncodedInst =
-          MI.getOpcode() == ARM64::ADDXri ? 0x91000000: 0xF9400000;
+          MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
         EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
         EncodedInst |=
           MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
diff --git a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
similarity index 50%
rename from lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h
rename to lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 45f07a5e2587..171d31c48cd7 100644
--- a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -1,4 +1,4 @@
-//===- ARM64ExternalSymbolizer.h - Symbolizer for ARM64 ---------*- C++ -*-===//
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,25 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Symbolize ARM64 assembly code during disassembly using callbacks.
+// Symbolize AArch64 assembly code during disassembly using callbacks.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARM64EXTERNALSYMBOLIZER_H
-#define ARM64EXTERNALSYMBOLIZER_H
+#ifndef AArch64EXTERNALSYMBOLIZER_H
+#define AArch64EXTERNALSYMBOLIZER_H
 
 #include "llvm/MC/MCExternalSymbolizer.h"
 
 namespace llvm {
 
-class ARM64ExternalSymbolizer : public MCExternalSymbolizer {
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
 public:
-  ARM64ExternalSymbolizer(MCContext &Ctx,
-                          std::unique_ptr<MCRelocationInfo> RelInfo,
-                          LLVMOpInfoCallback GetOpInfo,
-                          LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo)
-    : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
-                           DisInfo) {}
+  AArch64ExternalSymbolizer(MCContext &Ctx,
+                            std::unique_ptr<MCRelocationInfo> RelInfo,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp,
+                            void *DisInfo)
+      : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+                             DisInfo) {}
 
   bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
                                 int64_t Value, uint64_t Address, bool IsBranch,
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
index 21baf250af86..d64c05b0adbc 100644
--- a/lib/Target/AArch64/Disassembler/CMakeLists.txt
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -1,3 +1,8 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64Disassembler
   AArch64Disassembler.cpp
+  AArch64ExternalSymbolizer.cpp
   )
+
+add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index 05c4ed1646b9..a4224f4a2f53 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
index 5c861207f836..741bb817a633 100644
--- a/lib/Target/AArch64/Disassembler/Makefile
+++ b/lib/Target/AArch64/Disassembler/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64Disassembler
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
 CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index d9571238a033..8a21f0650cdc 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -12,15 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstPrinter.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -28,522 +28,1289 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #define PRINT_ALIAS_INSTR
 #include "AArch64GenAsmWriter.inc"
-
-static int64_t unpackSignedImm(int BitWidth, uint64_t Value) {
-  assert(!(Value & ~((1ULL << BitWidth)-1)) && "immediate not n-bit");
-  if (Value & (1ULL <<  (BitWidth - 1)))
-    return static_cast<int64_t>(Value) - (1LL << BitWidth);
-  else
-    return Value;
-}
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
 
 AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
                                        const MCInstrInfo &MII,
                                        const MCRegisterInfo &MRI,
-                                       const MCSubtargetInfo &STI) :
-  MCInstPrinter(MAI, MII, MRI) {
+                                       const MCSubtargetInfo &STI)
+    : MCInstPrinter(MAI, MII, MRI) {
   // Initialize the set of available features.
   setAvailableFeatures(STI.getFeatureBits());
 }
 
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI)
+    : AArch64InstPrinter(MAI, MII, MRI, STI) {}
+
 void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // This is for .cfi directives.
   OS << getRegisterName(RegNo);
 }
 
-void
-AArch64InstPrinter::printOffsetSImm9Operand(const MCInst *MI,
-                                              unsigned OpNum, raw_ostream &O) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
-  int32_t Imm = unpackSignedImm(9, MOImm.getImm());
-
-  O << '#' << Imm;
-}
-
-void
-AArch64InstPrinter::printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O, unsigned MemSize,
-                                          unsigned RmSize) {
-  unsigned ExtImm = MI->getOperand(OpNum).getImm();
-  unsigned OptionHi = ExtImm >> 1;
-  unsigned S = ExtImm & 1;
-  bool IsLSL = OptionHi == 1 && RmSize == 64;
-
-  const char *Ext;
-  switch (OptionHi) {
-  case 1:
-    Ext = (RmSize == 32) ? "uxtw" : "lsl";
-    break;
-  case 3:
-    Ext = (RmSize == 32) ? "sxtw" : "sxtx";
-    break;
-  default:
-    llvm_unreachable("Incorrect Option on load/store (reg offset)");
-  }
-  O << Ext;
-
-  if (S) {
-    unsigned ShiftAmt = Log2_32(MemSize);
-    O << " #" << ShiftAmt;
-  } else if (IsLSL) {
-    O << " #0";
-  }
-}
-
-void
-AArch64InstPrinter::printAddSubImmLSL0Operand(const MCInst *MI,
-                                              unsigned OpNum, raw_ostream &O) {
-  const MCOperand &Imm12Op = MI->getOperand(OpNum);
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  // Check for special encodings and print the canonical alias instead.
 
-  if (Imm12Op.isImm()) {
-    int64_t Imm12 = Imm12Op.getImm();
-    assert(Imm12 >= 0 && "Invalid immediate for add/sub imm");
-    O << "#" << Imm12;
-  } else {
-    assert(Imm12Op.isExpr() && "Unexpected shift operand type");
-    O << "#" << *Imm12Op.getExpr();
-  }
-}
+  unsigned Opcode = MI->getOpcode();
 
-void
-AArch64InstPrinter::printAddSubImmLSL12Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O) {
+  if (Opcode == AArch64::SYSxt)
+    if (printSysAlias(MI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
 
-  printAddSubImmLSL0Operand(MI, OpNum, O);
+  // SBFM/UBFM should print to a nicer aliased form if possible.
+  if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+      Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0);
+    const MCOperand &Op1 = MI->getOperand(1);
+    const MCOperand &Op2 = MI->getOperand(2);
+    const MCOperand &Op3 = MI->getOperand(3);
+
+    bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+    bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+
+      switch (Op3.getImm()) {
+      default:
+        break;
+      case 7:
+        if (IsSigned)
+          AsmMnemonic = "sxtb";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxtb";
+        break;
+      case 15:
+        if (IsSigned)
+          AsmMnemonic = "sxth";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxth";
+        break;
+      case 31:
+        // *xtw is only valid for signed 64-bit operations.
+        if (Is64Bit && IsSigned)
+          AsmMnemonic = "sxtw";
+        break;
+      }
+
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
 
-  O << ", lsl #12";
-}
+    // All immediate shifts are aliases, implemented using the Bitfield
+    // instruction. In all cases the immediate shift amount shift must be in
+    // the range 0 to (reg.size -1).
+    if (Op2.isImm() && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+      int shift = 0;
+      int64_t immr = Op2.getImm();
+      int64_t imms = Op3.getImm();
+      if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+        AsmMnemonic = "lsl";
+        shift = 31 - imms;
+      } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+                 ((imms + 1 == immr))) {
+        AsmMnemonic = "lsl";
+        shift = 63 - imms;
+      } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      }
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
 
-void
-AArch64InstPrinter::printBareImmOperand(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  O << MO.getImm();
-}
+    // SBFIZ/UBFIZ aliases
+    if (Op2.getImm() > Op3.getImm()) {
+      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+      printAnnotation(O, Annot);
+      return;
+    }
 
-template<unsigned RegWidth> void
-AArch64InstPrinter::printBFILSBOperand(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  const MCOperand &ImmROp = MI->getOperand(OpNum);
-  unsigned LSB = ImmROp.getImm() == 0 ? 0 : RegWidth - ImmROp.getImm();
+    // Otherwise SBFX/UBFX is the preferred form
+    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+    printAnnotation(O, Annot);
+    return;
+  }
 
-  O << '#' << LSB;
-}
+  if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+    const MCOperand &Op2 = MI->getOperand(2);
+    int ImmR = MI->getOperand(3).getImm();
+    int ImmS = MI->getOperand(4).getImm();
+
+    // BFI alias
+    if (ImmS < ImmR) {
+      int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+      int LSB = (BitWidth - ImmR) % BitWidth;
+      int Width = ImmS + 1;
+      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+      printAnnotation(O, Annot);
+      return;
+    }
 
-void AArch64InstPrinter::printBFIWidthOperand(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  const MCOperand &ImmSOp = MI->getOperand(OpNum);
-  unsigned Width = ImmSOp.getImm() + 1;
+    int LSB = ImmR;
+    int Width = ImmS - ImmR + 1;
+    // Otherwise BFXIL the preferred form
+    O << "\tbfxil\t"
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+      << ", #" << LSB << ", #" << Width;
+    printAnnotation(O, Annot);
+    return;
+  }
 
-  O << '#' << Width;
-}
+  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+  // printed.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+       Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isExpr()) {
+    if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+      O << "\tmovz\t";
+    else
+      O << "\tmovn\t";
 
-void
-AArch64InstPrinter::printBFXWidthOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &ImmSOp = MI->getOperand(OpNum);
-  const MCOperand &ImmROp = MI->getOperand(OpNum - 1);
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(1).getExpr();
+    return;
+  }
 
-  unsigned ImmR = ImmROp.getImm();
-  unsigned ImmS = ImmSOp.getImm();
+  if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+      MI->getOperand(2).isExpr()) {
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(2).getExpr();
+    return;
+  }
 
-  assert(ImmS >= ImmR && "Invalid ImmR, ImmS combination for bitfield extract");
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
 
-  O << '#' << (ImmS - ImmR + 1);
+  printAnnotation(O, Annot);
 }
 
-void
-AArch64InstPrinter::printCRxOperand(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O) {
-    const MCOperand &CRx = MI->getOperand(OpNum);
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+                                bool &IsTbx) {
+  switch (Opcode) {
+  case AArch64::TBXv8i8One:
+  case AArch64::TBXv8i8Two:
+  case AArch64::TBXv8i8Three:
+  case AArch64::TBXv8i8Four:
+    IsTbx = true;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBLv8i8One:
+  case AArch64::TBLv8i8Two:
+  case AArch64::TBLv8i8Three:
+  case AArch64::TBLv8i8Four:
+    IsTbx = false;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBXv16i8One:
+  case AArch64::TBXv16i8Two:
+  case AArch64::TBXv16i8Three:
+  case AArch64::TBXv16i8Four:
+    IsTbx = true;
+    Layout = ".16b";
+    return true;
+  case AArch64::TBLv16i8One:
+  case AArch64::TBLv16i8Two:
+  case AArch64::TBLv16i8Three:
+  case AArch64::TBLv16i8Four:
+    IsTbx = false;
+    Layout = ".16b";
+    return true;
+  default:
+    return false;
+  }
+}
 
-    O << 'c' << CRx.getImm();
+struct LdStNInstrDesc {
+  unsigned Opcode;
+  const char *Mnemonic;
+  const char *Layout;
+  int ListOperand;
+  bool HasLane;
+  int NaturalOffset;
+};
+
+static LdStNInstrDesc LdStNInstInfo[] = {
+  { AArch64::LD1i8,             "ld1",  ".b",     1, true,  0  },
+  { AArch64::LD1i16,            "ld1",  ".h",     1, true,  0  },
+  { AArch64::LD1i32,            "ld1",  ".s",     1, true,  0  },
+  { AArch64::LD1i64,            "ld1",  ".d",     1, true,  0  },
+  { AArch64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
+  { AArch64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
+  { AArch64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
+  { AArch64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
+  { AArch64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
+  { AArch64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
+  { AArch64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
+  { AArch64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
+  { AArch64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
+  { AArch64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
+  { AArch64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
+  { AArch64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
+  { AArch64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
+  { AArch64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
+  { AArch64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
+  { AArch64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
+  { AArch64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
+  { AArch64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
+  { AArch64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
+  { AArch64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
+  { AArch64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
+  { AArch64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
+  { AArch64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
+  { AArch64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
+  { AArch64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
+  { AArch64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
+  { AArch64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
+  { AArch64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
+  { AArch64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
+  { AArch64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
+  { AArch64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
+  { AArch64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
+  { AArch64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
+  { AArch64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
+  { AArch64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
+  { AArch64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
+  { AArch64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
+  { AArch64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
+  { AArch64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
+  { AArch64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
+  { AArch64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
+  { AArch64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
+  { AArch64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
+  { AArch64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
+  { AArch64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
+  { AArch64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
+  { AArch64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
+  { AArch64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
+  { AArch64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
+  { AArch64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
+  { AArch64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
+  { AArch64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
+  { AArch64::LD2i8,             "ld2",  ".b",     1, true,  0  },
+  { AArch64::LD2i16,            "ld2",  ".h",     1, true,  0  },
+  { AArch64::LD2i32,            "ld2",  ".s",     1, true,  0  },
+  { AArch64::LD2i64,            "ld2",  ".d",     1, true,  0  },
+  { AArch64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
+  { AArch64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
+  { AArch64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
+  { AArch64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
+  { AArch64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
+  { AArch64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
+  { AArch64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
+  { AArch64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
+  { AArch64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
+  { AArch64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
+  { AArch64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
+  { AArch64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
+  { AArch64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
+  { AArch64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
+  { AArch64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
+  { AArch64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
+  { AArch64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
+  { AArch64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
+  { AArch64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
+  { AArch64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
+  { AArch64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
+  { AArch64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
+  { AArch64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
+  { AArch64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
+  { AArch64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
+  { AArch64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
+  { AArch64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
+  { AArch64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
+  { AArch64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
+  { AArch64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
+  { AArch64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
+  { AArch64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
+  { AArch64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
+  { AArch64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
+  { AArch64::LD3i8,             "ld3",  ".b",     1, true,  0  },
+  { AArch64::LD3i16,            "ld3",  ".h",     1, true,  0  },
+  { AArch64::LD3i32,            "ld3",  ".s",     1, true,  0  },
+  { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
+  { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
+  { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
+  { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
+  { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
+  { AArch64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
+  { AArch64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
+  { AArch64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
+  { AArch64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
+  { AArch64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
+  { AArch64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
+  { AArch64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
+  { AArch64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
+  { AArch64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
+  { AArch64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
+  { AArch64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
+  { AArch64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
+  { AArch64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
+  { AArch64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
+  { AArch64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
+  { AArch64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
+  { AArch64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
+  { AArch64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
+  { AArch64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
+  { AArch64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
+  { AArch64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
+  { AArch64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
+  { AArch64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
+  { AArch64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
+  { AArch64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
+  { AArch64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
+  { AArch64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
+  { AArch64::LD4i8,             "ld4",  ".b",     1, true,  0  },
+  { AArch64::LD4i16,            "ld4",  ".h",     1, true,  0  },
+  { AArch64::LD4i32,            "ld4",  ".s",     1, true,  0  },
+  { AArch64::LD4i64,            "ld4",  ".d",     1, true,  0  },
+  { AArch64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
+  { AArch64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
+  { AArch64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
+  { AArch64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
+  { AArch64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
+  { AArch64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
+  { AArch64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
+  { AArch64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
+  { AArch64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
+  { AArch64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
+  { AArch64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
+  { AArch64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
+  { AArch64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
+  { AArch64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
+  { AArch64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
+  { AArch64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
+  { AArch64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
+  { AArch64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
+  { AArch64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
+  { AArch64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
+  { AArch64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
+  { AArch64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
+  { AArch64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
+  { AArch64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
+  { AArch64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
+  { AArch64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
+  { AArch64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
+  { AArch64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
+  { AArch64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
+  { AArch64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
+  { AArch64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
+  { AArch64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
+  { AArch64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
+  { AArch64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
+  { AArch64::ST1i8,             "st1",  ".b",     0, true,  0  },
+  { AArch64::ST1i16,            "st1",  ".h",     0, true,  0  },
+  { AArch64::ST1i32,            "st1",  ".s",     0, true,  0  },
+  { AArch64::ST1i64,            "st1",  ".d",     0, true,  0  },
+  { AArch64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
+  { AArch64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
+  { AArch64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
+  { AArch64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
+  { AArch64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
+  { AArch64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
+  { AArch64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
+  { AArch64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
+  { AArch64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
+  { AArch64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
+  { AArch64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
+  { AArch64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
+  { AArch64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
+  { AArch64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
+  { AArch64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
+  { AArch64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
+  { AArch64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
+  { AArch64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
+  { AArch64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
+  { AArch64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
+  { AArch64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
+  { AArch64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
+  { AArch64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
+  { AArch64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
+  { AArch64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
+  { AArch64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
+  { AArch64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
+  { AArch64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
+  { AArch64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
+  { AArch64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
+  { AArch64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
+  { AArch64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
+  { AArch64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
+  { AArch64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
+  { AArch64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
+  { AArch64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
+  { AArch64::ST2i8,             "st2",  ".b",     0, true,  0  },
+  { AArch64::ST2i16,            "st2",  ".h",     0, true,  0  },
+  { AArch64::ST2i32,            "st2",  ".s",     0, true,  0  },
+  { AArch64::ST2i64,            "st2",  ".d",     0, true,  0  },
+  { AArch64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
+  { AArch64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
+  { AArch64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
+  { AArch64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
+  { AArch64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
+  { AArch64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
+  { AArch64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
+  { AArch64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
+  { AArch64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
+  { AArch64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
+  { AArch64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
+  { AArch64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
+  { AArch64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
+  { AArch64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
+  { AArch64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
+  { AArch64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
+  { AArch64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
+  { AArch64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
+  { AArch64::ST3i8,             "st3",  ".b",     0, true,  0  },
+  { AArch64::ST3i16,            "st3",  ".h",     0, true,  0  },
+  { AArch64::ST3i32,            "st3",  ".s",     0, true,  0  },
+  { AArch64::ST3i64,            "st3",  ".d",     0, true,  0  },
+  { AArch64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
+  { AArch64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
+  { AArch64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
+  { AArch64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
+  { AArch64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
+  { AArch64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
+  { AArch64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
+  { AArch64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
+  { AArch64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
+  { AArch64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
+  { AArch64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
+  { AArch64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
+  { AArch64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
+  { AArch64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
+  { AArch64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
+  { AArch64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
+  { AArch64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
+  { AArch64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
+  { AArch64::ST4i8,             "st4",  ".b",     0, true,  0  },
+  { AArch64::ST4i16,            "st4",  ".h",     0, true,  0  },
+  { AArch64::ST4i32,            "st4",  ".s",     0, true,  0  },
+  { AArch64::ST4i64,            "st4",  ".d",     0, true,  0  },
+  { AArch64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
+  { AArch64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
+  { AArch64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
+  { AArch64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
+  { AArch64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
+  { AArch64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
+  { AArch64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
+  { AArch64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
+  { AArch64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
+  { AArch64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
+  { AArch64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
+  { AArch64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
+  { AArch64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
+  { AArch64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
+  { AArch64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
+  { AArch64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
+  { AArch64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
+  { AArch64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
+};
+
+static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+  unsigned Idx;
+  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+    if (LdStNInstInfo[Idx].Opcode == Opcode)
+      return &LdStNInstInfo[Idx];
+
+  return nullptr;
 }
 
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                        StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+  StringRef Layout, Mnemonic;
 
-void
-AArch64InstPrinter::printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-    const MCOperand &ScaleOp = MI->getOperand(OpNum);
+  bool IsTbx;
+  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+      << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
 
-    O << '#' << (64 - ScaleOp.getImm());
-}
+    unsigned ListOpNum = IsTbx ? 2 : 1;
+    printVectorList(MI, ListOpNum, O, "");
 
+    O << ", "
+      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+    printAnnotation(O, Annot);
+    return;
+  }
 
-void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &o) {
-  const MCOperand &MOImm8 = MI->getOperand(OpNum);
+  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+    // Now onto the operands: first a vector list with possible lane
+    // specifier. E.g. { v0 }[2]
+    int OpNum = LdStDesc->ListOperand;
+    printVectorList(MI, OpNum++, O, "");
+
+    if (LdStDesc->HasLane)
+      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+    // Next the address: [xN]
+    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+    O << ", [" << getRegisterName(AddrReg) << ']';
+
+    // Finally, there might be a post-indexed offset.
+    if (LdStDesc->NaturalOffset != 0) {
+      unsigned Reg = MI->getOperand(OpNum++).getReg();
+      if (Reg != AArch64::XZR)
+        O << ", " << getRegisterName(Reg);
+      else {
+        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+        O << ", #" << LdStDesc->NaturalOffset;
+      }
+    }
+
+    printAnnotation(O, Annot);
+    return;
+  }
 
-  assert(MOImm8.isImm()
-         && "Immediate operand required for floating-point immediate inst");
+  AArch64InstPrinter::printInst(MI, O, Annot);
+}
 
-  uint32_t Imm8 = MOImm8.getImm();
-  uint32_t Fraction = Imm8 & 0xf;
-  uint32_t Exponent = (Imm8 >> 4) & 0x7;
-  uint32_t Negative = (Imm8 >> 7) & 0x1;
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+  const char *Asm = nullptr;
+  const MCOperand &Op1 = MI->getOperand(0);
+  const MCOperand &Cn = MI->getOperand(1);
+  const MCOperand &Cm = MI->getOperand(2);
+  const MCOperand &Op2 = MI->getOperand(3);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  if (CnVal == 7) {
+    switch (CmVal) {
+    default:
+      break;
+
+    // IC aliases
+    case 1:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tialluis";
+      break;
+    case 5:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tiallu";
+      else if (Op1Val == 3 && Op2Val == 1)
+        Asm = "ic\tivau";
+      break;
+
+    // DC aliases
+    case 4:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tzva";
+      break;
+    case 6:
+      if (Op1Val == 0 && Op2Val == 1)
+        Asm = "dc\tivac";
+      if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tisw";
+      break;
+    case 10:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcsw";
+      break;
+    case 11:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvau";
+      break;
+    case 14:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcivac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcisw";
+      break;
+
+    // AT aliases
+    case 8:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e1r"; break;
+        case 1: Asm = "at\ts1e1w"; break;
+        case 2: Asm = "at\ts1e0r"; break;
+        case 3: Asm = "at\ts1e0w"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e2r"; break;
+        case 1: Asm = "at\ts1e2w"; break;
+        case 4: Asm = "at\ts12e1r"; break;
+        case 5: Asm = "at\ts12e1w"; break;
+        case 6: Asm = "at\ts12e0r"; break;
+        case 7: Asm = "at\ts12e0w"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e3r"; break;
+        case 1: Asm = "at\ts1e3w"; break;
+        }
+        break;
+      }
+      break;
+    }
+  } else if (CnVal == 8) {
+    // TLBI aliases
+    switch (CmVal) {
+    default:
+      break;
+    case 3:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1is"; break;
+        case 1: Asm = "tlbi\tvae1is"; break;
+        case 2: Asm = "tlbi\taside1is"; break;
+        case 3: Asm = "tlbi\tvaae1is"; break;
+        case 5: Asm = "tlbi\tvale1is"; break;
+        case 7: Asm = "tlbi\tvaale1is"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2is"; break;
+        case 1: Asm = "tlbi\tvae2is"; break;
+        case 4: Asm = "tlbi\talle1is"; break;
+        case 5: Asm = "tlbi\tvale2is"; break;
+        case 6: Asm = "tlbi\tvmalls12e1is"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3is"; break;
+        case 1: Asm = "tlbi\tvae3is"; break;
+        case 5: Asm = "tlbi\tvale3is"; break;
+        }
+        break;
+      }
+      break;
+    case 0:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1is"; break;
+        case 5: Asm = "tlbi\tipas2le1is"; break;
+        }
+        break;
+      }
+      break;
+    case 4:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1"; break;
+        case 5: Asm = "tlbi\tipas2le1"; break;
+        }
+        break;
+      }
+      break;
+    case 7:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1"; break;
+        case 1: Asm = "tlbi\tvae1"; break;
+        case 2: Asm = "tlbi\taside1"; break;
+        case 3: Asm = "tlbi\tvaae1"; break;
+        case 5: Asm = "tlbi\tvale1"; break;
+        case 7: Asm = "tlbi\tvaale1"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2"; break;
+        case 1: Asm = "tlbi\tvae2"; break;
+        case 4: Asm = "tlbi\talle1"; break;
+        case 5: Asm = "tlbi\tvale2"; break;
+        case 6: Asm = "tlbi\tvmalls12e1"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3"; break;
+        case 1: Asm = "tlbi\tvae3";  break;
+        case 5: Asm = "tlbi\tvale3"; break;
+        }
+        break;
+      }
+      break;
+    }
+  }
 
-  float Val = 1.0f + Fraction / 16.0f;
+  if (Asm) {
+    unsigned Reg = MI->getOperand(4).getReg();
 
-  // That is:
-  // 000 -> 2^1,  001 -> 2^2,  010 -> 2^3,  011 -> 2^4,
-  // 100 -> 2^-3, 101 -> 2^-2, 110 -> 2^-1, 111 -> 2^0
-  if (Exponent & 0x4) {
-    Val /= 1 << (7 - Exponent);
-  } else {
-    Val *= 1 << (Exponent + 1);
+    O << '\t' << Asm;
+    if (StringRef(Asm).lower().find("all") == StringRef::npos)
+      O << ", " << getRegisterName(Reg);
   }
 
-  Val = Negative ? -Val : Val;
+  return Asm != nullptr;
+}
 
-  o << '#' << format("%.8f", Val);
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
 }
 
-void AArch64InstPrinter::printFPZeroOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &o) {
-  o << "#0.0";
+void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << format("#%#llx", Op.getImm());
 }
 
-void
-AArch64InstPrinter::printCondCodeOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                                             unsigned Imm, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Reg == AArch64::XZR)
+      O << "#" << Imm;
+    else
+      O << getRegisterName(Reg);
+  } else
+    llvm_unreachable("unknown operand kind in printPostIncOperand64");
+}
 
-  O << A64CondCodeToString(static_cast<A64CC::CondCodes>(MO.getImm()));
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isReg() && "Non-register vreg operand!");
+  unsigned Reg = Op.getReg();
+  O << getRegisterName(Reg, AArch64::vreg);
 }
 
-void
-AArch64InstPrinter::printInverseCondCodeOperand(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
-  A64CC::CondCodes CC =
-      static_cast<A64CC::CondCodes>(MI->getOperand(OpNum).getImm());
-  O << A64CondCodeToString(A64InvertCondCode(CC));
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+  O << "c" << Op.getImm();
 }
 
-template <unsigned field_width, unsigned scale> void
-AArch64InstPrinter::printLabelOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
-
-  if (!MO.isImm()) {
-    printOperand(MI, OpNum, O);
-    return;
+  if (MO.isImm()) {
+    unsigned Val = (MO.getImm() & 0xfff);
+    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+    unsigned Shift =
+        AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+    O << '#' << Val;
+    if (Shift != 0)
+      printShifter(MI, OpNum + 1, O);
+
+    if (CommentStream)
+      *CommentStream << '=' << (Val << Shift) << '\n';
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+    printShifter(MI, OpNum + 1, O);
   }
+}
 
-  // The immediate of LDR (lit) instructions is a signed 19-bit immediate, which
-  // is multiplied by 4 (because all A64 instructions are 32-bits wide).
-  uint64_t UImm = MO.getImm();
-  uint64_t Sign = UImm & (1LL << (field_width - 1));
-  int64_t SImm = scale * ((UImm & ~Sign) - Sign);
-
-  O << "#" << SImm;
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
 }
 
-template<unsigned RegWidth> void
-AArch64InstPrinter::printLogicalImmOperand(const MCInst *MI, unsigned OpNum,
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  uint64_t Val;
-  A64Imms::isLogicalImmBits(RegWidth, MO.getImm(), Val);
+  uint64_t Val = MI->getOperand(OpNum).getImm();
   O << "#0x";
-  O.write_hex(Val);
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
 }
 
-void
-AArch64InstPrinter::printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O, int MemSize) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  // LSL #0 should not be printed.
+  if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+      AArch64_AM::getShiftValue(Val) == 0)
+    return;
+  O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+    << " #" << AArch64_AM::getShiftValue(Val);
+}
 
-  if (MOImm.isImm()) {
-    uint32_t Imm = MOImm.getImm() * MemSize;
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printShifter(MI, OpNum + 1, O);
+}
 
-    O << "#" << Imm;
-  } else {
-    O << "#" << *MOImm.getExpr();
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printArithExtend(MI, OpNum + 1, O);
+}
+
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+  unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
+
+  // If the destination or first source register operand is [W]SP, print
+  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+  // all.
+  if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+          ExtType == AArch64_AM::UXTX) ||
+         ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+          ExtType == AArch64_AM::UXTW) ) {
+      if (ShiftVal != 0)
+        O << ", lsl #" << ShiftVal;
+      return;
+    }
   }
+  O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+  if (ShiftVal != 0)
+    O << " #" << ShiftVal;
 }
 
-void
-AArch64InstPrinter::printShiftOperand(const MCInst *MI,  unsigned OpNum,
-                                      raw_ostream &O,
-                                      A64SE::ShiftExtSpecifiers Shift) {
-    const MCOperand &MO = MI->getOperand(OpNum);
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O, char SrcRegKind,
+                                        unsigned Width) {
+  unsigned SignExtend = MI->getOperand(OpNum).getImm();
+  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
 
-    // LSL #0 is not printed
-    if (Shift == A64SE::LSL && MO.isImm() && MO.getImm() == 0)
-        return;
+  // sxtw, sxtx, uxtw or lsl (== uxtx)
+  bool IsLSL = !SignExtend && SrcRegKind == 'x';
+  if (IsLSL)
+    O << "lsl";
+  else
+    O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
 
-    switch (Shift) {
-    case A64SE::LSL: O << "lsl"; break;
-    case A64SE::LSR: O << "lsr"; break;
-    case A64SE::ASR: O << "asr"; break;
-    case A64SE::ROR: O << "ror"; break;
-    default: llvm_unreachable("Invalid shift specifier in logical instruction");
-    }
+  if (DoShift || IsLSL)
+    O << " #" << Log2_32(Width / 8);
+}
 
-  O << " #" << MO.getImm();
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(CC);
 }
 
-void
-AArch64InstPrinter::printMoveWideImmOperand(const MCInst *MI,  unsigned OpNum,
-                                            raw_ostream &O) {
-  const MCOperand &UImm16MO = MI->getOperand(OpNum);
-  const MCOperand &ShiftMO = MI->getOperand(OpNum + 1);
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
 
-  if (UImm16MO.isImm()) {
-    O << '#' << UImm16MO.getImm();
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
 
-    if (ShiftMO.getImm() != 0)
-      O << ", lsl #" << (ShiftMO.getImm() * 16);
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << '#' << Scale * MI->getOperand(OpNum).getImm();
+}
 
-    return;
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+                                           unsigned Scale, raw_ostream &O) {
+  const MCOperand MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "#" << (MO.getImm() * Scale);
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
   }
-
-  O << "#" << *UImm16MO.getExpr();
 }
 
-void AArch64InstPrinter::printNamedImmOperand(const NamedImmMapper &Mapper,
-                                              const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  bool ValidName;
-  const MCOperand &MO = MI->getOperand(OpNum);
-  StringRef Name = Mapper.toString(MO.getImm(), ValidName);
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+                                          unsigned Scale, raw_ostream &O) {
+  const MCOperand MO1 = MI->getOperand(OpNum + 1);
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MO1.isImm()) {
+      O << ", #" << (MO1.getImm() * Scale);
+  } else {
+    assert(MO1.isExpr() && "Unexpected operand type!");
+    O << ", " << *MO1.getExpr();
+  }
+  O << ']';
+}
 
-  if (ValidName)
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned prfop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid);
+  if (Valid)
     O << Name;
   else
-    O << '#' << MO.getImm();
+    O << '#' << prfop;
 }
 
-void
-AArch64InstPrinter::printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                                       const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
+  float FPImm =
+      MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
 
-  bool ValidName;
-  std::string Name = Mapper.toString(MO.getImm(), ValidName);
-  if (ValidName) {
-    O << Name;
-    return;
-  }
+  // 8 decimal places are enough to perfectly represent permitted floats.
+  O << format("#%.8f", FPImm);
 }
 
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+  while (Stride--) {
+    switch (Reg) {
+    default:
+      llvm_unreachable("Vector register expected!");
+    case AArch64::Q0:  Reg = AArch64::Q1;  break;
+    case AArch64::Q1:  Reg = AArch64::Q2;  break;
+    case AArch64::Q2:  Reg = AArch64::Q3;  break;
+    case AArch64::Q3:  Reg = AArch64::Q4;  break;
+    case AArch64::Q4:  Reg = AArch64::Q5;  break;
+    case AArch64::Q5:  Reg = AArch64::Q6;  break;
+    case AArch64::Q6:  Reg = AArch64::Q7;  break;
+    case AArch64::Q7:  Reg = AArch64::Q8;  break;
+    case AArch64::Q8:  Reg = AArch64::Q9;  break;
+    case AArch64::Q9:  Reg = AArch64::Q10; break;
+    case AArch64::Q10: Reg = AArch64::Q11; break;
+    case AArch64::Q11: Reg = AArch64::Q12; break;
+    case AArch64::Q12: Reg = AArch64::Q13; break;
+    case AArch64::Q13: Reg = AArch64::Q14; break;
+    case AArch64::Q14: Reg = AArch64::Q15; break;
+    case AArch64::Q15: Reg = AArch64::Q16; break;
+    case AArch64::Q16: Reg = AArch64::Q17; break;
+    case AArch64::Q17: Reg = AArch64::Q18; break;
+    case AArch64::Q18: Reg = AArch64::Q19; break;
+    case AArch64::Q19: Reg = AArch64::Q20; break;
+    case AArch64::Q20: Reg = AArch64::Q21; break;
+    case AArch64::Q21: Reg = AArch64::Q22; break;
+    case AArch64::Q22: Reg = AArch64::Q23; break;
+    case AArch64::Q23: Reg = AArch64::Q24; break;
+    case AArch64::Q24: Reg = AArch64::Q25; break;
+    case AArch64::Q25: Reg = AArch64::Q26; break;
+    case AArch64::Q26: Reg = AArch64::Q27; break;
+    case AArch64::Q27: Reg = AArch64::Q28; break;
+    case AArch64::Q28: Reg = AArch64::Q29; break;
+    case AArch64::Q29: Reg = AArch64::Q30; break;
+    case AArch64::Q30: Reg = AArch64::Q31; break;
+    // Vector lists can wrap around.
+    case AArch64::Q31:
+      Reg = AArch64::Q0;
+      break;
+    }
+  }
+  return Reg;
+}
 
-void AArch64InstPrinter::printRegExtendOperand(const MCInst *MI,
-                                               unsigned OpNum,
-                                               raw_ostream &O,
-                                               A64SE::ShiftExtSpecifiers Ext) {
-  // FIXME: In principle TableGen should be able to detect this itself far more
-  // easily. We will only accumulate more of these hacks.
-  unsigned Reg0 = MI->getOperand(0).getReg();
-  unsigned Reg1 = MI->getOperand(1).getReg();
-
-  if (isStackReg(Reg0) || isStackReg(Reg1)) {
-    A64SE::ShiftExtSpecifiers LSLEquiv;
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O,
+                                         StringRef LayoutSuffix) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
 
-    if (Reg0 == AArch64::XSP || Reg1 == AArch64::XSP)
-      LSLEquiv = A64SE::UXTX;
-    else
-      LSLEquiv = A64SE::UXTW;
+  O << "{ ";
 
-    if (Ext == LSLEquiv) {
-      O << "lsl #" << MI->getOperand(OpNum).getImm();
-      return;
-    }
+  // Work out how many registers there are in the list (if there is an actual
+  // list).
+  unsigned NumRegs = 1;
+  if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+    NumRegs = 2;
+  else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+    NumRegs = 3;
+  else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+    NumRegs = 4;
+
+  // Now forget about the list and find out what the first register is.
+  if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+    Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+    Reg = FirstReg;
+
+  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+  // printing (otherwise getRegisterName fails).
+  if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+    Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
   }
 
-  switch (Ext) {
-  case A64SE::UXTB: O << "uxtb"; break;
-  case A64SE::UXTH: O << "uxth"; break;
-  case A64SE::UXTW: O << "uxtw"; break;
-  case A64SE::UXTX: O << "uxtx"; break;
-  case A64SE::SXTB: O << "sxtb"; break;
-  case A64SE::SXTH: O << "sxth"; break;
-  case A64SE::SXTW: O << "sxtw"; break;
-  case A64SE::SXTX: O << "sxtx"; break;
-  default: llvm_unreachable("Unexpected shift type for printing");
+  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+    O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+    if (i + 1 != NumRegs)
+      O << ", ";
   }
 
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.getImm() != 0)
-    O << " #" << MO.getImm();
+  O << " }";
 }
 
-template<int MemScale> void
-AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
-  int32_t Imm = unpackSignedImm(7, MOImm.getImm());
+void AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  printVectorList(MI, OpNum, O, "");
+}
 
-  O << "#" << (Imm * MemScale);
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  std::string Suffix(".");
+  if (NumLanes)
+    Suffix += itostr(NumLanes) + LaneKind;
+  else
+    Suffix += LaneKind;
+
+  printVectorList(MI, OpNum, O, Suffix);
 }
 
-void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo,
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
-  unsigned Reg = MI->getOperand(OpNo).getReg();
-  std::string Name = getRegisterName(Reg);
-  Name[0] = 'v';
-  O << Name;
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
 }
 
-void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
-  } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 2);
+    return;
+  }
+
+  // If the branch target is simply an address then print it in hex.
+  const MCConstantExpr *BranchTarget =
+      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+  int64_t Address;
+  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    O << "0x";
+    O.write_hex(Address);
   } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    // If a symbolic branch target was added as a constant expression then print
-    // that address in hex.
-    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
-    int64_t Address;
-    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-      O << "0x";
-      O.write_hex(Address);
-    }
-    else {
-      // Otherwise, just print the expression.
-      O << *Op.getExpr();
-    }
+    // Otherwise, just print the expression.
+    O << *MI->getOperand(OpNum).getExpr();
   }
 }
 
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
 
-void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                   StringRef Annot) {
-  if (MI->getOpcode() == AArch64::TLSDESCCALL) {
-    // This is a special assembler directive which applies an
-    // R_AARCH64_TLSDESC_CALL to the following (BLR) instruction. It has a fixed
-    // form outside the normal TableGenerated scheme.
-    O << "\t.tlsdesccall " << *MI->getOperand(0).getExpr();
-  } else if (!printAliasInstr(MI, O))
-    printInstruction(MI, O);
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 12);
+    return;
+  }
 
-  printAnnotation(O, Annot);
+  // Otherwise, just print the expression.
+  O << *MI->getOperand(OpNum).getExpr();
 }
 
-template <A64SE::ShiftExtSpecifiers Ext, bool isHalf>
-void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI,
-                                                     unsigned OpNum,
-                                                     raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-
-  assert(MO.isImm() &&
-         "Immediate operand required for Neon vector immediate inst.");
-
-  bool IsLSL = false;
-  if (Ext == A64SE::LSL)
-    IsLSL = true;
-  else if (Ext != A64SE::MSL)
-    llvm_unreachable("Invalid shift specifier in movi instruction");
-
-  int64_t Imm = MO.getImm();
-
-  // MSL and LSLH accepts encoded shift amount 0 or 1.
-  if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1)
-    llvm_unreachable("Invalid shift amount in movi instruction");
-
-  // LSH accepts encoded shift amount 0, 1, 2 or 3.
-  if (IsLSL && (Imm < 0 || Imm > 3))
-    llvm_unreachable("Invalid shift amount in movi instruction");
-
-  // Print shift amount as multiple of 8 with MSL encoded shift amount
-  // 0 and 1 printed as 8 and 16.
-  if (!IsLSL)
-    Imm++;
-  Imm *= 8;
-
-  // LSL #0 is not printed
-  if (IsLSL) {
-    if (Imm == 0)
-      return;
-    O << ", lsl";
-  } else
-    O << ", msl";
-
-  O << " #" << Imm;
-}
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  unsigned Opcode = MI->getOpcode();
 
-void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &o) {
-  o << "#0x0";
+  bool Valid;
+  StringRef Name;
+  if (Opcode == AArch64::ISB)
+    Name = AArch64ISB::ISBMapper().toString(Val, Valid);
+  else
+    Name = AArch64DB::DBarrierMapper().toString(Val, Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << "#" << Val;
 }
 
-void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  const MCOperand &MOUImm = MI->getOperand(OpNum);
-
-  assert(MOUImm.isImm() &&
-         "Immediate operand required for Neon vector immediate inst.");
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  unsigned Imm = MOUImm.getImm();
+  bool Valid;
+  auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
 
-  O << "#0x";
-  O.write_hex(Imm);
+  if (Valid)
+    O << StringRef(Name).upper();
 }
 
-void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI,
-                                              unsigned OpNum,
-                                              raw_ostream &O) {
-  const MCOperand &MOUImm = MI->getOperand(OpNum);
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  assert(MOUImm.isImm()
-         && "Immediate operand required for Neon vector immediate inst.");
+  bool Valid;
+  auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
 
-  unsigned Imm = MOUImm.getImm();
-  O << Imm;
+  if (Valid)
+    O << StringRef(Name).upper();
 }
 
-void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
-                                                    unsigned OpNum,
-                                                    raw_ostream &O) {
-  const MCOperand &MOUImm8 = MI->getOperand(OpNum);
-
-  assert(MOUImm8.isImm() &&
-         "Immediate operand required for Neon vector immediate bytemask inst.");
-
-  uint32_t UImm8 = MOUImm8.getImm();
-  uint64_t Mask = 0;
-
-  // Replicates 0x00 or 0xff byte in a 64-bit vector
-  for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
-    if ((UImm8 >> ByteNum) & 1)
-      Mask |= (uint64_t)0xff << (8 * ByteNum);
-  }
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  O << "#0x";
-  O.write_hex(Mask);
+  bool Valid;
+  StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid);
+  if (Valid)
+    O << StringRef(Name.str()).upper();
+  else
+    O << "#" << Val;
 }
 
-// If Count > 1, there are two valid kinds of vector list:
-//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-//   (2) {Vn.layout - Vm.layout}
-// We choose the first kind as output.
-template <A64Layout::VectorLayout Layout, unsigned Count>
-void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
-
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-  std::string LayoutStr = A64VectorLayoutToString(Layout);
-  O << "{ ";
-  if (Count > 1) { // Print sub registers separately
-    bool IsVec64 = (Layout < A64Layout::VL_16B);
-    unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
-    for (unsigned I = 0; I < Count; I++) {
-      std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
-      Name[0] = 'v';
-      O << Name << LayoutStr;
-      if (I != Count - 1)
-        O << ", ";
-    }
-  } else { // Print the register directly when NumVecs is 1.
-    std::string Name = getRegisterName(Reg);
-    Name[0] = 'v';
-    O << Name << LayoutStr;
-  }
-  O << " }";
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned RawVal = MI->getOperand(OpNo).getImm();
+  uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+  O << format("#%#016llx", Val);
 }
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 7432755dd89b..fe7666e5cadb 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64INSTPRINTER_H
-#define LLVM_AARCH64INSTPRINTER_H
+#ifndef AArch64INSTPRINTER_H
+#define AArch64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
@@ -28,158 +28,112 @@ class AArch64InstPrinter : public MCInstPrinter {
   AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                      const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  // Autogenerated by tblgen
-  void printInstruction(const MCInst *MI, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo);
-  static const char *getInstructionName(unsigned Opcode);
-
-  void printRegName(raw_ostream &O, unsigned RegNum) const override;
-
-  template<unsigned MemSize, unsigned RmSize>
-  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O) {
-    printAddrRegExtendOperand(MI, OpNum, O, MemSize, RmSize);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
   }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
 
-
-  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O, unsigned MemSize,
-                                 unsigned RmSize);
-
-  void printAddSubImmLSL0Operand(const MCInst *MI,
-                                 unsigned OpNum, raw_ostream &O);
-  void printAddSubImmLSL12Operand(const MCInst *MI,
-                                  unsigned OpNum, raw_ostream &O);
-
-  void printBareImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<unsigned RegWidth>
-  void printBFILSBOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBFIWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBFXWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-
-  void printCondCodeOperand(const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O);
-  void printInverseCondCodeOperand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
-
-  void printCRxOperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O);
-
-  void printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
-
-  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
-  void printFPZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
-  template<int MemScale>
-  void printOffsetUImm12Operand(const MCInst *MI,
-                                  unsigned OpNum, raw_ostream &o) {
-    printOffsetUImm12Operand(MI, OpNum, o, MemScale);
+protected:
+  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  template<int Amount>
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printPostIncOperand(MI, OpNo, Amount, O);
   }
 
-  void printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &o, int MemScale);
-
-  template<unsigned field_width, unsigned scale>
-  void printLabelOperand(const MCInst *MI, unsigned OpNum,
-                         raw_ostream &O);
-
-  template<unsigned RegWidth>
-  void printLogicalImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<typename SomeNamedImmMapper>
-  void printNamedImmOperand(const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O) {
-    printNamedImmOperand(SomeNamedImmMapper(), MI, OpNum, O);
+  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                      char SrcRegKind, unsigned Width);
+  template <char SrcRegKind, unsigned Width>
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printMemExtend(MI, OpNum, O, SrcRegKind, Width);
   }
 
-  void printNamedImmOperand(const NamedImmMapper &Mapper,
-                            const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O);
-
-  void printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                          const MCInst *MI, unsigned OpNum,
-                          raw_ostream &O);
-
-  void printMRSOperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O) {
-    printSysRegOperand(A64SysReg::MRSMapper(), MI, OpNum, O);
-  }
+  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                         raw_ostream &O);
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                        raw_ostream &O);
 
-  void printMSROperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O) {
-    printSysRegOperand(A64SysReg::MSRMapper(), MI, OpNum, O);
+  template<int Scale>
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printUImm12Offset(MI, OpNum, Scale, O);
   }
 
-  void printShiftOperand(const char *name, const MCInst *MI,
-                         unsigned OpIdx, raw_ostream &O);
-
-  void printLSLOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printLSROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("lsr", MI, OpNum, O);
-  }
-  void printASROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("asr", MI, OpNum, O);
-  }
-  void printROROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("ror", MI, OpNum, O);
+  template<int BitWidth>
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
   }
 
-  template<A64SE::ShiftExtSpecifiers Shift>
-  void printShiftOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand(MI, OpNum, O, Shift);
-  }
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printShiftOperand(const MCInst *MI, unsigned OpNum,
-                         raw_ostream &O, A64SE::ShiftExtSpecifiers Sh);
+  template<int Scale>
+  void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printMoveWideImmOperand(const  MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  template<int MemSize> void
-  printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                       StringRef LayoutSuffix);
 
-  void printOffsetSImm9Operand(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
 
-  void printPRFMOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  template<A64SE::ShiftExtSpecifiers EXT>
-  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                             raw_ostream &O) {
-    printRegExtendOperand(MI, OpNum, O, EXT);
-  }
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
 
-  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                             raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+  AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
-  bool isStackReg(unsigned RegNo) {
-    return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
+  void printInstruction(const MCInst *MI, raw_ostream &O) override;
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  StringRef getRegName(unsigned RegNo) const override {
+    return getRegisterName(RegNo);
   }
-
-  template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-  void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
-  void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &O);
-
-  template <A64Layout::VectorLayout Layout, unsigned Count>
-  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
 };
 }
 
diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
index 3db56e4733f5..363f50258d77 100644
--- a/lib/Target/AArch64/InstPrinter/CMakeLists.txt
+++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64AsmPrinter
   AArch64InstPrinter.cpp
   )
+
+add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
index 4836c7c45d44..a13e842cdd3b 100644
--- a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
index 1c36a8dea798..b17e8d080119 100644
--- a/lib/Target/AArch64/InstPrinter/Makefile
+++ b/lib/Target/AArch64/InstPrinter/Makefile
@@ -9,7 +9,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64AsmPrinter
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 1b838282bd41..642c18394a67 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/LLVMBuild.txt -----------------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/LLVMBuild.txt -------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
 add_to_library_groups = AArch64
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
similarity index 89%
rename from lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
rename to lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 53bd3545a594..8b1e44e26e93 100644
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -1,4 +1,4 @@
-//===- ARM64AddressingModes.h - ARM64 Addressing Modes ----------*- C++ -*-===//
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the ARM64 addressing mode implementation stuff.
+// This file contains the AArch64 addressing mode implementation stuff.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
-#define LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
+#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -22,8 +22,8 @@
 
 namespace llvm {
 
-/// ARM64_AM - ARM64 Addressing Mode Stuff
-namespace ARM64_AM {
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
 
 //===----------------------------------------------------------------------===//
 // Shifts
@@ -49,35 +49,35 @@ enum ShiftExtendType {
 };
 
 /// getShiftName - Get the string encoding for the shift type.
-static inline const char *getShiftExtendName(ARM64_AM::ShiftExtendType ST) {
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
   switch (ST) {
   default: assert(false && "unhandled shift type!");
-  case ARM64_AM::LSL: return "lsl";
-  case ARM64_AM::LSR: return "lsr";
-  case ARM64_AM::ASR: return "asr";
-  case ARM64_AM::ROR: return "ror";
-  case ARM64_AM::MSL: return "msl";
-  case ARM64_AM::UXTB: return "uxtb";
-  case ARM64_AM::UXTH: return "uxth";
-  case ARM64_AM::UXTW: return "uxtw";
-  case ARM64_AM::UXTX: return "uxtx";
-  case ARM64_AM::SXTB: return "sxtb";
-  case ARM64_AM::SXTH: return "sxth";
-  case ARM64_AM::SXTW: return "sxtw";
-  case ARM64_AM::SXTX: return "sxtx";
+  case AArch64_AM::LSL: return "lsl";
+  case AArch64_AM::LSR: return "lsr";
+  case AArch64_AM::ASR: return "asr";
+  case AArch64_AM::ROR: return "ror";
+  case AArch64_AM::MSL: return "msl";
+  case AArch64_AM::UXTB: return "uxtb";
+  case AArch64_AM::UXTH: return "uxth";
+  case AArch64_AM::UXTW: return "uxtw";
+  case AArch64_AM::UXTX: return "uxtx";
+  case AArch64_AM::SXTB: return "sxtb";
+  case AArch64_AM::SXTH: return "sxth";
+  case AArch64_AM::SXTW: return "sxtw";
+  case AArch64_AM::SXTX: return "sxtx";
   }
   return nullptr;
 }
 
 /// getShiftType - Extract the shift type.
-static inline ARM64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
   switch ((Imm >> 6) & 0x7) {
-  default: return ARM64_AM::InvalidShiftExtend;
-  case 0: return ARM64_AM::LSL;
-  case 1: return ARM64_AM::LSR;
-  case 2: return ARM64_AM::ASR;
-  case 3: return ARM64_AM::ROR;
-  case 4: return ARM64_AM::MSL;
+  default: return AArch64_AM::InvalidShiftExtend;
+  case 0: return AArch64_AM::LSL;
+  case 1: return AArch64_AM::LSR;
+  case 2: return AArch64_AM::ASR;
+  case 3: return AArch64_AM::ROR;
+  case 4: return AArch64_AM::MSL;
   }
 }
 
@@ -95,17 +95,17 @@ static inline unsigned getShiftValue(unsigned Imm) {
 ///            100 ==> msl
 ///   {8-6}  = shifter
 ///   {5-0}  = imm
-static inline unsigned getShifterImm(ARM64_AM::ShiftExtendType ST,
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
                                      unsigned Imm) {
   assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
   unsigned STEnc = 0;
   switch (ST) {
   default:  llvm_unreachable("Invalid shift requested");
-  case ARM64_AM::LSL: STEnc = 0; break;
-  case ARM64_AM::LSR: STEnc = 1; break;
-  case ARM64_AM::ASR: STEnc = 2; break;
-  case ARM64_AM::ROR: STEnc = 3; break;
-  case ARM64_AM::MSL: STEnc = 4; break;
+  case AArch64_AM::LSL: STEnc = 0; break;
+  case AArch64_AM::LSR: STEnc = 1; break;
+  case AArch64_AM::ASR: STEnc = 2; break;
+  case AArch64_AM::ROR: STEnc = 3; break;
+  case AArch64_AM::MSL: STEnc = 4; break;
   }
   return (STEnc << 6) | (Imm & 0x3f);
 }
@@ -120,22 +120,22 @@ static inline unsigned getArithShiftValue(unsigned Imm) {
 }
 
 /// getExtendType - Extract the extend type for operands of arithmetic ops.
-static inline ARM64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
   assert((Imm & 0x7) == Imm && "invalid immediate!");
   switch (Imm) {
   default: llvm_unreachable("Compiler bug!");
-  case 0: return ARM64_AM::UXTB;
-  case 1: return ARM64_AM::UXTH;
-  case 2: return ARM64_AM::UXTW;
-  case 3: return ARM64_AM::UXTX;
-  case 4: return ARM64_AM::SXTB;
-  case 5: return ARM64_AM::SXTH;
-  case 6: return ARM64_AM::SXTW;
-  case 7: return ARM64_AM::SXTX;
+  case 0: return AArch64_AM::UXTB;
+  case 1: return AArch64_AM::UXTH;
+  case 2: return AArch64_AM::UXTW;
+  case 3: return AArch64_AM::UXTX;
+  case 4: return AArch64_AM::SXTB;
+  case 5: return AArch64_AM::SXTH;
+  case 6: return AArch64_AM::SXTW;
+  case 7: return AArch64_AM::SXTX;
   }
 }
 
-static inline ARM64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
   return getExtendType((Imm >> 3) & 0x7);
 }
 
@@ -148,17 +148,17 @@ static inline ARM64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
 ///            101 ==> sxth
 ///            110 ==> sxtw
 ///            111 ==> sxtx
-inline unsigned getExtendEncoding(ARM64_AM::ShiftExtendType ET) {
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
   switch (ET) {
   default: llvm_unreachable("Invalid extend type requested");
-  case ARM64_AM::UXTB: return 0; break;
-  case ARM64_AM::UXTH: return 1; break;
-  case ARM64_AM::UXTW: return 2; break;
-  case ARM64_AM::UXTX: return 3; break;
-  case ARM64_AM::SXTB: return 4; break;
-  case ARM64_AM::SXTH: return 5; break;
-  case ARM64_AM::SXTW: return 6; break;
-  case ARM64_AM::SXTX: return 7; break;
+  case AArch64_AM::UXTB: return 0; break;
+  case AArch64_AM::UXTH: return 1; break;
+  case AArch64_AM::UXTW: return 2; break;
+  case AArch64_AM::UXTX: return 3; break;
+  case AArch64_AM::SXTB: return 4; break;
+  case AArch64_AM::SXTH: return 5; break;
+  case AArch64_AM::SXTW: return 6; break;
+  case AArch64_AM::SXTX: return 7; break;
   }
 }
 
@@ -167,7 +167,7 @@ inline unsigned getExtendEncoding(ARM64_AM::ShiftExtendType ET) {
 ///   imm:     3-bit extend amount
 ///   {5-3}  = shifter
 ///   {2-0}  = imm3
-static inline unsigned getArithExtendImm(ARM64_AM::ShiftExtendType ET,
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
                                          unsigned Imm) {
   assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
   return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
@@ -181,7 +181,7 @@ static inline bool getMemDoShift(unsigned Imm) {
 
 /// getExtendType - Extract the extend type for the offset operand of
 /// loads/stores.
-static inline ARM64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
   return getExtendType((Imm >> 1) & 0x7);
 }
 
@@ -197,7 +197,7 @@ static inline ARM64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
 ///            111 ==> sxtx
 ///   {3-1}  = shifter
 ///   {0}  = doshift
-static inline unsigned getMemExtendImm(ARM64_AM::ShiftExtendType ET,
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
                                        bool DoShift) {
   return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
 }
@@ -731,7 +731,7 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
   return (EncVal << 32) | EncVal;
 }
 
-} // end namespace ARM64_AM
+} // end namespace AArch64_AM
 
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index e0931e420785..a917616f142d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -6,168 +6,57 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the AArch64 implementation of the MCAsmBackend class,
-// which is principally concerned with relaxation of the various fixup kinds.
-//
-//===----------------------------------------------------------------------===//
 
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MachO.h"
 using namespace llvm;
 
 namespace {
-class AArch64AsmBackend : public MCAsmBackend {
-  const MCSubtargetInfo* STI;
-public:
-  AArch64AsmBackend(const Target &T, const StringRef TT)
-    : MCAsmBackend(),
-      STI(AArch64_MC::createAArch64MCSubtargetInfo(TT, "", ""))
-    {}
-
-
-  ~AArch64AsmBackend() {
-    delete STI;
-  }
-
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-
-  virtual void processFixupValue(const MCAssembler &Asm,
-                                 const MCAsmLayout &Layout,
-                                 const MCFixup &Fixup, const MCFragment *DF,
-                                 const MCValue &Target, uint64_t &Value,
-                                 bool &IsResolved) override;
-};
-} // end anonymous namespace
-
-void AArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                          const MCAsmLayout &Layout,
-                                          const MCFixup &Fixup,
-                                          const MCFragment *DF,
-                                          const MCValue &Target,
-                                          uint64_t &Value, bool &IsResolved) {
-  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
-  // ~0xfff. This means that the required offset to reach a symbol can vary by
-  // up to one step depending on where the ADRP is in memory. For example:
-  //
-  //     ADRP x0, there
-  //  there:
-  //
-  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
-  // we'll need that as an offset. At any other address "there" will be in the
-  // same page as the ADRP and the instruction should encode 0x0. Assuming the
-  // section isn't 0x1000-aligned, we therefore need to delegate this decision
-  // to the linker -- a relocation!
-  if ((uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_got_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_gottprel_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_tlsdesc_adr_page)
-    IsResolved = false;
-}
-
-
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value);
 
-namespace {
+class AArch64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
 
-class ELFAArch64AsmBackend : public AArch64AsmBackend {
-  uint8_t OSABI;
-  bool IsLittle; // Big or little endian
 public:
-  ELFAArch64AsmBackend(const Target &T, const StringRef TT,
-                       uint8_t _OSABI, bool isLittle)
-    : AArch64AsmBackend(T, TT), OSABI(_OSABI), IsLittle(isLittle) { }
-
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const override;
+  AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
 
-  unsigned int getNumFixupKinds() const override {
+  unsigned getNumFixupKinds() const override {
     return AArch64::NumTargetFixupKinds;
   }
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-// This table *must* be in the order that the fixup_* kinds are defined in
-// AArch64FixupKinds.h.
-//
-// Name                   Offset (bits)    Size (bits)    Flags
-{ "fixup_a64_ld_prel",               0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel_page",         0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_add_lo12",              0,    32,             0 },
-{ "fixup_a64_ldst8_lo12",            0,    32,             0 },
-{ "fixup_a64_ldst16_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst32_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst64_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst128_lo12",          0,    32,             0 },
-{ "fixup_a64_tstbr",                 0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_condbr",                0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_uncondbr",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_call",                  0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_uabs_g0",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g0_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g1",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g1_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g2",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g2_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g3",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g0",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g1",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g2",          0,    32,             0 },
-{ "fixup_a64_adr_prel_got_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_got_lo12_nc",      0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g2",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g1",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g1_nc",     0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g0",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g0_nc",     0,    32,             0 },
-{ "fixup_a64_add_dtprel_hi12",       0,    32,             0 },
-{ "fixup_a64_add_dtprel_lo12",       0,    32,             0 },
-{ "fixup_a64_add_dtprel_lo12_nc",    0,    32,             0 },
-{ "fixup_a64_ldst8_dtprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst8_dtprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst16_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst16_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ldst32_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst32_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ldst64_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst64_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_movw_gottprel_g1",      0,    32,             0 },
-{ "fixup_a64_movw_gottprel_g0_nc",   0,    32,             0 },
-{ "fixup_a64_adr_gottprel_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_gottprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ld_gottprel_prel19",    0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_tprel_g2",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g1",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g1_nc",      0,    32,             0 },
-{ "fixup_a64_movw_tprel_g0",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g0_nc",      0,    32,             0 },
-{ "fixup_a64_add_tprel_hi12",        0,    32,             0 },
-{ "fixup_a64_add_tprel_lo12",        0,    32,             0 },
-{ "fixup_a64_add_tprel_lo12_nc",     0,    32,             0 },
-{ "fixup_a64_ldst8_tprel_lo12",      0,    32,             0 },
-{ "fixup_a64_ldst8_tprel_lo12_nc",   0,    32,             0 },
-{ "fixup_a64_ldst16_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst16_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst32_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst32_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst64_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst64_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_tlsdesc_adr_page",      0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_tlsdesc_ld64_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_tlsdesc_add_lo12_nc",   0,    32,             0 },
-{ "fixup_a64_tlsdesc_call",          0,     0,             0 }
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // AArch64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_add_imm12", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_movw", 5, 16, 0 },
+      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
     };
+
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -177,417 +66,501 @@ class ELFAArch64AsmBackend : public AArch64AsmBackend {
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override {
-    unsigned NumBytes = getFixupKindInfo(Fixup.getKind()).TargetSize / 8;
-    Value = adjustFixupValue(Fixup.getKind(), Value);
-    if (!Value) return;           // Doesn't change encoding.
-
-    unsigned Offset = Fixup.getOffset();
-    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-    // For each byte of the fragment that the fixup touches, mask in the bits
-    // from the fixup value.
-    for (unsigned i = 0; i != NumBytes; ++i) {
-      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-    }
-  }
+                  uint64_t Value, bool IsPCRel) const override;
 
-  bool mayNeedRelaxation(const MCInst&) const override {
-    return false;
-  }
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void relaxInstruction(const MCInst&, llvm::MCInst&) const override {
-    llvm_unreachable("Cannot relax instructions");
-  }
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createAArch64ELFObjectWriter(OS, OSABI, IsLittle);
-  }
+  unsigned getPointerSize() const { return 8; }
 };
 
 } // end anonymous namespace
 
-bool
-ELFAArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                           uint64_t Value,
-                                           const MCRelaxableFragment *DF,
-                                           const MCAsmLayout &Layout) const {
-  // Correct for now. With all instructions 32-bit only very low-level
-  // considerations could make you select something which may fail.
-  return false;
-}
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
 
+  case AArch64::fixup_aarch64_tlsdesc_call:
+    return 0;
 
-bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // Can't emit NOP with size not multiple of 32-bits
-  if (Count % 4 != 0)
-    return false;
+  case FK_Data_1:
+    return 1;
 
-  uint64_t NumNops = Count / 4;
-  for (uint64_t i = 0; i != NumNops; ++i)
-    OW->Write32(0xd503201f);
+  case FK_Data_2:
+  case AArch64::fixup_aarch64_movw:
+    return 2;
+
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    return 3;
+
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
 
-  return true;
+  case FK_Data_8:
+    return 8;
+  }
 }
 
-static unsigned ADRImmBits(unsigned Value) {
+static unsigned AdrImmBits(unsigned Value) {
   unsigned lo2 = Value & 0x3;
-  unsigned hi19 = (Value & 0x1fffff) >> 2;
-
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
   return (hi19 << 5) | (lo2 << 29);
 }
 
 static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
-    llvm_unreachable("Unknown fixup kind!");
-  case FK_Data_2:
-    assert((int64_t)Value >= -32768 &&
-           (int64_t)Value <= 65536 &&
-           "Out of range ABS16 fixup");
+    assert(false && "Unknown fixup kind!");
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Value >= 0x1000)
+      report_fatal_error("invalid imm12 fixup value");
     return Value;
-  case FK_Data_4:
-    assert((int64_t)Value >= -(1LL << 31) &&
-           (int64_t)Value <= (1LL << 32) - 1 &&
-           "Out of range ABS32 fixup");
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Value & 1 || Value >= 0x2000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 1;
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Value & 3 || Value >= 0x4000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 2;
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Value & 7 || Value >= 0x8000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 3;
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Value & 15 || Value >= 0x10000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 4;
+  case AArch64::fixup_aarch64_movw:
+    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
+  case AArch64::fixup_aarch64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (SignedValue > 32767 || SignedValue < -32768)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
   case FK_Data_8:
     return Value;
+  }
+}
 
-  case AArch64::fixup_a64_ld_gottprel_prel19:
-    // R_AARCH64_LD_GOTTPREL_PREL19: Set a load-literal immediate to bits 1F
-    // FFFC of G(TPREL(S+A)) - P; check -2^20 <= X < 2^20.
-  case AArch64::fixup_a64_ld_prel:
-    // R_AARCH64_LD_PREL_LO19: Sets a load-literal (immediate) value to bits
-    // 1F FFFC of S+A-P, checking that -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range LDR (lit) fixup");
-    return (Value & 0x1ffffc) << 3;
-
-  case AArch64::fixup_a64_adr_prel:
-    // R_AARCH64_ADR_PREL_LO21: Sets an ADR immediate value to bits 1F FFFF of
-    // the result of S+A-P, checking that -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range ADR fixup");
-    return ADRImmBits(Value & 0x1fffff);
-
-  case AArch64::fixup_a64_adr_prel_page:
-    // R_AARCH64_ADR_PREL_PG_HI21: Sets an ADRP immediate value to bits 1 FFFF
-    // F000 of the result of the operation, checking that -2^32 <= result <
-    // 2^32.
-    assert((int64_t)Value >= -(1LL << 32) &&
-           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
-    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
-  case AArch64::fixup_a64_add_dtprel_hi12:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
-    // FF F000 of DTPREL(S+A), check 0 <= X < 2^24.
-  case AArch64::fixup_a64_add_tprel_hi12:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
-    // FF F000 of TPREL(S+A), check 0 <= X < 2^24.
-    assert((int64_t)Value >= 0 &&
-           (int64_t)Value < (1LL << 24) && "Out of range ADD fixup");
-    return (Value & 0xfff000) >> 2;
-
-  case AArch64::fixup_a64_add_dtprel_lo12:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
-    // FFF of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_add_tprel_lo12:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
-    // FFF of TPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t)Value >= 0 &&
-           (int64_t)Value < (1LL << 12) && "Out of range ADD fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_add_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: Set an ADD immediate field to bits
-    // FFF of DTPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_add_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12_NC: Set an ADD immediate field to bits
-    // FFF of TPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
-    // R_AARCH64_TLSDESC_ADD_LO12_NC: Set an ADD immediate field to bits
-    // FFF of G(TLSDESC(S+A)), with no overflow check.
-  case AArch64::fixup_a64_add_lo12:
-    // R_AARCH64_ADD_ABS_LO12_NC: Sets an ADD immediate value to bits FFF of
-    // S+A, with no overflow check.
-    return (Value & 0xfff) << 10;
-
-  case AArch64::fixup_a64_ldst8_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst8_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst8_lo12:
-    // R_AARCH64_LDST8_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFF
-    // of S+A, with no overflow check.
-    return (Value & 0xfff) << 10;
-
-  case AArch64::fixup_a64_ldst16_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst16_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst16_lo12:
-    // R_AARCH64_LDST16_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFE
-    // of S+A, with no overflow check.
-    return (Value & 0xffe) << 9;
-
-  case AArch64::fixup_a64_ldst32_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst32_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst32_lo12:
-    // R_AARCH64_LDST32_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFC
-    // of S+A, with no overflow check.
-    return (Value & 0xffc) << 8;
-
-  case AArch64::fixup_a64_ldst64_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst64_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst64_lo12:
-    // R_AARCH64_LDST64_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF8
-    // of S+A, with no overflow check.
-    return (Value & 0xff8) << 7;
-
-  case AArch64::fixup_a64_ldst128_lo12:
-    // R_AARCH64_LDST128_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF0
-    // of S+A, with no overflow check.
-    return (Value & 0xff0) << 6;
-
-  case AArch64::fixup_a64_movw_uabs_g0:
-    // R_AARCH64_MOVW_UABS_G0: Sets a MOVZ immediate field to bits FFFF of S+A
-    // with a check that S+A < 2^16
-    assert(Value <= 0xffff && "Out of range move wide fixup");
-    return (Value & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g0_nc:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of DTPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_movw_gottprel_g0_nc:
-    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of G(TPREL(S+A)) - GOT with no overflow check.
-  case AArch64::fixup_a64_movw_tprel_g0_nc:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of TPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_movw_uabs_g0_nc:
-    // R_AARCH64_MOVW_UABS_G0_NC: Sets a MOVK immediate field to bits FFFF of
-    // S+A with no overflow check.
-    return (Value & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g1:
-    // R_AARCH64_MOVW_UABS_G1: Sets a MOVZ immediate field to bits FFFF0000 of
-    // S+A with a check that S+A < 2^32
-    assert(Value <= 0xffffffffull && "Out of range move wide fixup");
-    return ((Value >> 16) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g1_nc:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: Set a MOVK immediate field
-    // to bits FFFF0000 of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_movw_tprel_g1_nc:
-    // R_AARCH64_TLSLD_MOVW_TPREL_G1_NC: Set a MOVK immediate field
-    // to bits FFFF0000 of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_movw_uabs_g1_nc:
-    // R_AARCH64_MOVW_UABS_G1_NC: Sets a MOVK immediate field to bits
-    // FFFF0000 of S+A with no overflow check.
-    return ((Value >> 16) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g2:
-    // R_AARCH64_MOVW_UABS_G2: Sets a MOVZ immediate field to bits FFFF 0000
-    // 0000 of S+A with a check that S+A < 2^48
-    assert(Value <= 0xffffffffffffull && "Out of range move wide fixup");
-    return ((Value >> 32) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g2_nc:
-    // R_AARCH64_MOVW_UABS_G2: Sets a MOVK immediate field to bits FFFF 0000
-    // 0000 of S+A with no overflow check.
-    return ((Value >> 32) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g3:
-    // R_AARCH64_MOVW_UABS_G3: Sets a MOVZ immediate field to bits FFFF 0000
-    // 0000 0000 of S+A (no overflow check needed)
-    return ((Value >> 48) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g0:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G0: Set a MOV[NZ] immediate field
-    // to bits FFFF of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_tprel_g0:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G0: Set a MOV[NZ] immediate field to
-    // bits FFFF of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g0: {
-    // R_AARCH64_MOVW_SABS_G0: Sets MOV[NZ] immediate field using bits FFFF of
-    // S+A (see notes below); check -2^16 <= S+A < 2^16. (notes say that we
-    // should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 16) && Signed < (1LL << 16)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = (Value & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      // MCCodeEmitter should have encoded a MOVN, which is fine.
-      Value = (~Value & 0xffff) << 5;
-    }
-    return Value;
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                   unsigned DataSize, uint64_t Value,
+                                   bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                             uint64_t Value,
+                                             const MCRelaxableFragment *DF,
+                                             const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for AArch64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         MCInst &Res) const {
+  assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  if ((Count & 3) != 0) {
+    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+      OW->Write8(0);
   }
 
-  case AArch64::fixup_a64_movw_dtprel_g1:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G1: Set a MOV[NZ] immediate field
-    // to bits FFFF0000 of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_gottprel_g1:
-    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: Set a MOV[NZ] immediate field
-    // to bits FFFF0000 of G(TPREL(S+A)) - GOT.
-  case AArch64::fixup_a64_movw_tprel_g1:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G1: Set a MOV[NZ] immediate field to
-    // bits FFFF0000 of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g1: {
-    // R_AARCH64_MOVW_SABS_G1: Sets MOV[NZ] immediate field using bits FFFF 0000
-    // of S+A (see notes below); check -2^32 <= S+A < 2^32. (notes say that we
-    // should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 32) && Signed < (1LL << 32)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = ((Value >> 16) & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      Value = ((~Value >> 16) & 0xffff) << 5;
-    }
-    return Value;
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->Write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_AArch64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_AArch64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : AArch64AsmBackend(T), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                         MachO::CPU_SUBTYPE_ARM64_ALL);
   }
 
-  case AArch64::fixup_a64_movw_dtprel_g2:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G2: Set a MOV[NZ] immediate field
-    // to bits FFFF 0000 0000 of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_tprel_g2:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G2: Set a MOV[NZ] immediate field to
-    // bits FFFF 0000 0000 of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g2: {
-    // R_AARCH64_MOVW_SABS_G2: Sets MOV[NZ] immediate field using bits FFFF 0000
-    // 0000 of S+A (see notes below); check -2^48 <= S+A < 2^48. (notes say that
-    // we should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 48) && Signed < (1LL << 48)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = ((Value >> 32) & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      Value = ((~Value >> 32) & 0xffff) << 5;
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
+  }
+
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    if (Instrs.empty())
+      return CU::UNWIND_AArch64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_AArch64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   AArch64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_AArch64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
     }
-    return Value;
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_AArch64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
   }
+};
 
-  case AArch64::fixup_a64_tstbr:
-    // R_AARCH64_TSTBR14: Sets the immediate field of a TBZ/TBNZ instruction to
-    // bits FFFC of S+A-P, checking -2^15 <= S+A-P < 2^15.
-    assert((int64_t)Value >= -(1LL << 15) &&
-           (int64_t)Value < (1LL << 15) && "Out of range TBZ/TBNZ fixup");
-    return (Value & 0xfffc) << (5 - 2);
-
-  case AArch64::fixup_a64_condbr:
-    // R_AARCH64_CONDBR19: Sets the immediate field of a conditional branch
-    // instruction to bits 1FFFFC of S+A-P, checking -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range B.cond fixup");
-    return (Value & 0x1ffffc) << (5 - 2);
-
-  case AArch64::fixup_a64_uncondbr:
-    // R_AARCH64_JUMP26 same as below (except to a linker, possibly).
-  case AArch64::fixup_a64_call:
-    // R_AARCH64_CALL26: Sets a CALL immediate field to bits FFFFFFC of S+A-P,
-    // checking that -2^27 <= S+A-P < 2^27.
-    assert((int64_t)Value >= -(1LL << 27) &&
-           (int64_t)Value < (1LL << 27) && "Out of range branch fixup");
-    return (Value & 0xffffffc) >> 2;
-
-  case AArch64::fixup_a64_adr_gottprel_page:
-    // R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: Set an ADRP immediate field to bits
-    // 1FFFFF000 of Page(G(TPREL(S+A))) - Page(P); check -2^32 <= X < 2^32.
-  case AArch64::fixup_a64_tlsdesc_adr_page:
-    // R_AARCH64_TLSDESC_ADR_PAGE: Set an ADRP immediate field to bits 1FFFFF000
-    // of Page(G(TLSDESC(S+A))) - Page(P); check -2^32 <= X < 2^32.
-  case AArch64::fixup_a64_adr_prel_got_page:
-    // R_AARCH64_ADR_GOT_PAGE: Sets the immediate value of an ADRP to bits
-    // 1FFFFF000 of the operation, checking that -2^32 < Page(G(S))-Page(GOT) <
-    // 2^32.
-    assert((int64_t)Value >= -(1LL << 32) &&
-           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
-    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
-  case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
-    // R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: Set an LD offset field to bits FF8
-    // of X, with no overflow check. Check that X & 7 == 0.
-  case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
-    // R_AARCH64_TLSDESC_LD64_LO12_NC: Set an LD offset field to bits FF8 of
-    // G(TLSDESC(S+A)), with no overflow check. Check that X & 7 == 0.
-  case AArch64::fixup_a64_ld64_got_lo12_nc:
-    // R_AARCH64_LD64_GOT_LO12_NC: Sets the LD/ST immediate field to bits FF8 of
-    // G(S) with no overflow check. Check X & 7 == 0
-    assert(((int64_t)Value & 7) == 0 && "Misaligned fixup");
-    return (Value & 0xff8) << 7;
-
-  case AArch64::fixup_a64_tlsdesc_call:
-    // R_AARCH64_TLSDESC_CALL: For relaxation only.
-    return 0;
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  bool IsLittleEndian;
+
+  ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
+    : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
   }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+    bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    IsResolved = false;
+}
+
+void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                      unsigned DataSize, uint64_t Value,
+                                      bool IsPCRel) const {
+  // store fixups in .eh_frame section in big endian order
+  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
+    const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
+    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
+    if (SecELF->getSectionName() == ".eh_frame")
+      Value = ByteSwap_32(unsigned(Value));
+  }
+  AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
+}
 }
 
-MCAsmBackend *
-llvm::createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                              StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ true);
+
+  if (TheTriple.isOSDarwin())
+    return new DarwinAArch64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
 }
 
-MCAsmBackend *
-llvm::createAArch64beAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                              StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ false);
+
+  assert(TheTriple.isOSBinFormatELF() &&
+         "Big endian is only supported for ELF targets!");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+                                  /*IsLittleEndian=*/false);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a5fe9141e655..e05191eaf3e0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCValue.h"
@@ -35,257 +36,222 @@ class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
 };
 }
 
-AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian)
-  : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
-                            /*HasRelocationAddend*/ true)
-{}
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+                                               bool IsLittleEndian)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true) {}
 
-AArch64ELFObjectWriter::~AArch64ELFObjectWriter()
-{}
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
 
 unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                              const MCFixup &Fixup,
-                                              bool IsPCRel) const {
-  unsigned Type;
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  AArch64MCExpr::VariantKind RefKind =
+      static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+  AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
-    default:
-      llvm_unreachable("Unimplemented fixup -> relocation");
-    case FK_Data_8:
-      return ELF::R_AARCH64_PREL64;
-    case FK_Data_4:
-      return ELF::R_AARCH64_PREL32;
     case FK_Data_2:
       return ELF::R_AARCH64_PREL16;
-    case AArch64::fixup_a64_ld_prel:
-      Type = ELF::R_AARCH64_LD_PREL_LO19;
-      break;
-    case AArch64::fixup_a64_adr_prel:
-      Type = ELF::R_AARCH64_ADR_PREL_LO21;
-      break;
-    case AArch64::fixup_a64_adr_prel_page:
-      Type = ELF::R_AARCH64_ADR_PREL_PG_HI21;
-      break;
-    case AArch64::fixup_a64_adr_prel_got_page:
-      Type = ELF::R_AARCH64_ADR_GOT_PAGE;
-      break;
-    case AArch64::fixup_a64_tstbr:
-      Type = ELF::R_AARCH64_TSTBR14;
-      break;
-    case AArch64::fixup_a64_condbr:
-      Type = ELF::R_AARCH64_CONDBR19;
-      break;
-    case AArch64::fixup_a64_uncondbr:
-      Type = ELF::R_AARCH64_JUMP26;
-      break;
-    case AArch64::fixup_a64_call:
-      Type = ELF::R_AARCH64_CALL26;
-      break;
-    case AArch64::fixup_a64_adr_gottprel_page:
-      Type = ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
-      break;
-    case AArch64::fixup_a64_ld_gottprel_prel19:
-      Type =  ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
-      break;
-    case AArch64::fixup_a64_tlsdesc_adr_page:
-      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      break;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case AArch64::fixup_aarch64_pcrel_adr_imm21:
+      assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+      return ELF::R_AARCH64_ADR_PREL_LO21;
+    case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+      if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+        return ELF::R_AARCH64_ADR_GOT_PAGE;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      llvm_unreachable("invalid symbol kind for ADRP relocation");
+    case AArch64::fixup_aarch64_pcrel_branch26:
+      return ELF::R_AARCH64_JUMP26;
+    case AArch64::fixup_aarch64_pcrel_call26:
+      return ELF::R_AARCH64_CALL26;
+    case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+        return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+      return ELF::R_AARCH64_LD_PREL_LO19;
+    case AArch64::fixup_aarch64_pcrel_branch14:
+      return ELF::R_AARCH64_TSTBR14;
+    case AArch64::fixup_aarch64_pcrel_branch19:
+      return ELF::R_AARCH64_CONDBR19;
+    default:
+      llvm_unreachable("Unsupported pc-relative fixup kind");
     }
   } else {
     switch ((unsigned)Fixup.getKind()) {
-    default:
-      llvm_unreachable("Unimplemented fixup -> relocation");
-    case FK_Data_8:
-      return ELF::R_AARCH64_ABS64;
-    case FK_Data_4:
-      return ELF::R_AARCH64_ABS32;
     case FK_Data_2:
       return ELF::R_AARCH64_ABS16;
-    case AArch64::fixup_a64_add_lo12:
-      Type = ELF::R_AARCH64_ADD_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ld64_got_lo12_nc:
-      Type = ELF::R_AARCH64_LD64_GOT_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_lo12:
-      Type = ELF::R_AARCH64_LDST8_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_lo12:
-      Type = ELF::R_AARCH64_LDST16_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_lo12:
-      Type = ELF::R_AARCH64_LDST32_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_lo12:
-      Type = ELF::R_AARCH64_LDST64_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst128_lo12:
-      Type = ELF::R_AARCH64_LDST128_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g0:
-      Type = ELF::R_AARCH64_MOVW_UABS_G0;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g0_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G0_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g1:
-      Type = ELF::R_AARCH64_MOVW_UABS_G1;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g1_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g2:
-      Type = ELF::R_AARCH64_MOVW_UABS_G2;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g2_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G2_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g3:
-      Type = ELF::R_AARCH64_MOVW_UABS_G3;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g0:
-      Type = ELF::R_AARCH64_MOVW_SABS_G0;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g1:
-      Type = ELF::R_AARCH64_MOVW_SABS_G1;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g2:
-      Type = ELF::R_AARCH64_MOVW_SABS_G2;
-      break;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case AArch64::fixup_aarch64_add_imm12:
+      if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
 
-    // TLS Local-dynamic block
-    case AArch64::fixup_a64_movw_dtprel_g2:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g1:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g1_nc:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g0:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_add_dtprel_hi12:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
-      break;
-    case AArch64::fixup_a64_add_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_add_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for add (uimm12) instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale1:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
 
-    // TLS initial-exec block
-    case AArch64::fixup_a64_movw_gottprel_g1:
-      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_gottprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for 8-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale2:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
 
-    // TLS local-exec block
-    case AArch64::fixup_a64_movw_tprel_g2:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g1:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g1_nc:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g0:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_add_tprel_hi12:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
-      break;
-    case AArch64::fixup_a64_add_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_add_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for 16-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale4:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
 
-    // TLS general-dynamic block
-    case AArch64::fixup_a64_tlsdesc_adr_page:
-      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      break;
-    case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
-      Type = ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-      break;
-    case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
-      Type = ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
-      break;
-    case AArch64::fixup_a64_tlsdesc_call:
-      Type = ELF::R_AARCH64_TLSDESC_CALL;
-      break;
+      report_fatal_error("invalid fixup for 32-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale8:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      report_fatal_error("invalid fixup for 64-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale16:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for 128-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_movw:
+      if (RefKind == AArch64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+        return ELF::R_AARCH64_MOVW_SABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1)
+        return ELF::R_AARCH64_MOVW_UABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+        return ELF::R_AARCH64_MOVW_SABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0)
+        return ELF::R_AARCH64_MOVW_UABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+        return ELF::R_AARCH64_MOVW_SABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      report_fatal_error("invalid fixup for movz/movk instruction");
+      return 0;
+    case AArch64::fixup_aarch64_tlsdesc_call:
+      return ELF::R_AARCH64_TLSDESC_CALL;
+    default:
+      llvm_unreachable("Unknown ELF relocation type");
     }
   }
 
-  return Type;
+  llvm_unreachable("Unimplemented fixup -> relocation");
 }
 
 MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
-                                                   uint8_t OSABI,
-                                                   bool IsLittleEndian) {
-  MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
-  return createELFObjectWriter(MOTW, OS,  IsLittleEndian);
+                                                 uint8_t OSABI,
+                                                 bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW =
+      new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index df2cb3837580..a79406d9d1fe 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -56,7 +56,7 @@ namespace {
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
   AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                     MCCodeEmitter *Emitter)
+                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
         LastEMS(EMS_None) {}
 
@@ -76,7 +76,7 @@ class AArch64ELFStreamer : public MCELFStreamer {
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  void EmitInstruction(const MCInst& Inst,
+  void EmitInstruction(const MCInst &Inst,
                        const MCSubtargetInfo &STI) override {
     EmitA64MappingSymbol();
     MCELFStreamer::EmitInstruction(Inst, STI);
@@ -96,7 +96,7 @@ class AArch64ELFStreamer : public MCELFStreamer {
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
                      const SMLoc &Loc) override {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+    MCELFStreamer::EmitValueImpl(Value, Size);
   }
 
 private:
@@ -107,13 +107,15 @@ class AArch64ELFStreamer : public MCELFStreamer {
   };
 
   void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data) return;
+    if (LastEMS == EMS_Data)
+      return;
     EmitMappingSymbol("$d");
     LastEMS = EMS_Data;
   }
 
   void EmitA64MappingSymbol() {
-    if (LastEMS == EMS_A64) return;
+    if (LastEMS == EMS_A64)
+      return;
     EmitMappingSymbol("$x");
     LastEMS = EMS_A64;
   }
@@ -122,15 +124,14 @@ class AArch64ELFStreamer : public MCELFStreamer {
     MCSymbol *Start = getContext().CreateTempSymbol();
     EmitLabel(Start);
 
-    MCSymbol *Symbol =
-      getContext().GetOrCreateSymbol(Name + "." +
-                                     Twine(MappingSymbolCounter++));
+    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++));
 
     MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    AssignSection(Symbol, getCurrentSection().first);
+    Symbol->setSection(*getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
@@ -146,16 +147,14 @@ class AArch64ELFStreamer : public MCELFStreamer {
 }
 
 namespace llvm {
-  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack) {
-    AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
-    if (RelaxAll)
-      S->getAssembler().setRelaxAll(true);
-    if (NoExecStack)
-      S->getAssembler().setNoExecStack(true);
-    return S;
-  }
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack) {
+  AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
 }
-
-
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index 5a89ca50cee8..bc6973bd5f8b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -18,10 +18,9 @@
 
 namespace llvm {
 
-  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                          raw_ostream &OS,
-                                          MCCodeEmitter *Emitter,
-                                          bool RelaxAll, bool NoExecStack);
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack);
 }
 
 #endif // AArch64_ELF_STREAMER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index eeb122d38494..bf405fbac77b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64FixupKinds.h - AArch64 Specific Fixup Entries -*- C++ -*-=//
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,108 +6,71 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the LLVM fixups applied to MCInsts in the AArch64
-// backend.
-//
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_AARCH64FIXUPKINDS_H
-#define LLVM_AARCH64_AARCH64FIXUPKINDS_H
+#ifndef LLVM_AArch64FIXUPKINDS_H
+#define LLVM_AArch64FIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 
 namespace llvm {
-  namespace AArch64 {
-    enum Fixups {
-      fixup_a64_ld_prel = FirstTargetFixupKind,
-      fixup_a64_adr_prel,
-      fixup_a64_adr_prel_page,
-
-      fixup_a64_add_lo12,
-
-      fixup_a64_ldst8_lo12,
-      fixup_a64_ldst16_lo12,
-      fixup_a64_ldst32_lo12,
-      fixup_a64_ldst64_lo12,
-      fixup_a64_ldst128_lo12,
-
-      fixup_a64_tstbr,
-      fixup_a64_condbr,
-      fixup_a64_uncondbr,
-      fixup_a64_call,
-
-      fixup_a64_movw_uabs_g0,
-      fixup_a64_movw_uabs_g0_nc,
-      fixup_a64_movw_uabs_g1,
-      fixup_a64_movw_uabs_g1_nc,
-      fixup_a64_movw_uabs_g2,
-      fixup_a64_movw_uabs_g2_nc,
-      fixup_a64_movw_uabs_g3,
-
-      fixup_a64_movw_sabs_g0,
-      fixup_a64_movw_sabs_g1,
-      fixup_a64_movw_sabs_g2,
-
-      fixup_a64_adr_prel_got_page,
-      fixup_a64_ld64_got_lo12_nc,
-
-      // Produce offsets relative to the module's dynamic TLS area.
-      fixup_a64_movw_dtprel_g2,
-      fixup_a64_movw_dtprel_g1,
-      fixup_a64_movw_dtprel_g1_nc,
-      fixup_a64_movw_dtprel_g0,
-      fixup_a64_movw_dtprel_g0_nc,
-      fixup_a64_add_dtprel_hi12,
-      fixup_a64_add_dtprel_lo12,
-      fixup_a64_add_dtprel_lo12_nc,
-      fixup_a64_ldst8_dtprel_lo12,
-      fixup_a64_ldst8_dtprel_lo12_nc,
-      fixup_a64_ldst16_dtprel_lo12,
-      fixup_a64_ldst16_dtprel_lo12_nc,
-      fixup_a64_ldst32_dtprel_lo12,
-      fixup_a64_ldst32_dtprel_lo12_nc,
-      fixup_a64_ldst64_dtprel_lo12,
-      fixup_a64_ldst64_dtprel_lo12_nc,
-
-      // Produce the GOT entry containing a variable's address in TLS's
-      // initial-exec mode.
-      fixup_a64_movw_gottprel_g1,
-      fixup_a64_movw_gottprel_g0_nc,
-      fixup_a64_adr_gottprel_page,
-      fixup_a64_ld64_gottprel_lo12_nc,
-      fixup_a64_ld_gottprel_prel19,
-
-      // Produce offsets relative to the thread pointer: TPIDR_EL0.
-      fixup_a64_movw_tprel_g2,
-      fixup_a64_movw_tprel_g1,
-      fixup_a64_movw_tprel_g1_nc,
-      fixup_a64_movw_tprel_g0,
-      fixup_a64_movw_tprel_g0_nc,
-      fixup_a64_add_tprel_hi12,
-      fixup_a64_add_tprel_lo12,
-      fixup_a64_add_tprel_lo12_nc,
-      fixup_a64_ldst8_tprel_lo12,
-      fixup_a64_ldst8_tprel_lo12_nc,
-      fixup_a64_ldst16_tprel_lo12,
-      fixup_a64_ldst16_tprel_lo12_nc,
-      fixup_a64_ldst32_tprel_lo12,
-      fixup_a64_ldst32_tprel_lo12_nc,
-      fixup_a64_ldst64_tprel_lo12,
-      fixup_a64_ldst64_tprel_lo12_nc,
-
-      // Produce the special fixups used by the general-dynamic TLS model.
-      fixup_a64_tlsdesc_adr_page,
-      fixup_a64_tlsdesc_ld64_lo12_nc,
-      fixup_a64_tlsdesc_add_lo12_nc,
-      fixup_a64_tlsdesc_call,
-
-
-      // Marker
-      LastTargetFixupKind,
-      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-    };
-  }
-}
+namespace AArch64 {
+
+enum Fixups {
+  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_aarch64_pcrel_adrp_imm21,
+
+  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_aarch64_add_imm12,
+
+  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_aarch64_ldst_imm12_scale1,
+  fixup_aarch64_ldst_imm12_scale2,
+  fixup_aarch64_ldst_imm12_scale4,
+  fixup_aarch64_ldst_imm12_scale8,
+  fixup_aarch64_ldst_imm12_scale16,
+
+  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+  // pc-relative loads and generates relocations directly when necessary.
+  fixup_aarch64_ldr_pcrel_imm19,
+
+  // FIXME: comment
+  fixup_aarch64_movw,
+
+  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch14,
+
+  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+  // b.cc and generates relocations directly when necessary.
+  fixup_aarch64_pcrel_branch19,
+
+  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch26,
+
+  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_aarch64_pcrel_call26,
+
+  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_aarch64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index b090a55eb99a..1763b40e2d48 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -13,26 +13,82 @@
 
 #include "AArch64MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
-AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::aarch64_be)
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "aarch64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+               clEnumValEnd));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
+  Triple T(TT);
+  if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
   PointerSize = 8;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
   CommentString = "//";
+  PrivateGlobalPrefix = ".L";
   Code32Directive = ".code\t32";
 
   Data16bitsDirective = "\t.hword\t";
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = "\t.xword\t";
 
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
@@ -40,7 +96,6 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
   UseIntegratedAssembler = true;
-}
 
-// Pin the vtable to this file.
-void AArch64ELFMCAsmInfo::anchor() {}
+  HasIdentDirective = true;
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 78fd5d5b4fe0..42a031d7c2ca 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -1,4 +1,4 @@
-//==-- AArch64MCAsmInfo.h - AArch64 asm properties -------------*- C++ -*--===//
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64TARGETASMINFO_H
-#define LLVM_AARCH64TARGETASMINFO_H
+#ifndef AArch64TARGETASMINFO_H
+#define AArch64TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 
 namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit AArch64MCAsmInfoDarwin();
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
 
-struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
-  explicit AArch64ELFMCAsmInfo(StringRef TT);
-private:
-  void anchor() override;
+struct AArch64MCAsmInfoELF : public MCAsmInfo {
+  explicit AArch64MCAsmInfoELF(StringRef TT);
 };
 
 } // namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 7ff46d71df91..464a18cdbc04 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code =//
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -21,526 +21,562 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
 namespace {
+
 class AArch64MCCodeEmitter : public MCCodeEmitter {
-  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   MCContext &Ctx;
 
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
 public:
-  AArch64MCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+  AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
 
   ~AArch64MCCodeEmitter() {}
 
-  unsigned getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+  /// attached to a load, store or prfm instruction. If operand requires a
+  /// relocation, record it and return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
 
-  unsigned getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
 
-  template<int MemSize>
-  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const {
-    return getOffsetUImm12OpValue(MI, OpIdx, Fixups, STI, MemSize);
-  }
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
 
-  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI,
-                                    int MemSize) const;
+  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+  /// pc-relative address.
+  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
-  unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
+  /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+  /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+  /// operation is a sign extend (as opposed to a zero extend).
+  uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
-  unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
 
-  unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
-                            SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
 
-  // Labels are handled mostly the same way: a symbol is needed, and
-  // just gets some fixup attached.
-  template<AArch64::Fixups fixupDesired>
-  unsigned getLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                           SmallVectorImpl<MCFixup> &Fixups,
-                           const MCSubtargetInfo &STI) const;
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned  getLoadLitLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
 
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
-  unsigned getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
 
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned getAddressWithFixup(const MCOperand &MO,
-                               unsigned FixupKind,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
 
-  void EmitByte(unsigned char C, raw_ostream &OS) const {
-    OS << (char)C;
-  }
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
 
-  void EmitInstruction(uint32_t Val, raw_ostream &OS) const {
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      EmitByte(Val & 0xff, OS);
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
       Val >>= 8;
     }
   }
 
-
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override;
 
-  template<int hasRs, int hasRt2> unsigned
-  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
-                        const MCSubtargetInfo &STI) const;
-
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                   const MCSubtargetInfo &STI) const;
-
   unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
                       const MCSubtargetInfo &STI) const;
 
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+                        const MCSubtargetInfo &STI) const;
 
+  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const;
 };
 
 } // end anonymous namespace
 
-unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
-                                       unsigned FixupKind,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (!MO.isExpr()) {
-    // This can occur for manually decoded or constructed MCInsts, but neither
-    // the assembly-parser nor instruction selection will currently produce an
-    // MCInst that's not a symbol reference.
-    assert(MO.isImm() && "Unexpected address requested");
-    return MO.getImm();
-  }
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, STI, Ctx);
+}
 
-  const MCExpr *Expr = MO.getExpr();
-  MCFixupKind Kind = MCFixupKind(FixupKind);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else {
+    assert(MO.isImm() && "did not expect relocated expression");
+    return static_cast<unsigned>(MO.getImm());
+  }
 
+  assert(0 && "Unable to encode MCOperand!");
   return 0;
 }
 
-unsigned AArch64MCCodeEmitter::
-getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups,
-                       const MCSubtargetInfo &STI,
-                       int MemSize) const {
-  const MCOperand &ImmOp = MI.getOperand(OpIdx);
-  if (ImmOp.isImm())
-    return ImmOp.getImm();
-
-  assert(ImmOp.isExpr() && "Unexpected operand type");
-  const AArch64MCExpr *Expr = cast<AArch64MCExpr>(ImmOp.getExpr());
-  unsigned FixupKind;
-
-
-  switch (Expr->getKind()) {
-  default: llvm_unreachable("Unexpected operand modifier");
-  case AArch64MCExpr::VK_AARCH64_LO12: {
-    static const unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
-                                             AArch64::fixup_a64_ldst16_lo12,
-                                             AArch64::fixup_a64_ldst32_lo12,
-                                             AArch64::fixup_a64_ldst64_lo12,
-                                AArch64::fixup_a64_ldst128_lo12 };
-    assert(MemSize <= 16 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_GOT_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc;
-    break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:  {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_dtprel_lo12,
-      AArch64::fixup_a64_ldst16_dtprel_lo12,
-      AArch64::fixup_a64_ldst32_dtprel_lo12,
-      AArch64::fixup_a64_ldst64_dtprel_lo12
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst64_dtprel_lo12_nc
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc;
-    break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_tprel_lo12,
-      AArch64::fixup_a64_ldst16_tprel_lo12,
-      AArch64::fixup_a64_ldst32_tprel_lo12,
-      AArch64::fixup_a64_ldst64_tprel_lo12
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst16_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst32_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst64_tprel_lo12_nc
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_tlsdesc_ld64_lo12_nc;
-    break;
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
   }
 
-  return getAddressWithFixup(ImmOp, FixupKind, Fixups, STI);
+  return ImmVal;
 }
 
-unsigned
-AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+  const MCExpr *Expr = MO.getExpr();
 
-  assert(MO.isExpr());
-
-  unsigned FixupKind = 0;
-  switch(cast<AArch64MCExpr>(MO.getExpr())->getKind()) {
-  default: llvm_unreachable("Invalid expression modifier");
-  case AArch64MCExpr::VK_AARCH64_LO12:
-    FixupKind = AArch64::fixup_a64_add_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_HI12:
-    FixupKind = AArch64::fixup_a64_add_dtprel_hi12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:
-    FixupKind = AArch64::fixup_a64_add_dtprel_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC:
-    FixupKind = AArch64::fixup_a64_add_dtprel_lo12_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_HI12:
-    FixupKind = AArch64::fixup_a64_add_tprel_hi12; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:
-    FixupKind = AArch64::fixup_a64_add_tprel_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC:
-    FixupKind = AArch64::fixup_a64_add_tprel_lo12_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
-    FixupKind = AArch64::fixup_a64_tlsdesc_add_lo12_nc; break;
-  }
+  MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
-}
+  MCNumFixups += 1;
 
-unsigned
-AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  // All of the information is in the fixup.
+  return 0;
+}
 
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
   const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-
-  assert(MO.isExpr());
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
 
-  unsigned Modifier = AArch64MCExpr::VK_AARCH64_None;
-  if (const AArch64MCExpr *Expr = dyn_cast<AArch64MCExpr>(MO.getExpr()))
-    Modifier = Expr->getKind();
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
 
-  unsigned FixupKind = 0;
-  switch(Modifier) {
-  case AArch64MCExpr::VK_AARCH64_None:
-    FixupKind = AArch64::fixup_a64_adr_prel_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_GOT:
-    FixupKind = AArch64::fixup_a64_adr_prel_got_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL:
-    FixupKind = AArch64::fixup_a64_adr_gottprel_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_TLSDESC:
-    FixupKind = AArch64::fixup_a64_tlsdesc_adr_page;
-    break;
-  default:
-    llvm_unreachable("Unknown symbol reference kind for ADRP instruction");
-  }
+  ++MCNumFixups;
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
+  return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Only immediate expected for shift");
 
-  return ((32 - MO.getImm()) & 0x1f) | (31 - MO.getImm()) << 6;
-}
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
 
-unsigned
-AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Only immediate expected for shift");
+  ++MCNumFixups;
 
-  return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 8 - MI.getOperand(Op).getImm();
-}
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 16 - MI.getOperand(Op).getImm();
-}
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 32 - MI.getOperand(Op).getImm();
-}
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 64 - MI.getOperand(Op).getImm();
-}
+  ++MCNumFixups;
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 8;
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 16;
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+  unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+  return (SignExtend << 1) | DoShift;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 32;
-}
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 64;
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
 }
 
-template<AArch64::Fixups fixupDesired> unsigned
-AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
-                                      unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
-  if (MO.isExpr())
-    return getAddressWithFixup(MO, fixupDesired, Fixups, STI);
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-  assert(MO.isImm());
-  return MO.getImm();
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
-                                       unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
+  // If the destination is an immediate, we have nothing to do.
   if (MO.isImm())
     return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
 
-  assert(MO.isExpr());
+  MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-  unsigned FixupKind;
-  if (isa<AArch64MCExpr>(MO.getExpr())) {
-    assert(dyn_cast<AArch64MCExpr>(MO.getExpr())->getKind()
-           == AArch64MCExpr::VK_AARCH64_GOTTPREL
-           && "Invalid symbol modifier for literal load");
-    FixupKind = AArch64::fixup_a64_ld_gottprel_prel19;
-  } else {
-    FixupKind = AArch64::fixup_a64_ld_prel;
-  }
+  ++MCNumFixups;
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
+  // All of the information is in the fixup.
+  return 0;
 }
 
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
 
-unsigned
-AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                       const MCOperand &MO,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (MO.isReg()) {
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  } else if (MO.isImm()) {
-    return static_cast<unsigned>(MO.getImm());
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
   }
 
-  llvm_unreachable("Unable to encode MCOperand!");
+  assert(false && "Invalid value for vector shift amount!");
   return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &UImm16MO = MI.getOperand(OpIdx);
-  const MCOperand &ShiftMO = MI.getOperand(OpIdx + 1);
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
 
-  unsigned Result = static_cast<unsigned>(ShiftMO.getImm()) << 16;
+uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
 
-  if (UImm16MO.isImm()) {
-    Result |= UImm16MO.getImm();
-    return Result;
-  }
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
 
-  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
-  AArch64::Fixups requestedFixup;
-  switch (A64E->getKind()) {
-  default: llvm_unreachable("unexpected expression modifier");
-  case AArch64MCExpr::VK_AARCH64_ABS_G0:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g0; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G1:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g1; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G2:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g2; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g2_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G3:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g3; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G0:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g0; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G1:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g1; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G2:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g2; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g2; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_gottprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_gottprel_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g2; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g0; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g0_nc; break;
-  }
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
 
-  return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups, STI);
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
 }
 
-template<int hasRs, int hasRt2> unsigned
-AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
-                                            unsigned EncodedValue,
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
                                             const MCSubtargetInfo &STI) const {
-  if (!hasRs) EncodedValue |= 0x001F0000;
-  if (!hasRt2) EncodedValue |= 0x00007C00;
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
 
-  return EncodedValue;
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
 }
 
-unsigned
-AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                              const MCSubtargetInfo &STI) const {
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                       const MCSubtargetInfo &STI) const {
   // If one of the signed fixup kinds is applied to a MOVZ instruction, the
   // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
   // job to ensure that any bits possibly affected by this are 0. This means we
@@ -553,23 +589,38 @@ AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
 
   const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
   switch (A64E->getKind()) {
-  case AArch64MCExpr::VK_AARCH64_SABS_G0:
-  case AArch64MCExpr::VK_AARCH64_SABS_G1:
-  case AArch64MCExpr::VK_AARCH64_SABS_G2:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+  case AArch64MCExpr::VK_DTPREL_G2:
+  case AArch64MCExpr::VK_DTPREL_G1:
+  case AArch64MCExpr::VK_DTPREL_G0:
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G2:
+  case AArch64MCExpr::VK_TPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G0:
     return EncodedValue & ~(1u << 30);
   default:
     // Nothing to do for an unsigned fixup.
     return EncodedValue;
   }
 
-  llvm_unreachable("Should have returned by now");
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
 unsigned
@@ -582,32 +633,22 @@ AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
   return EncodedValue;
 }
 
-MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                const MCSubtargetInfo &STI,
-                                                MCContext &Ctx) {
-  return new AArch64MCCodeEmitter(Ctx);
-}
-
-void AArch64MCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups,
-                  const MCSubtargetInfo &STI) const {
-  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
-    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
-    // following (BLR) instruction. It doesn't emit any code itself so it
-    // doesn't go through the normal TableGenerated channels.
-    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_a64_tlsdesc_call);
-    const MCExpr *Expr;
-    Expr = AArch64MCExpr::CreateTLSDesc(MI.getOperand(0).getExpr(), Ctx);
-    Fixups.push_back(MCFixup::Create(0, Expr, Fixup));
-    return;
-  }
-
-  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue,
+                                            const MCSubtargetInfo &STI) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
 
-  EmitInstruction(Binary, OS);
+  return EncodedValue;
 }
 
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+    const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+  // The Rm field of FCMP and friends is unused - it should be assembled
+  // as 0, but is ignored by the processor.
+  EncodedValue &= ~(0x1f << 16);
+  return EncodedValue;
+}
 
 #include "AArch64GenMCCodeEmitter.inc"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 7aef9c57bf36..85c3ec7a55f1 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -16,71 +16,117 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "aarch64mcexpr"
+#define DEBUG_TYPE "aarch64symbolrefexpr"
 
-const AArch64MCExpr*
-AArch64MCExpr::Create(VariantKind Kind, const MCExpr *Expr,
-                      MCContext &Ctx) {
-  return new (Ctx) AArch64MCExpr(Kind, Expr);
+const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
 }
 
 void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
-  switch (Kind) {
-  default: llvm_unreachable("Invalid kind!");
-  case VK_AARCH64_GOT:              OS << ":got:"; break;
-  case VK_AARCH64_GOT_LO12:         OS << ":got_lo12:"; break;
-  case VK_AARCH64_LO12:             OS << ":lo12:"; break;
-  case VK_AARCH64_ABS_G0:           OS << ":abs_g0:"; break;
-  case VK_AARCH64_ABS_G0_NC:        OS << ":abs_g0_nc:"; break;
-  case VK_AARCH64_ABS_G1:           OS << ":abs_g1:"; break;
-  case VK_AARCH64_ABS_G1_NC:        OS << ":abs_g1_nc:"; break;
-  case VK_AARCH64_ABS_G2:           OS << ":abs_g2:"; break;
-  case VK_AARCH64_ABS_G2_NC:        OS << ":abs_g2_nc:"; break;
-  case VK_AARCH64_ABS_G3:           OS << ":abs_g3:"; break;
-  case VK_AARCH64_SABS_G0:          OS << ":abs_g0_s:"; break;
-  case VK_AARCH64_SABS_G1:          OS << ":abs_g1_s:"; break;
-  case VK_AARCH64_SABS_G2:          OS << ":abs_g2_s:"; break;
-  case VK_AARCH64_DTPREL_G2:        OS << ":dtprel_g2:"; break;
-  case VK_AARCH64_DTPREL_G1:        OS << ":dtprel_g1:"; break;
-  case VK_AARCH64_DTPREL_G1_NC:     OS << ":dtprel_g1_nc:"; break;
-  case VK_AARCH64_DTPREL_G0:        OS << ":dtprel_g0:"; break;
-  case VK_AARCH64_DTPREL_G0_NC:     OS << ":dtprel_g0_nc:"; break;
-  case VK_AARCH64_DTPREL_HI12:      OS << ":dtprel_hi12:"; break;
-  case VK_AARCH64_DTPREL_LO12:      OS << ":dtprel_lo12:"; break;
-  case VK_AARCH64_DTPREL_LO12_NC:   OS << ":dtprel_lo12_nc:"; break;
-  case VK_AARCH64_GOTTPREL_G1:      OS << ":gottprel_g1:"; break;
-  case VK_AARCH64_GOTTPREL_G0_NC:   OS << ":gottprel_g0_nc:"; break;
-  case VK_AARCH64_GOTTPREL:         OS << ":gottprel:"; break;
-  case VK_AARCH64_GOTTPREL_LO12:    OS << ":gottprel_lo12:"; break;
-  case VK_AARCH64_TPREL_G2:         OS << ":tprel_g2:"; break;
-  case VK_AARCH64_TPREL_G1:         OS << ":tprel_g1:"; break;
-  case VK_AARCH64_TPREL_G1_NC:      OS << ":tprel_g1_nc:"; break;
-  case VK_AARCH64_TPREL_G0:         OS << ":tprel_g0:"; break;
-  case VK_AARCH64_TPREL_G0_NC:      OS << ":tprel_g0_nc:"; break;
-  case VK_AARCH64_TPREL_HI12:       OS << ":tprel_hi12:"; break;
-  case VK_AARCH64_TPREL_LO12:       OS << ":tprel_lo12:"; break;
-  case VK_AARCH64_TPREL_LO12_NC:    OS << ":tprel_lo12_nc:"; break;
-  case VK_AARCH64_TLSDESC:          OS << ":tlsdesc:"; break;
-  case VK_AARCH64_TLSDESC_LO12:     OS << ":tlsdesc_lo12:"; break;
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
 
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
   }
+}
 
-  const MCExpr *Expr = getSubExpr();
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << '(';
-  Expr->print(OS);
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << ')';
+void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
 }
 
-bool
-AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAsmLayout *Layout) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+const MCSection *AArch64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
+}
+
+bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
 }
 
 static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
@@ -114,66 +160,15 @@ static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
 }
 
 void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
-  switch (getKind()) {
+  switch (getSymbolLoc(Kind)) {
   default:
     return;
-  case VK_AARCH64_DTPREL_G2:
-  case VK_AARCH64_DTPREL_G1:
-  case VK_AARCH64_DTPREL_G1_NC:
-  case VK_AARCH64_DTPREL_G0:
-  case VK_AARCH64_DTPREL_G0_NC:
-  case VK_AARCH64_DTPREL_HI12:
-  case VK_AARCH64_DTPREL_LO12:
-  case VK_AARCH64_DTPREL_LO12_NC:
-  case VK_AARCH64_GOTTPREL_G1:
-  case VK_AARCH64_GOTTPREL_G0_NC:
-  case VK_AARCH64_GOTTPREL:
-  case VK_AARCH64_GOTTPREL_LO12:
-  case VK_AARCH64_TPREL_G2:
-  case VK_AARCH64_TPREL_G1:
-  case VK_AARCH64_TPREL_G1_NC:
-  case VK_AARCH64_TPREL_G0:
-  case VK_AARCH64_TPREL_G0_NC:
-  case VK_AARCH64_TPREL_HI12:
-  case VK_AARCH64_TPREL_LO12:
-  case VK_AARCH64_TPREL_LO12_NC:
-  case VK_AARCH64_TLSDESC:
-  case VK_AARCH64_TLSDESC_LO12:
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
     break;
   }
 
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
 }
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
-}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 23128fefb0b0..e869ed0a26a4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -1,4 +1,4 @@
-//==- AArch64MCExpr.h - AArch64 specific MC expression classes --*- C++ -*-===//
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,166 +12,147 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64MCEXPR_H
-#define LLVM_AARCH64MCEXPR_H
+#ifndef LLVM_AArch64MCEXPR_H
+#define LLVM_AArch64MCEXPR_H
 
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
 class AArch64MCExpr : public MCTargetExpr {
 public:
   enum VariantKind {
-    VK_AARCH64_None,
-    VK_AARCH64_GOT,      // :got: modifier in assembly
-    VK_AARCH64_GOT_LO12, // :got_lo12:
-    VK_AARCH64_LO12,     // :lo12:
-
-    VK_AARCH64_ABS_G0, // :abs_g0:
-    VK_AARCH64_ABS_G0_NC, // :abs_g0_nc:
-    VK_AARCH64_ABS_G1,
-    VK_AARCH64_ABS_G1_NC,
-    VK_AARCH64_ABS_G2,
-    VK_AARCH64_ABS_G2_NC,
-    VK_AARCH64_ABS_G3,
-
-    VK_AARCH64_SABS_G0, // :abs_g0_s:
-    VK_AARCH64_SABS_G1,
-    VK_AARCH64_SABS_G2,
-
-    VK_AARCH64_DTPREL_G2, // :dtprel_g2:
-    VK_AARCH64_DTPREL_G1,
-    VK_AARCH64_DTPREL_G1_NC,
-    VK_AARCH64_DTPREL_G0,
-    VK_AARCH64_DTPREL_G0_NC,
-    VK_AARCH64_DTPREL_HI12,
-    VK_AARCH64_DTPREL_LO12,
-    VK_AARCH64_DTPREL_LO12_NC,
-
-    VK_AARCH64_GOTTPREL_G1, // :gottprel:
-    VK_AARCH64_GOTTPREL_G0_NC,
-    VK_AARCH64_GOTTPREL,
-    VK_AARCH64_GOTTPREL_LO12,
-
-    VK_AARCH64_TPREL_G2, // :tprel:
-    VK_AARCH64_TPREL_G1,
-    VK_AARCH64_TPREL_G1_NC,
-    VK_AARCH64_TPREL_G0,
-    VK_AARCH64_TPREL_G0_NC,
-    VK_AARCH64_TPREL_HI12,
-    VK_AARCH64_TPREL_LO12,
-    VK_AARCH64_TPREL_LO12_NC,
-
-    VK_AARCH64_TLSDESC, // :tlsdesc:
-    VK_AARCH64_TLSDESC_LO12
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_HI12     = 0x030,
+    VK_G0       = 0x040,
+    VK_G1       = 0x050,
+    VK_G2       = 0x060,
+    VK_G3       = 0x070,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_S          = VK_SABS     | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_S          = VK_SABS     | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_S          = VK_SABS     | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
   };
 
 private:
-  const VariantKind Kind;
   const MCExpr *Expr;
+  const VariantKind Kind;
 
-  explicit AArch64MCExpr(VariantKind _Kind, const MCExpr *_Expr)
-    : Kind(_Kind), Expr(_Expr) {}
+  explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
 
 public:
   /// @name Construction
   /// @{
 
-  static const AArch64MCExpr *Create(VariantKind Kind, const MCExpr *Expr,
-                                     MCContext &Ctx);
-
-  static const AArch64MCExpr *CreateLo12(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_AARCH64_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOT(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_AARCH64_GOT, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTLo12(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-    return Create(VK_AARCH64_GOT_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateDTPREL_G1(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_DTPREL_G1, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateDTPREL_G0_NC(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_DTPREL_G0_NC, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTTPREL(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_GOTTPREL, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTTPRELLo12(const MCExpr *Expr,
-                                                 MCContext &Ctx) {
-    return Create(VK_AARCH64_GOTTPREL_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateTLSDesc(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-    return Create(VK_AARCH64_TLSDESC, Expr, Ctx);
-  }
+  static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
 
-  static const AArch64MCExpr *CreateTLSDescLo12(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_TLSDESC_LO12, Expr, Ctx);
-  }
+  /// @}
+  /// @name Accessors
+  /// @{
 
-  static const AArch64MCExpr *CreateTPREL_G1(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_TPREL_G1, Expr, Ctx);
-  }
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
 
-  static const AArch64MCExpr *CreateTPREL_G0_NC(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_TPREL_G0_NC, Expr, Ctx);
-  }
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
 
-  static const AArch64MCExpr *CreateABS_G3(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G3, Expr, Ctx);
-  }
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
 
-  static const AArch64MCExpr *CreateABS_G2_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G2_NC, Expr, Ctx);
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
   }
 
-  static const AArch64MCExpr *CreateABS_G1_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G1_NC, Expr, Ctx);
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
   }
 
-  static const AArch64MCExpr *CreateABS_G0_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G0_NC, Expr, Ctx);
-  }
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
 
   /// @}
-  /// @name Accessors
-  /// @{
 
-  /// getOpcode - Get the kind of this expression.
-  VariantKind getKind() const { return Kind; }
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
 
-  /// getSubExpr - Get the child of this expression.
-  const MCExpr *getSubExpr() const { return Expr; }
+  void PrintImpl(raw_ostream &OS) const override;
 
-  /// @}
+  void AddValueSymbols(MCAssembler *) const override;
+
+  const MCSection *FindAssociatedSection() const override;
 
-  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout) const override;
-  void AddValueSymbols(MCAssembler *) const override;
-  const MCSection *FindAssociatedSection() const override {
-    return getSubExpr()->FindAssociatedSection();
-  }
 
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 599949c04357..ae698c59f6ce 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions -------------===//
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,9 +15,7 @@
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
 #include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -27,23 +25,14 @@
 
 using namespace llvm;
 
-#define GET_REGINFO_MC_DESC
-#include "AArch64GenRegisterInfo.inc"
-
 #define GET_INSTRINFO_MC_DESC
 #include "AArch64GenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_MC_DESC
 #include "AArch64GenSubtargetInfo.inc"
 
-MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
-                                                          StringRef CPU,
-                                                          StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
 
 static MCInstrInfo *createAArch64MCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
@@ -51,9 +40,20 @@ static MCInstrInfo *createAArch64MCInstrInfo() {
   return X;
 }
 
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+
+  if (CPU.empty())
+    CPU = "generic";
+
+  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
 static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitAArch64MCRegisterInfo(X, AArch64::X30);
+  InitAArch64MCRegisterInfo(X, AArch64::LR);
   return X;
 }
 
@@ -61,8 +61,16 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   Triple TheTriple(TT);
 
-  MCAsmInfo *MAI = new AArch64ELFMCAsmInfo(TT);
-  unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new AArch64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new AArch64MCAsmInfoELF(TT);
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
 
@@ -72,40 +80,35 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
 static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) {
-    // On ELF platforms the default static relocation model has a smart enough
-    // linker to cope with referencing external symbols defined in a shared
-    // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-    RM = Reloc::Static;
-  }
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
 
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
-  else if (CM == CodeModel::JITDefault) {
-    // The default MCJIT memory managers make no guarantees about where they can
-    // find an executable page; JITed code needs to be able to refer to globals
-    // no matter how far away they are.
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
     CM = CodeModel::Large;
-  }
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error(
+        "Only small and large code models are allowed on AArch64");
+
+  // AArch64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
 
+  MCCodeGenInfo *X = new MCCodeGenInfo();
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  return createAArch64ELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-
 static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
@@ -114,108 +117,109 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
                                                  const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new AArch64InstPrinter(MAI, MII, MRI, STI);
+  if (SyntaxVariant == 1)
+    return new AArch64AppleInstPrinter(MAI, MII, MRI, STI);
+
   return nullptr;
 }
 
-namespace {
-
-class AArch64MCInstrAnalysis : public MCInstrAnalysis {
-public:
-  AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
-
-  bool isUnconditionalBranch(const MCInst &Inst) const override {
-    if (Inst.getOpcode() == AArch64::Bcc
-        && Inst.getOperand(0).getImm() == A64CC::AL)
-      return true;
-    return MCInstrAnalysis::isUnconditionalBranch(Inst);
-  }
-
-  bool isConditionalBranch(const MCInst &Inst) const override {
-    if (Inst.getOpcode() == AArch64::Bcc
-        && Inst.getOperand(0).getImm() == A64CC::AL)
-      return false;
-    return MCInstrAnalysis::isConditionalBranch(Inst);
-  }
-
-  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                      uint64_t Size, uint64_t &Target) const override {
-    unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
-    // FIXME: We only handle PCRel branches for now.
-    if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
-        != MCOI::OPERAND_PCREL)
-      return false;
-
-    int64_t Imm = Inst.getOperand(LblOperand).getImm();
-    Target = Addr + Imm;
-    return true;
-  }
-};
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
 
-}
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
 
-static MCInstrAnalysis *createAArch64MCInstrAnalysis(const MCInstrInfo *Info) {
-  return new AArch64MCInstrAnalysis(Info);
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
 }
 
-
-
+// Force static initialization.
 extern "C" void LLVMInitializeAArch64TargetMC() {
   // Register the MC asm info.
-  RegisterMCAsmInfoFn A(TheAArch64leTarget, createAArch64MCAsmInfo);
-  RegisterMCAsmInfoFn B(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
 
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
                                         createAArch64MCCodeGenInfo);
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
                                         createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+                                        createAArch64MCCodeGenInfo);
 
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget,
                                       createAArch64MCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
                                       createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+                                      createAArch64MCInstrInfo);
 
   // Register the MC register info.
   TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget,
                                     createAArch64MCRegisterInfo);
   TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
                                     createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+                                    createAArch64MCRegisterInfo);
 
   // Register the MC subtarget info.
-  using AArch64_MC::createAArch64MCSubtargetInfo;
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
                                           createAArch64MCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
                                           createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+                                          createAArch64MCSubtargetInfo);
 
-  // Register the MC instruction analyzer.
-  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64leTarget,
-                                          createAArch64MCInstrAnalysis);
-  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64beTarget,
-                                          createAArch64MCInstrAnalysis);
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
+                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
+                                       createAArch64beAsmBackend);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
                                         createAArch64MCCodeEmitter);
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
                                         createAArch64MCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
-                                       createAArch64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
-                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+                                        createAArch64MCCodeEmitter);
 
   // Register the object streamer.
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget,
                                            createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
                                            createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
 
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
                                         createAArch64MCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
                                         createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+                                        createAArch64MCInstPrinter);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index bd8beaf16b07..d886ea23c13e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -11,18 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64MCTARGETDESC_H
-#define LLVM_AARCH64MCTARGETDESC_H
+#ifndef AArch64MCTARGETDESC_H
+#define AArch64MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
+#include <string>
 
 namespace llvm {
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
 class MCRegisterInfo;
+class MCObjectWriter;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -30,28 +31,25 @@ class raw_ostream;
 
 extern Target TheAArch64leTarget;
 extern Target TheAArch64beTarget;
-
-namespace AArch64_MC {
-  MCSubtargetInfo *createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                StringRef FS);
-}
+extern Target TheARM64leTarget;
+extern Target TheARM64beTarget;
 
 MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                          const MCRegisterInfo &MRI,
-                                          const MCSubtargetInfo &STI,
-                                          MCContext &Ctx);
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
 
-MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
-                                             uint8_t OSABI,
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
                                              bool IsLittleEndian);
 
-MCAsmBackend *createAArch64leAsmBackend(const Target &T,
-                                        const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU);
-
-MCAsmBackend *createAArch64beAsmBackend(const Target &T,
-                                        const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU);
+MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
 
 } // End llvm namespace
 
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
similarity index 88%
rename from lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
rename to lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 1c48159bbe95..ba9536676e12 100644
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -1,4 +1,4 @@
-//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
@@ -23,13 +23,13 @@
 using namespace llvm;
 
 namespace {
-class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
-  bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
                                   const MCSymbolRefExpr *Sym,
                                   unsigned &Log2Size, const MCAssembler &Asm);
 
 public:
-  ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+  AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
       : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
                                  /*UseAggressiveSymbolFolding=*/true) {}
 
@@ -40,7 +40,7 @@ class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
 };
 }
 
-bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
     unsigned &Log2Size, const MCAssembler &Asm) {
   RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
@@ -66,16 +66,16 @@ bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
     Log2Size = llvm::Log2_32(4);
     switch (Sym->getKind()) {
     default:
-      assert(0 && "Unexpected symbol reference variant kind!");
+      llvm_unreachable("Unexpected symbol reference variant kind!");
     case MCSymbolRefExpr::VK_PAGEOFF:
       RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
       return true;
@@ -86,7 +86,7 @@ bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
       RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
       return true;
     }
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     Log2Size = llvm::Log2_32(4);
     // This encompasses the relocation for the whole 21-bit value.
     switch (Sym->getKind()) {
@@ -104,15 +104,15 @@ bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
       return true;
     }
     return true;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
     Log2Size = llvm::Log2_32(4);
     RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
     return true;
   }
 }
 
-void ARM64MachObjectWriter::RecordRelocation(
+void AArch64MachObjectWriter::RecordRelocation(
     MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
     uint64_t &FixedValue) {
@@ -129,20 +129,20 @@ void ARM64MachObjectWriter::RecordRelocation(
 
   FixupOffset += Fixup.getOffset();
 
-  // ARM64 pcrel relocation addends do not include the section offset.
+  // AArch64 pcrel relocation addends do not include the section offset.
   if (IsPCRel)
     FixedValue += FixupOffset;
 
   // ADRP fixups use relocations for the whole symbol value and only
   // put the addend in the instruction itself. Clear out any value the
   // generic code figured out from the sybmol definition.
-  if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21)
+  if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
     FixedValue = 0;
 
   // imm19 relocations are for conditional branches, which require
   // assembler local symbols. If we got here, that's not what we have,
   // so complain loudly.
-  if (Kind == ARM64::fixup_arm64_pcrel_branch19) {
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
     Asm.getContext().FatalError(Fixup.getLoc(),
                                 "conditional branch requires assembler-local"
                                 " label. '" +
@@ -153,15 +153,15 @@ void ARM64MachObjectWriter::RecordRelocation(
 
   // 14-bit branch relocations should only target internal labels, and so
   // should never get here.
-  if (Kind == ARM64::fixup_arm64_pcrel_branch14) {
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
     Asm.getContext().FatalError(Fixup.getLoc(),
                                 "Invalid relocation on conditional branch!");
     return;
   }
 
-  if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+  if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
                                   Asm)) {
-    Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!");
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
     return;
   }
 
@@ -220,7 +220,7 @@ void ARM64MachObjectWriter::RecordRelocation(
                                   "unsupported pc-relative relocation of "
                                   "difference");
 
-    // ARM64 always uses external relocations. If there is no symbol to use as
+    // AArch64 always uses external relocations. If there is no symbol to use as
     // a base address (a local symbol with no preceding non-local symbol),
     // error out.
     //
@@ -305,9 +305,9 @@ void ARM64MachObjectWriter::RecordRelocation(
         Base = nullptr;
     }
 
-    // ARM64 uses external relocations as much as possible. For debug sections,
-    // and for pointer-sized relocations (.quad), we allow section relocations.
-    // It's code sections that run into trouble.
+    // AArch64 uses external relocations as much as possible. For debug
+    // sections, and for pointer-sized relocations (.quad), we allow section
+    // relocations.  It's code sections that run into trouble.
     if (Base) {
       Index = Base->getIndex();
       IsExtern = 1;
@@ -387,9 +387,10 @@ void ARM64MachObjectWriter::RecordRelocation(
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
-MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS,
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
                                                   uint32_t CPUType,
                                                   uint32_t CPUSubtype) {
-  return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype),
-                                OS, /*IsLittleEndian=*/true);
+  return createMachObjectWriter(
+      new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/true);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 54c4465b60d7..7d5bced17a6a 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -6,4 +6,9 @@ add_llvm_library(LLVMAArch64Desc
   AArch64MCCodeEmitter.cpp
   AArch64MCExpr.cpp
   AArch64MCTargetDesc.cpp
-  )
+  AArch64MachObjectWriter.cpp
+)
+add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
index 37c8035a49f9..70cff0b704f7 100644
--- a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
index 641bb83c4775..f356c5850413 100644
--- a/lib/Target/AArch64/Makefile
+++ b/lib/Target/AArch64/Makefile
@@ -12,19 +12,14 @@ LIBRARYNAME = LLVMAArch64CodeGen
 TARGET = AArch64
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AArch64GenAsmMatcher.inc \
-   AArch64GenAsmWriter.inc \
-   AArch64GenCallingConv.inc \
-   AArch64GenDAGISel.inc \
-   AArch64GenDisassemblerTables.inc \
-   AArch64GenInstrInfo.inc \
-   AArch64GenMCCodeEmitter.inc \
-   AArch64GenMCPseudoLowering.inc \
-   AArch64GenRegisterInfo.inc \
-   AArch64GenSubtargetInfo.inc
+BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
+		AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
+		AArch64GenDAGISel.inc \
+		AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
+		AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
+		AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
+		AArch64GenMCPseudoLowering.inc
 
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
 
 include $(LEVEL)/Makefile.common
-
-
diff --git a/lib/Target/AArch64/README.txt b/lib/Target/AArch64/README.txt
deleted file mode 100644
index 601990f17dee..000000000000
--- a/lib/Target/AArch64/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-This file will contain changes that need to be made before AArch64 can become an
-officially supported target. Currently a placeholder.
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 9281e4e1d937..3a382c165e7c 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -------------===//
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,22 +6,26 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the key registration step for the architecture.
-//
-//===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-Target llvm::TheAArch64leTarget;
-Target llvm::TheAArch64beTarget;
+namespace llvm {
+Target TheAArch64leTarget;
+Target TheAArch64beTarget;
+Target TheARM64leTarget;
+Target TheARM64beTarget;
+} // end namespace llvm
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
-    RegisterTarget<Triple::aarch64, /*HasJIT=*/true>
-    X(TheAArch64leTarget, "aarch64", "AArch64 (ARM 64-bit little endian target)");
-    RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true>
-    Y(TheAArch64beTarget, "aarch64_be", "AArch64 (ARM 64-bit big endian target)");
+  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
+                                                   "AArch64 (little endian)");
+  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
+                                                      "AArch64 (big endian)");
+
+  RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+      TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
+  RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+      TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
 }
diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
index ee734c647261..e236eed00be1 100644
--- a/lib/Target/AArch64/TargetInfo/CMakeLists.txt
+++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64Info
   AArch64TargetInfo.cpp
   )
+
+add_dependencies(LLVMAArch64Info AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
index 642917239810..93c5407bb1f1 100644
--- a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 2a97cd632560..3c24bb30a26d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -18,7 +18,7 @@
 
 using namespace llvm;
 
-StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
   for (unsigned i = 0; i < NumPairs; ++i) {
     if (Pairs[i].Value == Value) {
       Valid = true;
@@ -30,7 +30,7 @@ StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
   return StringRef();
 }
 
-uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
   std::string LowerCaseName = Name.lower();
   for (unsigned i = 0; i < NumPairs; ++i) {
     if (Pairs[i].Name == LowerCaseName) {
@@ -43,11 +43,11 @@ uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
   return -1;
 }
 
-bool NamedImmMapper::validImm(uint32_t Value) const {
+bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
   return Value < TooBigImm;
 }
 
-const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = {
   {"s1e1r", S1E1R},
   {"s1e2r", S1E2R},
   {"s1e3r", S1E3R},
@@ -62,10 +62,10 @@ const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
   {"s12e0w", S12E0W},
 };
 
-A64AT::ATMapper::ATMapper()
-  : NamedImmMapper(ATPairs, 0) {}
+AArch64AT::ATMapper::ATMapper()
+  : AArch64NamedImmMapper(ATPairs, 0) {}
 
-const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = {
   {"oshld", OSHLD},
   {"oshst", OSHST},
   {"osh", OSH},
@@ -80,10 +80,10 @@ const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
   {"sy", SY}
 };
 
-A64DB::DBarrierMapper::DBarrierMapper()
-  : NamedImmMapper(DBarrierPairs, 16u) {}
+AArch64DB::DBarrierMapper::DBarrierMapper()
+  : AArch64NamedImmMapper(DBarrierPairs, 16u) {}
 
-const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = {
   {"zva", ZVA},
   {"ivac", IVAC},
   {"isw", ISW},
@@ -94,26 +94,26 @@ const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
   {"cisw", CISW}
 };
 
-A64DC::DCMapper::DCMapper()
-  : NamedImmMapper(DCPairs, 0) {}
+AArch64DC::DCMapper::DCMapper()
+  : AArch64NamedImmMapper(DCPairs, 0) {}
 
-const NamedImmMapper::Mapping A64IC::ICMapper::ICPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = {
   {"ialluis",  IALLUIS},
   {"iallu", IALLU},
   {"ivau", IVAU}
 };
 
-A64IC::ICMapper::ICMapper()
-  : NamedImmMapper(ICPairs, 0) {}
+AArch64IC::ICMapper::ICMapper()
+  : AArch64NamedImmMapper(ICPairs, 0) {}
 
-const NamedImmMapper::Mapping A64ISB::ISBMapper::ISBPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = {
   {"sy",  SY},
 };
 
-A64ISB::ISBMapper::ISBMapper()
-  : NamedImmMapper(ISBPairs, 16) {}
+AArch64ISB::ISBMapper::ISBMapper()
+  : AArch64NamedImmMapper(ISBPairs, 16) {}
 
-const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = {
   {"pldl1keep", PLDL1KEEP},
   {"pldl1strm", PLDL1STRM},
   {"pldl2keep", PLDL2KEEP},
@@ -134,19 +134,19 @@ const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
   {"pstl3strm", PSTL3STRM}
 };
 
-A64PRFM::PRFMMapper::PRFMMapper()
-  : NamedImmMapper(PRFMPairs, 32) {}
+AArch64PRFM::PRFMMapper::PRFMMapper()
+  : AArch64NamedImmMapper(PRFMPairs, 32) {}
 
-const NamedImmMapper::Mapping A64PState::PStateMapper::PStatePairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = {
   {"spsel", SPSel},
   {"daifset", DAIFSet},
   {"daifclr", DAIFClr}
 };
 
-A64PState::PStateMapper::PStateMapper()
-  : NamedImmMapper(PStatePairs, 0) {}
+AArch64PState::PStateMapper::PStateMapper()
+  : AArch64NamedImmMapper(PStatePairs, 0) {}
 
-const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = {
   {"mdccsr_el0", MDCCSR_EL0},
   {"dbgdtrrx_el0", DBGDTRRX_EL0},
   {"mdrar_el1", MDRAR_EL1},
@@ -176,16 +176,16 @@ const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
   {"id_isar3_el1", ID_ISAR3_EL1},
   {"id_isar4_el1", ID_ISAR4_EL1},
   {"id_isar5_el1", ID_ISAR5_EL1},
-  {"id_aa64pfr0_el1", ID_AA64PFR0_EL1},
-  {"id_aa64pfr1_el1", ID_AA64PFR1_EL1},
-  {"id_aa64dfr0_el1", ID_AA64DFR0_EL1},
-  {"id_aa64dfr1_el1", ID_AA64DFR1_EL1},
-  {"id_aa64afr0_el1", ID_AA64AFR0_EL1},
-  {"id_aa64afr1_el1", ID_AA64AFR1_EL1},
-  {"id_aa64isar0_el1", ID_AA64ISAR0_EL1},
-  {"id_aa64isar1_el1", ID_AA64ISAR1_EL1},
-  {"id_aa64mmfr0_el1", ID_AA64MMFR0_EL1},
-  {"id_aa64mmfr1_el1", ID_AA64MMFR1_EL1},
+  {"id_aa64pfr0_el1", ID_A64PFR0_EL1},
+  {"id_aa64pfr1_el1", ID_A64PFR1_EL1},
+  {"id_aa64dfr0_el1", ID_A64DFR0_EL1},
+  {"id_aa64dfr1_el1", ID_A64DFR1_EL1},
+  {"id_aa64afr0_el1", ID_A64AFR0_EL1},
+  {"id_aa64afr1_el1", ID_A64AFR1_EL1},
+  {"id_aa64isar0_el1", ID_A64ISAR0_EL1},
+  {"id_aa64isar1_el1", ID_A64ISAR1_EL1},
+  {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1},
+  {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1},
   {"mvfr0_el1", MVFR0_EL1},
   {"mvfr1_el1", MVFR1_EL1},
   {"mvfr2_el1", MVFR2_EL1},
@@ -245,12 +245,13 @@ const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
   {"ich_elsr_el2", ICH_ELSR_EL2}
 };
 
-A64SysReg::MRSMapper::MRSMapper() {
+AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
     InstPairs = &MRSPairs[0];
     NumInstPairs = llvm::array_lengthof(MRSPairs);
 }
 
-const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = {
   {"dbgdtrtx_el0", DBGDTRTX_EL0},
   {"oslar_el1", OSLAR_EL1},
   {"pmswinc_el0", PMSWINC_EL0},
@@ -268,13 +269,14 @@ const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
   {"icc_sgi0r_el1", ICC_SGI0R_EL1}
 };
 
-A64SysReg::MSRMapper::MSRMapper() {
+AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
     InstPairs = &MSRPairs[0];
     NumInstPairs = llvm::array_lengthof(MSRPairs);
 }
 
 
-const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = {
   {"osdtrrx_el1", OSDTRRX_EL1},
   {"osdtrtx_el1",  OSDTRTX_EL1},
   {"teecr32_el1", TEECR32_EL1},
@@ -753,10 +755,16 @@ const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
   {"ich_lr15_el2", ICH_LR15_EL2}
 };
 
+const AArch64NamedImmMapper::Mapping
+AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
+  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
+};
+
 uint32_t
-A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
-  // First search the registers shared by all
+AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
   std::string NameLower = Name.lower();
+
+  // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Name == NameLower) {
       Valid = true;
@@ -764,6 +772,16 @@ A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
     }
   }
 
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Name == NameLower) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Value;
+      }
+    }
+  }
+
   // Now try the instruction-specific registers (either read-only or
   // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
@@ -796,7 +814,8 @@ A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
 }
 
 std::string
-A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+  // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Value == Bits) {
       Valid = true;
@@ -804,6 +823,18 @@ A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
     }
   }
 
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Value == Bits) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Name;
+      }
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
     if (InstPairs[i].Value == Bits) {
       Valid = true;
@@ -831,7 +862,7 @@ A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
                + "_c" + utostr(CRm) + "_" + utostr(Op2);
 }
 
-const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = {
   {"ipas2e1is", IPAS2E1IS},
   {"ipas2le1is", IPAS2LE1IS},
   {"vmalle1is", VMALLE1IS},
@@ -866,308 +897,5 @@ const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
   {"vaale1", VAALE1}
 };
 
-A64TLBI::TLBIMapper::TLBIMapper()
-  : NamedImmMapper(TLBIPairs, 0) {}
-
-bool A64Imms::isFPImm(const APFloat &Val, uint32_t &Imm8Bits) {
-  const fltSemantics &Sem = Val.getSemantics();
-  unsigned FracBits = APFloat::semanticsPrecision(Sem) - 1;
-
-  uint32_t ExpMask;
-  switch (FracBits) {
-  case 10: // IEEE half-precision
-    ExpMask = 0x1f;
-    break;
-  case 23: // IEEE single-precision
-    ExpMask = 0xff;
-    break;
-  case 52: // IEEE double-precision
-    ExpMask = 0x7ff;
-    break;
-  case 112: // IEEE quad-precision
-    // No immediates are valid for double precision.
-    return false;
-  default:
-    llvm_unreachable("Only half, single and double precision supported");
-  }
-
-  uint32_t ExpStart = FracBits;
-  uint64_t FracMask = (1ULL << FracBits) - 1;
-
-  uint32_t Sign = Val.isNegative();
-
-  uint64_t Bits= Val.bitcastToAPInt().getLimitedValue();
-  uint64_t Fraction = Bits & FracMask;
-  int32_t Exponent = ((Bits >> ExpStart) & ExpMask);
-  Exponent -= ExpMask >> 1;
-
-  // S[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 5):imm8<5:0>:Zeros(19)
-  // D[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 8):imm8<5:0>:Zeros(48)
-  // This translates to: only 4 bits of fraction; -3 <= exp <= 4.
-  uint64_t A64FracStart = FracBits - 4;
-  uint64_t A64FracMask = 0xf;
-
-  // Are there too many fraction bits?
-  if (Fraction & ~(A64FracMask << A64FracStart))
-    return false;
-
-  if (Exponent < -3 || Exponent > 4)
-    return false;
-
-  uint32_t PackedFraction = (Fraction >> A64FracStart) & A64FracMask;
-  uint32_t PackedExp = (Exponent + 7) & 0x7;
-
-  Imm8Bits = (Sign << 7) | (PackedExp << 4) | PackedFraction;
-  return true;
-}
-
-// Encoding of the immediate for logical (immediate) instructions:
-//
-// | N | imms   | immr   | size | R            | S            |
-// |---+--------+--------+------+--------------+--------------|
-// | 1 | ssssss | rrrrrr |   64 | UInt(rrrrrr) | UInt(ssssss) |
-// | 0 | 0sssss | xrrrrr |   32 | UInt(rrrrr)  | UInt(sssss)  |
-// | 0 | 10ssss | xxrrrr |   16 | UInt(rrrr)   | UInt(ssss)   |
-// | 0 | 110sss | xxxrrr |    8 | UInt(rrr)    | UInt(sss)    |
-// | 0 | 1110ss | xxxxrr |    4 | UInt(rr)     | UInt(ss)     |
-// | 0 | 11110s | xxxxxr |    2 | UInt(r)      | UInt(s)      |
-// | 0 | 11111x | -      |      | UNALLOCATED  |              |
-//
-// Columns 'R', 'S' and 'size' specify a "bitmask immediate" of size bits in
-// which the lower S+1 bits are ones and the remaining bits are zero, then
-// rotated right by R bits, which is then replicated across the datapath.
-//
-// + Values of 'N', 'imms' and 'immr' which do not match the above table are
-//   RESERVED.
-// + If all 's' bits in the imms field are set then the instruction is
-//   RESERVED.
-// + The 'x' bits in the 'immr' field are IGNORED.
-
-bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
-  int RepeatWidth;
-  int Rotation = 0;
-  int Num1s = 0;
-
-  // Because there are S+1 ones in the replicated mask, an immediate of all
-  // zeros is not allowed. Filtering it here is probably more efficient.
-  if (Imm == 0) return false;
-
-  for (RepeatWidth = RegWidth; RepeatWidth > 1; RepeatWidth /= 2) {
-    uint64_t RepeatMask = RepeatWidth == 64 ? -1 : (1ULL << RepeatWidth) - 1;
-    uint64_t ReplicatedMask = Imm & RepeatMask;
-
-    if (ReplicatedMask == 0) continue;
-
-    // First we have to make sure the mask is actually repeated in each slot for
-    // this width-specifier.
-    bool IsReplicatedMask = true;
-    for (unsigned i = RepeatWidth; i < RegWidth; i += RepeatWidth) {
-      if (((Imm >> i) & RepeatMask) != ReplicatedMask) {
-        IsReplicatedMask = false;
-        break;
-      }
-    }
-    if (!IsReplicatedMask) continue;
-
-    // Now we have to work out the amount of rotation needed. The first part of
-    // this calculation is actually independent of RepeatWidth, but the complex
-    // case will depend on it.
-    Rotation = countTrailingZeros(Imm);
-    if (Rotation == 0) {
-      // There were no leading zeros, which means it's either in place or there
-      // are 1s at each end (e.g. 0x8003 needs rotating).
-      Rotation = RegWidth == 64 ? CountLeadingOnes_64(Imm)
-                                : CountLeadingOnes_32(Imm);
-      Rotation = RepeatWidth - Rotation;
-    }
-
-    uint64_t ReplicatedOnes = ReplicatedMask;
-    if (Rotation != 0 && Rotation != 64)
-      ReplicatedOnes = (ReplicatedMask >> Rotation)
-        | ((ReplicatedMask << (RepeatWidth - Rotation)) & RepeatMask);
-
-    // Of course, they may not actually be ones, so we have to check that:
-    if (!isMask_64(ReplicatedOnes))
-      continue;
-
-    Num1s = CountTrailingOnes_64(ReplicatedOnes);
-
-    // We know we've got an almost valid encoding (certainly, if this is invalid
-    // no other parameters would work).
-    break;
-  }
-
-  // The encodings which would produce all 1s are RESERVED.
-  if (RepeatWidth == 1 || Num1s == RepeatWidth) return false;
-
-  uint32_t N = RepeatWidth == 64;
-  uint32_t ImmR = RepeatWidth - Rotation;
-  uint32_t ImmS = Num1s - 1;
-
-  switch (RepeatWidth) {
-  default: break; // No action required for other valid rotations.
-  case 16: ImmS |= 0x20; break; // 10ssss
-  case 8: ImmS |= 0x30; break;  // 110sss
-  case 4: ImmS |= 0x38; break;  // 1110ss
-  case 2: ImmS |= 0x3c; break;  // 11110s
-  }
-
-  Bits = ImmS | (ImmR << 6) | (N << 12);
-
-  return true;
-}
-
-
-bool A64Imms::isLogicalImmBits(unsigned RegWidth, uint32_t Bits,
-                               uint64_t &Imm) {
-  uint32_t N = Bits >> 12;
-  uint32_t ImmR = (Bits >> 6) & 0x3f;
-  uint32_t ImmS = Bits & 0x3f;
-
-  // N=1 encodes a 64-bit replication and is invalid for the 32-bit
-  // instructions.
-  if (RegWidth == 32 && N != 0) return false;
-
-  int Width = 0;
-  if (N == 1)
-    Width = 64;
-  else if ((ImmS & 0x20) == 0)
-    Width = 32;
-  else if ((ImmS & 0x10) == 0)
-    Width = 16;
-  else if ((ImmS & 0x08) == 0)
-    Width = 8;
-  else if ((ImmS & 0x04) == 0)
-    Width = 4;
-  else if ((ImmS & 0x02) == 0)
-    Width = 2;
-  else {
-    // ImmS  is 0b11111x: UNALLOCATED
-    return false;
-  }
-
-  int Num1s = (ImmS & (Width - 1)) + 1;
-
-  // All encodings which would map to -1 (signed) are RESERVED.
-  if (Num1s == Width) return false;
-
-  int Rotation = (ImmR & (Width - 1));
-  uint64_t Mask = (1ULL << Num1s) - 1;
-  uint64_t WidthMask = Width == 64 ? -1 : (1ULL << Width) - 1;
-  if (Rotation != 0 && Rotation != 64)
-    Mask = (Mask >> Rotation)
-      | ((Mask << (Width - Rotation)) & WidthMask);
-
-  Imm = Mask;
-  for (unsigned i = 1; i < RegWidth / Width; ++i) {
-    Mask <<= Width;
-    Imm |= Mask;
-  }
-
-  return true;
-}
-
-bool A64Imms::isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
-  // If high bits are set then a 32-bit MOVZ can't possibly work.
-  if (RegWidth == 32 && (Value & ~0xffffffffULL))
-    return false;
-
-  for (int i = 0; i < RegWidth; i += 16) {
-    // If the value is 0 when we mask out all the bits that could be set with
-    // the current LSL value then it's representable.
-    if ((Value & ~(0xffffULL << i)) == 0) {
-      Shift = i / 16;
-      UImm16 = (Value >> i) & 0xffff;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool A64Imms::isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
-  // MOVN is defined to set its register to NOT(LSL(imm16, shift)).
-
-  // We have to be a little careful about a 32-bit register: 0xffff_1234 *is*
-  // representable, but ~0xffff_1234 == 0xffff_ffff_0000_edcb which is not
-  // a valid input for isMOVZImm.
-  if (RegWidth == 32 && (Value & ~0xffffffffULL))
-    return false;
-
-  uint64_t MOVZEquivalent = RegWidth == 32 ? ~Value & 0xffffffff : ~Value;
-
-  return isMOVZImm(RegWidth, MOVZEquivalent, UImm16, Shift);
-}
-
-bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
-                            int &UImm16, int &Shift) {
-  if (isMOVZImm(RegWidth, Value, UImm16, Shift))
-    return false;
-
-  return isMOVNImm(RegWidth, Value, UImm16, Shift);
-}
-
-// decodeNeonModShiftImm - Decode a Neon OpCmode value into the
-// the shift amount and the shift type (shift zeros or ones in) and
-// returns whether the OpCmode value implies a shift operation.
-bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
-                                    unsigned &ShiftOnesIn) {
-  ShiftImm = 0;
-  ShiftOnesIn = false;
-  bool HasShift = true;
-
-  if (OpCmode == 0xe) {
-    // movi byte
-    HasShift = false;
-  } else if (OpCmode == 0x1e) {
-    // movi 64-bit bytemask
-    HasShift = false;
-  } else if ((OpCmode & 0xc) == 0x8) {
-    // shift zeros, per halfword
-    ShiftImm = ((OpCmode & 0x2) >> 1);
-  } else if ((OpCmode & 0x8) == 0) {
-    // shift zeros, per word
-    ShiftImm = ((OpCmode & 0x6) >> 1);
-  } else if ((OpCmode & 0xe) == 0xc) {
-    // shift ones, per word
-    ShiftOnesIn = true;
-    ShiftImm = (OpCmode & 0x1);
-  } else {
-    // per byte, per bytemask
-    llvm_unreachable("Unsupported Neon modified immediate");
-  }
-
-  return HasShift;
-}
-
-// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values
-// into the element value and the element size in bits.
-uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode,
-                                   unsigned &EltBits) {
-  uint64_t DecodedVal = Val;
-  EltBits = 0;
-
-  if (OpCmode == 0xe) {
-    // movi byte
-    EltBits = 8;
-  } else if (OpCmode == 0x1e) {
-    // movi 64-bit bytemask
-    DecodedVal = 0;
-    for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if ((Val >> ByteNum) & 1)
-        DecodedVal |= (uint64_t)0xff << (8 * ByteNum);
-    }
-    EltBits = 64;
-  } else if ((OpCmode & 0xc) == 0x8) {
-    // shift zeros, per halfword
-    EltBits = 16;
-  } else if ((OpCmode & 0x8) == 0) {
-    // shift zeros, per word
-    EltBits = 32;
-  } else if ((OpCmode & 0xe) == 0xc) {
-    // shift ones, per word
-    EltBits = 32;
-  } else {
-    llvm_unreachable("Unsupported Neon modified immediate");
-  }
-  return DecodedVal;
-}
+AArch64TLBI::TLBIMapper::TLBIMapper()
+  : AArch64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 39b042b7208a..9d2ce21c9626 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -1,4 +1,4 @@
-//===-- AArch64BaseInfo.h - Top level definitions for AArch64- --*- C++ -*-===//
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,95 +14,256 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_BASEINFO_H
-#define LLVM_AARCH64_BASEINFO_H
+#ifndef AArch64BASEINFO_H
+#define AArch64BASEINFO_H
 
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
-// // Enums corresponding to AArch64 condition codes
-namespace A64CC {
-  // The CondCodes constants map directly to the 4-bit encoding of the
-  // condition field for predicated instructions.
-  enum CondCodes {   // Meaning (integer)          Meaning (floating-point)
-    EQ = 0,        // Equal                      Equal
-    NE,            // Not equal                  Not equal, or unordered
-    HS,            // Unsigned higher or same    >, ==, or unordered
-    LO,            // Unsigned lower or same     Less than
-    MI,            // Minus, negative            Less than
-    PL,            // Plus, positive or zero     >, ==, or unordered
-    VS,            // Overflow                   Unordered
-    VC,            // No overflow                Ordered
-    HI,            // Unsigned higher            Greater than, or unordered
-    LS,            // Unsigned lower or same     Less than or equal
-    GE,            // Greater than or equal      Greater than or equal
-    LT,            // Less than                  Less than, or unordered
-    GT,            // Signed greater than        Greater than
-    LE,            // Signed less than or equal  <, ==, or unordered
-    AL,            // Always (unconditional)     Always (unconditional)
-    NV,             // Always (unconditional)     Always (unconditional)
-    // Note the NV exists purely to disassemble 0b1111. Execution
-    // is "always".
-    Invalid
-  };
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::X0: return AArch64::W0;
+  case AArch64::X1: return AArch64::W1;
+  case AArch64::X2: return AArch64::W2;
+  case AArch64::X3: return AArch64::W3;
+  case AArch64::X4: return AArch64::W4;
+  case AArch64::X5: return AArch64::W5;
+  case AArch64::X6: return AArch64::W6;
+  case AArch64::X7: return AArch64::W7;
+  case AArch64::X8: return AArch64::W8;
+  case AArch64::X9: return AArch64::W9;
+  case AArch64::X10: return AArch64::W10;
+  case AArch64::X11: return AArch64::W11;
+  case AArch64::X12: return AArch64::W12;
+  case AArch64::X13: return AArch64::W13;
+  case AArch64::X14: return AArch64::W14;
+  case AArch64::X15: return AArch64::W15;
+  case AArch64::X16: return AArch64::W16;
+  case AArch64::X17: return AArch64::W17;
+  case AArch64::X18: return AArch64::W18;
+  case AArch64::X19: return AArch64::W19;
+  case AArch64::X20: return AArch64::W20;
+  case AArch64::X21: return AArch64::W21;
+  case AArch64::X22: return AArch64::W22;
+  case AArch64::X23: return AArch64::W23;
+  case AArch64::X24: return AArch64::W24;
+  case AArch64::X25: return AArch64::W25;
+  case AArch64::X26: return AArch64::W26;
+  case AArch64::X27: return AArch64::W27;
+  case AArch64::X28: return AArch64::W28;
+  case AArch64::FP: return AArch64::W29;
+  case AArch64::LR: return AArch64::W30;
+  case AArch64::SP: return AArch64::WSP;
+  case AArch64::XZR: return AArch64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
 
-} // namespace A64CC
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::W0: return AArch64::X0;
+  case AArch64::W1: return AArch64::X1;
+  case AArch64::W2: return AArch64::X2;
+  case AArch64::W3: return AArch64::X3;
+  case AArch64::W4: return AArch64::X4;
+  case AArch64::W5: return AArch64::X5;
+  case AArch64::W6: return AArch64::X6;
+  case AArch64::W7: return AArch64::X7;
+  case AArch64::W8: return AArch64::X8;
+  case AArch64::W9: return AArch64::X9;
+  case AArch64::W10: return AArch64::X10;
+  case AArch64::W11: return AArch64::X11;
+  case AArch64::W12: return AArch64::X12;
+  case AArch64::W13: return AArch64::X13;
+  case AArch64::W14: return AArch64::X14;
+  case AArch64::W15: return AArch64::X15;
+  case AArch64::W16: return AArch64::X16;
+  case AArch64::W17: return AArch64::X17;
+  case AArch64::W18: return AArch64::X18;
+  case AArch64::W19: return AArch64::X19;
+  case AArch64::W20: return AArch64::X20;
+  case AArch64::W21: return AArch64::X21;
+  case AArch64::W22: return AArch64::X22;
+  case AArch64::W23: return AArch64::X23;
+  case AArch64::W24: return AArch64::X24;
+  case AArch64::W25: return AArch64::X25;
+  case AArch64::W26: return AArch64::X26;
+  case AArch64::W27: return AArch64::X27;
+  case AArch64::W28: return AArch64::X28;
+  case AArch64::W29: return AArch64::FP;
+  case AArch64::W30: return AArch64::LR;
+  case AArch64::WSP: return AArch64::SP;
+  case AArch64::WZR: return AArch64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
 
-inline static const char *A64CondCodeToString(A64CC::CondCodes CC) {
-  switch (CC) {
-  default: llvm_unreachable("Unknown condition code");
-  case A64CC::EQ:  return "eq";
-  case A64CC::NE:  return "ne";
-  case A64CC::HS:  return "hs";
-  case A64CC::LO:  return "lo";
-  case A64CC::MI:  return "mi";
-  case A64CC::PL:  return "pl";
-  case A64CC::VS:  return "vs";
-  case A64CC::VC:  return "vc";
-  case A64CC::HI:  return "hi";
-  case A64CC::LS:  return "ls";
-  case A64CC::GE:  return "ge";
-  case A64CC::LT:  return "lt";
-  case A64CC::GT:  return "gt";
-  case A64CC::LE:  return "le";
-  case A64CC::AL:  return "al";
-  case A64CC::NV:  return "nv";
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::D0:  return AArch64::B0;
+  case AArch64::D1:  return AArch64::B1;
+  case AArch64::D2:  return AArch64::B2;
+  case AArch64::D3:  return AArch64::B3;
+  case AArch64::D4:  return AArch64::B4;
+  case AArch64::D5:  return AArch64::B5;
+  case AArch64::D6:  return AArch64::B6;
+  case AArch64::D7:  return AArch64::B7;
+  case AArch64::D8:  return AArch64::B8;
+  case AArch64::D9:  return AArch64::B9;
+  case AArch64::D10: return AArch64::B10;
+  case AArch64::D11: return AArch64::B11;
+  case AArch64::D12: return AArch64::B12;
+  case AArch64::D13: return AArch64::B13;
+  case AArch64::D14: return AArch64::B14;
+  case AArch64::D15: return AArch64::B15;
+  case AArch64::D16: return AArch64::B16;
+  case AArch64::D17: return AArch64::B17;
+  case AArch64::D18: return AArch64::B18;
+  case AArch64::D19: return AArch64::B19;
+  case AArch64::D20: return AArch64::B20;
+  case AArch64::D21: return AArch64::B21;
+  case AArch64::D22: return AArch64::B22;
+  case AArch64::D23: return AArch64::B23;
+  case AArch64::D24: return AArch64::B24;
+  case AArch64::D25: return AArch64::B25;
+  case AArch64::D26: return AArch64::B26;
+  case AArch64::D27: return AArch64::B27;
+  case AArch64::D28: return AArch64::B28;
+  case AArch64::D29: return AArch64::B29;
+  case AArch64::D30: return AArch64::B30;
+  case AArch64::D31: return AArch64::B31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::B0:  return AArch64::D0;
+  case AArch64::B1:  return AArch64::D1;
+  case AArch64::B2:  return AArch64::D2;
+  case AArch64::B3:  return AArch64::D3;
+  case AArch64::B4:  return AArch64::D4;
+  case AArch64::B5:  return AArch64::D5;
+  case AArch64::B6:  return AArch64::D6;
+  case AArch64::B7:  return AArch64::D7;
+  case AArch64::B8:  return AArch64::D8;
+  case AArch64::B9:  return AArch64::D9;
+  case AArch64::B10: return AArch64::D10;
+  case AArch64::B11: return AArch64::D11;
+  case AArch64::B12: return AArch64::D12;
+  case AArch64::B13: return AArch64::D13;
+  case AArch64::B14: return AArch64::D14;
+  case AArch64::B15: return AArch64::D15;
+  case AArch64::B16: return AArch64::D16;
+  case AArch64::B17: return AArch64::D17;
+  case AArch64::B18: return AArch64::D18;
+  case AArch64::B19: return AArch64::D19;
+  case AArch64::B20: return AArch64::D20;
+  case AArch64::B21: return AArch64::D21;
+  case AArch64::B22: return AArch64::D22;
+  case AArch64::B23: return AArch64::D23;
+  case AArch64::B24: return AArch64::D24;
+  case AArch64::B25: return AArch64::D25;
+  case AArch64::B26: return AArch64::D26;
+  case AArch64::B27: return AArch64::D27;
+  case AArch64::B28: return AArch64::D28;
+  case AArch64::B29: return AArch64::D29;
+  case AArch64::B30: return AArch64::D30;
+  case AArch64::B31: return AArch64::D31;
   }
+  // For anything else, return it unchanged.
+  return Reg;
 }
 
-inline static A64CC::CondCodes A64StringToCondCode(StringRef CondStr) {
-  return StringSwitch<A64CC::CondCodes>(CondStr.lower())
-             .Case("eq", A64CC::EQ)
-             .Case("ne", A64CC::NE)
-             .Case("ne", A64CC::NE)
-             .Case("hs", A64CC::HS)
-             .Case("cs", A64CC::HS)
-             .Case("lo", A64CC::LO)
-             .Case("cc", A64CC::LO)
-             .Case("mi", A64CC::MI)
-             .Case("pl", A64CC::PL)
-             .Case("vs", A64CC::VS)
-             .Case("vc", A64CC::VC)
-             .Case("hi", A64CC::HI)
-             .Case("ls", A64CC::LS)
-             .Case("ge", A64CC::GE)
-             .Case("lt", A64CC::LT)
-             .Case("gt", A64CC::GT)
-             .Case("le", A64CC::LE)
-             .Case("al", A64CC::AL)
-             .Case("nv", A64CC::NV)
-             .Default(A64CC::Invalid);
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
+  LO = 0x3,      // Unsigned lower             Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe,      // Always (unconditional)     Always (unconditional)
+  NV = 0xf,      // Always (unconditional)     Always (unconditional)
+  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+  Invalid
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case HS:  return "hs";
+  case LO:  return "lo";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  case NV:  return "nv";
+  }
 }
 
-inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
-  // It turns out that the condition codes have been designed so that in order
-  // to reverse the intent of the condition you only have to invert the low bit:
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  // To reverse a condition it's necessary to only invert the low bit:
+
+  return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1);
+}
 
-  return static_cast<A64CC::CondCodes>(static_cast<unsigned>(CC) ^ 0x1);
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case HS: return C; // C == 1
+  case LO: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
 }
+} // end namespace AArch64CC
 
 /// Instances of this class can perform bidirectional mapping from random
 /// identifier strings to operand encodings. For example "MSR" takes a named
@@ -115,14 +276,14 @@ inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
 /// out just how often these instructions are emitted before working on it. It
 /// might even be optimal to just reorder the tables for the common instructions
 /// rather than changing the algorithm.
-struct NamedImmMapper {
+struct AArch64NamedImmMapper {
   struct Mapping {
     const char *Name;
     uint32_t Value;
   };
 
   template<int N>
-  NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+  AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
     : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
 
   StringRef toString(uint32_t Value, bool &Valid) const;
@@ -138,7 +299,7 @@ struct NamedImmMapper {
   uint32_t TooBigImm;
 };
 
-namespace A64AT {
+namespace AArch64AT {
   enum ATValues {
     Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
     S1E1R = 0x43c0,  // 01  000  0111  1000  000
@@ -155,14 +316,14 @@ namespace A64AT {
     S12E0W = 0x63c7  // 01  100  0111  1000  111
   };
 
-  struct ATMapper : NamedImmMapper {
+  struct ATMapper : AArch64NamedImmMapper {
     const static Mapping ATPairs[];
 
     ATMapper();
   };
 
 }
-namespace A64DB {
+namespace AArch64DB {
   enum DBValues {
     Invalid = -1,
     OSHLD = 0x1,
@@ -179,14 +340,14 @@ namespace A64DB {
     SY =    0xf
   };
 
-  struct DBarrierMapper : NamedImmMapper {
+  struct DBarrierMapper : AArch64NamedImmMapper {
     const static Mapping DBarrierPairs[];
 
     DBarrierMapper();
   };
 }
 
-namespace  A64DC {
+namespace  AArch64DC {
   enum DCValues {
     Invalid = -1,   // Op1  CRn   CRm   Op2
     ZVA   = 0x5ba1, // 01  011  0111  0100  001
@@ -199,7 +360,7 @@ namespace  A64DC {
     CISW  = 0x43f2  // 01  000  0111  1110  010
   };
 
-  struct DCMapper : NamedImmMapper {
+  struct DCMapper : AArch64NamedImmMapper {
     const static Mapping DCPairs[];
 
     DCMapper();
@@ -207,7 +368,7 @@ namespace  A64DC {
 
 }
 
-namespace  A64IC {
+namespace  AArch64IC {
   enum ICValues {
     Invalid = -1,     // Op1  CRn   CRm   Op2
     IALLUIS = 0x0388, // 000  0111  0001  000
@@ -216,7 +377,7 @@ namespace  A64IC {
   };
 
 
-  struct ICMapper : NamedImmMapper {
+  struct ICMapper : AArch64NamedImmMapper {
     const static Mapping ICPairs[];
 
     ICMapper();
@@ -227,19 +388,19 @@ namespace  A64IC {
   }
 }
 
-namespace  A64ISB {
+namespace  AArch64ISB {
   enum ISBValues {
     Invalid = -1,
     SY = 0xf
   };
-  struct ISBMapper : NamedImmMapper {
+  struct ISBMapper : AArch64NamedImmMapper {
     const static Mapping ISBPairs[];
 
     ISBMapper();
   };
 }
 
-namespace A64PRFM {
+namespace AArch64PRFM {
   enum PRFMValues {
     Invalid = -1,
     PLDL1KEEP = 0x00,
@@ -262,14 +423,14 @@ namespace A64PRFM {
     PSTL3STRM = 0x15
   };
 
-  struct PRFMMapper : NamedImmMapper {
+  struct PRFMMapper : AArch64NamedImmMapper {
     const static Mapping PRFMPairs[];
 
     PRFMMapper();
   };
 }
 
-namespace A64PState {
+namespace AArch64PState {
   enum PStateValues {
     Invalid = -1,
     SPSel = 0x05,
@@ -277,7 +438,7 @@ namespace A64PState {
     DAIFClr = 0x1f
   };
 
-  struct PStateMapper : NamedImmMapper {
+  struct PStateMapper : AArch64NamedImmMapper {
     const static Mapping PStatePairs[];
 
     PStateMapper();
@@ -285,7 +446,7 @@ namespace A64PState {
 
 }
 
-namespace A64SE {
+namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
         LSL,
@@ -306,7 +467,7 @@ namespace A64SE {
     };
 }
 
-namespace A64Layout {
+namespace AArch64Layout {
     enum VectorLayout {
         Invalid = -1,
         VL_8B,
@@ -329,43 +490,43 @@ namespace A64Layout {
 }
 
 inline static const char *
-A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
   switch (Layout) {
-  case A64Layout::VL_8B:  return ".8b";
-  case A64Layout::VL_4H:  return ".4h";
-  case A64Layout::VL_2S:  return ".2s";
-  case A64Layout::VL_1D:  return ".1d";
-  case A64Layout::VL_16B:  return ".16b";
-  case A64Layout::VL_8H:  return ".8h";
-  case A64Layout::VL_4S:  return ".4s";
-  case A64Layout::VL_2D:  return ".2d";
-  case A64Layout::VL_B:  return ".b";
-  case A64Layout::VL_H:  return ".h";
-  case A64Layout::VL_S:  return ".s";
-  case A64Layout::VL_D:  return ".d";
+  case AArch64Layout::VL_8B:  return ".8b";
+  case AArch64Layout::VL_4H:  return ".4h";
+  case AArch64Layout::VL_2S:  return ".2s";
+  case AArch64Layout::VL_1D:  return ".1d";
+  case AArch64Layout::VL_16B:  return ".16b";
+  case AArch64Layout::VL_8H:  return ".8h";
+  case AArch64Layout::VL_4S:  return ".4s";
+  case AArch64Layout::VL_2D:  return ".2d";
+  case AArch64Layout::VL_B:  return ".b";
+  case AArch64Layout::VL_H:  return ".h";
+  case AArch64Layout::VL_S:  return ".s";
+  case AArch64Layout::VL_D:  return ".d";
   default: llvm_unreachable("Unknown Vector Layout");
   }
 }
 
-inline static A64Layout::VectorLayout
-A64StringToVectorLayout(StringRef LayoutStr) {
-  return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
-             .Case(".8b", A64Layout::VL_8B)
-             .Case(".4h", A64Layout::VL_4H)
-             .Case(".2s", A64Layout::VL_2S)
-             .Case(".1d", A64Layout::VL_1D)
-             .Case(".16b", A64Layout::VL_16B)
-             .Case(".8h", A64Layout::VL_8H)
-             .Case(".4s", A64Layout::VL_4S)
-             .Case(".2d", A64Layout::VL_2D)
-             .Case(".b", A64Layout::VL_B)
-             .Case(".h", A64Layout::VL_H)
-             .Case(".s", A64Layout::VL_S)
-             .Case(".d", A64Layout::VL_D)
-             .Default(A64Layout::Invalid);
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", AArch64Layout::VL_8B)
+             .Case(".4h", AArch64Layout::VL_4H)
+             .Case(".2s", AArch64Layout::VL_2S)
+             .Case(".1d", AArch64Layout::VL_1D)
+             .Case(".16b", AArch64Layout::VL_16B)
+             .Case(".8h", AArch64Layout::VL_8H)
+             .Case(".4s", AArch64Layout::VL_4S)
+             .Case(".2d", AArch64Layout::VL_2D)
+             .Case(".b", AArch64Layout::VL_B)
+             .Case(".h", AArch64Layout::VL_H)
+             .Case(".s", AArch64Layout::VL_S)
+             .Case(".d", AArch64Layout::VL_D)
+             .Default(AArch64Layout::Invalid);
 }
 
-namespace A64SysReg {
+namespace AArch64SysReg {
   enum SysRegROValues {
     MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
     DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
@@ -396,16 +557,16 @@ namespace A64SysReg {
     ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
     ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
     ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
-    ID_AA64PFR0_EL1   = 0xc020, // 11  000  0000  0100  000
-    ID_AA64PFR1_EL1   = 0xc021, // 11  000  0000  0100  001
-    ID_AA64DFR0_EL1   = 0xc028, // 11  000  0000  0101  000
-    ID_AA64DFR1_EL1   = 0xc029, // 11  000  0000  0101  001
-    ID_AA64AFR0_EL1   = 0xc02c, // 11  000  0000  0101  100
-    ID_AA64AFR1_EL1   = 0xc02d, // 11  000  0000  0101  101
-    ID_AA64ISAR0_EL1  = 0xc030, // 11  000  0000  0110  000
-    ID_AA64ISAR1_EL1  = 0xc031, // 11  000  0000  0110  001
-    ID_AA64MMFR0_EL1  = 0xc038, // 11  000  0000  0111  000
-    ID_AA64MMFR1_EL1  = 0xc039, // 11  000  0000  0111  001
+    ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
+    ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
+    ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
+    ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
+    ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
+    ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
+    ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
+    ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
+    ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
+    ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
     MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
     MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
     MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
@@ -960,38 +1121,45 @@ namespace A64SysReg {
     ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
     ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
     ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
-    ICH_LR15_EL2      = 0xe66f  // 11  100  1100  1101  111
+    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
+  };
+
+  // Cyclone specific system registers
+  enum CycloneSysRegValues {
+    CPM_IOACC_CTL_EL3 = 0xff90
   };
 
-  // Note that these do not inherit from NamedImmMapper. This class is
+  // Note that these do not inherit from AArch64NamedImmMapper. This class is
   // sufficiently different in its behaviour that I don't believe it's worth
-  // burdening the common NamedImmMapper with abstractions only needed in
+  // burdening the common AArch64NamedImmMapper with abstractions only needed in
   // this one case.
   struct SysRegMapper {
-    static const NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
 
-    const NamedImmMapper::Mapping *InstPairs;
+    const AArch64NamedImmMapper::Mapping *InstPairs;
     size_t NumInstPairs;
+    uint64_t FeatureBits;
 
-    SysRegMapper() {}
+    SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
     uint32_t fromString(StringRef Name, bool &Valid) const;
     std::string toString(uint32_t Bits, bool &Valid) const;
   };
 
   struct MSRMapper : SysRegMapper {
-    static const NamedImmMapper::Mapping MSRPairs[];
-    MSRMapper();
+    static const AArch64NamedImmMapper::Mapping MSRPairs[];
+    MSRMapper(uint64_t FeatureBits);
   };
 
   struct MRSMapper : SysRegMapper {
-    static const NamedImmMapper::Mapping MRSPairs[];
-    MRSMapper();
+    static const AArch64NamedImmMapper::Mapping MRSPairs[];
+    MRSMapper(uint64_t FeatureBits);
   };
 
   uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
 }
 
-namespace A64TLBI {
+namespace AArch64TLBI {
   enum TLBIValues {
     Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
     IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
@@ -1028,7 +1196,7 @@ namespace A64TLBI {
     VAALE1       = 0x443f  // 01  000  1000  0111  111
   };
 
-  struct TLBIMapper : NamedImmMapper {
+  struct TLBIMapper : AArch64NamedImmMapper {
     const static Mapping TLBIPairs[];
 
     TLBIMapper();
@@ -1051,88 +1219,62 @@ namespace A64TLBI {
       return true;
     }
   }
-}
+} 
 
 namespace AArch64II {
-
+  /// Target Operand Flag enum.
   enum TOF {
-    //===--------------------------------------------------------------===//
+    //===------------------------------------------------------------------===//
     // AArch64 Specific MachineOperand flags.
 
     MO_NO_FLAG,
 
-    // MO_GOT - Represents a relocation referring to the GOT entry of a given
-    // symbol. Used in adrp.
-    MO_GOT,
-
-    // MO_GOT_LO12 - Represents a relocation referring to the low 12 bits of the
-    // GOT entry of a given symbol. Used in ldr only.
-    MO_GOT_LO12,
-
-    // MO_DTPREL_* - Represents a relocation referring to the offset from a
-    // module's dynamic thread pointer. Used in the local-dynamic TLS access
-    // model.
-    MO_DTPREL_G1,
-    MO_DTPREL_G0_NC,
-
-    // MO_GOTTPREL_* - Represents a relocation referring to a GOT entry
-    // providing the offset of a variable from the thread-pointer. Used in
-    // initial-exec TLS model where this offset is assigned in the static thread
-    // block and thus known by the dynamic linker.
-    MO_GOTTPREL,
-    MO_GOTTPREL_LO12,
-
-    // MO_TLSDESC_* - Represents a relocation referring to a GOT entry providing
-    // a TLS descriptor chosen by the dynamic linker. Used for the
-    // general-dynamic and local-dynamic TLS access models where very littls is
-    // known at link-time.
-    MO_TLSDESC,
-    MO_TLSDESC_LO12,
-
-    // MO_TPREL_* - Represents a relocation referring to the offset of a
-    // variable from the thread pointer itself. Used in the local-exec TLS
-    // access model.
-    MO_TPREL_G1,
-    MO_TPREL_G0_NC,
-
-    // MO_LO12 - On a symbol operand, this represents a relocation containing
-    // lower 12 bits of the address. Used in add/sub/ldr/str.
-    MO_LO12,
-
-    // MO_ABS_G* - Represent the 16-bit granules of an absolute reference using
-    // movz/movk instructions.
-    MO_ABS_G3,
-    MO_ABS_G2_NC,
-    MO_ABS_G1_NC,
-    MO_ABS_G0_NC
+    MO_FRAGMENT = 0x7,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 8,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x10,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x20
   };
-}
-
-class APFloat;
-
-namespace A64Imms {
-  bool isFPImm(const APFloat &Val, uint32_t &Imm8Bits);
-
-  inline bool isFPImm(const APFloat &Val) {
-    uint32_t Imm8;
-    return isFPImm(Val, Imm8);
-  }
-
-  bool isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits);
-  bool isLogicalImmBits(unsigned RegWidth, uint32_t Bits, uint64_t &Imm);
-
-  bool isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-  bool isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
-  // We sometimes want to know whether the immediate is representable with a
-  // MOVN but *not* with a MOVZ (because that would take priority).
-  bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
-  uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits);
-  bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
-                             unsigned &ShiftOnesIn);
-  }
+} // end namespace AArch64II
 
-} // end namespace llvm;
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt
index 4acecc935e2a..bcefeb672f76 100644
--- a/lib/Target/AArch64/Utils/LLVMBuild.txt
+++ b/lib/Target/AArch64/Utils/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch646/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
index 0f4a64527123..0b80f82f2b99 100644
--- a/lib/Target/AArch64/Utils/Makefile
+++ b/lib/Target/AArch64/Utils/Makefile
@@ -9,7 +9,8 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64Utils
 
-# Hack: we need to include 'main' AArch64 target directory to grab private headers
-#CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+# Hack: we need to include 'main' AArch64 target directory to grab private
+# headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 2b97e015bf41..55e9fe5f5c57 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -671,6 +671,20 @@ void ARMAsmPrinter::emitAttributes() {
       ATS.emitFPU(ARM::VFPV2);
   }
 
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // PIC specific attributes.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+                      ARMBuildAttrs::AddressRWPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RO_data,
+                      ARMBuildAttrs::AddressROPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressGOT);
+  } else {
+    // Allow direct addressing of imported data for all other relocation models.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressDirect);
+  }
+
   // Signal various FP modes.
   if (!TM.Options.UnsafeFPMath) {
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index bc266e88b2df..eec5d14100c2 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -102,14 +103,15 @@ ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
 
 // Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl
 // currently defaults to no prepass hazard recognizer.
-ScheduleHazardRecognizer *ARMBaseInstrInfo::
-CreateTargetHazardRecognizer(const TargetMachine *TM,
-                             const ScheduleDAG *DAG) const {
+ScheduleHazardRecognizer *
+ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                               const ScheduleDAG *DAG) const {
   if (usePreRAHazardRecognizer()) {
-    const InstrItineraryData *II = TM->getInstrItineraryData();
+    const InstrItineraryData *II =
+        &static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData();
     return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
   }
-  return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG);
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
 }
 
 ScheduleHazardRecognizer *ARMBaseInstrInfo::
@@ -4358,6 +4360,29 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
   MI->addRegisterKilled(DReg, TRI, true);
 }
 
+void ARMBaseInstrInfo::getUnconditionalBranch(
+    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+  if (Subtarget.isThumb())
+    Branch.setOpcode(ARM::tB);
+  else if (Subtarget.isThumb2())
+    Branch.setOpcode(ARM::t2B);
+  else
+    Branch.setOpcode(ARM::Bcc);
+
+  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+  Branch.addOperand(MCOperand::CreateImm(ARMCC::AL));
+  Branch.addOperand(MCOperand::CreateReg(0));
+}
+
+void ARMBaseInstrInfo::getTrap(MCInst &MI) const {
+  if (Subtarget.isThumb())
+    MI.setOpcode(ARM::tTRAP);
+  else if (Subtarget.useNaClTrap())
+    MI.setOpcode(ARM::TRAPNaCl);
+  else
+    MI.setOpcode(ARM::TRAP);
+}
+
 bool ARMBaseInstrInfo::hasNOP() const {
   return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0;
 }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 4b3e74023ac9..b8d675806b25 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -50,7 +50,7 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
-  CreateTargetHazardRecognizer(const TargetMachine *TM,
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const override;
 
   ScheduleHazardRecognizer *
@@ -229,6 +229,13 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
                                       const TargetRegisterInfo*) const override;
   void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
                                  const TargetRegisterInfo *TRI) const override;
+
+  void
+  getUnconditionalBranch(MCInst &Branch,
+                         const MCSymbolRefExpr *BranchTarget) const override;
+
+  void getTrap(MCInst &MI) const override;
+
   /// Get the number of addresses by LDM or VLDM or zero for unknown.
   unsigned getNumLDMAddresses(const MachineInstr *MI) const;
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index a2eee9ff3048..cdd91c7a7036 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -45,9 +45,12 @@ using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
     : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), BasePtr(ARM::R6) {
-  if (STI.isTargetMachO())
-    FramePtr = ARM::R7;
-  else if (STI.isTargetWindows())
+  if (STI.isTargetMachO()) {
+    if (STI.isTargetDarwin() || STI.isThumb1Only())
+      FramePtr = ARM::R7;
+    else
+      FramePtr = ARM::R11;
+  } else if (STI.isTargetWindows())
     FramePtr = ARM::R11;
   else // ARM EABI
     FramePtr = STI.isThumb() ? ARM::R7 : ARM::R11;
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 8e0fd8935282..dc41c1c14bbb 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -177,9 +177,8 @@ static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    CCValAssign::LocInfo &LocInfo,
                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
   SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
-
   // AAPCS HFAs must have 1-4 elements, all of the same type
-  assert(PendingHAMembers.size() < 4);
+  assert(PendingHAMembers.size() < 8);
   if (PendingHAMembers.size() > 0)
     assert(PendingHAMembers[0].getLocVT() == LocVT);
 
@@ -189,7 +188,7 @@ static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
       CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
 
   if (ArgFlags.isInConsecutiveRegsLast()) {
-    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 4 &&
+    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 8 &&
            "Homogeneous aggregates must have between 1 and 4 members");
 
     // Try to allocate a contiguous block of registers, each of the correct
@@ -197,6 +196,7 @@ static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     const uint16_t *RegList;
     unsigned NumRegs;
     switch (LocVT.SimpleTy) {
+    case MVT::i32:
     case MVT::f32:
       RegList = SRegList;
       NumRegs = 16;
@@ -235,11 +235,20 @@ static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
       State.AllocateReg(SRegList[regNo]);
 
     unsigned Size = LocVT.getSizeInBits() / 8;
-    unsigned Align = LocVT.SimpleTy == MVT::v2f64 ? 8 : Size;
+    unsigned Align = Size;
+
+    if (LocVT.SimpleTy == MVT::v2f64 || LocVT.SimpleTy == MVT::i32) {
+      // Vectors are always aligned to 8 bytes. If we've seen an i32 here
+      // it's because it's been split from a larger type, also with align 8.
+      Align = 8;
+    }
 
     for (auto It : PendingHAMembers) {
       It.convertToMem(State.AllocateStack(Size, Align));
       State.addLoc(It);
+
+      // Only the first member needs to be aligned.
+      Align = 1;
     }
 
     // All pending members have now been allocated
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 93357fe525ad..6045738e2e34 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -22,6 +22,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/CommandLine.h"
@@ -697,9 +698,6 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     HI16Opc = ARM::MOVTi16;
   }
 
-  if (RequiresBundling)
-    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(TargetOpcode::BUNDLE));
-
   LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
   HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
     .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -735,10 +733,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   LO16.addImm(Pred).addReg(PredReg);
   HI16.addImm(Pred).addReg(PredReg);
 
-  if (RequiresBundling) {
-    LO16->bundleWithPred();
-    HI16->bundleWithPred();
-  }
+  if (RequiresBundling)
+    finalizeBundle(MBB, &*LO16, &*MBBI);
 
   TransferImpOps(MI, LO16, HI16);
   MI.eraseFromParent();
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index c0f8a8d90253..6888ae994c58 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -220,7 +220,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::R10:
     case ARM::R11:
     case ARM::R12:
-      if (STI.isTargetMachO()) {
+      if (STI.isTargetDarwin()) {
         GPRCS2Size += 4;
         break;
       }
@@ -380,7 +380,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetMachO())
+        if (STI.isTargetDarwin())
           break;
         // fallthrough
       case ARM::R0:
@@ -445,7 +445,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetMachO()) {
+        if (STI.isTargetDarwin()) {
           unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
           unsigned Offset = MFI->getObjectOffset(FI);
           unsigned CFIIndex = MMI.addFrameInst(
@@ -810,7 +810,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     unsigned LastReg = 0;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetMachO())) continue;
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
 
       // D-registers in the aligned area DPRCS2 are NOT spilled here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -888,7 +888,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     bool DeleteRet = false;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetMachO())) continue;
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
 
       // The aligned reloads from area DPRCS2 are not inserted here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -1438,7 +1438,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     if (Spilled) {
       NumGPRSpills++;
 
-      if (!STI.isTargetMachO()) {
+      if (!STI.isTargetDarwin()) {
         if (Reg == ARM::LR)
           LRSpilled = true;
         CS1Spilled = true;
@@ -1460,7 +1460,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         break;
       }
     } else {
-      if (!STI.isTargetMachO()) {
+      if (!STI.isTargetDarwin()) {
         UnspilledCS1GPRs.push_back(Reg);
         continue;
       }
@@ -1746,6 +1746,12 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL;
 
+  uint64_t StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
   // Use R4 and R5 as scratch registers.
   // We save R4 and R5 before use and restore them before leaving the function.
   unsigned ScratchReg0 = ARM::R4;
@@ -1775,8 +1781,6 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MF.push_front(PrevStackMBB);
 
   // The required stack size that is aligned to ARM constant criterion.
-  uint64_t StackSize = MFI->getStackSize();
-
   AlignedStackSize = alignToARMConstant(StackSize);
 
   // When the frame size is less than 256 we just compare the stack
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 8577c8af47d5..08d598d7c5a6 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -73,6 +73,13 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
       Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    SelectionDAGISel::runOnMachineFunction(MF);
+    return true;
+  }
+
   const char *getPassName() const override {
     return "ARM Instruction Selection";
   }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 5beb752d3a42..4a762ce6d3bd 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -155,16 +155,16 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
-static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<ARMSubtarget>().isTargetMachO())
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
     return new TargetLoweringObjectFileMachO();
-  if (TM.getSubtarget<ARMSubtarget>().isTargetWindows())
+  if (TT.isOSWindows())
     return new TargetLoweringObjectFileCOFF();
   return new ARMElfTargetObjectFile();
 }
 
 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(TM)) {
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   RegInfo = TM.getRegisterInfo();
   Itins = TM.getInstrItineraryData();
@@ -710,7 +710,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setExceptionSelectorRegister(ARM::R1);
   }
 
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
   if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
@@ -983,6 +987,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
 
+  case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
+
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
   case ARMISD::VCGE:          return "ARMISD::VCGE";
@@ -1199,7 +1205,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
   case CallingConv::C:
     if (!Subtarget->isAAPCS_ABI())
       return CallingConv::ARM_APCS;
-    else if (Subtarget->hasVFP2() &&
+    else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
              !isVarArg)
       return CallingConv::ARM_AAPCS_VFP;
@@ -1207,10 +1213,10 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
       return CallingConv::ARM_AAPCS;
   case CallingConv::Fast:
     if (!Subtarget->isAAPCS_ABI()) {
-      if (Subtarget->hasVFP2() && !isVarArg)
+      if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
         return CallingConv::Fast;
       return CallingConv::ARM_APCS;
-    } else if (Subtarget->hasVFP2() && !isVarArg)
+    } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
       return CallingConv::ARM_AAPCS_VFP;
     else
       return CallingConv::ARM_AAPCS;
@@ -1598,8 +1604,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   if (EnableARMLongCalls) {
-    assert (getTargetMachine().getRelocationModel() == Reloc::Static
-            && "long-calls with non-static relocation model!");
+    assert((Subtarget->isTargetWindows() ||
+            getTargetMachine().getRelocationModel() == Reloc::Static) &&
+           "long-calls with non-static relocation model!");
     // Handle a global address or an external symbol. If it's not one of
     // those, the target's already in a register, so we don't need to do
     // anything extra.
@@ -2535,6 +2542,11 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::arm_rbit: {
+    assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+           "RBIT intrinsic must have i32 type!");
+    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(0));
+  }
   case Intrinsic::arm_thread_pointer: {
     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
@@ -6213,6 +6225,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+      return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    llvm_unreachable("Don't know how to custom lower this!");
   }
 }
 
@@ -7111,6 +7127,73 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
   return BB;
 }
 
+MachineBasicBlock *
+ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  const TargetMachine &TM = getTargetMachine();
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(Subtarget->isTargetWindows() &&
+         "__chkstk is only supported on Windows");
+  assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
+
+  // __chkstk takes the number of words to allocate on the stack in R4, and
+  // returns the stack adjustment in number of bytes in R4.  This will not
+  // clober any other registers (other than the obvious lr).
+  //
+  // Although, technically, IP should be considered a register which may be
+  // clobbered, the call itself will not touch it.  Windows on ARM is a pure
+  // thumb-2 environment, so there is no interworking required.  As a result, we
+  // do not expect a veneer to be emitted by the linker, clobbering IP.
+  //
+  // Each module recieves its own copy of __chkstk, so no import thunk is
+  // required, again, ensuring that IP is not clobbered.
+  //
+  // Finally, although some linkers may theoretically provide a trampoline for
+  // out of range calls (which is quite common due to a 32M range limitation of
+  // branches for Thumb), we can generate the long-call version via
+  // -mcmodel=large, alleviating the need for the trampoline which may clobber
+  // IP.
+
+  switch (TM.getCodeModel()) {
+  case CodeModel::Small:
+  case CodeModel::Medium:
+  case CodeModel::Default:
+  case CodeModel::Kernel:
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addExternalSymbol("__chkstk")
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  case CodeModel::Large:
+  case CodeModel::JITDefault: {
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+    unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+
+    BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
+      .addExternalSymbol("__chkstk");
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  }
+  }
+
+  AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
+                                      ARM::SP)
+                              .addReg(ARM::SP, RegState::Define)
+                              .addReg(ARM::R4, RegState::Kill)));
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
@@ -7360,6 +7443,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::COPY_STRUCT_BYVAL_I32:
     ++NumLoopByVals;
     return EmitStructByval(MI, BB);
+  case ARM::WIN__CHKSTK:
+    return EmitLowered__chkstk(MI, BB);
   }
 }
 
@@ -8315,6 +8400,8 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
                                  std::min(4U, LD->getAlignment() / 2));
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+    if (DCI.DAG.getTargetLoweringInfo().isBigEndian())
+      std::swap (NewLD1, NewLD2);
     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
     DCI.RemoveFromWorklist(LD);
     DAG.DeleteNode(LD);
@@ -8382,7 +8469,8 @@ static SDValue PerformSTORECombine(SDNode *N,
     SDLoc DL(St);
     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = 0; i < NumElems; ++i)
+      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
@@ -10478,6 +10566,32 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   return CallInfo.first;
 }
 
+SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "unsupported target platform");
+  SDLoc DL(Op);
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+
+  SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
+                              DAG.getConstant(2, MVT::i32));
+
+  SDValue Flag;
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::i32, MVT::Glue);
+  Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
+
+  SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+  Chain = NewSP.getValue(1);
+
+  SDValue Ops[2] = { NewSP, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
@@ -10635,14 +10749,20 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
 bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
   // Loads and stores less than 64-bits are already atomic; ones above that
   // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 64;
+  // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+  // anything for those.
+  bool IsMClass = Subtarget->isMClass();
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+    return Size == 64 && !IsMClass;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    return LI->getType()->getPrimitiveSizeInBits() == 64 && !IsMClass;
+  }
 
-  // For the real atomic operations, we have ldrex/strex up to 64 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 64;
+  // For the real atomic operations, we have ldrex/strex up to 32 bits,
+  // and up to 64 bits on the non-M profiles
+  unsigned AtomicLimit = IsMClass ? 32 : 64;
+  return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit;
 }
 
 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
@@ -10778,14 +10898,13 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
-  if (getEffectiveCallingConv(CallConv, isVarArg) ==
-      CallingConv::ARM_AAPCS_VFP) {
-    HABaseType Base = HA_UNKNOWN;
-    uint64_t Members = 0;
-    bool result = isHomogeneousAggregate(Ty, Base, Members);
-    DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump(); dbgs() << "\n");
-    return result;
-  } else {
+  if (getEffectiveCallingConv(CallConv, isVarArg) !=
+      CallingConv::ARM_AAPCS_VFP)
     return false;
-  }
+
+  HABaseType Base = HA_UNKNOWN;
+  uint64_t Members = 0;
+  bool result = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump(); dbgs() << "\n");
+  return result;
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index c15305c5e603..1ace0f330f1a 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -95,6 +95,8 @@ namespace llvm {
 
       PRELOAD,      // Preload
 
+      WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
+
       VCEQ,         // Vector compare equal.
       VCEQZ,        // Vector compare equal to zero.
       VCGE,         // Vector compare greater than or equal.
@@ -470,6 +472,7 @@ namespace llvm {
                               const ARMSubtarget *ST) const;
     SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
 
     unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
@@ -578,6 +581,9 @@ namespace llvm {
 
     MachineBasicBlock *EmitStructByval(MachineInstr *MI,
                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
   };
 
   enum NEONModImmType {
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index f642893161c3..03eac2e0c84f 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -493,7 +493,7 @@ def neon_vcvt_imm32 : Operand<i32> {
 // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
 def rot_imm_XFORM: SDNodeXForm<imm, [{
   switch (N->getZExtValue()){
-  default: assert(0);
+  default: llvm_unreachable(nullptr);
   case 0:  return CurDAG->getTargetConstant(0, MVT::i32);
   case 8:  return CurDAG->getTargetConstant(1, MVT::i32);
   case 16: return CurDAG->getTargetConstant(2, MVT::i32);
@@ -1969,7 +1969,7 @@ def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
 
 // A8.8.247  UDF - Undefined (Encoding A1)
 def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
-                "udf", "\t$imm16", []> {
+                "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> {
   bits<16> imm16;
   let Inst{31-28} = 0b1110; // AL
   let Inst{27-25} = 0b011;
@@ -5093,6 +5093,19 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
   let Inst{11-0} = a;
 }
 
+// Dynamic stack allocation yields a _chkstk for Windows targets.  These calls
+// are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
+                      [SDNPHasChain, SDNPSideEffect]>;
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+  def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
+
 //===----------------------------------------------------------------------===//
 // TLS Instructions
 //
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index ff3832d98b5e..e17f73af03ec 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1194,8 +1194,8 @@ def tTST :                      // A8.6.230
                Sched<[WriteALU]>;
 
 // A8.8.247  UDF - Undefined (Encoding T1)
-def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8", []>,
-           Encoding16 {
+def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
+              [(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
   bits<8> imm8;
   let Inst{15-12} = 0b1101;
   let Inst{11-8} = 0b1110;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 28f528a510e9..c30d6abbb299 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -2408,8 +2408,8 @@ def t2UBFX: T2TwoRegBitFI<
 }
 
 // A8.8.247  UDF - Undefined (Encoding T2)
-def t2UDF
-    : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16", []> {
+def t2UDF : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16",
+                 [(int_arm_undefined imm0_65535:$imm16)]> {
   bits<16> imm16;
   let Inst{31-29} = 0b111;
   let Inst{28-27} = 0b10;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index ee7df5476c71..a03bcdbddd79 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -505,7 +505,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback. Check for this.
-  if (Opcode == ARM::tLDRi && isThumb1)
+  if (Opcode == ARM::tLDMIA && isThumb1)
     for (unsigned I = 0; I < NumRegs; ++I)
       if (Base == Regs[I].first) {
         Writeback = false;
@@ -519,17 +519,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       // Update tLDMIA with writeback if necessary.
       Opcode = ARM::tLDMIA_UPD;
 
-    // The base isn't dead after a merged instruction with writeback. Update
-    // future uses of the base with the added offset (if possible), or reset
-    // the base register as necessary.
-    if (!BaseKill)
-      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
-
     MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
 
     // Thumb1: we might need to set base writeback when building the MI.
     MIB.addReg(Base, getDefRegState(true))
        .addReg(Base, getKillRegState(BaseKill));
+
+    // The base isn't dead after a merged instruction with writeback. Update
+    // future uses of the base with the added offset (if possible), or reset
+    // the base register as necessary.
+    if (!BaseKill)
+      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
   } else {
     // No writeback, simply build the MachineInstr.
     MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
@@ -1734,6 +1734,12 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
+  // FIXME: Temporarily disabling for Thumb-1 due to miscompiles
+  if (isThumb1) {
+    delete RS;
+    return false;
+  }
+
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index d7ec6eba941c..7934761af25b 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -220,7 +220,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
     if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
-      assert(0 && "Duplicate entries!");
+      llvm_unreachable("Duplicate entries!");
   }
 
   unsigned getOriginalCPIdx(unsigned CloneIdx) const {
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 008ad640a476..c3a9131fb767 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -18,10 +18,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 
-ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM),
-    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
-}
+ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
 ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
 }
@@ -34,6 +32,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                              bool isVolatile, bool AlwaysInline,
                                              MachinePointerInfo DstPtrInfo,
                                           MachinePointerInfo SrcPtrInfo) const {
+  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if ((Align & 3) != 0)
@@ -44,7 +43,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   if (!ConstantSize)
     return SDValue();
   uint64_t SizeVal = ConstantSize->getZExtValue();
-  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
     return SDValue();
 
   unsigned BytesLeft = SizeVal & 3;
@@ -54,7 +53,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   unsigned VTSize = 4;
   unsigned i = 0;
   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
-  const unsigned MAX_LOADS_IN_LDM = Subtarget->isThumb1Only() ? 4 : 6;
+  const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6;
   SDValue TFOps[6];
   SDValue Loads[6];
   uint64_t SrcOff = 0, DstOff = 0;
@@ -151,9 +150,10 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         SDValue Src, SDValue Size,
                         unsigned Align, bool isVolatile,
                         MachinePointerInfo DstPtrInfo) const {
+  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
   // Use default for non-AAPCS (or MachO) subtargets
-  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO() ||
-      Subtarget->isTargetWindows())
+  if (!Subtarget.isAAPCS_ABI() || Subtarget.isTargetMachO() ||
+      Subtarget.isTargetWindows())
     return SDValue();
 
   const ARMTargetLowering &TLI =
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 8c2397b52188..13769dc8efe0 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -36,12 +36,8 @@ namespace ARM_AM {
 }  // end namespace ARM_AM
 
 class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARMSubtarget *Subtarget;
-
 public:
-  explicit ARMSelectionDAGInfo(const TargetMachine &TM);
+  explicit ARMSelectionDAGInfo(const DataLayout &DL);
   ~ARMSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 1c9dd4d49d6a..fc842512ef00 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -76,22 +76,80 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
                          "Allow IT blocks based on ARMv7"),
               clEnumValEnd));
 
-ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, bool IsLittle,
-                           const TargetOptions &Options)
-  : ARMGenSubtargetInfo(TT, CPU, FS)
-  , ARMProcFamily(Others)
-  , ARMProcClass(None)
-  , stackAlignment(4)
-  , CPUString(CPU)
-  , IsLittle(IsLittle)
-  , TargetTriple(TT)
-  , Options(Options)
-  , TargetABI(ARM_ABI_UNKNOWN) {
+static std::string computeDataLayout(ARMSubtarget &ST) {
+  std::string Ret = "";
+
+  if (ST.isLittle())
+    // Little endian.
+    Ret += "e";
+  else
+    // Big endian.
+    Ret += "E";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+
+  // Pointers are 32 bits and aligned to 32 bits.
+  Ret += "-p:32:32";
+
+  // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to
+  // align to 32.
+  if (ST.isThumb())
+    Ret += "-i1:8:32-i8:8:32-i16:16:32";
+
+  // ABIs other than APCS have 64 bit integers with natural alignment.
+  if (!ST.isAPCS_ABI())
+    Ret += "-i64:64";
+
+  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+  // bits, others to 64 bits. We always try to align to 64 bits.
+  if (ST.isAPCS_ABI())
+    Ret += "-f64:32:64";
+
+  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+  // to 64. We always ty to give them natural alignment.
+  if (ST.isAPCS_ABI())
+    Ret += "-v64:32:64-v128:32:128";
+  else
+    Ret += "-v128:64:128";
+
+  // On thumb and APCS, only try to align aggregates to 32 bits (the default is
+  // 64 bits).
+  if (ST.isThumb() || ST.isAPCS_ABI())
+    Ret += "-a:0:32";
+
+  // Integer registers are 32 bits.
+  Ret += "-n32";
+
+  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
+  // aligned everywhere else.
+  if (ST.isTargetNaCl())
+    Ret += "-S128";
+  else if (ST.isAAPCS_ABI())
+    Ret += "-S64";
+  else
+    Ret += "-S32";
+
+  return Ret;
+}
+
+/// initializeSubtargetDependencies - Initializes using a CPU and feature string
+/// so that we can use initializer lists for subtarget initialization.
+ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
+  return *this;
 }
 
+ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, bool IsLittle,
+                           const TargetOptions &Options)
+    : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
+      TargetTriple(TT), Options(Options), TargetABI(ARM_ABI_UNKNOWN),
+      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
+      TSInfo(DL) {}
+
 void ARMSubtarget::initializeEnvironment() {
   HasV4TOps = false;
   HasV5TOps = false;
@@ -352,6 +410,13 @@ bool ARMSubtarget::hasSinCos() const {
     !getTargetTriple().isOSVersionLT(7, 0);
 }
 
+// Enable the PostMachineScheduler if the target selects it instead of
+// PostRAScheduler. Currently only available on the command line via
+// -misched-postra.
+bool ARMSubtarget::enablePostMachineScheduler() const {
+  return PostRAScheduler;
+}
+
 bool ARMSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index fd106f6fc865..7da80ec0d494 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -14,8 +14,10 @@
 #ifndef ARMSUBTARGET_H
 #define ARMSUBTARGET_H
 
+#include "ARMSelectionDAGInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -247,7 +249,18 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 
   /// \brief Reset the features for the ARM target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  const DataLayout *getDataLayout() const { return &DL; }
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+
 private:
+  const DataLayout DL;
+  ARMSelectionDAGInfo TSInfo;
+
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
 public:
@@ -379,7 +392,12 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
-  bool useMovt() const { return UseMovt && !isMinSize(); }
+  bool useMovt() const {
+    // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
+    // immediates as it is inherently position independent, and may be out of
+    // range otherwise.
+    return UseMovt && (isTargetWindows() || !isMinSize());
+  }
   bool supportsTailCall() const { return SupportsTailCall; }
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
@@ -396,6 +414,9 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
 
+  /// True for some subtargets at > -O0.
+  bool enablePostMachineScheduler() const;
+
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 887622705ed2..a93824230d70 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -28,6 +28,12 @@ DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
                    cl::desc("Inhibit optimization of S->D register accesses on A15"),
                    cl::init(false));
 
+static cl::opt<bool>
+EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget);
@@ -46,9 +52,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
                                            CodeGenOpt::Level OL,
                                            bool isLittle)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, isLittle, Options),
-    JITInfo(),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+    Subtarget(TT, CPU, FS, isLittle, Options), JITInfo() {
 
   // Default to triple-appropriate float ABI
   if (Options.FloatABIType == FloatABI::Default)
@@ -67,62 +71,6 @@ void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
 
 void ARMTargetMachine::anchor() { }
 
-static std::string computeDataLayout(ARMSubtarget &ST) {
-  std::string Ret = "";
-
-  if (ST.isLittle())
-    // Little endian.
-    Ret += "e";
-  else
-    // Big endian.
-    Ret += "E";
-
-  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
-
-  // Pointers are 32 bits and aligned to 32 bits.
-  Ret += "-p:32:32";
-
-  // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to
-  // align to 32.
-  if (ST.isThumb())
-    Ret += "-i1:8:32-i8:8:32-i16:16:32";
-
-  // ABIs other than APCS have 64 bit integers with natural alignment.
-  if (!ST.isAPCS_ABI())
-    Ret += "-i64:64";
-
-  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
-  // bits, others to 64 bits. We always try to align to 64 bits.
-  if (ST.isAPCS_ABI())
-    Ret += "-f64:32:64";
-
-  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
-  // to 64. We always ty to give them natural alignment.
-  if (ST.isAPCS_ABI())
-    Ret += "-v64:32:64-v128:32:128";
-  else
-    Ret += "-v128:64:128";
-
-  // On thumb and APCS, only try to align aggregates to 32 bits (the default is
-  // 64 bits).
-  if (ST.isThumb() || ST.isAPCS_ABI())
-    Ret += "-a:0:32";
-
-  // Integer registers are 32 bits.
-  Ret += "-n32";
-
-  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
-  // aligned everywhere else.
-  if (ST.isTargetNaCl())
-    Ret += "-S128";
-  else if (ST.isAAPCS_ABI())
-    Ret += "-S64";
-  else
-    Ret += "-S32";
-
-  return Ret;
-}
-
 ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
@@ -131,9 +79,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                                    bool isLittle)
   : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle),
     InstrInfo(Subtarget),
-    DL(computeDataLayout(Subtarget)),
     TLInfo(*this),
-    TSInfo(*this),
     FrameLowering(Subtarget) {
   initAsmInfo();
   if (!Subtarget.hasARMOps())
@@ -171,9 +117,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
     InstrInfo(Subtarget.hasThumb2()
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
-    DL(computeDataLayout(Subtarget)),
     TLInfo(*this),
-    TSInfo(*this),
     FrameLowering(Subtarget.hasThumb2()
               ? new ARMFrameLowering(Subtarget)
               : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
@@ -213,6 +157,7 @@ class ARMPassConfig : public TargetPassConfig {
     return *getARMTargetMachine().getSubtargetImpl();
   }
 
+  void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreRegAlloc() override;
@@ -225,11 +170,22 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new ARMPassConfig(this, PM);
 }
 
-bool ARMPassConfig::addPreISel() {
+void ARMPassConfig::addIRPasses() {
   const ARMSubtarget *Subtarget = &getARMSubtarget();
-  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
     addPass(createAtomicExpandLoadLinkedPass(TM));
 
+    // Cmpxchg instructions are often used with a subsequent comparison to
+    // determine whether it succeeded. We can exploit existing control-flow in
+    // ldrex/strex loops to simplify this, but it needs tidying up.
+    if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+      addPass(createCFGSimplificationPass());
+  }
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createGlobalMergePass(TM));
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 664c992ea165..154927786082 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -32,9 +32,9 @@ namespace llvm {
 class ARMBaseTargetMachine : public LLVMTargetMachine {
 protected:
   ARMSubtarget        Subtarget;
+
 private:
   ARMJITInfo          JITInfo;
-  InstrItineraryData  InstrItins;
 
 public:
   ARMBaseTargetMachine(const Target &T, StringRef TT,
@@ -51,9 +51,11 @@ class ARMBaseTargetMachine : public LLVMTargetMachine {
     llvm_unreachable("getTargetLowering not implemented");
   }
   const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+    return &getSubtargetImpl()->getInstrItineraryData();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-
   /// \brief Register ARM analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
 
@@ -68,9 +70,7 @@ class ARMBaseTargetMachine : public LLVMTargetMachine {
 class ARMTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
   ARMInstrInfo        InstrInfo;
-  const DataLayout    DL;       // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
-  ARMSelectionDAGInfo TSInfo;
   ARMFrameLowering    FrameLowering;
  public:
   ARMTargetMachine(const Target &T, StringRef TT,
@@ -89,13 +89,12 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
   }
 
   const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
   const ARMFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
   const ARMInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
 };
 
 /// ARMLETargetMachine - ARM little endian target machine.
@@ -128,9 +127,7 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
   // Either Thumb1InstrInfo or Thumb2InstrInfo.
   std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
-  const DataLayout    DL;   // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
-  ARMSelectionDAGInfo TSInfo;
   // Either Thumb1FrameLowering or ARMFrameLowering.
   std::unique_ptr<ARMFrameLowering> FrameLowering;
 public:
@@ -151,7 +148,7 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   }
 
   const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
 
   /// returns either Thumb1InstrInfo or Thumb2InstrInfo
@@ -162,7 +159,6 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   const ARMFrameLowering *getFrameLowering() const override {
     return FrameLowering.get();
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
 };
 
 /// ThumbLETargetMachine - Thumb little endian target machine.
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index ebb25f49fa44..57df7da7f310 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -25,7 +25,7 @@ using namespace llvm;
 #define DEBUG_TYPE "armtti"
 
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeARMTTIPass(PassRegistry &);
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 5cdf394443a4..6bb41e4bbe69 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -190,11 +190,11 @@ class ARMAsmParser : public MCTargetAsmParser {
   }
 
   int tryParseRegister();
-  bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
-  int tryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
+  bool tryParseRegisterWithWriteBack(OperandVector &);
+  int tryParseShiftRegister(OperandVector &);
+  bool parseRegisterList(OperandVector &);
+  bool parseMemory(OperandVector &);
+  bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
   bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
                               unsigned &ShiftAmount);
@@ -282,54 +282,42 @@ class ARMAsmParser : public MCTargetAsmParser {
 
   /// }
 
-  OperandMatchResultTy parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseCoprocNumOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseCoprocRegOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseCoprocOptionOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseMemBarrierOptOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseInstSyncBarrierOptOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseProcIFlagsOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseMSRMaskOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &O,
-                                   StringRef Op, int Low, int High);
-  OperandMatchResultTy parsePKHLSLImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+  OperandMatchResultTy parseITCondCode(OperandVector &);
+  OperandMatchResultTy parseCoprocNumOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocRegOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocOptionOperand(OperandVector &);
+  OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
+  OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
+  OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low,
+                                   int High);
+  OperandMatchResultTy parsePKHLSLImm(OperandVector &O) {
     return parsePKHImm(O, "lsl", 0, 31);
   }
-  OperandMatchResultTy parsePKHASRImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+  OperandMatchResultTy parsePKHASRImm(OperandVector &O) {
     return parsePKHImm(O, "asr", 1, 32);
   }
-  OperandMatchResultTy parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseRotImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseBitfield(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseVectorList(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseSetEndImm(OperandVector &);
+  OperandMatchResultTy parseShifterImm(OperandVector &);
+  OperandMatchResultTy parseRotImm(OperandVector &);
+  OperandMatchResultTy parseBitfield(OperandVector &);
+  OperandMatchResultTy parsePostIdxReg(OperandVector &);
+  OperandMatchResultTy parseAM3Offset(OperandVector &);
+  OperandMatchResultTy parseFPImm(OperandVector &);
+  OperandMatchResultTy parseVectorList(OperandVector &);
   OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index,
                                        SMLoc &EndLoc);
 
   // Asm Match Converter Methods
-  void cvtThumbMultiply(MCInst &Inst,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtThumbBranches(MCInst &Inst,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &);
-
-  bool validateInstruction(MCInst &Inst,
-                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
-  bool processInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
-  bool shouldOmitCCOutOperand(StringRef Mnemonic,
-                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  bool shouldOmitPredicateOperand(StringRef Mnemonic,
-                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  void cvtThumbMultiply(MCInst &Inst, const OperandVector &);
+  void cvtThumbBranches(MCInst &Inst, const OperandVector &);
+
+  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
+  bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+
 public:
   enum ARMMatchResultTy {
     Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
@@ -361,19 +349,17 @@ class ARMAsmParser : public MCTargetAsmParser {
 
   // Implementation of the MCTargetAsmParser interface:
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  bool
-  ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                   SMLoc NameLoc,
-                   SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
   void onLabelParsed(MCSymbol *Symbol) override;
 };
@@ -545,8 +531,8 @@ class ARMOperand : public MCParsedAsmOperand {
     struct BitfieldOp Bitfield;
   };
 
-  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 public:
+  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
   ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
     Kind = o.Kind;
     StartLoc = o.StartLoc;
@@ -2481,56 +2467,58 @@ class ARMOperand : public MCParsedAsmOperand {
 
   void print(raw_ostream &OS) const override;
 
-  static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_ITCondMask);
+  static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ITCondMask);
     Op->ITMask.Mask = Mask;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CondCode);
+  static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
+                                                    SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CondCode);
     Op->CC.Val = CC;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CoprocNum);
+  static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocNum);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CoprocReg);
+  static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocReg);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_CoprocOption);
+  static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
+                                                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_CoprocOption);
     Op->Cop.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CCOut);
+  static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CCOut);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateToken(StringRef Str, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_Token);
+  static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -2538,20 +2526,20 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_Register);
+  static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateShiftedRegister(ARM_AM::ShiftOpc ShTy,
-                                           unsigned SrcReg,
-                                           unsigned ShiftReg,
-                                           unsigned ShiftImm,
-                                           SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_ShiftedRegister);
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                        unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
+                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
     Op->RegShiftedReg.ShiftTy = ShTy;
     Op->RegShiftedReg.SrcReg = SrcReg;
     Op->RegShiftedReg.ShiftReg = ShiftReg;
@@ -2561,11 +2549,10 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy,
-                                            unsigned SrcReg,
-                                            unsigned ShiftImm,
-                                            SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_ShiftedImmediate);
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                         unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
     Op->RegShiftedImm.ShiftTy = ShTy;
     Op->RegShiftedImm.SrcReg = SrcReg;
     Op->RegShiftedImm.ShiftImm = ShiftImm;
@@ -2574,9 +2561,9 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateShifterImm(bool isASR, unsigned Imm,
-                                   SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_ShifterImmediate);
+  static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
     Op->ShifterImm.isASR = isASR;
     Op->ShifterImm.Imm = Imm;
     Op->StartLoc = S;
@@ -2584,17 +2571,18 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_RotateImmediate);
+  static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
+                                                  SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_RotateImmediate);
     Op->RotImm.Imm = Imm;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateBitfield(unsigned LSB, unsigned Width,
-                                    SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_BitfieldDescriptor);
+  static std::unique_ptr<ARMOperand>
+  CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
     Op->Bitfield.LSB = LSB;
     Op->Bitfield.Width = Width;
     Op->StartLoc = S;
@@ -2602,8 +2590,8 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *
-  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned> > &Regs,
+  static std::unique_ptr<ARMOperand>
+  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
                 SMLoc StartLoc, SMLoc EndLoc) {
     assert (Regs.size() > 0 && "RegList contains no registers?");
     KindTy Kind = k_RegisterList;
@@ -2617,7 +2605,7 @@ class ARMOperand : public MCParsedAsmOperand {
     // Sort based on the register encoding values.
     array_pod_sort(Regs.begin(), Regs.end());
 
-    ARMOperand *Op = new ARMOperand(Kind);
+    auto Op = make_unique<ARMOperand>(Kind);
     for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator
            I = Regs.begin(), E = Regs.end(); I != E; ++I)
       Op->Registers.push_back(I->second);
@@ -2626,9 +2614,11 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                      bool isDoubleSpaced, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_VectorList);
+  static std::unique_ptr<ARMOperand> CreateVectorList(unsigned RegNum,
+                                                      unsigned Count,
+                                                      bool isDoubleSpaced,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorList);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -2637,10 +2627,10 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateVectorListAllLanes(unsigned RegNum, unsigned Count,
-                                              bool isDoubleSpaced,
-                                              SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_VectorListAllLanes);
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
+                           SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -2649,11 +2639,10 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateVectorListIndexed(unsigned RegNum, unsigned Count,
-                                             unsigned Index,
-                                             bool isDoubleSpaced,
-                                             SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_VectorListIndexed);
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
+                          bool isDoubleSpaced, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.LaneIndex = Index;
@@ -2663,33 +2652,30 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
-                                       MCContext &Ctx) {
-    ARMOperand *Op = new ARMOperand(k_VectorIndex);
+  static std::unique_ptr<ARMOperand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<ARMOperand>(k_VectorIndex);
     Op->VectorIndex.Val = Idx;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_Immediate);
+  static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateMem(unsigned BaseRegNum,
-                               const MCConstantExpr *OffsetImm,
-                               unsigned OffsetRegNum,
-                               ARM_AM::ShiftOpc ShiftType,
-                               unsigned ShiftImm,
-                               unsigned Alignment,
-                               bool isNegative,
-                               SMLoc S, SMLoc E,
-                               SMLoc AlignmentLoc = SMLoc()) {
-    ARMOperand *Op = new ARMOperand(k_Memory);
+  static std::unique_ptr<ARMOperand>
+  CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm,
+            unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
+            unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
+            SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
+    auto Op = make_unique<ARMOperand>(k_Memory);
     Op->Memory.BaseRegNum = BaseRegNum;
     Op->Memory.OffsetImm = OffsetImm;
     Op->Memory.OffsetRegNum = OffsetRegNum;
@@ -2703,11 +2689,10 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreatePostIdxReg(unsigned RegNum, bool isAdd,
-                                      ARM_AM::ShiftOpc ShiftTy,
-                                      unsigned ShiftImm,
-                                      SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_PostIndexRegister);
+  static std::unique_ptr<ARMOperand>
+  CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
+                   unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
     Op->PostIdxReg.RegNum = RegNum;
     Op->PostIdxReg.isAdd = isAdd;
     Op->PostIdxReg.ShiftTy = ShiftTy;
@@ -2717,33 +2702,35 @@ class ARMOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_MemBarrierOpt);
+  static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
+                                                         SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
     Op->MBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt,
-                                              SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_InstSyncBarrierOpt);
+  static std::unique_ptr<ARMOperand>
+  CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
     Op->ISBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_ProcIFlags);
+  static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
+                                                      SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ProcIFlags);
     Op->IFlags.Val = IFlags;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_MSRMask);
+  static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MSRMask);
     Op->MMask.Val = MMask;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -2947,8 +2934,7 @@ int ARMAsmParser::tryParseRegister() {
 // occurs, return -1. An irrecoverable error is one where tokens have been
 // consumed in the process of trying to parse the shifter (i.e., when it is
 // indeed a shifter operand, but malformed).
-int ARMAsmParser::tryParseShiftRegister(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -2972,7 +2958,8 @@ int ARMAsmParser::tryParseShiftRegister(
   // The source register for the shift has already been added to the
   // operand list, so we need to pop it off and combine it into the shifted
   // register operand instead.
-  std::unique_ptr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val());
+  std::unique_ptr<ARMOperand> PrevOp(
+      (ARMOperand *)Operands.pop_back_val().release());
   if (!PrevOp->isReg())
     return Error(PrevOp->getStartLoc(), "shift must be of a register");
   int SrcReg = PrevOp->getReg();
@@ -3049,8 +3036,7 @@ int ARMAsmParser::tryParseShiftRegister(
 ///
 /// TODO this is likely to change to allow different register types and or to
 /// parse for a specific register type.
-bool ARMAsmParser::
-tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) {
   const AsmToken &RegTok = Parser.getTok();
   int RegNo = tryParseRegister();
   if (RegNo == -1)
@@ -3136,8 +3122,8 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
 }
 
 /// parseITCondCode - Try to parse a condition code for an IT instruction.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseITCondCode(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
@@ -3173,8 +3159,8 @@ parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -3192,8 +3178,8 @@ parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -3210,8 +3196,8 @@ parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
 /// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
 /// coproc_option : '{' imm0_255 '}'
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
 
   // If this isn't a '{', this isn't a coprocessor immediate operand.
@@ -3288,8 +3274,7 @@ static unsigned getDRegFromQReg(unsigned QReg) {
 }
 
 /// Parse a register list.
-bool ARMAsmParser::
-parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
   assert(Parser.getTok().is(AsmToken::LCurly) &&
          "Token is not a Left Curly Brace");
   SMLoc S = Parser.getTok().getLoc();
@@ -3470,8 +3455,8 @@ parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
 }
 
 // parse a vector register list
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseVectorList(OperandVector &Operands) {
   VectorLaneTy LaneKind;
   unsigned LaneIndex;
   SMLoc S = Parser.getTok().getLoc();
@@ -3721,8 +3706,8 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 /// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   unsigned Opt;
@@ -3792,8 +3777,8 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 /// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   unsigned Opt;
@@ -3843,8 +3828,8 @@ parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
 
 /// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier)) 
@@ -3877,8 +3862,8 @@ parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 /// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
@@ -4005,9 +3990,9 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
-            int Low, int High) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low,
+                          int High) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
     Error(Parser.getTok().getLoc(), Op + " operand expected.");
@@ -4053,8 +4038,8 @@ parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseSetEndImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -4082,8 +4067,8 @@ parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 ///     lsl #n  'n' in [0,31]
 ///     asr #n  'n' in [1,32]
 ///             n == 32 encoded as n == 0.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseShifterImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -4152,8 +4137,8 @@ parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
 /// of instructions. Legal values are:
 ///     ror #n  'n' in {0, 8, 16, 24}
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseRotImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier))
@@ -4198,8 +4183,8 @@ parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseBitfield(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   // The bitfield descriptor is really two operands, the LSB and the width.
   if (Parser.getTok().isNot(AsmToken::Hash) &&
@@ -4266,8 +4251,8 @@ parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parsePostIdxReg(OperandVector &Operands) {
   // Check for a post-index addressing register operand. Specifically:
   // postidx_reg := '+' register {, shift}
   //              | '-' register {, shift}
@@ -4315,8 +4300,8 @@ parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
   // Check for a post-index addressing register operand. Specifically:
   // am3offset := '+' register
   //              | '-' register
@@ -4388,26 +4373,24 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// Convert parsed operands to MCInst.  Needed here because this instruction
 /// only has two register operands, but multiplication is commutative so
 /// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
-void ARMAsmParser::
-cvtThumbMultiply(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
+void ARMAsmParser::cvtThumbMultiply(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1);
+  ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1);
   // If we have a three-operand form, make sure to set Rn to be the operand
   // that isn't the same as Rd.
   unsigned RegOp = 4;
   if (Operands.size() == 6 &&
-      ((ARMOperand*)Operands[4])->getReg() ==
-        ((ARMOperand*)Operands[3])->getReg())
+      ((ARMOperand &)*Operands[4]).getReg() ==
+          ((ARMOperand &)*Operands[3]).getReg())
     RegOp = 5;
-  ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1);
+  ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1);
   Inst.addOperand(Inst.getOperand(0));
-  ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
+  ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2);
 }
 
-void ARMAsmParser::
-cvtThumbBranches(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
+                                    const OperandVector &Operands) {
   int CondOp = -1, ImmOp = -1;
   switch(Inst.getOpcode()) {
     case ARM::tB:
@@ -4430,7 +4413,7 @@ cvtThumbBranches(MCInst &Inst,
   } else {
     // outside IT blocks we can only have unconditional branches with AL
     // condition code or conditional branches with non-AL condition code
-    unsigned Cond = static_cast<ARMOperand*>(Operands[CondOp])->getCondCode();
+    unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode();
     switch(Inst.getOpcode()) {
       case ARM::tB:
       case ARM::tBcc: 
@@ -4447,27 +4430,26 @@ cvtThumbBranches(MCInst &Inst,
   switch(Inst.getOpcode()) {
     // classify tB as either t2B or t1B based on range of immediate operand
     case ARM::tB: {
-      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
-      if(!op->isSignedOffset<11, 1>() && isThumbTwo()) 
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<11, 1>() && isThumbTwo())
         Inst.setOpcode(ARM::t2B);
       break;
     }
     // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
     case ARM::tBcc: {
-      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
-      if(!op->isSignedOffset<8, 1>() && isThumbTwo())
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<8, 1>() && isThumbTwo())
         Inst.setOpcode(ARM::t2Bcc);
       break;
     }
   }
-  ((ARMOperand*)Operands[ImmOp])->addImmOperands(Inst, 1);
-  ((ARMOperand*)Operands[CondOp])->addCondCodeOperands(Inst, 2);
+  ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1);
+  ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
-bool ARMAsmParser::
-parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::parseMemory(OperandVector &Operands) {
   SMLoc S, E;
   assert(Parser.getTok().is(AsmToken::LBrac) &&
          "Token is not a Left Bracket");
@@ -4717,8 +4699,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
 }
 
 /// parseFPImm - A floating point immediate expression operand.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseFPImm(OperandVector &Operands) {
   // Anything that can accept a floating point constant as an operand
   // needs to go through here, as the regular parseExpression is
   // integer only.
@@ -4744,12 +4726,12 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // integer constant. Make sure we don't try to parse an FPImm
   // for these:
   // vmov.i{8|16|32|64} <dreg|qreg>, #imm
-  ARMOperand *TyOp = static_cast<ARMOperand*>(Operands[2]);
-  bool isVmovf = TyOp->isToken() && (TyOp->getToken() == ".f32" ||
-                                     TyOp->getToken() == ".f64");
-  ARMOperand *Mnemonic = static_cast<ARMOperand*>(Operands[0]);
-  bool isFconst = Mnemonic->isToken() && (Mnemonic->getToken() == "fconstd" ||
-                                          Mnemonic->getToken() == "fconsts");
+  ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
+  bool isVmovf = TyOp.isToken() &&
+                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64");
+  ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
+  bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
+                                         Mnemonic.getToken() == "fconsts");
   if (!(isVmovf || isFconst))
     return MatchOperand_NoMatch;
 
@@ -4798,8 +4780,7 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
 /// Parse a arm instruction operand.  For now this parses the operand regardless
 /// of the mnemonic.
-bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                StringRef Mnemonic) {
+bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   SMLoc S, E;
 
   // Check if the current operand has a custom associated parser, if so, try to
@@ -5125,7 +5106,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
 }
 
 bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+                                          OperandVector &Operands) {
   // FIXME: This is all horribly hacky. We really need a better way to deal
   // with optional operands like this in the matcher table.
 
@@ -5138,17 +5119,17 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // conditionally adding the cc_out in the first place because we need
   // to check the type of the parsed immediate operand.
   if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
-      !static_cast<ARMOperand*>(Operands[4])->isARMSOImm() &&
-      static_cast<ARMOperand*>(Operands[4])->isImm0_65535Expr() &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+      !static_cast<ARMOperand &>(*Operands[4]).isARMSOImm() &&
+      static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
     return true;
 
   // Register-register 'add' for thumb does not have a cc_out operand
   // when there are only two register operands.
   if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
     return true;
   // Register-register 'add' for thumb does not have a cc_out operand
   // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
@@ -5156,13 +5137,12 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // that can handle a different range and has a cc_out operand.
   if (((isThumb() && Mnemonic == "add") ||
        (isThumbTwo() && Mnemonic == "sub")) &&
-      Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      ((Mnemonic == "add" &&static_cast<ARMOperand*>(Operands[5])->isReg()) ||
-       static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4()))
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) ||
+       static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4()))
     return true;
   // For Thumb2, add/sub immediate does not have a cc_out operand for the
   // imm0_4095 variant. That's the least-preferred variant when
@@ -5170,23 +5150,22 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // should remove the cc_out operand, we have to explicitly check that
   // it's not one of the other variants. Ugh.
   if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
     // Nest conditions rather than one big 'if' statement for readability.
     //
     // If both registers are low, we're in an IT block, and the immediate is
     // in range, we should use encoding T1 instead, which has a cc_out.
     if (inITBlock() &&
-        isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) &&
-        isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) &&
-        static_cast<ARMOperand*>(Operands[5])->isImm0_7())
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) &&
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) &&
+        static_cast<ARMOperand &>(*Operands[5]).isImm0_7())
       return false;
     // Check against T3. If the second register is the PC, this is an
     // alternate form of ADR, which uses encoding T4, so check for that too.
-    if (static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC &&
-        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
+    if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
+        static_cast<ARMOperand &>(*Operands[5]).isT2SOImm())
       return false;
 
     // Otherwise, we use encoding T4, which does not have a cc_out
@@ -5198,35 +5177,34 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
   // use the 16-bit encoding or not.
   if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[5])->isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isReg() &&
       // If the registers aren't low regs, the destination reg isn't the
       // same as one of the source regs, or the cc_out operand is zero
       // outside of an IT block, we have to use the 32-bit encoding, so
       // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand*>(Operands[5])->getReg()) ||
-       !inITBlock() ||
-       (static_cast<ARMOperand*>(Operands[3])->getReg() !=
-        static_cast<ARMOperand*>(Operands[5])->getReg() &&
-        static_cast<ARMOperand*>(Operands[3])->getReg() !=
-        static_cast<ARMOperand*>(Operands[4])->getReg())))
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) ||
+       !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[5]).getReg() &&
+                        static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[4]).getReg())))
     return true;
 
   // Also check the 'mul' syntax variant that doesn't specify an explicit
   // destination register.
   if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
       // If the registers aren't low regs  or the cc_out operand is zero
       // outside of an IT block, we have to use the 32-bit encoding, so
       // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
        !inITBlock()))
     return true;
 
@@ -5239,32 +5217,32 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // anyway.
   if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
       (Operands.size() == 5 || Operands.size() == 6) &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      (static_cast<ARMOperand*>(Operands[4])->isImm() ||
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      (static_cast<ARMOperand &>(*Operands[4]).isImm() ||
        (Operands.size() == 6 &&
-        static_cast<ARMOperand*>(Operands[5])->isImm())))
+        static_cast<ARMOperand &>(*Operands[5]).isImm())))
     return true;
 
   return false;
 }
 
-bool ARMAsmParser::shouldOmitPredicateOperand(
-    StringRef Mnemonic, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
+                                              OperandVector &Operands) {
   // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
   unsigned RegIdx = 3;
   if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
-      static_cast<ARMOperand *>(Operands[2])->getToken() == ".f32") {
-    if (static_cast<ARMOperand *>(Operands[3])->isToken() &&
-        static_cast<ARMOperand *>(Operands[3])->getToken() == ".f32")
+      static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") {
+    if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+        static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32")
       RegIdx = 4;
 
-    if (static_cast<ARMOperand *>(Operands[RegIdx])->isReg() &&
-        (ARMMCRegisterClasses[ARM::DPRRegClassID]
-             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg()) ||
-         ARMMCRegisterClasses[ARM::QPRRegClassID]
-             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg())))
+    if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
+        (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) ||
+         ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg())))
       return true;
   }
   return false;
@@ -5309,8 +5287,7 @@ static bool RequiresVFPRegListValidation(StringRef Inst,
 
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                    SMLoc NameLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+                                    SMLoc NameLoc, OperandVector &Operands) {
   // FIXME: Can this be done via tablegen in some fashion?
   bool RequireVFPRegisterListCheck;
   bool AcceptSinglePrecisionOnly;
@@ -5489,12 +5466,12 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   Parser.Lex(); // Consume the EndOfStatement
 
   if (RequireVFPRegisterListCheck) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands.back());
-    if (AcceptSinglePrecisionOnly && !Op->isSPRRegList())
-      return Error(Op->getStartLoc(),
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands.back());
+    if (AcceptSinglePrecisionOnly && !Op.isSPRRegList())
+      return Error(Op.getStartLoc(),
                    "VFP/Neon single precision register expected");
-    if (AcceptDoublePrecisionOnly && !Op->isDPRRegList())
-      return Error(Op->getStartLoc(),
+    if (AcceptDoublePrecisionOnly && !Op.isDPRRegList())
+      return Error(Op.getStartLoc(),
                    "VFP/Neon double precision register expected");
   }
 
@@ -5505,20 +5482,14 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // try to remove a cc_out operand that was explicitly set on the the
   // mnemonic, of course (CarrySetting == true). Reason number #317 the
   // table driven matcher doesn't fit well with the ARM instruction set.
-  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands))
     Operands.erase(Operands.begin() + 1);
-    delete Op;
-  }
 
   // Some instructions have the same mnemonic, but don't always
   // have a predicate. Distinguish them here and delete the
   // predicate if needed.
-  if (shouldOmitPredicateOperand(Mnemonic, Operands)) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+  if (shouldOmitPredicateOperand(Mnemonic, Operands))
     Operands.erase(Operands.begin() + 1);
-    delete Op;
-  }
 
   // ARM mode 'blx' need special handling, as the register operand version
   // is predicable, but the label operand version is not. So, we can't rely
@@ -5526,11 +5497,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // a k_CondCode operand in the list. If we're trying to match the label
   // version, remove the k_CondCode operand here.
   if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
-      static_cast<ARMOperand*>(Operands[2])->isImm()) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+      static_cast<ARMOperand &>(*Operands[2]).isImm())
     Operands.erase(Operands.begin() + 1);
-    delete Op;
-  }
 
   // Adjust operands of ldrexd/strexd to MCK_GPRPair.
   // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
@@ -5543,53 +5511,50 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
        Mnemonic == "stlexd")) {
     bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
     unsigned Idx = isLoad ? 2 : 3;
-    ARMOperand* Op1 = static_cast<ARMOperand*>(Operands[Idx]);
-    ARMOperand* Op2 = static_cast<ARMOperand*>(Operands[Idx+1]);
+    ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
 
     const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID);
     // Adjust only if Op1 and Op2 are GPRs.
-    if (Op1->isReg() && Op2->isReg() && MRC.contains(Op1->getReg()) &&
-        MRC.contains(Op2->getReg())) {
-      unsigned Reg1 = Op1->getReg();
-      unsigned Reg2 = Op2->getReg();
+    if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+        MRC.contains(Op2.getReg())) {
+      unsigned Reg1 = Op1.getReg();
+      unsigned Reg2 = Op2.getReg();
       unsigned Rt = MRI->getEncodingValue(Reg1);
       unsigned Rt2 = MRI->getEncodingValue(Reg2);
 
       // Rt2 must be Rt + 1 and Rt must be even.
       if (Rt + 1 != Rt2 || (Rt & 1)) {
-        Error(Op2->getStartLoc(), isLoad ?
-            "destination operands must be sequential" :
-            "source operands must be sequential");
+        Error(Op2.getStartLoc(), isLoad
+                                     ? "destination operands must be sequential"
+                                     : "source operands must be sequential");
         return true;
       }
       unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0,
           &(MRI->getRegClass(ARM::GPRPairRegClassID)));
-      Operands.erase(Operands.begin() + Idx, Operands.begin() + Idx + 2);
-      Operands.insert(Operands.begin() + Idx, ARMOperand::CreateReg(
-            NewReg, Op1->getStartLoc(), Op2->getEndLoc()));
-      delete Op1;
-      delete Op2;
+      Operands[Idx] =
+          ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+      Operands.erase(Operands.begin() + Idx + 1);
     }
   }
 
   // GNU Assembler extension (compatibility)
   if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
-    ARMOperand *Op2 = static_cast<ARMOperand *>(Operands[2]);
-    ARMOperand *Op3 = static_cast<ARMOperand *>(Operands[3]);
-    if (Op3->isMem()) {
-      assert(Op2->isReg() && "expected register argument");
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
+    ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+    if (Op3.isMem()) {
+      assert(Op2.isReg() && "expected register argument");
 
       unsigned SuperReg = MRI->getMatchingSuperReg(
-          Op2->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+          Op2.getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
 
       assert(SuperReg && "expected register pair");
 
       unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
 
-      Operands.insert(Operands.begin() + 3,
-                      ARMOperand::CreateReg(PairedReg,
-                                            Op2->getStartLoc(),
-                                            Op2->getEndLoc()));
+      Operands.insert(
+          Operands.begin() + 3,
+          ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
     }
   }
 
@@ -5599,19 +5564,13 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // so the Mnemonic is the original name "subs" and delete the predicate
   // operand so it will match the table entry.
   if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::PC &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::LR &&
-      static_cast<ARMOperand*>(Operands[5])->isImm()) {
-    ARMOperand *Op0 = static_cast<ARMOperand*>(Operands[0]);
-    Operands.erase(Operands.begin());
-    delete Op0;
-    Operands.insert(Operands.begin(), ARMOperand::CreateToken(Name, NameLoc));
-
-    ARMOperand *Op1 = static_cast<ARMOperand*>(Operands[1]);
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+    Operands.front() = ARMOperand::CreateToken(Name, NameLoc);
     Operands.erase(Operands.begin() + 1);
-    delete Op1;
   }
   return false;
 }
@@ -5657,9 +5616,8 @@ static bool instIsBreakpoint(const MCInst &Inst) {
 }
 
 // FIXME: We would really like to be able to tablegen'erate this.
-bool ARMAsmParser::
-validateInstruction(MCInst &Inst,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::validateInstruction(MCInst &Inst,
+                                       const OperandVector &Operands) {
   const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   SMLoc Loc = Operands[0]->getStartLoc();
 
@@ -5682,7 +5640,7 @@ validateInstruction(MCInst &Inst,
       // Find the condition code Operand to get its SMLoc information.
       SMLoc CondLoc;
       for (unsigned I = 1; I < Operands.size(); ++I)
-        if (static_cast<ARMOperand*>(Operands[I])->isCondCode())
+        if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
           CondLoc = Operands[I]->getStartLoc();
       return Error(CondLoc, "incorrect condition in IT block; got '" +
                    StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
@@ -5782,8 +5740,8 @@ validateInstruction(MCInst &Inst,
     // in the register list.
     unsigned Rn = Inst.getOperand(0).getReg();
     bool HasWritebackToken =
-      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
       return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
@@ -5843,11 +5801,10 @@ validateInstruction(MCInst &Inst,
     // this first statement is always true for the new Inst.  Essentially, the
     // destination is unconditionally copied into the second source operand
     // without checking to see if it matches what we actually parsed.
-    if (Operands.size() == 6 &&
-        (((ARMOperand*)Operands[3])->getReg() !=
-         ((ARMOperand*)Operands[5])->getReg()) &&
-        (((ARMOperand*)Operands[3])->getReg() !=
-         ((ARMOperand*)Operands[4])->getReg())) {
+    if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() !=
+                                 ((ARMOperand &)*Operands[5]).getReg()) &&
+        (((ARMOperand &)*Operands[3]).getReg() !=
+         ((ARMOperand &)*Operands[4]).getReg())) {
       return Error(Operands[3]->getStartLoc(),
                    "destination register must match source register");
     }
@@ -5900,23 +5857,23 @@ validateInstruction(MCInst &Inst,
   }
   // Final range checking for Thumb unconditional branch instructions.
   case ARM::tB:
-    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<11, 1>())
+    if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
       return Error(Operands[2]->getStartLoc(), "branch target out of range");
     break;
   case ARM::t2B: {
     int op = (Operands[2]->isImm()) ? 2 : 3;
-    if (!(static_cast<ARMOperand*>(Operands[op]))->isSignedOffset<24, 1>())
+    if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>())
       return Error(Operands[op]->getStartLoc(), "branch target out of range");
     break;
   }
   // Final range checking for Thumb conditional branch instructions.
   case ARM::tBcc:
-    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<8, 1>())
+    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>())
       return Error(Operands[2]->getStartLoc(), "branch target out of range");
     break;
   case ARM::t2Bcc: {
     int Op = (Operands[2]->isImm()) ? 2 : 3;
-    if (!(static_cast<ARMOperand*>(Operands[Op]))->isSignedOffset<20, 1>())
+    if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>())
       return Error(Operands[Op]->getStartLoc(), "branch target out of range");
     break;
   }
@@ -5931,19 +5888,19 @@ validateInstruction(MCInst &Inst,
     // lead to bugs that are difficult to find since this is an easy mistake
     // to make.
     int i = (Operands[3]->isImm()) ? 3 : 4;
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[i]);
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
     if (CE) break;
-    const MCExpr *E = dyn_cast<MCExpr>(Op->getImm());
+    const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
     if (!E) break;
     const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
     if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
-                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16)) {
-      return Error(Op->getStartLoc(),
-	     "immediate expression for mov requires :lower16: or :upper16");
-      break;
-    }
-    }
+                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16))
+      return Error(
+          Op.getStartLoc(),
+          "immediate expression for mov requires :lower16: or :upper16");
+    break;
+  }
   }
 
   return false;
@@ -6205,9 +6162,8 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
   }
 }
 
-bool ARMAsmParser::
-processInstruction(MCInst &Inst,
-                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::processInstruction(MCInst &Inst,
+                                      const OperandVector &Operands) {
   switch (Inst.getOpcode()) {
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
@@ -6264,8 +6220,8 @@ processInstruction(MCInst &Inst,
     // Select the narrow version if the immediate will fit.
     if (Inst.getOperand(1).getImm() > 0 &&
         Inst.getOperand(1).getImm() <= 0xff &&
-        !(static_cast<ARMOperand*>(Operands[2])->isToken() &&
-         static_cast<ARMOperand*>(Operands[2])->getToken() == ".w"))
+        !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
       Inst.setOpcode(ARM::tLDRpci);
     else
       Inst.setOpcode(ARM::t2LDRpci);
@@ -7355,8 +7311,8 @@ processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
         Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
-        !(static_cast<ARMOperand*>(Operands[3])->isToken() &&
-         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w")) {
+        !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("unexpected opcode");
@@ -7559,7 +7515,7 @@ processInstruction(MCInst &Inst,
   case ARM::LDMIA_UPD:
     // If this is a load of a single register via a 'pop', then we should use
     // a post-indexed LDR instruction instead, per the ARM ARM.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "pop" &&
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "pop" &&
         Inst.getNumOperands() == 5) {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::LDR_POST_IMM);
@@ -7577,7 +7533,7 @@ processInstruction(MCInst &Inst,
   case ARM::STMDB_UPD:
     // If this is a store of a single register via a 'push', then we should use
     // a pre-indexed STR instruction instead, per the ARM ARM.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "push" &&
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "push" &&
         Inst.getNumOperands() == 5) {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::STR_PRE_IMM);
@@ -7593,7 +7549,7 @@ processInstruction(MCInst &Inst,
   case ARM::t2ADDri12:
     // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
     // mnemonic was used (not "addw"), encoding T3 is preferred.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "add" ||
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" ||
         ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
       break;
     Inst.setOpcode(ARM::t2ADDri);
@@ -7602,7 +7558,7 @@ processInstruction(MCInst &Inst,
   case ARM::t2SUBri12:
     // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
     // mnemonic was used (not "subw"), encoding T3 is preferred.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "sub" ||
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" ||
         ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
       break;
     Inst.setOpcode(ARM::t2SUBri);
@@ -7638,9 +7594,9 @@ processInstruction(MCInst &Inst,
         !isARMLowRegister(Inst.getOperand(0).getReg()) ||
         (unsigned)Inst.getOperand(2).getImm() > 255 ||
         ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
-        (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
-        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+         (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
@@ -7661,8 +7617,8 @@ processInstruction(MCInst &Inst,
     // 'as' behaviour. Make sure the wide encoding wasn't explicit.
     if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
         Inst.getOperand(5).getReg() != 0 ||
-        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(ARM::tADDhirr);
@@ -7719,8 +7675,8 @@ processInstruction(MCInst &Inst,
     // an error in validateInstruction().
     unsigned Rn = Inst.getOperand(0).getReg();
     bool hasWritebackToken =
-      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
     bool listContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
         (!listContainsBase && !hasWritebackToken) ||
@@ -7782,10 +7738,10 @@ processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         (unsigned)Inst.getOperand(1).getImm() <= 255 &&
         ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
-         Inst.getOperand(4).getReg() == ARM::CPSR) ||
-        (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
-        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
-         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+          Inst.getOperand(4).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
       // The operands aren't in the same order for tMOVi8...
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::tMOVi8);
@@ -7806,8 +7762,8 @@ processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == ARMCC::AL &&
         Inst.getOperand(4).getReg() == ARM::CPSR &&
-        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
-         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
       // The operands aren't the same for tMOV[S]r... (no cc_out)
       MCInst TmpInst;
       TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
@@ -7829,8 +7785,8 @@ processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == 0 &&
-        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
-         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("Illegal opcode!");
@@ -7942,9 +7898,10 @@ processInstruction(MCInst &Inst,
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
         ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
-        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
-         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -7981,9 +7938,10 @@ processInstruction(MCInst &Inst,
         (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
          Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
         ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
-        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
-         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -8063,11 +8021,10 @@ template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
 }
 
 static const char *getSubtargetFeatureName(unsigned Val);
-bool ARMAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
   MCInst Inst;
   unsigned MatchResult;
 
@@ -8136,7 +8093,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     }
 
@@ -8144,7 +8101,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction",
-                 ((ARMOperand*)Operands[0])->getLocRange());
+                 ((ARMOperand &)*Operands[0]).getLocRange());
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
   case Match_RequiresITBlock:
@@ -8154,12 +8111,12 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
   case Match_ImmRange0_15: {
-    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
   }
   case Match_ImmRange0_239: {
-    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
   }
@@ -8175,7 +8132,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_DupAlignedMemoryRequires64or128:
   case Match_AlignedMemoryRequires64or128or256:
   {
-    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getAlignmentLoc();
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getAlignmentLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     switch (MatchResult) {
       default:
@@ -8923,28 +8880,22 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   }
 
   // RAII object to make sure parsed operands are deleted.
-  struct CleanupObject {
-    SmallVector<MCParsedAsmOperand *, 1> Operands;
-    ~CleanupObject() {
-      for (unsigned I = 0, E = Operands.size(); I != E; ++I)
-        delete Operands[I];
-    }
-  } CO;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
 
   // Parse the register list
-  if (parseRegisterList(CO.Operands))
+  if (parseRegisterList(Operands))
     return false;
-  ARMOperand *Op = (ARMOperand*)CO.Operands[0];
-  if (!IsVector && !Op->isRegList()) {
+  ARMOperand &Op = (ARMOperand &)*Operands[0];
+  if (!IsVector && !Op.isRegList()) {
     Error(L, ".save expects GPR registers");
     return false;
   }
-  if (IsVector && !Op->isDPRRegList()) {
+  if (IsVector && !Op.isDPRRegList()) {
     Error(L, ".vsave expects DPR registers");
     return false;
   }
 
-  getTargetStreamer().emitRegSave(Op->getRegList(), IsVector);
+  getTargetStreamer().emitRegSave(Op.getRegList(), IsVector);
   return false;
 }
 
@@ -9468,23 +9419,23 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
 
 // Define this matcher function after the auto-generated include so we
 // have the match class enum definitions.
-unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
                                                   unsigned Kind) {
-  ARMOperand *Op = static_cast<ARMOperand*>(AsmOp);
+  ARMOperand &Op = static_cast<ARMOperand &>(AsmOp);
   // If the kind is a token for a literal immediate, check if our asm
   // operand matches. This is for InstAliases which have a fixed-value
   // immediate in the syntax.
   switch (Kind) {
   default: break;
   case MCK__35_0:
-    if (Op->isImm())
-      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()))
+    if (Op.isImm())
+      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
         if (CE->getValue() == 0)
           return Match_Success;
     break;
   case MCK_ARMSOImm:
-    if (Op->isImm()) {
-      const MCExpr *SOExpr = Op->getImm();
+    if (Op.isImm()) {
+      const MCExpr *SOExpr = Op.getImm();
       int64_t Value;
       if (!SOExpr->EvaluateAsAbsolute(Value))
         return Match_Success;
@@ -9493,8 +9444,8 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
     }
     break;
   case MCK_GPRPair:
-    if (Op->isReg() &&
-        MRI->getRegClass(ARM::GPRRegClassID).contains(Op->getReg()))
+    if (Op.isReg() &&
+        MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
       return Match_Success;
     break;
   }
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index e4b785def87c..228fb5756ca8 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -1092,13 +1092,13 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
   if (isSub) {
     O << ", "
       << markup("<imm:")
-      << "#-" << -OffImm
+      << "#-" << formatImm(-OffImm)
       << markup(">");
   }
   else if (AlwaysPrintImm0 || OffImm > 0) {
     O << ", "
       << markup("<imm:")
-      << "#" << OffImm
+      << "#" << formatImm(OffImm)
       << markup(">");
   }
   O << "]" << markup(">");
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 701a6320d487..5b51a52f828a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1029,9 +1029,6 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
     switch (ARM16Expr->getKind()) {
     default: llvm_unreachable("Unsupported ARMFixup");
     case ARMMCExpr::VK_ARM_HI16:
-      if (Triple(STI.getTargetTriple()).isOSWindows())
-        return 0;
-
       Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movt_hi16
                                        : ARM::fixup_arm_movt_hi16);
       break;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index e3cfb05b379d..0cb795ba3ccc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -11,147 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 
 using namespace llvm;
-
-namespace {
-// A class to keep track of assembler-generated constant pools that are use to
-// implement the ldr-pseudo.
-class ConstantPool {
-  typedef SmallVector<std::pair<MCSymbol *, const MCExpr *>, 4> EntryVecTy;
-  EntryVecTy Entries;
-
-public:
-  // Initialize a new empty constant pool
-  ConstantPool() {}
-
-  // Add a new entry to the constant pool in the next slot.
-  // \param Value is the new entry to put in the constant pool.
-  //
-  // \returns a MCExpr that references the newly inserted value
-  const MCExpr *addEntry(const MCExpr *Value, MCContext &Context);
-
-  // Emit the contents of the constant pool using the provided streamer.
-  void emitEntries(MCStreamer &Streamer);
-
-  // Return true if the constant pool is empty
-  bool empty();
-};
-}
-
-namespace llvm {
-class AssemblerConstantPools {
-  // Map type used to keep track of per-Section constant pools used by the
-  // ldr-pseudo opcode. The map associates a section to its constant pool. The
-  // constant pool is a vector of (label, value) pairs. When the ldr
-  // pseudo is parsed we insert a new (label, value) pair into the constant pool
-  // for the current section and add MCSymbolRefExpr to the new label as
-  // an opcode to the ldr. After we have parsed all the user input we
-  // output the (label, value) pairs in each constant pool at the end of the
-  // section.
-  //
-  // We use the MapVector for the map type to ensure stable iteration of
-  // the sections at the end of the parse. We need to iterate over the
-  // sections in a stable order to ensure that we have print the
-  // constant pools in a deterministic order when printing an assembly
-  // file.
-  typedef MapVector<const MCSection *, ConstantPool> ConstantPoolMapTy;
-  ConstantPoolMapTy ConstantPools;
-
-public:
-  AssemblerConstantPools() {}
-  ~AssemblerConstantPools() {}
-
-  void emitAll(MCStreamer &Streamer);
-  void emitForCurrentSection(MCStreamer &Streamer);
-  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr);
-
-private:
-  ConstantPool *getConstantPool(const MCSection *Section);
-  ConstantPool &getOrCreateConstantPool(const MCSection *Section);
-};
-}
-
-//
-// ConstantPool implementation
-//
-// Emit the contents of the constant pool using the provided streamer.
-void ConstantPool::emitEntries(MCStreamer &Streamer) {
-  if (Entries.empty())
-    return;
-  Streamer.EmitCodeAlignment(4); // align to 4-byte address
-  Streamer.EmitDataRegion(MCDR_DataRegion);
-  for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end();
-       I != E; ++I) {
-    Streamer.EmitLabel(I->first);
-    Streamer.EmitValue(I->second, 4);
-  }
-  Streamer.EmitDataRegion(MCDR_DataRegionEnd);
-  Entries.clear();
-}
-
-const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) {
-  MCSymbol *CPEntryLabel = Context.CreateTempSymbol();
-
-  Entries.push_back(std::make_pair(CPEntryLabel, Value));
-  return MCSymbolRefExpr::Create(CPEntryLabel, Context);
-}
-
-bool ConstantPool::empty() { return Entries.empty(); }
-
-//
-// AssemblerConstantPools implementation
-//
-ConstantPool *
-AssemblerConstantPools::getConstantPool(const MCSection *Section) {
-  ConstantPoolMapTy::iterator CP = ConstantPools.find(Section);
-  if (CP == ConstantPools.end())
-    return nullptr;
-
-  return &CP->second;
-}
-
-ConstantPool &
-AssemblerConstantPools::getOrCreateConstantPool(const MCSection *Section) {
-  return ConstantPools[Section];
-}
-
-static void emitConstantPool(MCStreamer &Streamer, const MCSection *Section,
-                             ConstantPool &CP) {
-  if (!CP.empty()) {
-    Streamer.SwitchSection(Section);
-    CP.emitEntries(Streamer);
-  }
-}
-
-void AssemblerConstantPools::emitAll(MCStreamer &Streamer) {
-  // Dump contents of assembler constant pools.
-  for (ConstantPoolMapTy::iterator CPI = ConstantPools.begin(),
-                                   CPE = ConstantPools.end();
-       CPI != CPE; ++CPI) {
-    const MCSection *Section = CPI->first;
-    ConstantPool &CP = CPI->second;
-
-    emitConstantPool(Streamer, Section, CP);
-  }
-}
-
-void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) {
-  const MCSection *Section = Streamer.getCurrentSection().first;
-  if (ConstantPool *CP = getConstantPool(Section)) {
-    emitConstantPool(Streamer, Section, *CP);
-  }
-}
-
-const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer,
-                                               const MCExpr *Expr) {
-  const MCSection *Section = Streamer.getCurrentSection().first;
-  return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext());
-}
-
 //
 // ARMTargetStreamer Implemenation
 //
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index ba9df6e962c9..d31f1f41c697 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -27,6 +27,8 @@ class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
 
   unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
                         bool IsCrossSection) const override;
+
+  bool recordRelocation(const MCFixup &) const override;
 };
 
 unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
@@ -61,12 +63,14 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
   case ARM::fixup_arm_thumb_blx:
     return COFF::IMAGE_REL_ARM_BLX23T;
   case ARM::fixup_t2_movw_lo16:
-    return COFF::IMAGE_REL_ARM_MOV32T;
   case ARM::fixup_t2_movt_hi16:
-    llvm_unreachable("High-word for pair-wise relocations are contiguously "
-                     "addressed as an IMAGE_REL_ARM_MOV32T relocation");
+    return COFF::IMAGE_REL_ARM_MOV32T;
   }
 }
+
+bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
+}
 }
 
 namespace llvm {
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index f61874b1cb0a..5916954850ae 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __THUMB_FRAMEINFO_H_
-#define __THUMB_FRAMEINFO_H_
+#ifndef LLVM_ARM_THUMB1FRAMELOWERING_H
+#define LLVM_ARM_THUMB1FRAMELOWERING_H
 
 #include "ARMFrameLowering.h"
 #include "ARMSubtarget.h"
diff --git a/lib/Target/ARM64/ARM64.h b/lib/Target/ARM64/ARM64.h
deleted file mode 100644
index debb9002eb4b..000000000000
--- a/lib/Target/ARM64/ARM64.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- ARM64.h - Top-level interface for ARM64 representation --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the entry points for global functions defined in the LLVM
-// ARM64 back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TARGET_ARM64_H
-#define TARGET_ARM64_H
-
-#include "Utils/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class ARM64TargetMachine;
-class FunctionPass;
-class MachineFunctionPass;
-
-FunctionPass *createARM64DeadRegisterDefinitions();
-FunctionPass *createARM64ConditionalCompares();
-FunctionPass *createARM64AdvSIMDScalar();
-FunctionPass *createARM64BranchRelaxation();
-FunctionPass *createARM64ISelDag(ARM64TargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel);
-FunctionPass *createARM64StorePairSuppressPass();
-FunctionPass *createARM64ExpandPseudoPass();
-FunctionPass *createARM64LoadStoreOptimizationPass();
-ModulePass *createARM64PromoteConstantPass();
-FunctionPass *createARM64AddressTypePromotionPass();
-/// \brief Creates an ARM-specific Target Transformation Info pass.
-ImmutablePass *createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM);
-
-FunctionPass *createARM64CleanupLocalDynamicTLSPass();
-
-FunctionPass *createARM64CollectLOHPass();
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64.td b/lib/Target/ARM64/ARM64.td
deleted file mode 100644
index c473205f17ca..000000000000
--- a/lib/Target/ARM64/ARM64.td
+++ /dev/null
@@ -1,134 +0,0 @@
-//===- ARM64.td - Describe the ARM64 Target Machine --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// ARM64 Subtarget features.
-//
-
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
-                                       "Enable ARMv8 FP">;
-
-def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
-
-def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-  "Enable cryptographic instructions">;
-
-def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
-  "Enable ARMv8 CRC-32 checksum instructions">;
-
-/// Cyclone has register move instructions which are "free".
-def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
-                                        "Has zero-cycle register moves">;
-
-/// Cyclone has instructions which zero registers for "free".
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
-
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "ARM64RegisterInfo.td"
-include "ARM64CallingConvention.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "ARM64Schedule.td"
-include "ARM64InstrInfo.td"
-
-def ARM64InstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// ARM64 Processors supported.
-//
-include "ARM64SchedA53.td"
-include "ARM64SchedCyclone.td"
-
-def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
-                                   "Cortex-A53 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
-                                   FeatureCRC]>;
-
-def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
-                                   "Cortex-A57 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
-                                   FeatureCRC]>;
-
-def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
-                                   "Cyclone",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
-                                   FeatureCRC,
-                                   FeatureZCRegMove, FeatureZCZeroing]>;
-
-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
-                                              FeatureNEON,
-                                              FeatureCRC]>;
-
-def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
-def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
-
-//===----------------------------------------------------------------------===//
-// Assembly parser
-//===----------------------------------------------------------------------===//
-
-def GenericAsmParserVariant : AsmParserVariant {
-  int Variant = 0;
-  string Name = "generic";
-}
-
-def AppleAsmParserVariant : AsmParserVariant {
-  int Variant = 1;
-  string Name = "apple-neon";
-}
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-// ARM64 Uses the MC printer for asm output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def GenericAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  int Variant = 0;
-  bit isMCAsmWriter = 1;
-}
-
-def AppleAsmWriter : AsmWriter {
-  let AsmWriterClassName = "AppleInstPrinter";
-  int Variant = 1;
-  int isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Declaration
-//===----------------------------------------------------------------------===//
-
-def ARM64 : Target {
-  let InstructionSet = ARM64InstrInfo;
-  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
-  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
-}
diff --git a/lib/Target/ARM64/ARM64AsmPrinter.cpp b/lib/Target/ARM64/ARM64AsmPrinter.cpp
deleted file mode 100644
index 5531101fe2f4..000000000000
--- a/lib/Target/ARM64/ARM64AsmPrinter.cpp
+++ /dev/null
@@ -1,604 +0,0 @@
-//===-- ARM64AsmPrinter.cpp - ARM64 LLVM assembly writer ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to the ARM64 assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64MCInstLower.h"
-#include "ARM64RegisterInfo.h"
-#include "ARM64Subtarget.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-namespace {
-
-class ARM64AsmPrinter : public AsmPrinter {
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when printing asm code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  ARM64MCInstLower MCInstLowering;
-  StackMaps SM;
-
-public:
-  ARM64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), Subtarget(&TM.getSubtarget<ARM64Subtarget>()),
-        MCInstLowering(OutContext, *Mang, *this), SM(*this), ARM64FI(nullptr),
-        LOHLabelCounter(0) {}
-
-  const char *getPassName() const override { return "ARM64 Assembly Printer"; }
-
-  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
-  /// tblgen'erated pseudo lowering.
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
-    return MCInstLowering.lowerOperand(MO, MCOp);
-  }
-
-  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                     const MachineInstr &MI);
-  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                       const MachineInstr &MI);
-  /// \brief tblgen'erated driver function for lowering simple MI->MC
-  /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
-
-  void EmitInstruction(const MachineInstr *MI) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AsmPrinter::getAnalysisUsage(AU);
-    AU.setPreservesAll();
-  }
-
-  bool runOnMachineFunction(MachineFunction &F) override {
-    ARM64FI = F.getInfo<ARM64FunctionInfo>();
-    return AsmPrinter::runOnMachineFunction(F);
-  }
-
-private:
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
-  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
-  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
-  bool printAsmRegInClass(const MachineOperand &MO,
-                          const TargetRegisterClass *RC, bool isVector,
-                          raw_ostream &O);
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O) override;
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O) override;
-
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
-  void EmitFunctionBodyEnd() override;
-
-  MCSymbol *GetCPISymbol(unsigned CPID) const override;
-  void EmitEndOfAsmFile(Module &M) override;
-  ARM64FunctionInfo *ARM64FI;
-
-  /// \brief Emit the LOHs contained in ARM64FI.
-  void EmitLOHs();
-
-  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
-  MInstToMCSymbol LOHInstToLabel;
-  unsigned LOHLabelCounter;
-};
-
-} // end of anonymous namespace
-
-//===----------------------------------------------------------------------===//
-
-void ARM64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetMachO()) {
-    // Funny Darwin hack: This flag tells the linker that no global symbols
-    // contain code that falls through to other global symbols (e.g. the obvious
-    // implementation of multiple entry points).  If this doesn't occur, the
-    // linker can safely perform dead code stripping.  Since LLVM never
-    // generates code that does this, it is always safe to set.
-    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
-    SM.serializeToStackMapSection();
-  }
-
-  // Emit a .data.rel section containing any stubs that were created.
-  if (Subtarget->isTargetELF()) {
-    const TargetLoweringObjectFileELF &TLOFELF =
-      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
-
-    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
-
-    // Output stubs for external and common global variables.
-    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
-    if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
-
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        OutStreamer.EmitLabel(Stubs[i].first);
-        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(0));
-      }
-      Stubs.clear();
-    }
-  }
-
-}
-
-MachineLocation
-ARM64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
-void ARM64AsmPrinter::EmitLOHs() {
-  SmallVector<MCSymbol *, 3> MCArgs;
-
-  for (const auto &D : ARM64FI->getLOHContainer()) {
-    for (const MachineInstr *MI : D.getArgs()) {
-      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
-      assert(LabelIt != LOHInstToLabel.end() &&
-             "Label hasn't been inserted for LOH related instruction");
-      MCArgs.push_back(LabelIt->second);
-    }
-    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
-    MCArgs.clear();
-  }
-}
-
-void ARM64AsmPrinter::EmitFunctionBodyEnd() {
-  if (!ARM64FI->getLOHRelated().empty())
-    EmitLOHs();
-}
-
-/// GetCPISymbol - Return the symbol for the specified constant pool entry.
-MCSymbol *ARM64AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  // Darwin uses a linker-private symbol name for constant-pools (to
-  // avoid addends on the relocation?), ELF has no such concept and
-  // uses a normal private symbol.
-  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
-    return OutContext.GetOrCreateSymbol(
-        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
-        Twine(getFunctionNumber()) + "_" + Twine(CPID));
-
-  return OutContext.GetOrCreateSymbol(
-      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
-      Twine(getFunctionNumber()) + "_" + Twine(CPID));
-}
-
-void ARM64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
-                                   raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  switch (MO.getType()) {
-  default:
-    assert(0 && "<unknown operand type>");
-  case MachineOperand::MO_Register: {
-    unsigned Reg = MO.getReg();
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-    assert(!MO.getSubReg() && "Subregs should be eliminated!");
-    O << ARM64InstPrinter::getRegisterName(Reg);
-    break;
-  }
-  case MachineOperand::MO_Immediate: {
-    int64_t Imm = MO.getImm();
-    O << '#' << Imm;
-    break;
-  }
-  }
-}
-
-bool ARM64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
-                                        raw_ostream &O) {
-  unsigned Reg = MO.getReg();
-  switch (Mode) {
-  default:
-    return true; // Unknown mode.
-  case 'w':
-    Reg = getWRegFromXReg(Reg);
-    break;
-  case 'x':
-    Reg = getXRegFromWReg(Reg);
-    break;
-  }
-
-  O << ARM64InstPrinter::getRegisterName(Reg);
-  return false;
-}
-
-// Prints the register in MO using class RC using the offset in the
-// new register class. This should not be used for cross class
-// printing.
-bool ARM64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
-                                         const TargetRegisterClass *RC,
-                                         bool isVector, raw_ostream &O) {
-  assert(MO.isReg() && "Should only get here with a register!");
-  const ARM64RegisterInfo *RI =
-      static_cast<const ARM64RegisterInfo *>(TM.getRegisterInfo());
-  unsigned Reg = MO.getReg();
-  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
-  assert(RI->regsOverlap(RegToPrint, Reg));
-  O << ARM64InstPrinter::getRegisterName(
-           RegToPrint, isVector ? ARM64::vreg : ARM64::NoRegAltName);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                                      unsigned AsmVariant,
-                                      const char *ExtraCode, raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0]) {
-    if (ExtraCode[1] != 0)
-      return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      return true; // Unknown modifier.
-    case 'w':      // Print W register
-    case 'x':      // Print X register
-      if (MO.isReg())
-        return printAsmMRegister(MO, ExtraCode[0], O);
-      if (MO.isImm() && MO.getImm() == 0) {
-        unsigned Reg = ExtraCode[0] == 'w' ? ARM64::WZR : ARM64::XZR;
-        O << ARM64InstPrinter::getRegisterName(Reg);
-        return false;
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    case 'b': // Print B register.
-    case 'h': // Print H register.
-    case 's': // Print S register.
-    case 'd': // Print D register.
-    case 'q': // Print Q register.
-      if (MO.isReg()) {
-        const TargetRegisterClass *RC;
-        switch (ExtraCode[0]) {
-        case 'b':
-          RC = &ARM64::FPR8RegClass;
-          break;
-        case 'h':
-          RC = &ARM64::FPR16RegClass;
-          break;
-        case 's':
-          RC = &ARM64::FPR32RegClass;
-          break;
-        case 'd':
-          RC = &ARM64::FPR64RegClass;
-          break;
-        case 'q':
-          RC = &ARM64::FPR128RegClass;
-          break;
-        default:
-          return true;
-        }
-        return printAsmRegInClass(MO, RC, false /* vector */, O);
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    }
-  }
-
-  // According to ARM, we should emit x and v registers unless we have a
-  // modifier.
-  if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
-
-    // If this is a w or x register, print an x register.
-    if (ARM64::GPR32allRegClass.contains(Reg) ||
-        ARM64::GPR64allRegClass.contains(Reg))
-      return printAsmMRegister(MO, 'x', O);
-
-    // If this is a b, h, s, d, or q register, print it as a v register.
-    return printAsmRegInClass(MO, &ARM64::FPR128RegClass, true /* vector */, O);
-  }
-
-  printOperand(MI, OpNum, O);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
-                                            unsigned OpNum, unsigned AsmVariant,
-                                            const char *ExtraCode,
-                                            raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
-
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isReg() && "unexpected inline asm memory operand");
-  O << "[" << ARM64InstPrinter::getRegisterName(MO.getReg()) << "]";
-  return false;
-}
-
-void ARM64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                             raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps == 4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '[';
-  printOperand(MI, 0, OS);
-  OS << '+';
-  printOperand(MI, 1, OS);
-  OS << ']';
-  OS << "+";
-  printOperand(MI, NOps - 2, OS);
-}
-
-void ARM64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                                    const MachineInstr &MI) {
-  unsigned NumNOPBytes = MI.getOperand(1).getImm();
-
-  SM.recordStackMap(MI);
-  // Emit padding.
-  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
-  for (unsigned i = 0; i < NumNOPBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Lower a patchpoint of the form:
-// [<def>], <id>, <numBytes>, <target>, <numArgs>
-void ARM64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                                      const MachineInstr &MI) {
-  SM.recordPatchPoint(MI);
-
-  PatchPointOpers Opers(&MI);
-
-  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
-  unsigned EncodedBytes = 0;
-  if (CallTarget) {
-    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
-           "High 16 bits of call target should be zero.");
-    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 16;
-    // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVZWi)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 32) & 0xFFFF)
-                                    .addImm(32));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 16) & 0xFFFF)
-                                    .addImm(16));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(CallTarget & 0xFFFF)
-                                    .addImm(0));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::BLR).addReg(ScratchReg));
-  }
-  // Emit padding.
-  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
-  assert(NumBytes >= EncodedBytes &&
-         "Patchpoint can't request size less than the length of a call.");
-  assert((NumBytes - EncodedBytes) % 4 == 0 &&
-         "Invalid number of NOP bytes requested!");
-  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Simple pseudo-instructions have their lowering (with expansion to real
-// instructions) auto-generated.
-#include "ARM64GenMCPseudoLowering.inc"
-
-static unsigned getRealIndexedOpcode(unsigned Opc) {
-  switch (Opc) {
-  case ARM64::LDRXpre_isel:    return ARM64::LDRXpre;
-  case ARM64::LDRWpre_isel:    return ARM64::LDRWpre;
-  case ARM64::LDRQpre_isel:    return ARM64::LDRQpre;
-  case ARM64::LDRDpre_isel:    return ARM64::LDRDpre;
-  case ARM64::LDRSpre_isel:    return ARM64::LDRSpre;
-  case ARM64::LDRBBpre_isel:   return ARM64::LDRBBpre;
-  case ARM64::LDRHHpre_isel:   return ARM64::LDRHHpre;
-  case ARM64::LDRSBWpre_isel:  return ARM64::LDRSBWpre;
-  case ARM64::LDRSBXpre_isel:  return ARM64::LDRSBXpre;
-  case ARM64::LDRSHWpre_isel:  return ARM64::LDRSHWpre;
-  case ARM64::LDRSHXpre_isel:  return ARM64::LDRSHXpre;
-  case ARM64::LDRSWpre_isel:   return ARM64::LDRSWpre;
-
-  case ARM64::LDRQpost_isel:   return ARM64::LDRQpost;
-  case ARM64::LDRDpost_isel:   return ARM64::LDRDpost;
-  case ARM64::LDRSpost_isel:   return ARM64::LDRSpost;
-  case ARM64::LDRXpost_isel:   return ARM64::LDRXpost;
-  case ARM64::LDRWpost_isel:   return ARM64::LDRWpost;
-  case ARM64::LDRHHpost_isel:  return ARM64::LDRHHpost;
-  case ARM64::LDRBBpost_isel:  return ARM64::LDRBBpost;
-  case ARM64::LDRSWpost_isel:  return ARM64::LDRSWpost;
-  case ARM64::LDRSHWpost_isel: return ARM64::LDRSHWpost;
-  case ARM64::LDRSHXpost_isel: return ARM64::LDRSHXpost;
-  case ARM64::LDRSBWpost_isel: return ARM64::LDRSBWpost;
-  case ARM64::LDRSBXpost_isel: return ARM64::LDRSBXpost;
-
-  case ARM64::STRXpre_isel:    return ARM64::STRXpre;
-  case ARM64::STRWpre_isel:    return ARM64::STRWpre;
-  case ARM64::STRHHpre_isel:   return ARM64::STRHHpre;
-  case ARM64::STRBBpre_isel:   return ARM64::STRBBpre;
-  case ARM64::STRQpre_isel:    return ARM64::STRQpre;
-  case ARM64::STRDpre_isel:    return ARM64::STRDpre;
-  case ARM64::STRSpre_isel:    return ARM64::STRSpre;
-  }
-  llvm_unreachable("Unexpected pre-indexed opcode!");
-}
-
-void ARM64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(OutStreamer, MI))
-    return;
-
-  if (ARM64FI->getLOHRelated().count(MI)) {
-    // Generate a label for LOH related instruction
-    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
-    // Associate the instruction with the label
-    LOHInstToLabel[MI] = LOHLabel;
-    OutStreamer.EmitLabel(LOHLabel);
-  }
-
-  // Do any manual lowerings.
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
-  // Indexed loads and stores use a pseudo to handle complex operand
-  // tricks and writeback to the base register. We strip off the writeback
-  // operand and switch the opcode here. Post-indexed stores were handled by the
-  // tablegen'erated pseudos above. (The complex operand <--> simple
-  // operand isel is beyond tablegen's ability, so we do these manually).
-  case ARM64::LDRHHpre_isel:
-  case ARM64::LDRBBpre_isel:
-  case ARM64::LDRXpre_isel:
-  case ARM64::LDRWpre_isel:
-  case ARM64::LDRQpre_isel:
-  case ARM64::LDRDpre_isel:
-  case ARM64::LDRSpre_isel:
-  case ARM64::LDRSBWpre_isel:
-  case ARM64::LDRSBXpre_isel:
-  case ARM64::LDRSHWpre_isel:
-  case ARM64::LDRSHXpre_isel:
-  case ARM64::LDRSWpre_isel:
-  case ARM64::LDRQpost_isel:
-  case ARM64::LDRDpost_isel:
-  case ARM64::LDRSpost_isel:
-  case ARM64::LDRXpost_isel:
-  case ARM64::LDRWpost_isel:
-  case ARM64::LDRHHpost_isel:
-  case ARM64::LDRBBpost_isel:
-  case ARM64::LDRSWpost_isel:
-  case ARM64::LDRSHWpost_isel:
-  case ARM64::LDRSHXpost_isel:
-  case ARM64::LDRSBWpost_isel:
-  case ARM64::LDRSBXpost_isel: {
-    MCInst TmpInst;
-    // For loads, the writeback operand to be skipped is the second.
-    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
-    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::STRXpre_isel:
-  case ARM64::STRWpre_isel:
-  case ARM64::STRHHpre_isel:
-  case ARM64::STRBBpre_isel:
-  case ARM64::STRQpre_isel:
-  case ARM64::STRDpre_isel:
-  case ARM64::STRSpre_isel: {
-    MCInst TmpInst;
-    // For loads, the writeback operand to be skipped is the first.
-    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
-    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-
-  // Tail calls use pseudo instructions so they have the proper code-gen
-  // attributes (isCall, isReturn, etc.). We lower them to the real
-  // instruction here.
-  case ARM64::TCRETURNri: {
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::BR);
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TCRETURNdi: {
-    MCOperand Dest;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::B);
-    TmpInst.addOperand(Dest);
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TLSDESC_BLR: {
-    MCOperand Callee, Sym;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
-    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
-
-    // First emit a relocation-annotation. This expands to no code, but requests
-    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
-    MCInst TLSDescCall;
-    TLSDescCall.setOpcode(ARM64::TLSDESCCALL);
-    TLSDescCall.addOperand(Sym);
-    EmitToStreamer(OutStreamer, TLSDescCall);
-
-    // Other than that it's just a normal indirect call to the function loaded
-    // from the descriptor.
-    MCInst BLR;
-    BLR.setOpcode(ARM64::BLR);
-    BLR.addOperand(Callee);
-    EmitToStreamer(OutStreamer, BLR);
-
-    return;
-  }
-
-  case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(OutStreamer, SM, *MI);
-
-  case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(OutStreamer, SM, *MI);
-  }
-
-  // Finally, do the automated lowerings for everything else.
-  MCInst TmpInst;
-  MCInstLowering.Lower(MI, TmpInst);
-  EmitToStreamer(OutStreamer, TmpInst);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmPrinter() {
-  RegisterAsmPrinter<ARM64AsmPrinter> X(TheARM64leTarget);
-  RegisterAsmPrinter<ARM64AsmPrinter> Y(TheARM64beTarget);
-}
diff --git a/lib/Target/ARM64/ARM64CallingConv.h b/lib/Target/ARM64/ARM64CallingConv.h
deleted file mode 100644
index f24ba59dfb9b..000000000000
--- a/lib/Target/ARM64/ARM64CallingConv.h
+++ /dev/null
@@ -1,94 +0,0 @@
-//=== ARM64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the custom routines for the ARM64 Calling Convention that
-// aren't done by tablegen.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64CALLINGCONV_H
-#define ARM64CALLINGCONV_H
-
-#include "ARM64InstrInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-namespace llvm {
-
-/// CC_ARM64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via
-/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the
-/// argument is already promoted and LocVT is i1/i8/i16. We only promote the
-/// argument to i32 if we are sure this argument will be passed in register.
-static bool CC_ARM64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                        CCValAssign::LocInfo LocInfo,
-                                        ISD::ArgFlagsTy ArgFlags,
-                                        CCState &State,
-                                        bool IsWebKitJS = false) {
-  static const MCPhysReg RegList1[] = { ARM64::W0, ARM64::W1, ARM64::W2,
-                                        ARM64::W3, ARM64::W4, ARM64::W5,
-                                        ARM64::W6, ARM64::W7 };
-  static const MCPhysReg RegList2[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                        ARM64::X3, ARM64::X4, ARM64::X5,
-                                        ARM64::X6, ARM64::X7 };
-  static const MCPhysReg WebKitRegList1[] = { ARM64::W0 };
-  static const MCPhysReg WebKitRegList2[] = { ARM64::X0 };
-
-  const MCPhysReg *List1 = IsWebKitJS ? WebKitRegList1 : RegList1;
-  const MCPhysReg *List2 = IsWebKitJS ? WebKitRegList2 : RegList2;
-
-  if (unsigned Reg = State.AllocateReg(List1, List2, 8)) {
-    // Customized extra section for handling i1/i8/i16:
-    // We need to promote the argument to i32 if it is not done already.
-    if (ValVT != MVT::i32) {
-      if (ArgFlags.isSExt())
-        LocInfo = CCValAssign::SExt;
-      else if (ArgFlags.isZExt())
-        LocInfo = CCValAssign::ZExt;
-      else
-        LocInfo = CCValAssign::AExt;
-      ValVT = MVT::i32;
-    }
-    // Set LocVT to i32 as well if passing via register.
-    LocVT = MVT::i32;
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return true;
-  }
-  return false;
-}
-
-/// CC_ARM64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16
-/// via register. This behaves the same as CC_ARM64_Custom_i1i8i16_Reg, but only
-/// uses the first register.
-static bool CC_ARM64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                           CCValAssign::LocInfo LocInfo,
-                                           ISD::ArgFlagsTy ArgFlags,
-                                           CCState &State) {
-  return CC_ARM64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
-                                     State, true);
-}
-
-/// CC_ARM64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on
-/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument
-/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted,
-/// it will be truncated back to i1/i8/i16.
-static bool CC_ARM64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                          CCValAssign::LocInfo LocInfo,
-                                          ISD::ArgFlagsTy ArgFlags,
-                                          CCState &State) {
-  unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2);
-  unsigned Offset12 = State.AllocateStack(Space, Space);
-  ValVT = LocVT;
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo));
-  return true;
-}
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp
deleted file mode 100644
index 3b14649c487b..000000000000
--- a/lib/Target/ARM64/ARM64FrameLowering.cpp
+++ /dev/null
@@ -1,878 +0,0 @@
-//===- ARM64FrameLowering.cpp - ARM64 Frame Lowering -----------*- C++ -*-====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64Subtarget.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "frame-info"
-
-static cl::opt<bool> EnableRedZone("arm64-redzone",
-                                   cl::desc("enable use of redzone on ARM64"),
-                                   cl::init(false), cl::Hidden);
-
-STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
-
-static unsigned estimateStackSize(MachineFunction &MF) {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  int Offset = 0;
-  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -FFI->getObjectOffset(i);
-    if (FixedOff > Offset)
-      Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-    Offset += FFI->getObjectSize(i);
-    unsigned Align = FFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset + Align - 1) / Align * Align;
-  }
-  // This does not include the 16 bytes used for fp and lr.
-  return (unsigned)Offset;
-}
-
-bool ARM64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
-  if (!EnableRedZone)
-    return false;
-  // Don't use the red zone if the function explicitly asks us not to.
-  // This is typically used for kernel code.
-  if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoRedZone))
-    return false;
-
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  unsigned NumBytes = AFI->getLocalStackSize();
-
-  // Note: currently hasFP() is always true for hasCalls(), but that's an
-  // implementation detail of the current code, not a strict requirement,
-  // so stay safe here and check both.
-  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
-    return false;
-  return true;
-}
-
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.
-bool ARM64FrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-#ifndef NDEBUG
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
-  assert(!RegInfo->needsStackRealignment(MF) &&
-         "No stack realignment on ARM64!");
-#endif
-
-  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
-/// not required, we reserve argument space for call sites in the function
-/// immediately on entry to the current function.  This eliminates the need for
-/// add/sub sp brackets around call sites.  Returns true if the call frame is
-/// included as part of the stack frame.
-bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-void ARM64FrameLowering::eliminateCallFramePseudoInstr(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator I) const {
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  DebugLoc DL = I->getDebugLoc();
-  int Opc = I->getOpcode();
-  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
-  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
-
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  if (!TFI->hasReservedCallFrame(MF)) {
-    unsigned Align = getStackAlignment();
-
-    int64_t Amount = I->getOperand(0).getImm();
-    Amount = RoundUpToAlignment(Amount, Align);
-    if (!IsDestroy)
-      Amount = -Amount;
-
-    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
-    // doesn't have to pop anything), then the first operand will be zero too so
-    // this adjustment is a no-op.
-    if (CalleePopAmount == 0) {
-      // FIXME: in-function stack adjustment for calls is limited to 24-bits
-      // because there's no guaranteed temporary register available.
-      //
-      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
-      // 1) For offset <= 12-bit, we use LSL #0
-      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
-      // LSL #0, and the other uses LSL #12.
-      //
-      // Mostly call frames will be allocated at the start of a function so
-      // this is OK, but it is a limitation that needs dealing with.
-      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-      emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
-    }
-  } else if (CalleePopAmount != 0) {
-    // If the calling convention demands that the callee pops arguments from the
-    // stack, we want to add it back if we have a reserved call frame.
-    assert(CalleePopAmount < 0xffffff && "call frame too large");
-    emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII);
-  }
-  MBB.erase(I);
-}
-
-void
-ARM64FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                              MachineBasicBlock::iterator MBBI,
-                                              unsigned FramePtr) const {
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  // Add callee saved registers to move list.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  if (CSI.empty())
-    return;
-
-  const DataLayout *TD = MF.getTarget().getDataLayout();
-  bool HasFP = hasFP(MF);
-
-  // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD->getPointerSize(0);
-
-  // Calculate offsets.
-  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
-  unsigned TotalSkipped = 0;
-  for (const auto &Info : CSI) {
-    unsigned Reg = Info.getReg();
-    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
-                     getOffsetOfLocalArea() + saveAreaOffset;
-
-    // Don't output a new CFI directive if we're re-saving the frame pointer or
-    // link register. This happens when the PrologEpilogInserter has inserted an
-    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
-    // method automatically generates the directives when frame pointers are
-    // used. If we generate CFI directives for the extra "STP"s, the linker will
-    // lose track of the correct values for the frame pointer and link register.
-    if (HasFP && (FramePtr == Reg || Reg == ARM64::LR)) {
-      TotalSkipped += stackGrowth;
-      continue;
-    }
-
-    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-        nullptr, DwarfReg, Offset - TotalSkipped));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
-  }
-}
-
-void ARM64FrameLowering::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const Function *Fn = MF.getFunction();
-  const ARM64RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
-  bool HasFP = hasFP(MF);
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  int NumBytes = (int)MFI->getStackSize();
-  if (!AFI->hasStackFrame()) {
-    assert(!HasFP && "unexpected function without stack frame but with FP");
-
-    // All of the stack allocation is for locals.
-    AFI->setLocalStackSize(NumBytes);
-
-    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-
-    // REDZONE: If the stack size is less than 128 bytes, we don't need
-    // to actually allocate.
-    if (NumBytes && !canUseRedZone(MF)) {
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else if (NumBytes) {
-      ++NumRedZoneFunctions;
-    }
-
-    return;
-  }
-
-  // Only set up FP if we actually need to.
-  int FPOffset = 0;
-  if (HasFP) {
-    // First instruction must a) allocate the stack  and b) have an immediate
-    // that is a multiple of -2.
-    assert((MBBI->getOpcode() == ARM64::STPXpre ||
-            MBBI->getOpcode() == ARM64::STPDpre) &&
-           MBBI->getOperand(2).getReg() == ARM64::SP &&
-           MBBI->getOperand(3).getImm() < 0 &&
-           (MBBI->getOperand(3).getImm() & 1) == 0);
-
-    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
-    // required for the callee saved register area we get the frame pointer
-    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
-    FPOffset = -(MBBI->getOperand(3).getImm() + 2) * 8;
-    assert(FPOffset >= 0 && "Bad Framepointer Offset");
-  }
-
-  // Move past the saves of the callee-saved registers.
-  while (MBBI->getOpcode() == ARM64::STPXi ||
-         MBBI->getOpcode() == ARM64::STPDi ||
-         MBBI->getOpcode() == ARM64::STPXpre ||
-         MBBI->getOpcode() == ARM64::STPDpre) {
-    ++MBBI;
-    NumBytes -= 16;
-  }
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-  if (HasFP) {
-    // Issue    sub fp, sp, FPOffset or
-    //          mov fp,sp          when FPOffset is zero.
-    // Note: All stores of callee-saved registers are marked as "FrameSetup".
-    // This code marks the instruction(s) that set the FP also.
-    emitFrameOffset(MBB, MBBI, DL, ARM64::FP, ARM64::SP, FPOffset, TII,
-                    MachineInstr::FrameSetup);
-  }
-
-  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes);
-
-  // Allocate space for the rest of the frame.
-  if (NumBytes) {
-    // If we're a leaf function, try using the red zone.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-  }
-
-  // If we need a base pointer, set it up here. It's whatever the value of the
-  // stack pointer is at this point. Any variable size objects will be allocated
-  // after this, so we can still use the base pointer to reference locals.
-  //
-  // FIXME: Clarify FrameSetup flags here.
-  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
-  // needed.
-  //
-  if (RegInfo->hasBasePointer(MF))
-    TII->copyPhysReg(MBB, MBBI, DL, ARM64::X19, ARM64::SP, false);
-
-  if (needsFrameMoves) {
-    const DataLayout *TD = MF.getTarget().getDataLayout();
-    const int StackGrowth = -TD->getPointerSize(0);
-    unsigned FramePtr = RegInfo->getFrameRegister(MF);
-
-    // An example of the prologue:
-    //
-    //     .globl __foo
-    //     .align 2
-    //  __foo:
-    // Ltmp0:
-    //     .cfi_startproc
-    //     .cfi_personality 155, ___gxx_personality_v0
-    // Leh_func_begin:
-    //     .cfi_lsda 16, Lexception33
-    //
-    //     stp  xa,bx, [sp, -#offset]!
-    //     ...
-    //     stp  x28, x27, [sp, #offset-32]
-    //     stp  fp, lr, [sp, #offset-16]
-    //     add  fp, sp, #offset - 16
-    //     sub  sp, sp, #1360
-    //
-    // The Stack:
-    //       +-------------------------------------------+
-    // 10000 | ........ | ........ | ........ | ........ |
-    // 10004 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10008 | ........ | ........ | ........ | ........ |
-    // 1000c | ........ | ........ | ........ | ........ |
-    //       +===========================================+
-    // 10010 |                X28 Register               |
-    // 10014 |                X28 Register               |
-    //       +-------------------------------------------+
-    // 10018 |                X27 Register               |
-    // 1001c |                X27 Register               |
-    //       +===========================================+
-    // 10020 |                Frame Pointer              |
-    // 10024 |                Frame Pointer              |
-    //       +-------------------------------------------+
-    // 10028 |                Link Register              |
-    // 1002c |                Link Register              |
-    //       +===========================================+
-    // 10030 | ........ | ........ | ........ | ........ |
-    // 10034 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10038 | ........ | ........ | ........ | ........ |
-    // 1003c | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    //
-    //     [sp] = 10030        ::    >>initial value<<
-    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
-    //     fp = sp == 10020    ::  mov fp, sp
-    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
-    //     sp == 10010         ::    >>final value<<
-    //
-    // The frame pointer (w29) points to address 10020. If we use an offset of
-    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
-    // for w27, and -32 for w28:
-    //
-    //  Ltmp1:
-    //     .cfi_def_cfa w29, 16
-    //  Ltmp2:
-    //     .cfi_offset w30, -8
-    //  Ltmp3:
-    //     .cfi_offset w29, -16
-    //  Ltmp4:
-    //     .cfi_offset w27, -24
-    //  Ltmp5:
-    //     .cfi_offset w28, -32
-
-    if (HasFP) {
-      // Define the current CFA rule to use the provided FP.
-      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored LR
-      unsigned LR = RegInfo->getDwarfRegNum(ARM64::LR, true);
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored FP
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else {
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    }
-
-    // Now emit the moves for whatever callee saved regs we have.
-    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
-  }
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
-  if (MI->getOpcode() == ARM64::LDPXpost ||
-      MI->getOpcode() == ARM64::LDPDpost || MI->getOpcode() == ARM64::LDPXi ||
-      MI->getOpcode() == ARM64::LDPDi) {
-    if (!isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) ||
-        !isCalleeSavedRegister(MI->getOperand(1).getReg(), CSRegs) ||
-        MI->getOperand(2).getReg() != ARM64::SP)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
-                                      MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  DebugLoc DL = MBBI->getDebugLoc();
-  unsigned RetOpcode = MBBI->getOpcode();
-
-  int NumBytes = MFI->getStackSize();
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-
-  // Initial and residual are named for consitency with the prologue. Note that
-  // in the epilogue, the residual adjustment is executed first.
-  uint64_t ArgumentPopSize = 0;
-  if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) {
-    MachineOperand &StackAdjust = MBBI->getOperand(1);
-
-    // For a tail-call in a callee-pops-arguments environment, some or all of
-    // the stack may actually be in use for the call's arguments, this is
-    // calculated during LowerCall and consumed here...
-    ArgumentPopSize = StackAdjust.getImm();
-  } else {
-    // ... otherwise the amount to pop is *all* of the argument space,
-    // conveniently stored in the MachineFunctionInfo by
-    // LowerFormalArguments. This will, of course, be zero for the C calling
-    // convention.
-    ArgumentPopSize = AFI->getArgumentStackToRestore();
-  }
-
-  // The stack frame should be like below,
-  //
-  //      ----------------------                     ---
-  //      |                    |                      |
-  //      | BytesInStackArgArea|              CalleeArgStackSize
-  //      | (NumReusableBytes) |                (of tail call)
-  //      |                    |                     ---
-  //      |                    |                      |
-  //      ---------------------|        ---           |
-  //      |                    |         |            |
-  //      |   CalleeSavedReg   |         |            |
-  //      | (NumRestores * 16) |         |            |
-  //      |                    |         |            |
-  //      ---------------------|         |         NumBytes
-  //      |                    |     StackSize  (StackAdjustUp)
-  //      |   LocalStackSize   |         |            |
-  //      | (covering callee   |         |            |
-  //      |       args)        |         |            |
-  //      |                    |         |            |
-  //      ----------------------        ---          ---
-  //
-  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
-  //             = StackSize + ArgumentPopSize
-  //
-  // ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
-  // it as the 2nd argument of ARM64ISD::TC_RETURN.
-  NumBytes += ArgumentPopSize;
-
-  unsigned NumRestores = 0;
-  // Move past the restores of the callee-saved registers.
-  MachineBasicBlock::iterator LastPopI = MBBI;
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-  if (LastPopI != MBB.begin()) {
-    do {
-      ++NumRestores;
-      --LastPopI;
-    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
-    if (!isCSRestore(LastPopI, CSRegs)) {
-      ++LastPopI;
-      --NumRestores;
-    }
-  }
-  NumBytes -= NumRestores * 16;
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-
-  if (!hasFP(MF)) {
-    // If this was a redzone leaf function, we don't need to restore the
-    // stack pointer.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::SP, NumBytes, TII);
-    return;
-  }
-
-  // Restore the original stack pointer.
-  // FIXME: Rather than doing the math here, we should instead just use
-  // non-post-indexed loads for the restores if we aren't actually going to
-  // be able to save any instructions.
-  if (NumBytes || MFI->hasVarSizedObjects())
-    emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::FP,
-                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index.
-int ARM64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                            int FI) const {
-  unsigned FrameReg;
-  return getFrameIndexReference(MF, FI, FrameReg);
-}
-
-/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
-/// debug info.  It's the same as what we use for resolving the code-gen
-/// references for now.  FIXME: This can go wrong when references are
-/// SP-relative and simple call frames aren't used.
-int ARM64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                               int FI,
-                                               unsigned &FrameReg) const {
-  return resolveFrameIndexReference(MF, FI, FrameReg);
-}
-
-int ARM64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
-                                                   int FI, unsigned &FrameReg,
-                                                   bool PreferFP) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  int FPOffset = MFI->getObjectOffset(FI) + 16;
-  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
-  bool isFixed = MFI->isFixedObjectIndex(FI);
-
-  // Use frame pointer to reference fixed objects. Use it for locals if
-  // there are VLAs (and thus the SP isn't reliable as a base).
-  // Make sure useFPForScavengingIndex() does the right thing for the emergency
-  // spill slot.
-  bool UseFP = false;
-  if (AFI->hasStackFrame()) {
-    // Note: Keeping the following as multiple 'if' statements rather than
-    // merging to a single expression for readability.
-    //
-    // Argument access should always use the FP.
-    if (isFixed) {
-      UseFP = hasFP(MF);
-    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
-      // Use SP or FP, whichever gives us the best chance of the offset
-      // being in range for direct access. If the FPOffset is positive,
-      // that'll always be best, as the SP will be even further away.
-      // If the FPOffset is negative, we have to keep in mind that the
-      // available offset range for negative offsets is smaller than for
-      // positive ones. If we have variable sized objects, we're stuck with
-      // using the FP regardless, though, as the SP offset is unknown
-      // and we don't have a base pointer available. If an offset is
-      // available via the FP and the SP, use whichever is closest.
-      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
-          (FPOffset >= -256 && Offset > -FPOffset))
-        UseFP = true;
-    }
-  }
-
-  if (UseFP) {
-    FrameReg = RegInfo->getFrameRegister(MF);
-    return FPOffset;
-  }
-
-  // Use the base pointer if we have one.
-  if (RegInfo->hasBasePointer(MF))
-    FrameReg = RegInfo->getBaseRegister();
-  else {
-    FrameReg = ARM64::SP;
-    // If we're using the red zone for this function, the SP won't actually
-    // be adjusted, so the offsets will be negative. They're also all
-    // within range of the signed 9-bit immediate instructions.
-    if (canUseRedZone(MF))
-      Offset -= AFI->getLocalStackSize();
-  }
-
-  return Offset;
-}
-
-static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
-  if (Reg != ARM64::LR)
-    return getKillRegState(true);
-
-  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
-  bool LRLiveIn = MF.getRegInfo().isLiveIn(ARM64::LR);
-  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
-  return getKillRegState(LRKill);
-}
-
-bool ARM64FrameLowering::spillCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned idx = Count - i - 2;
-    unsigned Reg1 = CSI[idx].getReg();
-    unsigned Reg2 = CSI[idx + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    //
-    // The order of the registers in the list is controlled by
-    // getCalleeSavedRegs(), so they will always be in-order, as well.
-    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    unsigned StrOpc;
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
-    // first spill is a pre-increment that allocates the stack.
-    // For example:
-    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
-    //    stp     x20, x19, [sp, #16]    // addImm(+2)
-    //    stp     fp, lr, [sp, #32]      // addImm(+4)
-    // Rationale: This sequence saves uop updates compared to a sequence of
-    // pre-increment spills like stp xi,xj,[sp,#-16]!
-    // Note: Similar rational and sequence for restores in epilog.
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPXpre;
-      else
-        StrOpc = ARM64::STPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPDpre;
-      else
-        StrOpc = ARM64::STPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
-                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
-    // Compute offset: i = 0 => offset = -Count;
-    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
-    const int Offset = (i == 0) ? -Count : i;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for STP immediate");
-    BuildMI(MBB, MI, DL, TII.get(StrOpc))
-        .addReg(Reg2, getPrologueDeath(MF, Reg2))
-        .addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(ARM64::SP)
-        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
-        .setMIFlag(MachineInstr::FrameSetup);
-  }
-  return true;
-}
-
-bool ARM64FrameLowering::restoreCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned Reg1 = CSI[i].getReg();
-    unsigned Reg2 = CSI[i + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
-    // the last load is sp-pi post-increment and de-allocates the stack:
-    // For example:
-    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
-    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
-    //    ldp     x22, x21, [sp], #48     // addImm(+6)
-    // Note: see comment in spillCalleeSavedRegisters()
-    unsigned LdrOpc;
-
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPXpost;
-      else
-        LdrOpc = ARM64::LDPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPDpost;
-      else
-        LdrOpc = ARM64::LDPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
-                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
-    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
-    // etc.
-    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for LDP immediate");
-    BuildMI(MBB, MI, DL, TII.get(LdrOpc))
-        .addReg(Reg2, getDefRegState(true))
-        .addReg(Reg1, getDefRegState(true))
-        .addReg(ARM64::SP)
-        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
-                         // where the factor * 8 is implicit
-  }
-  return true;
-}
-
-void ARM64FrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
-  SmallVector<unsigned, 4> UnspilledCSGPRs;
-  SmallVector<unsigned, 4> UnspilledCSFPRs;
-
-  // The frame record needs to be created by saving the appropriate registers
-  if (hasFP(MF)) {
-    MRI->setPhysRegUsed(ARM64::FP);
-    MRI->setPhysRegUsed(ARM64::LR);
-  }
-
-  // Spill the BasePtr if it's used. Do this first thing so that the
-  // getCalleeSavedRegs() below will get the right answer.
-  if (RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
-
-  // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned NumGPRSpilled = 0;
-  unsigned NumFPRSpilled = 0;
-  bool ExtraCSSpill = false;
-  bool CanEliminateFrame = true;
-  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-
-  // Check pairs of consecutive callee-saved registers.
-  for (unsigned i = 0; CSRegs[i]; i += 2) {
-    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
-    const unsigned OddReg = CSRegs[i];
-    const unsigned EvenReg = CSRegs[i + 1];
-    assert((ARM64::GPR64RegClass.contains(OddReg) &&
-            ARM64::GPR64RegClass.contains(EvenReg)) ^
-               (ARM64::FPR64RegClass.contains(OddReg) &&
-                ARM64::FPR64RegClass.contains(EvenReg)) &&
-           "Register class mismatch!");
-
-    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
-    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
-
-    // Early exit if none of the registers in the register pair is actually
-    // used.
-    if (!OddRegUsed && !EvenRegUsed) {
-      if (ARM64::GPR64RegClass.contains(OddReg)) {
-        UnspilledCSGPRs.push_back(OddReg);
-        UnspilledCSGPRs.push_back(EvenReg);
-      } else {
-        UnspilledCSFPRs.push_back(OddReg);
-        UnspilledCSFPRs.push_back(EvenReg);
-      }
-      continue;
-    }
-
-    unsigned Reg = ARM64::NoRegister;
-    // If only one of the registers of the register pair is used, make sure to
-    // mark the other one as used as well.
-    if (OddRegUsed ^ EvenRegUsed) {
-      // Find out which register is the additional spill.
-      Reg = OddRegUsed ? EvenReg : OddReg;
-      MRI->setPhysRegUsed(Reg);
-    }
-
-    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
-    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
-    assert(((OddReg == ARM64::LR && EvenReg == ARM64::FP) ||
-            (RegInfo->getEncodingValue(OddReg) + 1 ==
-             RegInfo->getEncodingValue(EvenReg))) &&
-           "Register pair of non-adjacent registers!");
-    if (ARM64::GPR64RegClass.contains(OddReg)) {
-      NumGPRSpilled += 2;
-      // If it's not a reserved register, we can use it in lieu of an
-      // emergency spill slot for the register scavenger.
-      // FIXME: It would be better to instead keep looking and choose another
-      // unspilled register that isn't reserved, if there is one.
-      if (Reg != ARM64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
-        ExtraCSSpill = true;
-    } else
-      NumFPRSpilled += 2;
-
-    CanEliminateFrame = false;
-  }
-
-  // FIXME: Set BigStack if any stack slot references may be out of range.
-  // For now, just conservatively guestimate based on unscaled indexing
-  // range. We'll end up allocating an unnecessary spill slot a lot, but
-  // realistically that's not a big deal at this stage of the game.
-  // The CSR spill slots have not been allocated yet, so estimateStackSize
-  // won't include them.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
-  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
-  bool BigStack = (CFSize >= 256);
-  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
-    AFI->setHasStackFrame(true);
-
-  // Estimate if we might need to scavenge a register at some point in order
-  // to materialize a stack offset. If so, either spill one additional
-  // callee-saved register or reserve a special spill slot to facilitate
-  // register scavenging. If we already spilled an extra callee-saved register
-  // above to keep the number of spills even, we don't need to do anything else
-  // here.
-  if (BigStack && !ExtraCSSpill) {
-
-    // If we're adding a register to spill here, we have to add two of them
-    // to keep the number of regs to spill even.
-    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
-    unsigned Count = 0;
-    while (!UnspilledCSGPRs.empty() && Count < 2) {
-      unsigned Reg = UnspilledCSGPRs.back();
-      UnspilledCSGPRs.pop_back();
-      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
-                   << " to get a scratch register.\n");
-      MRI->setPhysRegUsed(Reg);
-      ExtraCSSpill = true;
-      ++Count;
-    }
-
-    // If we didn't find an extra callee-saved register to spill, create
-    // an emergency spill slot.
-    if (!ExtraCSSpill) {
-      const TargetRegisterClass *RC = &ARM64::GPR64RegClass;
-      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
-      RS->addScavengingFrameIndex(FI);
-      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
-                   << " as the emergency spill slot.\n");
-    }
-  }
-}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.h b/lib/Target/ARM64/ARM64FrameLowering.h
deleted file mode 100644
index 1991a0a18dd6..000000000000
--- a/lib/Target/ARM64/ARM64FrameLowering.h
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- ARM64FrameLowering.h - TargetFrameLowering for ARM64 ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64_FRAMELOWERING_H
-#define ARM64_FRAMELOWERING_H
-
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64FrameLowering : public TargetFrameLowering {
-  const ARM64TargetMachine &TM;
-
-public:
-  explicit ARM64FrameLowering(const ARM64TargetMachine &TM,
-                              const ARM64Subtarget &STI)
-      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
-                            false /*StackRealignable*/),
-        TM(TM) {}
-
-  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 unsigned FramePtr) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) const override;
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const override;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
-  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg,
-                                 bool PreferFP = false) const;
-  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const override;
-
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  const std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
-
-  /// \brief Can this function use the red zone for local allocations.
-  bool canUseRedZone(const MachineFunction &MF) const;
-
-  bool hasFP(const MachineFunction &MF) const override;
-  bool hasReservedCallFrame(const MachineFunction &MF) const override;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
deleted file mode 100644
index 4a1f9717bf73..000000000000
--- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
+++ /dev/null
@@ -1,2987 +0,0 @@
-//===-- ARM64ISelDAGToDAG.cpp - A dag to dag inst selector for ARM64 ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the ARM64 target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/APSInt.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Function.h" // To access function attributes.
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-isel"
-
-//===--------------------------------------------------------------------===//
-/// ARM64DAGToDAGISel - ARM64 specific code to select ARM64 machine
-/// instructions for SelectionDAG operations.
-///
-namespace {
-
-class ARM64DAGToDAGISel : public SelectionDAGISel {
-  ARM64TargetMachine &TM;
-
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  bool ForCodeSize;
-
-public:
-  explicit ARM64DAGToDAGISel(ARM64TargetMachine &tm, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel), TM(tm),
-        Subtarget(&TM.getSubtarget<ARM64Subtarget>()), ForCodeSize(false) {}
-
-  const char *getPassName() const override {
-    return "ARM64 Instruction Selection";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
-    ForCodeSize =
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                             Attribute::OptimizeForSize) ||
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-    return SelectionDAGISel::runOnMachineFunction(MF);
-  }
-
-  SDNode *Select(SDNode *Node) override;
-
-  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
-  /// inline asm expressions.
-  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                    char ConstraintCode,
-                                    std::vector<SDValue> &OutOps) override;
-
-  SDNode *SelectMLAV64LaneV128(SDNode *N);
-  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
-  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
-  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, false, Reg, Shift);
-  }
-  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, true, Reg, Shift);
-  }
-  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 16, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
-  }
-
-  bool SelectAddrModeRO8(SDValue N, SDValue &Base, SDValue &Offset,
-                         SDValue &Imm) {
-    return SelectAddrModeRO(N, 1, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO16(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 2, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO32(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 4, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO64(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 8, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO128(SDValue N, SDValue &Base, SDValue &Offset,
-                           SDValue &Imm) {
-    return SelectAddrModeRO(N, 16, Base, Offset, Imm);
-  }
-  bool SelectAddrModeNoIndex(SDValue N, SDValue &Val);
-
-  /// Form sequences of consecutive 64/128-bit registers for use in NEON
-  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
-  /// between 1 and 4 elements. If it contains a single element that is returned
-  /// unchanged; otherwise a REG_SEQUENCE value is returned.
-  SDValue createDTuple(ArrayRef<SDValue> Vecs);
-  SDValue createQTuple(ArrayRef<SDValue> Vecs);
-
-  /// Generic helper for the createDTuple/createQTuple
-  /// functions. Those should almost always be called instead.
-  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
-                      unsigned SubRegs[]);
-
-  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
-
-  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
-
-  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                     unsigned SubRegIdx);
-  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                         unsigned SubRegIdx);
-  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
-  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
-
-  SDNode *SelectBitfieldExtractOp(SDNode *N);
-  SDNode *SelectBitfieldInsertOp(SDNode *N);
-
-  SDNode *SelectLIBM(SDNode *N);
-
-// Include the pieces autogenerated from the target description.
-#include "ARM64GenDAGISel.inc"
-
-private:
-  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
-                             SDValue &Shift);
-  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
-                             SDValue &OffImm);
-  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
-                              SDValue &OffImm);
-  bool SelectAddrModeRO(SDValue N, unsigned Size, SDValue &Base,
-                        SDValue &Offset, SDValue &Imm);
-  bool isWorthFolding(SDValue V) const;
-  bool SelectExtendedSHL(SDValue N, unsigned Size, SDValue &Offset,
-                         SDValue &Imm);
-
-  template<unsigned RegWidth>
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
-    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
-  }
-
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
-};
-} // end anonymous namespace
-
-/// isIntImmediate - This method tests to see if the node is a constant
-/// operand. If so Imm will receive the 32-bit value.
-static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
-  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
-    Imm = C->getZExtValue();
-    return true;
-  }
-  return false;
-}
-
-// isIntImmediate - This method tests to see if a constant operand.
-// If so Imm will receive the value.
-static bool isIntImmediate(SDValue N, uint64_t &Imm) {
-  return isIntImmediate(N.getNode(), Imm);
-}
-
-// isOpcWithIntImmediate - This method tests to see if the node is a specific
-// opcode and that it has a immediate integer right operand.
-// If so Imm will receive the 32 bit value.
-static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
-                                  uint64_t &Imm) {
-  return N->getOpcode() == Opc &&
-         isIntImmediate(N->getOperand(1).getNode(), Imm);
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeNoIndex(SDValue N, SDValue &Val) {
-  EVT ValTy = N.getValueType();
-  if (ValTy != MVT::i64)
-    return false;
-  Val = N;
-  return true;
-}
-
-bool ARM64DAGToDAGISel::SelectInlineAsmMemoryOperand(
-    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
-  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
-  // Require the address to be in a register.  That is safe for all ARM64
-  // variants and it is hard to do anything much smarter without knowing
-  // how the operand is used.
-  OutOps.push_back(Op);
-  return false;
-}
-
-/// SelectArithImmed - Select an immediate value that can be represented as
-/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
-/// Val set to the 12-bit value and Shift set to the shifter operand.
-bool ARM64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
-                                         SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-  unsigned ShiftAmt;
-
-  if (Immed >> 12 == 0) {
-    ShiftAmt = 0;
-  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
-    ShiftAmt = 12;
-    Immed = Immed >> 12;
-  } else
-    return false;
-
-  unsigned ShVal = ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt);
-  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
-  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-  return true;
-}
-
-/// SelectNegArithImmed - As above, but negates the value before trying to
-/// select it.
-bool ARM64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
-                                            SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  // The immediate operand must be a 24-bit zero-extended immediate.
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-
-  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
-  // have the opposite effect on the C flag, so this pattern mustn't match under
-  // those circumstances.
-  if (Immed == 0)
-    return false;
-
-  if (N.getValueType() == MVT::i32)
-    Immed = ~((uint32_t)Immed) + 1;
-  else
-    Immed = ~Immed + 1ULL;
-  if (Immed & 0xFFFFFFFFFF000000ULL)
-    return false;
-
-  Immed &= 0xFFFFFFULL;
-  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
-}
-
-/// getShiftTypeForNode - Translate a shift node to the corresponding
-/// ShiftType value.
-static ARM64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
-  switch (N.getOpcode()) {
-  default:
-    return ARM64_AM::InvalidShiftExtend;
-  case ISD::SHL:
-    return ARM64_AM::LSL;
-  case ISD::SRL:
-    return ARM64_AM::LSR;
-  case ISD::SRA:
-    return ARM64_AM::ASR;
-  case ISD::ROTR:
-    return ARM64_AM::ROR;
-  }
-}
-
-/// \brief Determine wether it is worth to fold V into an extended register.
-bool ARM64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the a value is used at least twice, unless we are optimizing
-  // for code size.
-  if (ForCodeSize || V.hasOneUse())
-    return true;
-  return false;
-}
-
-/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
-/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
-/// instructions allow the shifted register to be rotated, but the arithmetic
-/// instructions do not.  The AllowROR parameter specifies whether ROR is
-/// supported.
-bool ARM64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
-                                              SDValue &Reg, SDValue &Shift) {
-  ARM64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
-  if (ShType == ARM64_AM::InvalidShiftExtend)
-    return false;
-  if (!AllowROR && ShType == ARM64_AM::ROR)
-    return false;
-
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    unsigned BitSize = N.getValueType().getSizeInBits();
-    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
-    unsigned ShVal = ARM64_AM::getShifterImm(ShType, Val);
-
-    Reg = N.getOperand(0);
-    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-    return isWorthFolding(N);
-  }
-
-  return false;
-}
-
-/// getExtendTypeForNode - Translate an extend node to the corresponding
-/// ExtendType value.
-static ARM64_AM::ShiftExtendType
-getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
-  if (N.getOpcode() == ISD::SIGN_EXTEND ||
-      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
-    EVT SrcVT;
-    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
-      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
-    else
-      SrcVT = N.getOperand(0).getValueType();
-
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::SXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::SXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::SXTW;
-    else if (SrcVT == MVT::i64)
-      return ARM64_AM::SXTX;
-
-    return ARM64_AM::InvalidShiftExtend;
-  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
-             N.getOpcode() == ISD::ANY_EXTEND) {
-    EVT SrcVT = N.getOperand(0).getValueType();
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::UXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::UXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::UXTW;
-    else if (SrcVT == MVT::i64)
-      return ARM64_AM::UXTX;
-
-    return ARM64_AM::InvalidShiftExtend;
-  } else if (N.getOpcode() == ISD::AND) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return ARM64_AM::InvalidShiftExtend;
-    uint64_t AndMask = CSD->getZExtValue();
-
-    switch (AndMask) {
-    default:
-      return ARM64_AM::InvalidShiftExtend;
-    case 0xFF:
-      return !IsLoadStore ? ARM64_AM::UXTB : ARM64_AM::InvalidShiftExtend;
-    case 0xFFFF:
-      return !IsLoadStore ? ARM64_AM::UXTH : ARM64_AM::InvalidShiftExtend;
-    case 0xFFFFFFFF:
-      return ARM64_AM::UXTW;
-    }
-  }
-
-  return ARM64_AM::InvalidShiftExtend;
-}
-
-// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
-static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
-  if (DL->getOpcode() != ARM64ISD::DUPLANE16 &&
-      DL->getOpcode() != ARM64ISD::DUPLANE32)
-    return false;
-
-  SDValue SV = DL->getOperand(0);
-  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
-    return false;
-
-  SDValue EV = SV.getOperand(1);
-  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-    return false;
-
-  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
-  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
-  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
-  LaneOp = EV.getOperand(0);
-
-  return true;
-}
-
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
-// high lane extract.
-static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
-                             SDValue &LaneOp, int &LaneIdx) {
-
-  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
-      return false;
-  }
-  StdOp = Op1;
-  return true;
-}
-
-/// SelectMLAV64LaneV128 - ARM64 supports vector MLAs where one multiplicand is
-/// a lane in the upper half of a 128-bit vector.  Recognize and select this so
-/// that we don't emit unnecessary lane extracts.
-SDNode *ARM64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
-  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
-  int LaneIdx = -1; // Will hold the lane index.
-
-  if (Op1.getOpcode() != ISD::MUL ||
-      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                        LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (Op1.getOpcode() != ISD::MUL ||
-        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                          LaneIdx))
-      return nullptr;
-  }
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
-
-  unsigned MLAOpc = ~0U;
-
-  switch (N->getSimpleValueType(0).SimpleTy) {
-  default:
-    llvm_unreachable("Unrecognized MLA.");
-  case MVT::v4i16:
-    MLAOpc = ARM64::MLAv4i16_indexed;
-    break;
-  case MVT::v8i16:
-    MLAOpc = ARM64::MLAv8i16_indexed;
-    break;
-  case MVT::v2i32:
-    MLAOpc = ARM64::MLAv2i32_indexed;
-    break;
-  case MVT::v4i32:
-    MLAOpc = ARM64::MLAv4i32_indexed;
-    break;
-  }
-
-  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
-  SDValue SMULLOp0;
-  SDValue SMULLOp1;
-  int LaneIdx;
-
-  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
-                        LaneIdx))
-    return nullptr;
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
-
-  unsigned SMULLOpc = ~0U;
-
-  if (IntNo == Intrinsic::arm64_neon_smull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::SMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::SMULLv2i32_indexed;
-      break;
-    }
-  } else if (IntNo == Intrinsic::arm64_neon_umull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::UMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::UMULLv2i32_indexed;
-      break;
-    }
-  } else
-    llvm_unreachable("Unrecognized intrinsic.");
-
-  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-/// SelectArithExtendedRegister - Select a "extended register" operand.  This
-/// operand folds in an extend followed by an optional left shift.
-bool ARM64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
-                                                    SDValue &Shift) {
-  unsigned ShiftVal = 0;
-  ARM64_AM::ShiftExtendType Ext;
-
-  if (N.getOpcode() == ISD::SHL) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return false;
-    ShiftVal = CSD->getZExtValue();
-    if (ShiftVal > 4)
-      return false;
-
-    Ext = getExtendTypeForNode(N.getOperand(0));
-    if (Ext == ARM64_AM::InvalidShiftExtend)
-      return false;
-
-    Reg = N.getOperand(0).getOperand(0);
-  } else {
-    Ext = getExtendTypeForNode(N);
-    if (Ext == ARM64_AM::InvalidShiftExtend)
-      return false;
-
-    Reg = N.getOperand(0);
-  }
-
-  // ARM64 mandates that the RHS of the operation must use the smallest
-  // register classs that could contain the size being extended from.  Thus,
-  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
-  // there might not be an actual 32-bit value in the program.  We can
-  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
-  if (Reg.getValueType() == MVT::i64 && Ext != ARM64_AM::UXTX &&
-      Ext != ARM64_AM::SXTX) {
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    MachineSDNode *Node = CurDAG->getMachineNode(
-        TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32, Reg, SubReg);
-    Reg = SDValue(Node, 0);
-  }
-
-  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
-  return isWorthFolding(N);
-}
-
-/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
-/// immediate" address.  The "Size" argument is the size in bytes of the memory
-/// reference, which determines the scale.
-bool ARM64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
-                                              SDValue &Base, SDValue &OffImm) {
-  const TargetLowering *TLI = getTargetLowering();
-  if (N.getOpcode() == ISD::FrameIndex) {
-    int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-    return true;
-  }
-
-  if (N.getOpcode() == ARM64ISD::ADDlow) {
-    GlobalAddressSDNode *GAN =
-        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
-    Base = N.getOperand(0);
-    OffImm = N.getOperand(1);
-    if (!GAN)
-      return true;
-
-    const GlobalValue *GV = GAN->getGlobal();
-    unsigned Alignment = GV->getAlignment();
-    const DataLayout *DL = TLI->getDataLayout();
-    if (Alignment == 0 && !Subtarget->isTargetDarwin())
-      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
-
-    if (Alignment >= Size)
-      return true;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(N)) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int64_t RHSC = (int64_t)RHS->getZExtValue();
-      unsigned Scale = Log2_32(Size);
-      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-        }
-        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
-        return true;
-      }
-    }
-  }
-
-  // Before falling back to our general case, check if the unscaled
-  // instructions can handle this. If so, that's preferable.
-  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
-    return false;
-
-  // Base only. The address will be materialized into a register before
-  // the memory is accessed.
-  //    add x0, Xbase, #offset
-  //    ldr x0, [x0]
-  Base = N;
-  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-  return true;
-}
-
-/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
-/// immediate" address.  This should only match when there is an offset that
-/// is not valid for a scaled immediate addressing mode.  The "Size" argument
-/// is the size in bytes of the memory reference, which is needed here to know
-/// what is valid for a scaled immediate.
-bool ARM64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
-                                               SDValue &Base, SDValue &OffImm) {
-  if (!CurDAG->isBaseWithConstantOffset(N))
-    return false;
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int64_t RHSC = RHS->getSExtValue();
-    // If the offset is valid as a scaled immediate, don't match here.
-    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
-        RHSC < (0x1000 << Log2_32(Size)))
-      return false;
-    if (RHSC >= -256 && RHSC < 256) {
-      Base = N.getOperand(0);
-      if (Base.getOpcode() == ISD::FrameIndex) {
-        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        const TargetLowering *TLI = getTargetLowering();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-      }
-      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
-      return true;
-    }
-  }
-  return false;
-}
-
-static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
-  SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-  SDValue ImpDef = SDValue(
-      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
-      0);
-  MachineSDNode *Node = CurDAG->getMachineNode(
-      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
-  return SDValue(Node, 0);
-}
-
-static SDValue WidenIfNeeded(SelectionDAG *CurDAG, SDValue N) {
-  if (N.getValueType() == MVT::i32) {
-    return Widen(CurDAG, N);
-  }
-
-  return N;
-}
-
-/// \brief Check if the given SHL node (\p N), can be used to form an
-/// extended register for an addressing mode.
-bool ARM64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
-                                          SDValue &Offset, SDValue &Imm) {
-  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
-  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-  if (CSD && (CSD->getZExtValue() & 0x7) == CSD->getZExtValue()) {
-
-    ARM64_AM::ShiftExtendType Ext = getExtendTypeForNode(N.getOperand(0), true);
-    if (Ext == ARM64_AM::InvalidShiftExtend) {
-      Ext = ARM64_AM::UXTX;
-      Offset = WidenIfNeeded(CurDAG, N.getOperand(0));
-    } else {
-      Offset = WidenIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
-    }
-
-    unsigned LegalShiftVal = Log2_32(Size);
-    unsigned ShiftVal = CSD->getZExtValue();
-
-    if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
-      return false;
-
-    Imm = CurDAG->getTargetConstant(
-        ARM64_AM::getMemExtendImm(Ext, ShiftVal != 0), MVT::i32);
-    if (isWorthFolding(N))
-      return true;
-  }
-  return false;
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeRO(SDValue N, unsigned Size,
-                                         SDValue &Base, SDValue &Offset,
-                                         SDValue &Imm) {
-  if (N.getOpcode() != ISD::ADD)
-    return false;
-  SDValue LHS = N.getOperand(0);
-  SDValue RHS = N.getOperand(1);
-
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
-
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
-  const SDNode *Node = N.getNode();
-  for (SDNode *UI : Node->uses()) {
-    if (!isa<MemSDNode>(*UI))
-      return false;
-  }
-
-  // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
-
-  // Try to match a shifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(RHS, Size, Offset, Imm)) {
-    Base = LHS;
-    return true;
-  }
-
-  // Try to match a shifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(LHS, Size, Offset, Imm)) {
-    Base = RHS;
-    return true;
-  }
-
-  ARM64_AM::ShiftExtendType Ext = ARM64_AM::UXTX;
-  // Try to match an unshifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(LHS, true)) != ARM64_AM::InvalidShiftExtend) {
-    Base = RHS;
-    Offset = WidenIfNeeded(CurDAG, LHS.getOperand(0));
-    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                    MVT::i32);
-    if (isWorthFolding(LHS))
-      return true;
-  }
-
-  // Try to match an unshifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(RHS, true)) != ARM64_AM::InvalidShiftExtend) {
-    Base = LHS;
-    Offset = WidenIfNeeded(CurDAG, RHS.getOperand(0));
-    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                    MVT::i32);
-    if (isWorthFolding(RHS))
-      return true;
-  }
-
-  // Match any non-shifted, non-extend, non-immediate add expression.
-  Base = LHS;
-  Offset = WidenIfNeeded(CurDAG, RHS);
-  Ext = ARM64_AM::UXTX;
-  Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                  MVT::i32);
-  // Reg1 + Reg2 is free: no check needed.
-  return true;
-}
-
-SDValue ARM64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::DDRegClassID, ARM64::DDDRegClassID,
-                                    ARM64::DDDDRegClassID };
-  static unsigned SubRegs[] = { ARM64::dsub0, ARM64::dsub1,
-                                ARM64::dsub2, ARM64::dsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::QQRegClassID, ARM64::QQQRegClassID,
-                                    ARM64::QQQQRegClassID };
-  static unsigned SubRegs[] = { ARM64::qsub0, ARM64::qsub1,
-                                ARM64::qsub2, ARM64::qsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
-                                       unsigned RegClassIDs[],
-                                       unsigned SubRegs[]) {
-  // There's no special register-class for a vector-list of 1 element: it's just
-  // a vector.
-  if (Regs.size() == 1)
-    return Regs[0];
-
-  assert(Regs.size() >= 2 && Regs.size() <= 4);
-
-  SDLoc DL(Regs[0].getNode());
-
-  SmallVector<SDValue, 4> Ops;
-
-  // First operand of REG_SEQUENCE is the desired RegClass.
-  Ops.push_back(
-      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
-
-  // Then we get pairs of source & subregister-position for the components.
-  for (unsigned i = 0; i < Regs.size(); ++i) {
-    Ops.push_back(Regs[i]);
-    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
-  }
-
-  SDNode *N =
-      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
-  return SDValue(N, 0);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc, bool isExt) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  unsigned ExtOff = isExt;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  unsigned Vec0Off = ExtOff + 1;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
-                               N->op_begin() + Vec0Off + NumVecs);
-  SDValue RegSeq = createQTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  if (isExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isUnindexed())
-    return nullptr;
-  EVT VT = LD->getMemoryVT();
-  EVT DstVT = N->getValueType(0);
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
-
-  // We're not doing validity checking here. That was done when checking
-  // if we should mark the load as indexed or not. We're just selecting
-  // the right instruction.
-  unsigned Opcode = 0;
-
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  bool InsertTo64 = false;
-  if (VT == MVT::i64)
-    Opcode = IsPre ? ARM64::LDRXpre_isel : ARM64::LDRXpost_isel;
-  else if (VT == MVT::i32) {
-    if (ExtType == ISD::NON_EXTLOAD)
-      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
-    else if (ExtType == ISD::SEXTLOAD)
-      Opcode = IsPre ? ARM64::LDRSWpre_isel : ARM64::LDRSWpost_isel;
-    else {
-      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
-      InsertTo64 = true;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i16) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSHXpre_isel : ARM64::LDRSHXpost_isel;
-      else
-        Opcode = IsPre ? ARM64::LDRSHWpre_isel : ARM64::LDRSHWpost_isel;
-    } else {
-      Opcode = IsPre ? ARM64::LDRHHpre_isel : ARM64::LDRHHpost_isel;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i8) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSBXpre_isel : ARM64::LDRSBXpost_isel;
-      else
-        Opcode = IsPre ? ARM64::LDRSBWpre_isel : ARM64::LDRSBWpost_isel;
-    } else {
-      Opcode = IsPre ? ARM64::LDRBBpre_isel : ARM64::LDRBBpost_isel;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::f32) {
-    Opcode = IsPre ? ARM64::LDRSpre_isel : ARM64::LDRSpost_isel;
-  } else if (VT == MVT::f64 || VT.is64BitVector()) {
-    Opcode = IsPre ? ARM64::LDRDpre_isel : ARM64::LDRDpost_isel;
-  } else if (VT.is128BitVector()) {
-    Opcode = IsPre ? ARM64::LDRQpre_isel : ARM64::LDRQpost_isel;
-  } else
-    return nullptr;
-  SDValue Chain = LD->getChain();
-  SDValue Base = LD->getBasePtr();
-  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
-  int OffsetVal = (int)OffsetOp->getZExtValue();
-  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
-  SDValue Ops[] = { Base, Offset, Chain };
-  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), DstVT, MVT::i64,
-                                       MVT::Other, Ops);
-  // Either way, we're replacing the node, so tell the caller that.
-  Done = true;
-  if (InsertTo64) {
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    SDNode *Sub = CurDAG->getMachineNode(
-        ARM64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
-        CurDAG->getTargetConstant(0, MVT::i64), SDValue(Res, 0), SubReg);
-    ReplaceUses(SDValue(N, 0), SDValue(Sub, 0));
-    ReplaceUses(SDValue(N, 1), SDValue(Res, 1));
-    ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
-    return nullptr;
-  }
-  return Res;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                                      unsigned SubRegIdx) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SDValue Chain = N->getOperand(0);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(2)); // Mem operand;
-  Ops.push_back(Chain);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-  for (unsigned i = 0; i < NumVecs; ++i)
-    ReplaceUses(SDValue(N, i),
-        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-  return nullptr;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
-                                          unsigned Opc, unsigned SubRegIdx) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SDValue Chain = N->getOperand(0);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Mem operand
-  Ops.push_back(N->getOperand(2)); // Incremental
-  Ops.push_back(Chain);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  // Update uses of write back register
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
-
-  // Update uses of vector list
-  SDValue SuperReg = SDValue(Ld, 1);
-  if (NumVecs == 1)
-    ReplaceUses(SDValue(N, 0), SuperReg);
-  else
-    for (unsigned i = 0; i < NumVecs; ++i)
-      ReplaceUses(SDValue(N, i),
-          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
-
-  // Update the chain
-  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-  return nullptr;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-
-  // Form a REG_SEQUENCE to force register allocation.
-  bool Is128Bit = VT.getSizeInBits() == 128;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 2));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
-
-  return St;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
-                                               unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other); // Type for the Chain
-
-  // Form a REG_SEQUENCE to force register allocation.
-  bool Is128Bit = VT.getSizeInBits() == 128;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
-  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
-  Ops.push_back(N->getOperand(0)); // Chain
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  return St;
-}
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-class WidenVector {
-  SelectionDAG &DAG;
-
-public:
-  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
-
-  SDValue operator()(SDValue V64Reg) {
-    EVT VT = V64Reg.getValueType();
-    unsigned NarrowSize = VT.getVectorNumElements();
-    MVT EltTy = VT.getVectorElementType().getSimpleVT();
-    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-    SDLoc DL(V64Reg);
-
-    SDValue Undef =
-        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
-    return DAG.getTargetInsertSubreg(ARM64::dsub, DL, WideTy, Undef, V64Reg);
-  }
-};
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, SDLoc(V128Reg), NarrowTy,
-                                    V128Reg);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
-                                          unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-
-  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-  static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2,
-                              ARM64::qsub3 };
-  for (unsigned i = 0; i < NumVecs; ++i) {
-    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
-    if (Narrow)
-      NV = NarrowVector(NV, *CurDAG);
-    ReplaceUses(SDValue(N, i), NV);
-  }
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return Ld;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
-                                              unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::i64); // Type of the write back register
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  // Update uses of the write back register
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
-
-  // Update uses of the vector list
-  SDValue SuperReg = SDValue(Ld, 1);
-  if (NumVecs == 1) {
-    ReplaceUses(SDValue(N, 0),
-                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
-  } else {
-    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-    static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2,
-                                ARM64::qsub3 };
-    for (unsigned i = 0; i < NumVecs; ++i) {
-      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
-                                                  SuperReg);
-      if (Narrow)
-        NV = NarrowVector(NV, *CurDAG);
-      ReplaceUses(SDValue(N, i), NV);
-    }
-  }
-
-  // Update the Chain
-  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-
-  return Ld;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
-                                           unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
-
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-  return St;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
-                                               unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  SmallVector<EVT, 2> ResTys;
-  ResTys.push_back(MVT::i64);   // Type of the write back register
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
-  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-  return St;
-}
-
-static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
-                                       unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       unsigned NumberOfIgnoredLowBits,
-                                       bool BiggerPattern) {
-  assert(N->getOpcode() == ISD::AND &&
-         "N must be a AND operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // FIXME: simplify-demanded-bits in DAGCombine will probably have
-  // changed the AND node to a 32-bit mask operation. We'll have to
-  // undo that as part of the transform here if we want to catch all
-  // the opportunities.
-  // Currently the NumberOfIgnoredLowBits argument helps to recover
-  // form these situations when matching bigger pattern (bitfield insert).
-
-  // For unsigned extracts, check for a shift right and mask
-  uint64_t And_imm = 0;
-  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
-    return false;
-
-  const SDNode *Op0 = N->getOperand(0).getNode();
-
-  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
-  // simplified. Try to undo that
-  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
-
-  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
-  if (And_imm & (And_imm + 1))
-    return false;
-
-  bool ClampMSB = false;
-  uint64_t Srl_imm = 0;
-  // Handle the SRL + ANY_EXTEND case.
-  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
-      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
-    // Extend the incoming operand of the SRL to 64-bit.
-    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
-    // Make sure to clamp the MSB so that we preserve the semantics of the
-    // original operations.
-    ClampMSB = true;
-  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
-             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
-                                   Srl_imm)) {
-    // If the shift result was truncated, we can still combine them.
-    Opd0 = Op0->getOperand(0).getOperand(0);
-
-    // Use the type of SRL node.
-    VT = Opd0->getValueType(0);
-  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
-    Opd0 = Op0->getOperand(0);
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift right has been performed.
-    // The resulting code will be at least as good as the original one
-    // plus it may expose more opportunities for bitfield insert pattern.
-    // FIXME: Currently we limit this to the bigger pattern, because
-    // some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
-         "bad amount in shift node!");
-
-  LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
-                                  : CountTrailingOnes_64(And_imm)) -
-        1;
-  if (ClampMSB)
-    // Since we're moving the extend before the right shift operation, we need
-    // to clamp the MSB to make sure we don't shift in undefined bits instead of
-    // the zeros which would get shifted in with the original right shift
-    // operation.
-    MSB = MSB > 31 ? 31 : MSB;
-
-  Opc = VT == MVT::i32 ? ARM64::UBFMWri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     unsigned &LSB, unsigned &MSB) {
-  // We are looking for the following pattern which basically extracts a single
-  // bit from the source value and places it in the LSB of the destination
-  // value, all other bits of the destination value or set to zero:
-  //
-  // Value2 = AND Value, MaskImm
-  // SRL Value2, ShiftImm
-  //
-  // with MaskImm >> ShiftImm == 1.
-  //
-  // This gets selected into a single UBFM:
-  //
-  // UBFM Value, ShiftImm, ShiftImm
-  //
-
-  if (N->getOpcode() != ISD::SRL)
-    return false;
-
-  uint64_t And_mask = 0;
-  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
-    return false;
-
-  Opd0 = N->getOperand(0).getOperand(0);
-
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  // Check whether we really have a one bit extract here.
-  if (And_mask >> Srl_imm == 0x1) {
-    if (N->getValueType(0) == MVT::i32)
-      Opc = ARM64::UBFMWri;
-    else
-      Opc = ARM64::UBFMXri;
-
-    LSB = MSB = Srl_imm;
-
-    return true;
-  }
-
-  return false;
-}
-
-static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       bool BiggerPattern) {
-  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
-         "N must be a SHR/SRA operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // Check for AND + SRL doing a one bit extract.
-  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
-    return true;
-
-  // we're looking for a shift of a shift
-  uint64_t Shl_imm = 0;
-  uint64_t Trunc_bits = 0;
-  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
-    Opd0 = N->getOperand(0).getOperand(0);
-  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
-             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
-    // We are looking for a shift of truncate. Truncate from i64 to i32 could
-    // be considered as setting high 32 bits as zero. Our strategy here is to
-    // always generate 64bit UBFM. This consistency will help the CSE pass
-    // later find more redundancy.
-    Opd0 = N->getOperand(0).getOperand(0);
-    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
-    VT = Opd0->getValueType(0);
-    assert(VT == MVT::i64 && "the promoted type should be i64");
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift left has been performed.
-    // FIXME: Currently we limit this to the bigger pattern case,
-    // because some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
-         "bad amount in shift node!");
-  // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
-  int sLSB = Srl_imm - Shl_imm;
-  if (sLSB < 0)
-    return false;
-  LSB = sLSB;
-  MSB = LSB + Width;
-  // SRA requires a signed extraction
-  if (VT == MVT::i32)
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMWri : ARM64::UBFMWri;
-  else
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMXri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
-                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
-                                unsigned NumberOfIgnoredLowBits = 0,
-                                bool BiggerPattern = false) {
-  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
-    return false;
-
-  switch (N->getOpcode()) {
-  default:
-    if (!N->isMachineOpcode())
-      return false;
-    break;
-  case ISD::AND:
-    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
-                                      NumberOfIgnoredLowBits, BiggerPattern);
-  case ISD::SRL:
-  case ISD::SRA:
-    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
-  }
-
-  unsigned NOpc = N->getMachineOpcode();
-  switch (NOpc) {
-  default:
-    return false;
-  case ARM64::SBFMWri:
-  case ARM64::UBFMWri:
-  case ARM64::SBFMXri:
-  case ARM64::UBFMXri:
-    Opc = NOpc;
-    Opd0 = N->getOperand(0);
-    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
-    return true;
-  }
-  // Unreachable
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
-  unsigned Opc, LSB, MSB;
-  SDValue Opd0;
-  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
-    return nullptr;
-
-  EVT VT = N->getValueType(0);
-
-  // If the bit extract operation is 64bit but the original type is 32bit, we
-  // need to add one EXTRACT_SUBREG.
-  if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32) {
-    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
-                       CurDAG->getTargetConstant(MSB, MVT::i64)};
-
-    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    MachineSDNode *Node =
-        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
-                               SDValue(BFM, 0), SubReg);
-    return Node;
-  }
-
-  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
-                   CurDAG->getTargetConstant(MSB, VT)};
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
-}
-
-/// Does DstMask form a complementary pair with the mask provided by
-/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
-/// this asks whether DstMask zeroes precisely those bits that will be set by
-/// the other half.
-static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
-                              unsigned NumberOfIgnoredHighBits, EVT VT) {
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "i32 or i64 mask type expected!");
-  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
-
-  APInt SignificantDstMask = APInt(BitWidth, DstMask);
-  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
-
-  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
-         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
-}
-
-// Look for bits that will be useful for later uses.
-// A bit is consider useless as soon as it is dropped and never used
-// before it as been dropped.
-// E.g., looking for useful bit of x
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// After #1, x useful bits are 0x7, then the useful bits of x, live through
-// y.
-// After #2, the useful bits of x are 0x4.
-// However, if x is used on an unpredicatable instruction, then all its bits
-// are useful.
-// E.g.
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// 3. str x, [@x]
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
-
-static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  Imm = ARM64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
-  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
-  getUsefulBits(Op, UsefulBits, Depth + 1);
-}
-
-static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
-                                             uint64_t Imm, uint64_t MSB,
-                                             unsigned Depth) {
-  // inherit the bitwidth value
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    // The interesting part will be in the lower part of the result
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was starting at Imm in the argument
-    OpUsefulBits = OpUsefulBits.shl(Imm);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    // The interesting part will be shifted in the result
-    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was at zero in the argument
-    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
-  }
-
-  UsefulBits &= OpUsefulBits;
-}
-
-static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
-                                  unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-
-  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-}
-
-static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t ShiftTypeAndValue =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  APInt Mask(UsefulBits);
-  Mask.clearAllBits();
-  Mask.flipAllBits();
-
-  if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSL) {
-    // Shift Left
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.shl(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.lshr(ShiftAmt);
-  } else if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSR) {
-    // Shift Right
-    // We do not handle ARM64_AM::ASR, because the sign will change the
-    // number of useful bits
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.lshr(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.shl(ShiftAmt);
-  } else
-    return;
-
-  UsefulBits &= Mask;
-}
-
-static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
-                                 unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
-
-  if (Op.getOperand(1) == Orig)
-    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    UsefulBits &= ~OpUsefulBits;
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  }
-}
-
-static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
-                                SDValue Orig, unsigned Depth) {
-
-  // Users of this node should have already been instruction selected
-  // FIXME: Can we turn that into an assert?
-  if (!UserNode->isMachineOpcode())
-    return;
-
-  switch (UserNode->getMachineOpcode()) {
-  default:
-    return;
-  case ARM64::ANDSWri:
-  case ARM64::ANDSXri:
-  case ARM64::ANDWri:
-  case ARM64::ANDXri:
-    // We increment Depth only when we call the getUsefulBits
-    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::UBFMWri:
-  case ARM64::UBFMXri:
-    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
-
-  case ARM64::ORRWrs:
-  case ARM64::ORRXrs:
-    if (UserNode->getOperand(1) != Orig)
-      return;
-    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::BFMWri:
-  case ARM64::BFMXri:
-    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
-  }
-}
-
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
-  if (Depth >= 6)
-    return;
-  // Initialize UsefulBits
-  if (!Depth) {
-    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
-    // At the beginning, assume every produced bits is useful
-    UsefulBits = APInt(Bitwidth, 0);
-    UsefulBits.flipAllBits();
-  }
-  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
-
-  for (SDNode *Node : Op.getNode()->uses()) {
-    // A use cannot produce useful bits
-    APInt UsefulBitsForUse = APInt(UsefulBits);
-    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
-    UsersUsefulBits |= UsefulBitsForUse;
-  }
-  // UsefulBits contains the produced bits that are meaningful for the
-  // current definition, thus a user cannot make a bit meaningful at
-  // this point
-  UsefulBits &= UsersUsefulBits;
-}
-
-/// Create a machine node performing a notional SHL of Op by ShlAmount. If
-/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
-/// 0, return Op unchanged.
-static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
-  if (ShlAmount == 0)
-    return Op;
-
-  EVT VT = Op.getValueType();
-  unsigned BitWidth = VT.getSizeInBits();
-  unsigned UBFMOpc = BitWidth == 32 ? ARM64::UBFMWri : ARM64::UBFMXri;
-
-  SDNode *ShiftNode;
-  if (ShlAmount > 0) {
-    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
-    ShiftNode = CurDAG->getMachineNode(
-        UBFMOpc, SDLoc(Op), VT, Op,
-        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
-        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
-  } else {
-    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
-    assert(ShlAmount < 0 && "expected right shift");
-    int ShrAmount = -ShlAmount;
-    ShiftNode = CurDAG->getMachineNode(
-        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
-        CurDAG->getTargetConstant(BitWidth - 1, VT));
-  }
-
-  return SDValue(ShiftNode, 0);
-}
-
-/// Does this tree qualify as an attempt to move a bitfield into position,
-/// essentially "(and (shl VAL, N), Mask)".
-static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
-                                    SDValue &Src, int &ShiftAmount,
-                                    int &MaskWidth) {
-  EVT VT = Op.getValueType();
-  unsigned BitWidth = VT.getSizeInBits();
-  (void)BitWidth;
-  assert(BitWidth == 32 || BitWidth == 64);
-
-  APInt KnownZero, KnownOne;
-  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
-
-  // Non-zero in the sense that they're not provably zero, which is the key
-  // point if we want to use this value
-  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
-
-  // Discard a constant AND mask if present. It's safe because the node will
-  // already have been factored into the computeKnownBits calculation above.
-  uint64_t AndImm;
-  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
-    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
-    Op = Op.getOperand(0);
-  }
-
-  uint64_t ShlImm;
-  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
-    return false;
-  Op = Op.getOperand(0);
-
-  if (!isShiftedMask_64(NonZeroBits))
-    return false;
-
-  ShiftAmount = countTrailingZeros(NonZeroBits);
-  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
-
-  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
-  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
-  // amount.
-  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
-
-  return true;
-}
-
-// Given a OR operation, check if we have the following pattern
-// ubfm c, b, imm, imm2 (or something that does the same jobs, see
-//                       isBitfieldExtractOp)
-// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
-//                 countTrailingZeros(mask2) == imm2 - imm + 1
-// f = d | c
-// if yes, given reference arguments will be update so that one can replace
-// the OR instruction with:
-// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
-                                     SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, SelectionDAG *CurDAG) {
-  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
-
-  // Set Opc
-  EVT VT = N->getValueType(0);
-  if (VT == MVT::i32)
-    Opc = ARM64::BFMWri;
-  else if (VT == MVT::i64)
-    Opc = ARM64::BFMXri;
-  else
-    return false;
-
-  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
-  // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
-
-  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
-  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
-
-  // OR is commutative, check both possibilities (does llvm provide a
-  // way to do that directely, e.g., via code matcher?)
-  SDValue OrOpd1Val = N->getOperand(1);
-  SDNode *OrOpd0 = N->getOperand(0).getNode();
-  SDNode *OrOpd1 = N->getOperand(1).getNode();
-  for (int i = 0; i < 2;
-       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
-    unsigned BFXOpc;
-    int DstLSB, Width;
-    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
-                            NumberOfIgnoredLowBits, true)) {
-      // Check that the returned opcode is compatible with the pattern,
-      // i.e., same type and zero extended (U and not S)
-      if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
-          (BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
-        continue;
-
-      // Compute the width of the bitfield insertion
-      DstLSB = 0;
-      Width = ImmS - ImmR + 1;
-      // FIXME: This constraint is to catch bitfield insertion we may
-      // want to widen the pattern if we want to grab general bitfied
-      // move case
-      if (Width <= 0)
-        continue;
-
-      // If the mask on the insertee is correct, we have a BFXIL operation. We
-      // can share the ImmR and ImmS values from the already-computed UBFM.
-    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
-                                       DstLSB, Width)) {
-      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
-      ImmS = Width - 1;
-    } else
-      continue;
-
-    // Check the second part of the pattern
-    EVT VT = OrOpd1->getValueType(0);
-    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
-
-    // Compute the Known Zero for the candidate of the first operand.
-    // This allows to catch more general case than just looking for
-    // AND with imm. Indeed, simplify-demanded-bits may have removed
-    // the AND instruction because it proves it was useless.
-    APInt KnownZero, KnownOne;
-    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
-
-    // Check if there is enough room for the second operand to appear
-    // in the first one
-    APInt BitsToBeInserted =
-        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
-
-    if ((BitsToBeInserted & ~KnownZero) != 0)
-      continue;
-
-    // Set the first operand
-    uint64_t Imm;
-    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
-        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
-      // In that case, we can eliminate the AND
-      Dst = OrOpd1->getOperand(0);
-    else
-      // Maybe the AND has been removed by simplify-demanded-bits
-      // or is useful because it discards more bits
-      Dst = OrOpd1Val;
-
-    // both parts match
-    return true;
-  }
-
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
-  if (N->getOpcode() != ISD::OR)
-    return nullptr;
-
-  unsigned Opc;
-  unsigned LSB, MSB;
-  SDValue Opd0, Opd1;
-
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
-    return nullptr;
-
-  EVT VT = N->getValueType(0);
-  SDValue Ops[] = { Opd0,
-                    Opd1,
-                    CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLIBM(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  unsigned Variant;
-  unsigned Opc;
-  unsigned FRINTXOpcs[] = { ARM64::FRINTXSr, ARM64::FRINTXDr };
-
-  if (VT == MVT::f32) {
-    Variant = 0;
-  } else if (VT == MVT::f64) {
-    Variant = 1;
-  } else
-    return nullptr; // Unrecognized argument type. Fall back on default codegen.
-
-  // Pick the FRINTX variant needed to set the flags.
-  unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
-  switch (N->getOpcode()) {
-  default:
-    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
-  case ISD::FCEIL: {
-    unsigned FRINTPOpcs[] = { ARM64::FRINTPSr, ARM64::FRINTPDr };
-    Opc = FRINTPOpcs[Variant];
-    break;
-  }
-  case ISD::FFLOOR: {
-    unsigned FRINTMOpcs[] = { ARM64::FRINTMSr, ARM64::FRINTMDr };
-    Opc = FRINTMOpcs[Variant];
-    break;
-  }
-  case ISD::FTRUNC: {
-    unsigned FRINTZOpcs[] = { ARM64::FRINTZSr, ARM64::FRINTZDr };
-    Opc = FRINTZOpcs[Variant];
-    break;
-  }
-  case ISD::FROUND: {
-    unsigned FRINTAOpcs[] = { ARM64::FRINTASr, ARM64::FRINTADr };
-    Opc = FRINTAOpcs[Variant];
-    break;
-  }
-  }
-
-  SDLoc dl(N);
-  SDValue In = N->getOperand(0);
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(In);
-
-  if (!TM.Options.UnsafeFPMath) {
-    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
-    Ops.push_back(SDValue(FRINTX, 1));
-  }
-
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-bool
-ARM64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                              unsigned RegWidth) {
-  APFloat FVal(0.0);
-  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
-    FVal = CN->getValueAPF();
-  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
-    // Some otherwise illegal constants are allowed in this case.
-    if (LN->getOperand(1).getOpcode() != ARM64ISD::ADDlow ||
-        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
-      return false;
-
-    ConstantPoolSDNode *CN =
-        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
-    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
-  } else
-    return false;
-
-  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
-  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
-  // x-register.
-  //
-  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
-  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
-  // integers.
-  bool IsExact;
-
-  // fbits is between 1 and 64 in the worst-case, which means the fmul
-  // could have 2^64 as an actual operand. Need 65 bits of precision.
-  APSInt IntVal(65, true);
-  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
-
-  // N.b. isPowerOf2 also checks for > 0.
-  if (!IsExact || !IntVal.isPowerOf2()) return false;
-  unsigned FBits = IntVal.logBase2();
-
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding FBits, but it must still be in range.
-  if (FBits == 0 || FBits > RegWidth) return false;
-
-  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
-  return true;
-}
-
-SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) {
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: ");
-  DEBUG(Node->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  // If we have a custom node, we already have selected!
-  if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
-    Node->setNodeId(-1);
-    return nullptr;
-  }
-
-  // Few custom selection stuff.
-  SDNode *ResNode = nullptr;
-  EVT VT = Node->getValueType(0);
-
-  switch (Node->getOpcode()) {
-  default:
-    break;
-
-  case ISD::ADD:
-    if (SDNode *I = SelectMLAV64LaneV128(Node))
-      return I;
-    break;
-
-  case ISD::LOAD: {
-    // Try to select as an indexed load. Fall through to normal processing
-    // if we can't.
-    bool Done = false;
-    SDNode *I = SelectIndexedLoad(Node, Done);
-    if (Done)
-      return I;
-    break;
-  }
-
-  case ISD::SRL:
-  case ISD::AND:
-  case ISD::SRA:
-    if (SDNode *I = SelectBitfieldExtractOp(Node))
-      return I;
-    break;
-
-  case ISD::OR:
-    if (SDNode *I = SelectBitfieldInsertOp(Node))
-      return I;
-    break;
-
-  case ISD::EXTRACT_VECTOR_ELT: {
-    // Extracting lane zero is a special case where we can just use a plain
-    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
-    // the rest of the compiler, especially the register allocator and copyi
-    // propagation, to reason about, so is preferred when it's possible to
-    // use it.
-    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
-    // Bail and use the default Select() for non-zero lanes.
-    if (LaneNode->getZExtValue() != 0)
-      break;
-    // If the element type is not the same as the result type, likewise
-    // bail and use the default Select(), as there's more to do than just
-    // a cross-class COPY. This catches extracts of i8 and i16 elements
-    // since they will need an explicit zext.
-    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
-      break;
-    unsigned SubReg;
-    switch (Node->getOperand(0)
-                .getValueType()
-                .getVectorElementType()
-                .getSizeInBits()) {
-    default:
-      assert(0 && "Unexpected vector element type!");
-    case 64:
-      SubReg = ARM64::dsub;
-      break;
-    case 32:
-      SubReg = ARM64::ssub;
-      break;
-    case 16: // FALLTHROUGH
-    case 8:
-      llvm_unreachable("unexpected zext-requiring extract element!");
-    }
-    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
-                                                     Node->getOperand(0));
-    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
-    DEBUG(Extract->dumpr(CurDAG));
-    DEBUG(dbgs() << "\n");
-    return Extract.getNode();
-  }
-  case ISD::Constant: {
-    // Materialize zero constants as copies from WZR/XZR.  This allows
-    // the coalescer to propagate these into other instructions.
-    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
-    if (ConstNode->isNullValue()) {
-      if (VT == MVT::i32)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::WZR, MVT::i32).getNode();
-      else if (VT == MVT::i64)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::XZR, MVT::i64).getNode();
-    }
-    break;
-  }
-
-  case ISD::FrameIndex: {
-    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-    const TargetLowering *TLI = getTargetLowering();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
-                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
-    return CurDAG->SelectNodeTo(Node, ARM64::ADDXri, MVT::i64, Ops);
-  }
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_ldaxp:
-    case Intrinsic::arm64_ldxp: {
-      unsigned Op =
-          IntNo == Intrinsic::arm64_ldaxp ? ARM64::LDAXPX : ARM64::LDXPX;
-      SDValue MemAddr = Node->getOperand(2);
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-
-      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
-                                          MVT::Other, MemAddr, Chain);
-
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-      return Ld;
-    }
-    case Intrinsic::arm64_stlxp:
-    case Intrinsic::arm64_stxp: {
-      unsigned Op =
-          IntNo == Intrinsic::arm64_stlxp ? ARM64::STLXPX : ARM64::STXPX;
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-      SDValue ValLo = Node->getOperand(2);
-      SDValue ValHi = Node->getOperand(3);
-      SDValue MemAddr = Node->getOperand(4);
-
-      // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(ValLo);
-      Ops.push_back(ValHi);
-      Ops.push_back(MemAddr);
-      Ops.push_back(Chain);
-
-      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-      return St;
-    }
-    case Intrinsic::arm64_neon_ld1x2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 2, ARM64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 2, ARM64::LD2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 2, ARM64::LD2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 2, ARM64::LD2i64);
-      break;
-    case Intrinsic::arm64_neon_ld3lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 3, ARM64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 3, ARM64::LD3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 3, ARM64::LD3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 3, ARM64::LD3i64);
-      break;
-    case Intrinsic::arm64_neon_ld4lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 4, ARM64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 4, ARM64::LD4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 4, ARM64::LD4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 4, ARM64::LD4i64);
-      break;
-    }
-  } break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_tbl2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBLv8i8Two
-                                                  : ARM64::TBLv16i8Two,
-                         false);
-    case Intrinsic::arm64_neon_tbl3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBLv8i8Three
-                                                  : ARM64::TBLv16i8Three,
-                         false);
-    case Intrinsic::arm64_neon_tbl4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBLv8i8Four
-                                                  : ARM64::TBLv16i8Four,
-                         false);
-    case Intrinsic::arm64_neon_tbx2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBXv8i8Two
-                                                  : ARM64::TBXv16i8Two,
-                         true);
-    case Intrinsic::arm64_neon_tbx3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBXv8i8Three
-                                                  : ARM64::TBXv16i8Three,
-                         true);
-    case Intrinsic::arm64_neon_tbx4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBXv8i8Four
-                                                  : ARM64::TBXv16i8Four,
-                         true);
-    case Intrinsic::arm64_neon_smull:
-    case Intrinsic::arm64_neon_umull:
-      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
-        return N;
-      break;
-    }
-    break;
-  }
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    if (Node->getNumOperands() >= 3)
-      VT = Node->getOperand(2)->getValueType(0);
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_st1x2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST2Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST3Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 2, ARM64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 2, ARM64::ST2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 2, ARM64::ST2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 2, ARM64::ST2i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 3, ARM64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 3, ARM64::ST3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 3, ARM64::ST3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 3, ARM64::ST3i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 4, ARM64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 4, ARM64::ST4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 4, ARM64::ST4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 4, ARM64::ST4i64);
-      break;
-    }
-    }
-  }
-  case ARM64ISD::LD2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, ARM64::LD2Twov2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, ARM64::LD3Threev2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, ARM64::LD4Fourv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1x2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, ARM64::LD1Twov2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1x3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, ARM64::LD1Threev2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1x4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, ARM64::LD1Fourv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 1, ARM64::LD1Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD2DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, ARM64::LD2Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD3DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, ARM64::LD3Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD4DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv8b_POST, ARM64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv16b_POST, ARM64::qsub0);
-    else if (VT == MVT::v4i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv4h_POST, ARM64::dsub0);
-    else if (VT == MVT::v8i16)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv8h_POST, ARM64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv2s_POST, ARM64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv4s_POST, ARM64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv1d_POST, ARM64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, ARM64::LD4Rv2d_POST, ARM64::qsub0);
-    break;
-  }
-  case ARM64ISD::LD1LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 1, ARM64::LD1i64_POST);
-    break;
-  }
-  case ARM64ISD::LD2LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 2, ARM64::LD2i64_POST);
-    break;
-  }
-  case ARM64ISD::LD3LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 3, ARM64::LD3i64_POST);
-    break;
-  }
-  case ARM64ISD::LD4LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 4, ARM64::LD4i64_POST);
-    break;
-  }
-  case ARM64ISD::ST2post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, ARM64::ST2Twov2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST);
-    break;
-  }
-  case ARM64ISD::ST3post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, ARM64::ST3Threev2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST);
-    break;
-  }
-  case ARM64ISD::ST4post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, ARM64::ST4Fourv2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST);
-    break;
-  }
-  case ARM64ISD::ST1x2post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, ARM64::ST1Twov2d_POST);
-    break;
-  }
-  case ARM64ISD::ST1x3post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, ARM64::ST1Threev2d_POST);
-    break;
-  }
-  case ARM64ISD::ST1x4post: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv16b_POST);
-    else if (VT == MVT::v4i16)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv4h_POST);
-    else if (VT == MVT::v8i16)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, ARM64::ST1Fourv2d_POST);
-    break;
-  }
-  case ARM64ISD::ST2LANEpost: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 2, ARM64::ST2i64_POST);
-    break;
-  }
-  case ARM64ISD::ST3LANEpost: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 3, ARM64::ST3i64_POST);
-    break;
-  }
-  case ARM64ISD::ST4LANEpost: {
-    VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 4, ARM64::ST4i64_POST);
-    break;
-  }
-
-  case ISD::FCEIL:
-  case ISD::FFLOOR:
-  case ISD::FTRUNC:
-  case ISD::FROUND:
-    if (SDNode *I = SelectLIBM(Node))
-      return I;
-    break;
-  }
-
-  // Select the default instruction
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
-}
-
-/// createARM64ISelDag - This pass converts a legalized DAG into a
-/// ARM64-specific DAG, ready for instruction scheduling.
-FunctionPass *llvm::createARM64ISelDag(ARM64TargetMachine &TM,
-                                       CodeGenOpt::Level OptLevel) {
-  return new ARM64DAGToDAGISel(TM, OptLevel);
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
deleted file mode 100644
index 538360cf39dc..000000000000
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ /dev/null
@@ -1,7891 +0,0 @@
-//===-- ARM64ISelLowering.cpp - ARM64 DAG Lowering Implementation  --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64TargetLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64ISelLowering.h"
-#include "ARM64PerfectShuffle.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64TargetObjectFile.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-lower"
-
-STATISTIC(NumTailCalls, "Number of tail calls");
-STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-
-enum AlignMode {
-  StrictAlign,
-  NoStrictAlign
-};
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
-      cl::Hidden, cl::init(NoStrictAlign),
-      cl::values(
-          clEnumValN(StrictAlign,   "arm64-strict-align",
-                     "Disallow all unaligned memory accesses"),
-          clEnumValN(NoStrictAlign, "arm64-no-strict-align",
-                     "Allow unaligned memory accesses"),
-          clEnumValEnd));
-
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableARM64ExtrGeneration("arm64-extr-generation", cl::Hidden,
-                          cl::desc("Allow ARM64 (or (shift)(shift))->extract"),
-                          cl::init(true));
-
-static cl::opt<bool>
-EnableARM64SlrGeneration("arm64-shift-insert-generation", cl::Hidden,
-                         cl::desc("Allow ARM64 SLI/SRI formation"),
-                         cl::init(false));
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<ARM64Subtarget>().isTargetDarwin())
-    return new ARM64_MachoTargetObjectFile();
-
-  return new ARM64_ELFTargetObjectFile();
-}
-
-ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(TM)) {
-  Subtarget = &TM.getSubtarget<ARM64Subtarget>();
-
-  // ARM64 doesn't have comparisons which set GPRs or setcc instructions, so
-  // we have to make something up. Arbitrarily, choose ZeroOrOne.
-  setBooleanContents(ZeroOrOneBooleanContent);
-  // When comparing vectors the result sets the different elements in the
-  // vector to all-one or all-zero.
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-
-  // Set up the register classes.
-  addRegisterClass(MVT::i32, &ARM64::GPR32allRegClass);
-  addRegisterClass(MVT::i64, &ARM64::GPR64allRegClass);
-
-  if (Subtarget->hasFPARMv8()) {
-    addRegisterClass(MVT::f16, &ARM64::FPR16RegClass);
-    addRegisterClass(MVT::f32, &ARM64::FPR32RegClass);
-    addRegisterClass(MVT::f64, &ARM64::FPR64RegClass);
-    addRegisterClass(MVT::f128, &ARM64::FPR128RegClass);
-  }
-
-  if (Subtarget->hasNEON()) {
-    addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass);
-    addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass);
-    // Someone set us up the NEON.
-    addDRTypeForNEON(MVT::v2f32);
-    addDRTypeForNEON(MVT::v8i8);
-    addDRTypeForNEON(MVT::v4i16);
-    addDRTypeForNEON(MVT::v2i32);
-    addDRTypeForNEON(MVT::v1i64);
-    addDRTypeForNEON(MVT::v1f64);
-
-    addQRTypeForNEON(MVT::v4f32);
-    addQRTypeForNEON(MVT::v2f64);
-    addQRTypeForNEON(MVT::v16i8);
-    addQRTypeForNEON(MVT::v8i16);
-    addQRTypeForNEON(MVT::v4i32);
-    addQRTypeForNEON(MVT::v2i64);
-  }
-
-  // Compute derived properties from the register classes
-  computeRegisterProperties();
-
-  // Provide all sorts of operation actions
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f64, Custom);
-  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
-
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
-
-  setOperationAction(ISD::FREM, MVT::f32, Expand);
-  setOperationAction(ISD::FREM, MVT::f64, Expand);
-  setOperationAction(ISD::FREM, MVT::f80, Expand);
-
-  // Custom lowering hooks are needed for XOR
-  // to fold it into CSINC/CSINV.
-  setOperationAction(ISD::XOR, MVT::i32, Custom);
-  setOperationAction(ISD::XOR, MVT::i64, Custom);
-
-  // Virtually no operation on f128 is legal, but LLVM can't expand them when
-  // there's a valid register class, so we need custom operations in most cases.
-  setOperationAction(ISD::FABS, MVT::f128, Expand);
-  setOperationAction(ISD::FADD, MVT::f128, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
-  setOperationAction(ISD::FCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FDIV, MVT::f128, Custom);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
-  setOperationAction(ISD::FMUL, MVT::f128, Custom);
-  setOperationAction(ISD::FNEG, MVT::f128, Expand);
-  setOperationAction(ISD::FPOW, MVT::f128, Expand);
-  setOperationAction(ISD::FREM, MVT::f128, Expand);
-  setOperationAction(ISD::FRINT, MVT::f128, Expand);
-  setOperationAction(ISD::FSIN, MVT::f128, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-  setOperationAction(ISD::FSUB, MVT::f128, Custom);
-  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
-  setOperationAction(ISD::SETCC, MVT::f128, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
-  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
-
-  // Lowering for many of the conversions is actually specified by the non-f128
-  // type. The LowerXXX function will be trivial when f128 isn't involved.
-  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
-
-  // Variable arguments.
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VAARG, MVT::Other, Custom);
-  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-
-  // Variable-sized objects.
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-
-  // Exception handling.
-  // FIXME: These are guesses. Has this been defined yet?
-  setExceptionPointerRegister(ARM64::X0);
-  setExceptionSelectorRegister(ARM64::X1);
-
-  // Constant pool entries
-  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
-  // BlockAddress
-  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
-
-  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
-  setOperationAction(ISD::ADDC, MVT::i64, Custom);
-  setOperationAction(ISD::ADDE, MVT::i64, Custom);
-  setOperationAction(ISD::SUBC, MVT::i64, Custom);
-  setOperationAction(ISD::SUBE, MVT::i64, Custom);
-
-  // ARM64 lacks both left-rotate and popcount instructions.
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
-  // ARM64 doesn't have {U|S}MUL_LOHI.
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-
-
-  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
-  // counterparts, which ARM64 supports directly.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
-  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
-
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  // Custom lower Add/Sub/Mul with overflow.
-  setOperationAction(ISD::SADDO, MVT::i32, Custom);
-  setOperationAction(ISD::SADDO, MVT::i64, Custom);
-  setOperationAction(ISD::UADDO, MVT::i32, Custom);
-  setOperationAction(ISD::UADDO, MVT::i64, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
-  setOperationAction(ISD::USUBO, MVT::i32, Custom);
-  setOperationAction(ISD::USUBO, MVT::i64, Custom);
-  setOperationAction(ISD::SMULO, MVT::i32, Custom);
-  setOperationAction(ISD::SMULO, MVT::i64, Custom);
-  setOperationAction(ISD::UMULO, MVT::i32, Custom);
-  setOperationAction(ISD::UMULO, MVT::i64, Custom);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Expand);
-  setOperationAction(ISD::FSIN, MVT::f64, Expand);
-  setOperationAction(ISD::FCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FCOS, MVT::f64, Expand);
-  setOperationAction(ISD::FPOW, MVT::f32, Expand);
-  setOperationAction(ISD::FPOW, MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
-
-  // ARM64 has implementations of a lot of rounding-like FP operations.
-  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
-  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
-    MVT Ty = RoundingTypes[I];
-    setOperationAction(ISD::FFLOOR, Ty, Legal);
-    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-    setOperationAction(ISD::FCEIL, Ty, Legal);
-    setOperationAction(ISD::FRINT, Ty, Legal);
-    setOperationAction(ISD::FTRUNC, Ty, Legal);
-    setOperationAction(ISD::FROUND, Ty, Legal);
-  }
-
-  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
-
-  if (Subtarget->isTargetMachO()) {
-    // For iOS, we don't want to the normal expansion of a libcall to
-    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-    // traffic.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
-
-  // ARM64 does not have floating-point extending loads, i1 sign-extending load,
-  // floating-point truncating stores, or v2i32->v2i16 truncating store.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
-  // Indexed loads and stores are supported.
-  for (unsigned im = (unsigned)ISD::PRE_INC;
-       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-    setIndexedLoadAction(im, MVT::i8, Legal);
-    setIndexedLoadAction(im, MVT::i16, Legal);
-    setIndexedLoadAction(im, MVT::i32, Legal);
-    setIndexedLoadAction(im, MVT::i64, Legal);
-    setIndexedLoadAction(im, MVT::f64, Legal);
-    setIndexedLoadAction(im, MVT::f32, Legal);
-    setIndexedStoreAction(im, MVT::i8, Legal);
-    setIndexedStoreAction(im, MVT::i16, Legal);
-    setIndexedStoreAction(im, MVT::i32, Legal);
-    setIndexedStoreAction(im, MVT::i64, Legal);
-    setIndexedStoreAction(im, MVT::f64, Legal);
-    setIndexedStoreAction(im, MVT::f32, Legal);
-  }
-
-  // Trap.
-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
-
-  // We combine OR nodes for bitfield operations.
-  setTargetDAGCombine(ISD::OR);
-
-  // Vector add and sub nodes may conceal a high-half opportunity.
-  // Also, try to fold ADD into CSINC/CSINV..
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-
-  setTargetDAGCombine(ISD::ANY_EXTEND);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::BITCAST);
-  setTargetDAGCombine(ISD::CONCAT_VECTORS);
-  setTargetDAGCombine(ISD::STORE);
-
-  setTargetDAGCombine(ISD::MUL);
-
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::VSELECT);
-
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
-
-  setStackPointerRegisterToSaveRestore(ARM64::SP);
-
-  setSchedulingPreference(Sched::Hybrid);
-
-  // Enable TBZ/TBNZ
-  MaskAndBranchFoldingIsLegal = true;
-
-  setMinFunctionAlignment(2);
-
-  RequireStrictAlign = (Align == StrictAlign);
-
-  setHasExtractBitsInsn(true);
-
-  if (Subtarget->hasNEON()) {
-    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
-    // silliness like this:
-    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
-    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
-    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
-    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
-    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
-    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
-    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
-
-    // ARM64 doesn't have a direct vector ->f32 conversion instructions for
-    // elements smaller than i32, so promote the input to i32 first.
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
-    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
-
-    // ARM64 doesn't have MUL.2d:
-    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
-    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
-    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
-    // Likewise, narrowing and extending vector loads/stores aren't handled
-    // directly.
-    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
-      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                         Expand);
-
-      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-
-      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-
-      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-        setTruncStoreAction((MVT::SimpleValueType)VT,
-                            (MVT::SimpleValueType)InnerVT, Expand);
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    }
-
-    // ARM64 has implementations of a lot of rounding-like FP operations.
-    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
-    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
-      MVT Ty = RoundingVecTypes[I];
-      setOperationAction(ISD::FFLOOR, Ty, Legal);
-      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-      setOperationAction(ISD::FCEIL, Ty, Legal);
-      setOperationAction(ISD::FRINT, Ty, Legal);
-      setOperationAction(ISD::FTRUNC, Ty, Legal);
-      setOperationAction(ISD::FROUND, Ty, Legal);
-    }
-  }
-}
-
-void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
-  }
-
-  // Mark vector float intrinsics as expand.
-  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
-    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
-  }
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
-  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
-  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
-
-  // CNT supports only B element sizes.
-  if (VT != MVT::v8i8 && VT != MVT::v16i8)
-    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
-
-  if (Subtarget->isLittleEndian()) {
-    for (unsigned im = (unsigned)ISD::PRE_INC;
-         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
-      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
-    }
-  }
-}
-
-void ARM64TargetLowering::addDRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR64RegClass);
-  addTypeForNEON(VT, MVT::v2i32);
-}
-
-void ARM64TargetLowering::addQRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR128RegClass);
-  addTypeForNEON(VT, MVT::v4i32);
-}
-
-EVT ARM64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector())
-    return MVT::i32;
-  return VT.changeVectorElementTypeToInteger();
-}
-
-/// computeKnownBitsForTargetNode - Determine which of the bits specified in
-/// Mask are known to be either zero or one and return them in the
-/// KnownZero/KnownOne bitsets.
-void ARM64TargetLowering::computeKnownBitsForTargetNode(
-    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
-    const SelectionDAG &DAG, unsigned Depth) const {
-  switch (Op.getOpcode()) {
-  default:
-    break;
-  case ARM64ISD::CSEL: {
-    APInt KnownZero2, KnownOne2;
-    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
-    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
-    KnownZero &= KnownZero2;
-    KnownOne &= KnownOne2;
-    break;
-  }
-  case ISD::INTRINSIC_W_CHAIN: {
-   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
-    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
-    switch (IntID) {
-    default: return;
-    case Intrinsic::arm64_ldaxr:
-    case Intrinsic::arm64_ldxr: {
-      unsigned BitWidth = KnownOne.getBitWidth();
-      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
-      unsigned MemBits = VT.getScalarType().getSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
-      return;
-    }
-    }
-    break;
-  }
-  case ISD::INTRINSIC_WO_CHAIN:
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_umaxv:
-    case Intrinsic::arm64_neon_uminv: {
-      // Figure out the datatype of the vector operand. The UMINV instruction
-      // will zero extend the result, so we can mark as known zero all the
-      // bits larger than the element datatype. 32-bit or larget doesn't need
-      // this as those are legal types and will be handled by isel directly.
-      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
-      unsigned BitWidth = KnownZero.getBitWidth();
-      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
-        assert(BitWidth >= 8 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
-        KnownZero |= Mask;
-      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
-        assert(BitWidth >= 16 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
-        KnownZero |= Mask;
-      }
-      break;
-    } break;
-    }
-  }
-  }
-}
-
-MVT ARM64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
-  return MVT::i64;
-}
-
-unsigned ARM64TargetLowering::getMaximalGlobalOffset() const {
-  // FIXME: On ARM64, this depends on the type.
-  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
-  // and the offset has to be a multiple of the related size in bytes.
-  return 4095;
-}
-
-FastISel *
-ARM64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
-                                    const TargetLibraryInfo *libInfo) const {
-  return ARM64::createFastISel(funcInfo, libInfo);
-}
-
-const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  default:
-    return nullptr;
-  case ARM64ISD::CALL:              return "ARM64ISD::CALL";
-  case ARM64ISD::ADRP:              return "ARM64ISD::ADRP";
-  case ARM64ISD::ADDlow:            return "ARM64ISD::ADDlow";
-  case ARM64ISD::LOADgot:           return "ARM64ISD::LOADgot";
-  case ARM64ISD::RET_FLAG:          return "ARM64ISD::RET_FLAG";
-  case ARM64ISD::BRCOND:            return "ARM64ISD::BRCOND";
-  case ARM64ISD::CSEL:              return "ARM64ISD::CSEL";
-  case ARM64ISD::FCSEL:             return "ARM64ISD::FCSEL";
-  case ARM64ISD::CSINV:             return "ARM64ISD::CSINV";
-  case ARM64ISD::CSNEG:             return "ARM64ISD::CSNEG";
-  case ARM64ISD::CSINC:             return "ARM64ISD::CSINC";
-  case ARM64ISD::THREAD_POINTER:    return "ARM64ISD::THREAD_POINTER";
-  case ARM64ISD::TLSDESC_CALL:      return "ARM64ISD::TLSDESC_CALL";
-  case ARM64ISD::ADC:               return "ARM64ISD::ADC";
-  case ARM64ISD::SBC:               return "ARM64ISD::SBC";
-  case ARM64ISD::ADDS:              return "ARM64ISD::ADDS";
-  case ARM64ISD::SUBS:              return "ARM64ISD::SUBS";
-  case ARM64ISD::ADCS:              return "ARM64ISD::ADCS";
-  case ARM64ISD::SBCS:              return "ARM64ISD::SBCS";
-  case ARM64ISD::ANDS:              return "ARM64ISD::ANDS";
-  case ARM64ISD::FCMP:              return "ARM64ISD::FCMP";
-  case ARM64ISD::FMIN:              return "ARM64ISD::FMIN";
-  case ARM64ISD::FMAX:              return "ARM64ISD::FMAX";
-  case ARM64ISD::DUP:               return "ARM64ISD::DUP";
-  case ARM64ISD::DUPLANE8:          return "ARM64ISD::DUPLANE8";
-  case ARM64ISD::DUPLANE16:         return "ARM64ISD::DUPLANE16";
-  case ARM64ISD::DUPLANE32:         return "ARM64ISD::DUPLANE32";
-  case ARM64ISD::DUPLANE64:         return "ARM64ISD::DUPLANE64";
-  case ARM64ISD::MOVI:              return "ARM64ISD::MOVI";
-  case ARM64ISD::MOVIshift:         return "ARM64ISD::MOVIshift";
-  case ARM64ISD::MOVIedit:          return "ARM64ISD::MOVIedit";
-  case ARM64ISD::MOVImsl:           return "ARM64ISD::MOVImsl";
-  case ARM64ISD::FMOV:              return "ARM64ISD::FMOV";
-  case ARM64ISD::MVNIshift:         return "ARM64ISD::MVNIshift";
-  case ARM64ISD::MVNImsl:           return "ARM64ISD::MVNImsl";
-  case ARM64ISD::BICi:              return "ARM64ISD::BICi";
-  case ARM64ISD::ORRi:              return "ARM64ISD::ORRi";
-  case ARM64ISD::BSL:               return "ARM64ISD::BSL";
-  case ARM64ISD::NEG:               return "ARM64ISD::NEG";
-  case ARM64ISD::EXTR:              return "ARM64ISD::EXTR";
-  case ARM64ISD::ZIP1:              return "ARM64ISD::ZIP1";
-  case ARM64ISD::ZIP2:              return "ARM64ISD::ZIP2";
-  case ARM64ISD::UZP1:              return "ARM64ISD::UZP1";
-  case ARM64ISD::UZP2:              return "ARM64ISD::UZP2";
-  case ARM64ISD::TRN1:              return "ARM64ISD::TRN1";
-  case ARM64ISD::TRN2:              return "ARM64ISD::TRN2";
-  case ARM64ISD::REV16:             return "ARM64ISD::REV16";
-  case ARM64ISD::REV32:             return "ARM64ISD::REV32";
-  case ARM64ISD::REV64:             return "ARM64ISD::REV64";
-  case ARM64ISD::EXT:               return "ARM64ISD::EXT";
-  case ARM64ISD::VSHL:              return "ARM64ISD::VSHL";
-  case ARM64ISD::VLSHR:             return "ARM64ISD::VLSHR";
-  case ARM64ISD::VASHR:             return "ARM64ISD::VASHR";
-  case ARM64ISD::CMEQ:              return "ARM64ISD::CMEQ";
-  case ARM64ISD::CMGE:              return "ARM64ISD::CMGE";
-  case ARM64ISD::CMGT:              return "ARM64ISD::CMGT";
-  case ARM64ISD::CMHI:              return "ARM64ISD::CMHI";
-  case ARM64ISD::CMHS:              return "ARM64ISD::CMHS";
-  case ARM64ISD::FCMEQ:             return "ARM64ISD::FCMEQ";
-  case ARM64ISD::FCMGE:             return "ARM64ISD::FCMGE";
-  case ARM64ISD::FCMGT:             return "ARM64ISD::FCMGT";
-  case ARM64ISD::CMEQz:             return "ARM64ISD::CMEQz";
-  case ARM64ISD::CMGEz:             return "ARM64ISD::CMGEz";
-  case ARM64ISD::CMGTz:             return "ARM64ISD::CMGTz";
-  case ARM64ISD::CMLEz:             return "ARM64ISD::CMLEz";
-  case ARM64ISD::CMLTz:             return "ARM64ISD::CMLTz";
-  case ARM64ISD::FCMEQz:            return "ARM64ISD::FCMEQz";
-  case ARM64ISD::FCMGEz:            return "ARM64ISD::FCMGEz";
-  case ARM64ISD::FCMGTz:            return "ARM64ISD::FCMGTz";
-  case ARM64ISD::FCMLEz:            return "ARM64ISD::FCMLEz";
-  case ARM64ISD::FCMLTz:            return "ARM64ISD::FCMLTz";
-  case ARM64ISD::NOT:               return "ARM64ISD::NOT";
-  case ARM64ISD::BIT:               return "ARM64ISD::BIT";
-  case ARM64ISD::CBZ:               return "ARM64ISD::CBZ";
-  case ARM64ISD::CBNZ:              return "ARM64ISD::CBNZ";
-  case ARM64ISD::TBZ:               return "ARM64ISD::TBZ";
-  case ARM64ISD::TBNZ:              return "ARM64ISD::TBNZ";
-  case ARM64ISD::TC_RETURN:         return "ARM64ISD::TC_RETURN";
-  case ARM64ISD::SITOF:             return "ARM64ISD::SITOF";
-  case ARM64ISD::UITOF:             return "ARM64ISD::UITOF";
-  case ARM64ISD::SQSHL_I:           return "ARM64ISD::SQSHL_I";
-  case ARM64ISD::UQSHL_I:           return "ARM64ISD::UQSHL_I";
-  case ARM64ISD::SRSHR_I:           return "ARM64ISD::SRSHR_I";
-  case ARM64ISD::URSHR_I:           return "ARM64ISD::URSHR_I";
-  case ARM64ISD::SQSHLU_I:          return "ARM64ISD::SQSHLU_I";
-  case ARM64ISD::WrapperLarge:      return "ARM64ISD::WrapperLarge";
-  case ARM64ISD::LD2post:           return "ARM64ISD::LD2post";
-  case ARM64ISD::LD3post:           return "ARM64ISD::LD3post";
-  case ARM64ISD::LD4post:           return "ARM64ISD::LD4post";
-  case ARM64ISD::ST2post:           return "ARM64ISD::ST2post";
-  case ARM64ISD::ST3post:           return "ARM64ISD::ST3post";
-  case ARM64ISD::ST4post:           return "ARM64ISD::ST4post";
-  case ARM64ISD::LD1x2post:         return "ARM64ISD::LD1x2post";
-  case ARM64ISD::LD1x3post:         return "ARM64ISD::LD1x3post";
-  case ARM64ISD::LD1x4post:         return "ARM64ISD::LD1x4post";
-  case ARM64ISD::ST1x2post:         return "ARM64ISD::ST1x2post";
-  case ARM64ISD::ST1x3post:         return "ARM64ISD::ST1x3post";
-  case ARM64ISD::ST1x4post:         return "ARM64ISD::ST1x4post";
-  case ARM64ISD::LD1DUPpost:        return "ARM64ISD::LD1DUPpost";
-  case ARM64ISD::LD2DUPpost:        return "ARM64ISD::LD2DUPpost";
-  case ARM64ISD::LD3DUPpost:        return "ARM64ISD::LD3DUPpost";
-  case ARM64ISD::LD4DUPpost:        return "ARM64ISD::LD4DUPpost";
-  case ARM64ISD::LD1LANEpost:       return "ARM64ISD::LD1LANEpost";
-  case ARM64ISD::LD2LANEpost:       return "ARM64ISD::LD2LANEpost";
-  case ARM64ISD::LD3LANEpost:       return "ARM64ISD::LD3LANEpost";
-  case ARM64ISD::LD4LANEpost:       return "ARM64ISD::LD4LANEpost";
-  case ARM64ISD::ST2LANEpost:       return "ARM64ISD::ST2LANEpost";
-  case ARM64ISD::ST3LANEpost:       return "ARM64ISD::ST3LANEpost";
-  case ARM64ISD::ST4LANEpost:       return "ARM64ISD::ST4LANEpost";
-  }
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const {
-  // We materialise the F128CSEL pseudo-instruction as some control flow and a
-  // phi node:
-
-  // OrigBB:
-  //     [... previous instrs leading to comparison ...]
-  //     b.ne TrueBB
-  //     b EndBB
-  // TrueBB:
-  //     ; Fallthrough
-  // EndBB:
-  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
-
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  MachineFunction *MF = MBB->getParent();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction::iterator It = MBB;
-  ++It;
-
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned IfTrueReg = MI->getOperand(1).getReg();
-  unsigned IfFalseReg = MI->getOperand(2).getReg();
-  unsigned CondCode = MI->getOperand(3).getImm();
-  bool NZCVKilled = MI->getOperand(4).isKill();
-
-  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, TrueBB);
-  MF->insert(It, EndBB);
-
-  // Transfer rest of current basic-block to EndBB
-  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
-                MBB->end());
-  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  BuildMI(MBB, DL, TII->get(ARM64::Bcc)).addImm(CondCode).addMBB(TrueBB);
-  BuildMI(MBB, DL, TII->get(ARM64::B)).addMBB(EndBB);
-  MBB->addSuccessor(TrueBB);
-  MBB->addSuccessor(EndBB);
-
-  // TrueBB falls through to the end.
-  TrueBB->addSuccessor(EndBB);
-
-  if (!NZCVKilled) {
-    TrueBB->addLiveIn(ARM64::NZCV);
-    EndBB->addLiveIn(ARM64::NZCV);
-  }
-
-  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(ARM64::PHI), DestReg)
-      .addReg(IfTrueReg)
-      .addMBB(TrueBB)
-      .addReg(IfFalseReg)
-      .addMBB(MBB);
-
-  MI->eraseFromParent();
-  return EndBB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
-  default:
-#ifndef NDEBUG
-    MI->dump();
-#endif
-    assert(0 && "Unexpected instruction for custom inserter!");
-    break;
-
-  case ARM64::F128CSEL:
-    return EmitF128CSEL(MI, BB);
-
-  case TargetOpcode::STACKMAP:
-  case TargetOpcode::PATCHPOINT:
-    return emitPatchPoint(MI, BB);
-  }
-  llvm_unreachable("Unexpected instruction for custom inserter!");
-}
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering private implementation.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Lowering Code
-//===----------------------------------------------------------------------===//
-
-/// changeIntCCToARM64CC - Convert a DAG integer condition code to an ARM64 CC
-static ARM64CC::CondCode changeIntCCToARM64CC(ISD::CondCode CC) {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown condition code!");
-  case ISD::SETNE:
-    return ARM64CC::NE;
-  case ISD::SETEQ:
-    return ARM64CC::EQ;
-  case ISD::SETGT:
-    return ARM64CC::GT;
-  case ISD::SETGE:
-    return ARM64CC::GE;
-  case ISD::SETLT:
-    return ARM64CC::LT;
-  case ISD::SETLE:
-    return ARM64CC::LE;
-  case ISD::SETUGT:
-    return ARM64CC::HI;
-  case ISD::SETUGE:
-    return ARM64CC::HS;
-  case ISD::SETULT:
-    return ARM64CC::LO;
-  case ISD::SETULE:
-    return ARM64CC::LS;
-  }
-}
-
-/// changeFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC.
-static void changeFPCCToARM64CC(ISD::CondCode CC, ARM64CC::CondCode &CondCode,
-                                ARM64CC::CondCode &CondCode2) {
-  CondCode2 = ARM64CC::AL;
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown FP condition!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ:
-    CondCode = ARM64CC::EQ;
-    break;
-  case ISD::SETGT:
-  case ISD::SETOGT:
-    CondCode = ARM64CC::GT;
-    break;
-  case ISD::SETGE:
-  case ISD::SETOGE:
-    CondCode = ARM64CC::GE;
-    break;
-  case ISD::SETOLT:
-    CondCode = ARM64CC::MI;
-    break;
-  case ISD::SETOLE:
-    CondCode = ARM64CC::LS;
-    break;
-  case ISD::SETONE:
-    CondCode = ARM64CC::MI;
-    CondCode2 = ARM64CC::GT;
-    break;
-  case ISD::SETO:
-    CondCode = ARM64CC::VC;
-    break;
-  case ISD::SETUO:
-    CondCode = ARM64CC::VS;
-    break;
-  case ISD::SETUEQ:
-    CondCode = ARM64CC::EQ;
-    CondCode2 = ARM64CC::VS;
-    break;
-  case ISD::SETUGT:
-    CondCode = ARM64CC::HI;
-    break;
-  case ISD::SETUGE:
-    CondCode = ARM64CC::PL;
-    break;
-  case ISD::SETLT:
-  case ISD::SETULT:
-    CondCode = ARM64CC::LT;
-    break;
-  case ISD::SETLE:
-  case ISD::SETULE:
-    CondCode = ARM64CC::LE;
-    break;
-  case ISD::SETNE:
-  case ISD::SETUNE:
-    CondCode = ARM64CC::NE;
-    break;
-  }
-}
-
-/// changeVectorFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC
-/// usable with the vector instructions. Fewer operations are available without
-/// a real NZCV register, so we have to use less efficient combinations to get
-/// the same effect.
-static void changeVectorFPCCToARM64CC(ISD::CondCode CC,
-                                      ARM64CC::CondCode &CondCode,
-                                      ARM64CC::CondCode &CondCode2,
-                                      bool &Invert) {
-  Invert = false;
-  switch (CC) {
-  default:
-    // Mostly the scalar mappings work fine.
-    changeFPCCToARM64CC(CC, CondCode, CondCode2);
-    break;
-  case ISD::SETUO:
-    Invert = true; // Fallthrough
-  case ISD::SETO:
-    CondCode = ARM64CC::MI;
-    CondCode2 = ARM64CC::GE;
-    break;
-  case ISD::SETUEQ:
-  case ISD::SETULT:
-  case ISD::SETULE:
-  case ISD::SETUGT:
-  case ISD::SETUGE:
-    // All of the compare-mask comparisons are ordered, but we can switch
-    // between the two by a double inversion. E.g. ULE == !OGT.
-    Invert = true;
-    changeFPCCToARM64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
-    break;
-  }
-}
-
-static bool isLegalArithImmed(uint64_t C) {
-  // Matches ARM64DAGToDAGISel::SelectArithImmed().
-  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
-}
-
-static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                              SDLoc dl, SelectionDAG &DAG) {
-  EVT VT = LHS.getValueType();
-
-  if (VT.isFloatingPoint())
-    return DAG.getNode(ARM64ISD::FCMP, dl, VT, LHS, RHS);
-
-  // The CMP instruction is just an alias for SUBS, and representing it as
-  // SUBS means that it's possible to get CSE with subtract operations.
-  // A later phase can perform the optimization of setting the destination
-  // register to WZR/XZR if it ends up being unused.
-  unsigned Opcode = ARM64ISD::SUBS;
-
-  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
-      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
-    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
-    // can be set differently by this operation. It comes down to whether
-    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
-    // everything is fine. If not then the optimization is wrong. Thus general
-    // comparisons are only valid if op2 != 0.
-
-    // So, finally, the only LLVM-native comparisons that don't mention C and V
-    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
-    // the absence of information about op2.
-    Opcode = ARM64ISD::ADDS;
-    RHS = RHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
-             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
-             !isUnsignedIntSetCC(CC)) {
-    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
-    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
-    // of the signed comparisons.
-    Opcode = ARM64ISD::ANDS;
-    RHS = LHS.getOperand(1);
-    LHS = LHS.getOperand(0);
-  }
-
-  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
-      .getValue(1);
-}
-
-static SDValue getARM64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                           SDValue &ARM64cc, SelectionDAG &DAG, SDLoc dl) {
-  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
-    EVT VT = RHS.getValueType();
-    uint64_t C = RHSC->getZExtValue();
-    if (!isLegalArithImmed(C)) {
-      // Constant does not fit, try adjusting it by one?
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETLT:
-      case ISD::SETGE:
-        if ((VT == MVT::i32 && C != 0x80000000 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0x80000000ULL &&
-             isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULT:
-      case ISD::SETUGE:
-        if ((VT == MVT::i32 && C != 0 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETLE:
-      case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULE:
-      case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      }
-    }
-  }
-
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-  ARM64cc = DAG.getConstant(ARM64CC, MVT::i32);
-  return Cmp;
-}
-
-static std::pair<SDValue, SDValue>
-getARM64XALUOOp(ARM64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
-  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
-         "Unsupported value type");
-  SDValue Value, Overflow;
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  unsigned Opc = 0;
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("Unknown overflow instruction!");
-  case ISD::SADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::UADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::HS;
-    break;
-  case ISD::SSUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::USUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::LO;
-    break;
-  // Multiply needs a little bit extra work.
-  case ISD::SMULO:
-  case ISD::UMULO: {
-    CC = ARM64CC::NE;
-    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
-    if (Op.getValueType() == MVT::i32) {
-      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-      // For a 32 bit multiply with overflow check we want the instruction
-      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
-      // need to generate the following pattern:
-      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
-      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
-      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
-      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
-                                DAG.getConstant(0, MVT::i64));
-      // On ARM64 the upper 32 bits are always zero extended for a 32 bit
-      // operation. We need to clear out the upper 32 bits, because we used a
-      // widening multiply that wrote all 64 bits. In the end this should be a
-      // noop.
-      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
-      if (IsSigned) {
-        // The signed overflow check requires more than just a simple check for
-        // any bit set in the upper 32 bits of the result. These bits could be
-        // just the sign bits of a negative number. To perform the overflow
-        // check we have to arithmetic shift right the 32nd bit of the result by
-        // 31 bits. Then we compare the result to the upper 32 bits.
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
-                                        DAG.getConstant(32, MVT::i64));
-        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
-        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
-                                        DAG.getConstant(31, MVT::i64));
-        // It is important that LowerBits is last, otherwise the arithmetic
-        // shift will not be folded into the compare (SUBS).
-        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
-        Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                       .getValue(1);
-      } else {
-        // The overflow check for unsigned multiply is easy. We only need to
-        // check if any of the upper 32 bits are set. This can be done with a
-        // CMP (shifted register). For that we need to generate the following
-        // pattern:
-        // (i64 ARM64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
-                                        DAG.getConstant(32, MVT::i64));
-        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-        Overflow =
-            DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                        UpperBits).getValue(1);
-      }
-      break;
-    }
-    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
-    // For the 64 bit multiply
-    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-    if (IsSigned) {
-      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
-      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
-                                      DAG.getConstant(63, MVT::i64));
-      // It is important that LowerBits is last, otherwise the arithmetic
-      // shift will not be folded into the compare (SUBS).
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                     .getValue(1);
-    } else {
-      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow =
-          DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                      UpperBits).getValue(1);
-    }
-    break;
-  }
-  } // switch (...)
-
-  if (Opc) {
-    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
-
-    // Emit the ARM64 operation with overflow check.
-    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
-    Overflow = Value.getValue(1);
-  }
-  return std::make_pair(Value, Overflow);
-}
-
-SDValue ARM64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                           RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
-  SDValue Sel = Op.getOperand(0);
-  SDValue Other = Op.getOperand(1);
-
-  // If neither operand is a SELECT_CC, give up.
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    std::swap(Sel, Other);
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    return Op;
-
-  // The folding we want to perform is:
-  // (xor x, (select_cc a, b, cc, 0, -1) )
-  //   -->
-  // (csel x, (xor x, -1), cc ...)
-  //
-  // The latter will get matched to a CSINV instruction.
-
-  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
-  SDValue LHS = Sel.getOperand(0);
-  SDValue RHS = Sel.getOperand(1);
-  SDValue TVal = Sel.getOperand(2);
-  SDValue FVal = Sel.getOperand(3);
-  SDLoc dl(Sel);
-
-  // FIXME: This could be generalized to non-integer comparisons.
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return Op;
-
-  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-  // The the values aren't constants, this isn't the pattern we're looking for.
-  if (!CFVal || !CTVal)
-    return Op;
-
-  // We can commute the SELECT_CC by inverting the condition.  This
-  // might be needed to make this fit into a CSINV pattern.
-  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-    std::swap(TVal, FVal);
-    std::swap(CTVal, CFVal);
-    CC = ISD::getSetCCInverse(CC, true);
-  }
-
-  // If the constants line up, perform the transform!
-  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    FVal = Other;
-    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
-                       DAG.getConstant(-1ULL, Other.getValueType()));
-
-    return DAG.getNode(ARM64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
-                       CCVal, Cmp);
-  }
-
-  return Op;
-}
-
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default:
-    assert(0 && "Invalid code");
-  case ISD::ADDC:
-    Opc = ARM64ISD::ADDS;
-    break;
-  case ISD::SUBC:
-    Opc = ARM64ISD::SUBS;
-    break;
-  case ISD::ADDE:
-    Opc = ARM64ISD::ADCS;
-    ExtraOp = true;
-    break;
-  case ISD::SUBE:
-    Opc = ARM64ISD::SBCS;
-    ExtraOp = true;
-    break;
-  }
-
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
-                     Op.getOperand(2));
-}
-
-static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
-    return SDValue();
-
-  ARM64CC::CondCode CC;
-  // The actual operation that sets the overflow or carry flag.
-  SDValue Value, Overflow;
-  std::tie(Value, Overflow) = getARM64XALUOOp(CC, Op, DAG);
-
-  // We use 0 and 1 as false and true values.
-  SDValue TVal = DAG.getConstant(1, MVT::i32);
-  SDValue FVal = DAG.getConstant(0, MVT::i32);
-
-  // We use an inverted condition, because the conditional select is inverted
-  // too. This will allow it to be selected to a single instruction:
-  // CSINC Wd, WZR, WZR, invert(cond).
-  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
-  Overflow = DAG.getNode(ARM64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, CCVal,
-                         Overflow);
-
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
-}
-
-// Prefetch operands are:
-// 1: Address to prefetch
-// 2: bool isWrite
-// 3: int locality (0 = no locality ... 3 = extreme locality)
-// 4: bool isDataCache
-static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-
-  bool IsStream = !Locality;
-  // When the locality number is set
-  if (Locality) {
-    // The front-end should have filtered out the out-of-range values
-    assert(Locality <= 3 && "Prefetch locality out-of-range");
-    // The locality degree is the opposite of the cache speed.
-    // Put the number the other way around.
-    // The encoding starts at 0 for level 1
-    Locality = 3 - Locality;
-  }
-
-  // built the mask value encoding the expected behavior.
-  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
-                   (Locality << 1) |    // Cache level bits
-                   (unsigned)IsStream;  // Stream bit
-  return DAG.getNode(ARM64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerFP_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFP_ROUND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-  SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT InVT = Op.getOperand(0).getValueType();
-  EVT VT = Op.getValueType();
-
-  // FP_TO_XINT conversion from the same type are legal.
-  if (VT.getSizeInBits() == InVT.getSizeInBits())
-    return Op;
-
-  if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
-    SDLoc dl(Op);
-    SDValue Cv =
-        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
-                    Op.getOperand(0));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
-  } else if (InVT == MVT::v2f32) {
-    SDLoc dl(Op);
-    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
-    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
-  }
-
-  // Type changing conversions are illegal.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerFP_TO_INT(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType().isVector())
-    return LowerVectorFP_TO_INT(Op, DAG);
-
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::FP_TO_SINT)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  SDValue In = Op.getOperand(0);
-  EVT InVT = In.getValueType();
-
-  // v2i32 to v2f32 is legal.
-  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
-    return Op;
-
-  // This function only handles v2f64 outputs.
-  if (VT == MVT::v2f64) {
-    // Extend the input argument to a v2i64 that we can feed into the
-    // floating point conversion. Zero or sign extend based on whether
-    // we're doing a signed or unsigned float conversion.
-    unsigned Opc =
-        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
-    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
-    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
-    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
-  }
-
-  // Scalarize v2i64 to v2f32 conversions.
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
-    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
-                               DAG.getConstant(i, MVT::i64));
-    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
-    BuildVectorOps.push_back(Sclr);
-  }
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
-}
-
-SDValue ARM64TargetLowering::LowerINT_TO_FP(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorINT_TO_FP(Op, DAG);
-
-  // i128 conversions are libcalls.
-  if (Op.getOperand(0).getValueType() == MVT::i128)
-    return SDValue();
-
-  // Other conversions are legal, unless it's to the completely software-based
-  // fp128.
-  if (Op.getValueType() != MVT::f128)
-    return Op;
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::SINT_TO_FP)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  ArgListEntry Entry;
-
-  Entry.Node = Arg;
-  Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
-  Args.push_back(Entry);
-
-  const char *LibcallName =
-      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
-
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
-SDValue ARM64TargetLowering::LowerOperation(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unimplemented operand");
-    return SDValue();
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:
-    return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::SETCC:
-    return LowerSETCC(Op, DAG);
-  case ISD::BR_CC:
-    return LowerBR_CC(Op, DAG);
-  case ISD::SELECT:
-    return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC:
-    return LowerSELECT_CC(Op, DAG);
-  case ISD::JumpTable:
-    return LowerJumpTable(Op, DAG);
-  case ISD::ConstantPool:
-    return LowerConstantPool(Op, DAG);
-  case ISD::BlockAddress:
-    return LowerBlockAddress(Op, DAG);
-  case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
-  case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG);
-  case ISD::VAARG:
-    return LowerVAARG(Op, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:
-    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::SADDO:
-  case ISD::UADDO:
-  case ISD::SSUBO:
-  case ISD::USUBO:
-  case ISD::SMULO:
-  case ISD::UMULO:
-    return LowerXALUO(Op, DAG);
-  case ISD::FADD:
-    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
-  case ISD::FSUB:
-    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
-  case ISD::FMUL:
-    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
-  case ISD::FDIV:
-    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
-  case ISD::FP_ROUND:
-    return LowerFP_ROUND(Op, DAG);
-  case ISD::FP_EXTEND:
-    return LowerFP_EXTEND(Op, DAG);
-  case ISD::FRAMEADDR:
-    return LowerFRAMEADDR(Op, DAG);
-  case ISD::RETURNADDR:
-    return LowerRETURNADDR(Op, DAG);
-  case ISD::INSERT_VECTOR_ELT:
-    return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::EXTRACT_VECTOR_ELT:
-    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
-  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG);
-  case ISD::VECTOR_SHUFFLE:
-    return LowerVECTOR_SHUFFLE(Op, DAG);
-  case ISD::EXTRACT_SUBVECTOR:
-    return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::SRA:
-  case ISD::SRL:
-  case ISD::SHL:
-    return LowerVectorSRA_SRL_SHL(Op, DAG);
-  case ISD::SHL_PARTS:
-    return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRL_PARTS:
-  case ISD::SRA_PARTS:
-    return LowerShiftRightParts(Op, DAG);
-  case ISD::CTPOP:
-    return LowerCTPOP(Op, DAG);
-  case ISD::FCOPYSIGN:
-    return LowerFCOPYSIGN(Op, DAG);
-  case ISD::AND:
-    return LowerVectorAND(Op, DAG);
-  case ISD::OR:
-    return LowerVectorOR(Op, DAG);
-  case ISD::XOR:
-    return LowerXOR(Op, DAG);
-  case ISD::PREFETCH:
-    return LowerPREFETCH(Op, DAG);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return LowerINT_TO_FP(Op, DAG);
-  case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:
-    return LowerFP_TO_INT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
-  }
-}
-
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned ARM64TargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
-//===----------------------------------------------------------------------===//
-//                      Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-#include "ARM64GenCallingConv.inc"
-
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
-CCAssignFn *ARM64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
-                                                   bool IsVarArg) const {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unsupported calling convention.");
-  case CallingConv::WebKit_JS:
-    return CC_ARM64_WebKit_JS;
-  case CallingConv::C:
-  case CallingConv::Fast:
-    if (!Subtarget->isTargetDarwin())
-      return CC_ARM64_AAPCS;
-    return IsVarArg ? CC_ARM64_DarwinPCS_VarArg : CC_ARM64_DarwinPCS;
-  }
-}
-
-SDValue ARM64TargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Assign locations to all of the incoming arguments.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  // At this point, Ins[].VT may already be promoted to i32. To correctly
-  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-  // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
-  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
-  // LocVT.
-  unsigned NumArgs = Ins.size();
-  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
-  unsigned CurArgIdx = 0;
-  for (unsigned i = 0; i != NumArgs; ++i) {
-    MVT ValVT = Ins[i].VT;
-    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
-
-    // Get type of the original argument.
-    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
-    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
-    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-    MVT LocVT = ValVT;
-    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-      LocVT = MVT::i8;
-    else if (ActualMVT == MVT::i16)
-      LocVT = MVT::i16;
-
-    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-    bool Res =
-        AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
-    assert(!Res && "Call operand has unhandled type");
-    (void)Res;
-  }
-  assert(ArgLocs.size() == Ins.size());
-  SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-
-    if (Ins[i].Flags.isByVal()) {
-      // Byval is used for HFAs in the PCS, but the system should work in a
-      // non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
-      int Size = Ins[i].Flags.getByValSize();
-      unsigned NumRegs = (Size + 7) / 8;
-
-      // FIXME: This works on big-endian for composite byvals, which are the common
-      // case. It should also work for fundamental types too.
-      unsigned FrameIdx =
-        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
-      InVals.push_back(FrameIdxN);
-
-      continue;
-    } if (VA.isRegLoc()) {
-      // Arguments stored in registers.
-      EVT RegVT = VA.getLocVT();
-
-      SDValue ArgValue;
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = &ARM64::GPR32RegClass;
-      else if (RegVT == MVT::i64)
-        RC = &ARM64::GPR64RegClass;
-      else if (RegVT == MVT::f32)
-        RC = &ARM64::FPR32RegClass;
-      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
-        RC = &ARM64::FPR64RegClass;
-      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
-        RC = &ARM64::FPR128RegClass;
-      else
-        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
-
-      // Transform the arguments in physical registers into virtual ones.
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
-
-      // If this is an 8, 16 or 32-bit value, it is really passed promoted
-      // to 64 bits.  Insert an assert[sz]ext to capture this, then
-      // truncate to the right size.
-      switch (VA.getLocInfo()) {
-      default:
-        llvm_unreachable("Unknown loc info!");
-      case CCValAssign::Full:
-        break;
-      case CCValAssign::BCvt:
-        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::SExt:
-        ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::ZExt:
-        ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      }
-
-      InVals.push_back(ArgValue);
-
-    } else { // VA.isRegLoc()
-      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
-      unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
-
-      uint32_t BEAlign = 0;
-      if (ArgSize < 8 && !Subtarget->isLittleEndian())
-        BEAlign = 8 - ArgSize;
-
-      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
-
-      // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      SDValue ArgValue;
-
-      // If the loc type and val type are not the same, create an anyext load.
-      if (VA.getLocVT().getSizeInBits() != VA.getValVT().getSizeInBits()) {
-        // We should only get here if this is a pure integer.
-        assert(!VA.getValVT().isVector() && VA.getValVT().isInteger() &&
-               "Only integer extension supported!");
-        ArgValue = DAG.getExtLoad(ISD::EXTLOAD, DL, VA.getValVT(), Chain, FIN,
-                                  MachinePointerInfo::getFixedStack(FI),
-                                  VA.getLocVT(),
-                                  false, false, false, 0);
-      } else {
-        ArgValue = DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
-                               MachinePointerInfo::getFixedStack(FI), false,
-                               false, false, 0);
-      }
-
-      InVals.push_back(ArgValue);
-    }
-  }
-
-  // varargs
-  if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
-      // The AAPCS variadic function ABI is identical to the non-variadic
-      // one. As a result there may be more arguments in registers and we should
-      // save them for future reference.
-      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
-    }
-
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-    // This will point to the next argument passed via stack.
-    unsigned StackOffset = CCInfo.getNextStackOffset();
-    // We currently pass all varargs at 8-byte alignment.
-    StackOffset = ((StackOffset + 7) & ~7);
-    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
-  }
-
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  unsigned StackArgSize = CCInfo.getNextStackOffset();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
-    // This is a non-standard ABI so by fiat I say we're allowed to make full
-    // use of the stack area to be popped, which must be aligned to 16 bytes in
-    // any case:
-    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
-
-    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
-    // a multiple of 16.
-    FuncInfo->setArgumentStackToRestore(StackArgSize);
-
-    // This realignment carries over to the available bytes below. Our own
-    // callers will guarantee the space is free by giving an aligned value to
-    // CALLSEQ_START.
-  }
-  // Even if we're not expected to free up the space, it's useful to know how
-  // much is there while considering tail calls (because we can reuse it).
-  FuncInfo->setBytesInStackArgArea(StackArgSize);
-
-  return Chain;
-}
-
-void ARM64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
-                                              SelectionDAG &DAG, SDLoc DL,
-                                              SDValue &Chain) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-
-  SmallVector<SDValue, 8> MemOps;
-
-  static const MCPhysReg GPRArgRegs[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                          ARM64::X3, ARM64::X4, ARM64::X5,
-                                          ARM64::X6, ARM64::X7 };
-  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
-  unsigned FirstVariadicGPR =
-      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
-
-  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
-  int GPRIdx = 0;
-  if (GPRSaveSize != 0) {
-    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
-
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &ARM64::GPR64RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                       MachinePointerInfo::getStack(i * 8), false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, getPointerTy()));
-    }
-  }
-  FuncInfo->setVarArgsGPRIndex(GPRIdx);
-  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
-
-  if (Subtarget->hasFPARMv8()) {
-    static const MCPhysReg FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2,
-                                            ARM64::Q3, ARM64::Q4, ARM64::Q5,
-                                            ARM64::Q6, ARM64::Q7 };
-    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
-    unsigned FirstVariadicFPR =
-        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
-
-    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
-    int FPRIdx = 0;
-    if (FPRSaveSize != 0) {
-      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
-      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass);
-        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-
-        SDValue Store =
-            DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                         MachinePointerInfo::getStack(i * 16), false, false, 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                          DAG.getConstant(16, getPointerTy()));
-      }
-    }
-    FuncInfo->setVarArgsFPRIndex(FPRIdx);
-    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
-  }
-
-  if (!MemOps.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
-  }
-}
-
-/// LowerCallResult - Lower the result values of a call into the
-/// appropriate copies out of appropriate physical registers.
-SDValue ARM64TargetLowering::LowerCallResult(
-    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-    SDValue ThisVal) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  // Assign locations to each value returned by this call.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, RetCC);
-
-  // Copy all of the result registers out of their specified physreg.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    CCValAssign VA = RVLocs[i];
-
-    // Pass 'this' value directly from the argument to return value, to avoid
-    // reg unit interference
-    if (i == 0 && isThisReturn) {
-      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
-             "unexpected return calling convention register assignment");
-      InVals.push_back(ThisVal);
-      continue;
-    }
-
-    SDValue Val =
-        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
-    Chain = Val.getValue(1);
-    InFlag = Val.getValue(2);
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
-      break;
-    }
-
-    InVals.push_back(Val);
-  }
-
-  return Chain;
-}
-
-bool ARM64TargetLowering::isEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
-  // For CallingConv::C this function knows whether the ABI needs
-  // changing. That's not true for other conventions so they will have to opt in
-  // manually.
-  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
-    return false;
-
-  const MachineFunction &MF = DAG.getMachineFunction();
-  const Function *CallerF = MF.getFunction();
-  CallingConv::ID CallerCC = CallerF->getCallingConv();
-  bool CCMatch = CallerCC == CalleeCC;
-
-  // Byval parameters hand the function a pointer directly into the stack area
-  // we want to reuse during a tail call. Working around this *is* possible (see
-  // X86) but less efficient and uglier in LowerCall.
-  for (Function::const_arg_iterator i = CallerF->arg_begin(),
-                                    e = CallerF->arg_end();
-       i != e; ++i)
-    if (i->hasByValAttr())
-      return false;
-
-  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
-    if (IsTailCallConvention(CalleeCC) && CCMatch)
-      return true;
-    return false;
-  }
-
-  // Now we search for cases where we can use a tail call without changing the
-  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
-  // concept.
-
-  // I want anyone implementing a new calling convention to think long and hard
-  // about this assert.
-  assert((!isVarArg || CalleeCC == CallingConv::C) &&
-         "Unexpected variadic calling convention");
-
-  if (isVarArg && !Outs.empty()) {
-    // At least two cases here: if caller is fastcc then we can't have any
-    // memory arguments (we'd be expected to clean up the stack afterwards). If
-    // caller is C then we could potentially use its argument area.
-
-    // FIXME: for now we take the most conservative of these in both cases:
-    // disallow all variadic memory operands.
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
-
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
-    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
-      if (!ArgLocs[i].isRegLoc())
-        return false;
-  }
-
-  // If the calling conventions do not match, then we'd better make sure the
-  // results are returned in the same way as what the caller expects.
-  if (!CCMatch) {
-    SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
-
-    SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
-
-    if (RVLocs1.size() != RVLocs2.size())
-      return false;
-    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
-      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
-        return false;
-      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
-        return false;
-      if (RVLocs1[i].isRegLoc()) {
-        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
-          return false;
-      } else {
-        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
-          return false;
-      }
-    }
-  }
-
-  // Nothing more to check if the callee is taking no arguments
-  if (Outs.empty())
-    return true;
-
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
-
-  const ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-
-  // If the stack arguments for this call would fit into our own save area then
-  // the call can be made tail.
-  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
-}
-
-SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain,
-                                                 SelectionDAG &DAG,
-                                                 MachineFrameInfo *MFI,
-                                                 int ClobberedFI) const {
-  SmallVector<SDValue, 8> ArgChains;
-  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
-  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
-
-  // Include the original chain at the beginning of the list. When this is
-  // used by target LowerCall hooks, this helps legalize find the
-  // CALLSEQ_BEGIN node.
-  ArgChains.push_back(Chain);
-
-  // Add a chain value for each stack argument corresponding
-  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
-                            UE = DAG.getEntryNode().getNode()->use_end();
-       U != UE; ++U)
-    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
-      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
-        if (FI->getIndex() < 0) {
-          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
-          int64_t InLastByte = InFirstByte;
-          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
-
-          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
-              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
-            ArgChains.push_back(SDValue(L, 1));
-        }
-
-  // Build a tokenfactor for all the chains.
-  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
-}
-
-bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
-                                                 bool TailCallOpt) const {
-  return CallCC == CallingConv::Fast && TailCallOpt;
-}
-
-bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
-  return CallCC == CallingConv::Fast;
-}
-
-/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
-/// and add input and output parameter nodes.
-SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                       SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG = CLI.DAG;
-  SDLoc &DL = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
-  SDValue Chain = CLI.Chain;
-  SDValue Callee = CLI.Callee;
-  bool &IsTailCall = CLI.IsTailCall;
-  CallingConv::ID CallConv = CLI.CallConv;
-  bool IsVarArg = CLI.IsVarArg;
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
-  bool IsThisReturn = false;
-
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  bool IsSibCall = false;
-
-  if (IsTailCall) {
-    // Check if it's really possible to do a tail call.
-    IsTailCall = isEligibleForTailCallOptimization(
-        Callee, CallConv, IsVarArg, IsStructRet,
-        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
-    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
-      report_fatal_error("failed to perform tail call elimination on a call "
-                         "site marked musttail");
-
-    // A sibling call is one where we're under the usual C ABI and not planning
-    // to change that but can still do a tail call:
-    if (!TailCallOpt && IsTailCall)
-      IsSibCall = true;
-
-    if (IsTailCall)
-      ++NumTailCalls;
-  }
-
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  if (IsVarArg) {
-    // Handle fixed and variable vector arguments differently.
-    // Variable vector arguments always go into memory.
-    unsigned NumArgs = Outs.size();
-
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ArgVT = Outs[i].VT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
-                                               /*IsVarArg=*/ !Outs[i].IsFixed);
-      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  } else {
-    // At this point, Outs[].VT may already be promoted to i32. To correctly
-    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-    // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
-    // we use a special version of AnalyzeCallOperands to pass in ValVT and
-    // LocVT.
-    unsigned NumArgs = Outs.size();
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ValVT = Outs[i].VT;
-      // Get type of the original argument.
-      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
-                                  /*AllowUnknown*/ true);
-      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-      MVT LocVT = ValVT;
-      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-        LocVT = MVT::i8;
-      else if (ActualMVT == MVT::i16)
-        LocVT = MVT::i16;
-
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-      bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  }
-
-  // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
-
-  if (IsSibCall) {
-    // Since we're not changing the ABI to make this a tail call, the memory
-    // operands are already available in the caller's incoming argument space.
-    NumBytes = 0;
-  }
-
-  // FPDiff is the byte offset of the call's argument area from the callee's.
-  // Stores to callee stack arguments will be placed in FixedStackSlots offset
-  // by this amount for a tail call. In a sibling call it must be 0 because the
-  // caller will deallocate the entire stack and the callee still expects its
-  // arguments to begin at SP+0. Completely unused for non-tail calls.
-  int FPDiff = 0;
-
-  if (IsTailCall && !IsSibCall) {
-    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
-
-    // Since callee will pop argument stack as a tail call, we must keep the
-    // popped size 16-byte aligned.
-    NumBytes = RoundUpToAlignment(NumBytes, 16);
-
-    // FPDiff will be negative if this tail call requires more space than we
-    // would automatically have in our incoming argument space. Positive if we
-    // can actually shrink the stack.
-    FPDiff = NumReusableBytes - NumBytes;
-
-    // The stack pointer must be 16-byte aligned at all times it's used for a
-    // memory operation, which in practice means at *all* times and in
-    // particular across call boundaries. Therefore our own arguments started at
-    // a 16-byte aligned SP and the delta applied for the tail call should
-    // satisfy the same constraint.
-    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
-  }
-
-  // Adjust the stack pointer for the new arguments...
-  // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsSibCall)
-    Chain =
-        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
-
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ARM64::SP, getPointerTy());
-
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-  SmallVector<SDValue, 8> MemOpChains;
-
-  // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
-       ++i, ++realArgIdx) {
-    CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = OutVals[realArgIdx];
-    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
-
-    // Promote the value if needed.
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::SExt:
-      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::ZExt:
-      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::AExt:
-      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::FPExt:
-      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
-        assert(VA.getLocVT() == MVT::i64 &&
-               "unexpected calling convention register assignment");
-        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
-               "unexpected use of 'returned'");
-        IsThisReturn = true;
-      }
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      assert(VA.isMemLoc());
-
-      SDValue DstAddr;
-      MachinePointerInfo DstInfo;
-
-      // FIXME: This works on big-endian for composite byvals, which are the
-      // common case. It should also work for fundamental types too.
-      uint32_t BEAlign = 0;
-      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
-      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
-        if (OpSize < 8)
-          BEAlign = 8 - OpSize;
-      }
-      unsigned LocMemOffset = VA.getLocMemOffset();
-      int32_t Offset = LocMemOffset + BEAlign;
-      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
-      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
-
-      if (IsTailCall) {
-        Offset = Offset + FPDiff;
-        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-
-        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
-        DstInfo = MachinePointerInfo::getFixedStack(FI);
-
-        // Make sure any stack arguments overlapping with where we're storing
-        // are loaded before this eventual operation. Otherwise they'll be
-        // clobbered.
-        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
-      } else {
-        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
-
-        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
-        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
-      }
-
-      if (Outs[i].Flags.isByVal()) {
-        SDValue SizeNode =
-            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
-        SDValue Cpy = DAG.getMemcpy(
-            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
-            /*isVolatile = */ false,
-            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
-
-        MemOpChains.push_back(Cpy);
-      } else {
-        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
-        // promoted to a legal register type i32, we should truncate Arg back to
-        // i1/i8/i16.
-        if (Arg.getValueType().isSimple() &&
-            Arg.getValueType().getSimpleVT() == MVT::i32 &&
-            (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
-             VA.getLocVT() == MVT::i16))
-          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
-
-        SDValue Store =
-            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
-        MemOpChains.push_back(Store);
-      }
-    }
-  }
-
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
-
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into the appropriate regs.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
-  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
-  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
-  // node so that legalize doesn't hack it.
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      Subtarget->isTargetMachO()) {
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      const GlobalValue *GV = G->getGlobal();
-      bool InternalLinkage = GV->hasInternalLinkage();
-      if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-      else {
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
-                                            ARM64II::MO_GOT);
-        Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-      }
-    } else if (ExternalSymbolSDNode *S =
-                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
-      const char *Sym = S->getSymbol();
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), ARM64II::MO_GOT);
-      Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-    }
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
-  }
-
-  // We don't usually want to end the call-sequence here because we would tidy
-  // the frame up *after* the call, however in the ABI-changing tail-call case
-  // we've carefully laid out the parameters so that when sp is reset they'll be
-  // in the correct location.
-  if (IsTailCall && !IsSibCall) {
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(0, true), InFlag, DL);
-    InFlag = Chain.getValue(1);
-  }
-
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
-
-  if (IsTailCall) {
-    // Each tail call may have to adjust the stack by a different amount, so
-    // this information must travel along with the operation for eventual
-    // consumption by emitEpilogue.
-    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
-  }
-
-  // Add argument registers to the end of the list so that they are known live
-  // into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
-
-  // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  if (IsThisReturn) {
-    // For 'this' returns, use the X0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
-    if (!Mask) {
-      IsThisReturn = false;
-      Mask = ARI->getCallPreservedMask(CallConv);
-    }
-  } else
-    Mask = ARI->getCallPreservedMask(CallConv);
-
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
-
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-
-  // If we're doing a tall call, use a TC_RETURN here rather than an
-  // actual call instruction.
-  if (IsTailCall)
-    return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, Ops);
-
-  // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops);
-  InFlag = Chain.getValue(1);
-
-  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
-                                ? RoundUpToAlignment(NumBytes, 16)
-                                : 0;
-
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(CalleePopBytes, true),
-                             InFlag, DL);
-  if (!Ins.empty())
-    InFlag = Chain.getValue(1);
-
-  // Handle result values, copying them out of physregs into vregs that we
-  // return.
-  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
-                         InVals, IsThisReturn,
-                         IsThisReturn ? OutVals[0] : SDValue());
-}
-
-bool ARM64TargetLowering::CanLowerReturn(
-    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC);
-}
-
-SDValue
-ARM64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                 bool isVarArg,
-                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                 const SmallVectorImpl<SDValue> &OutVals,
-                                 SDLoc DL, SelectionDAG &DAG) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeReturn(Outs, RetCC);
-
-  // Copy the result values into the output registers.
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
-  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
-       ++i, ++realRVLocIdx) {
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue Arg = OutVals[realRVLocIdx];
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
-  }
-
-  RetOps[0] = Chain; // Update chain.
-
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
-
-  return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, RetOps);
-}
-
-//===----------------------------------------------------------------------===//
-//  Other Lowering Code
-//===----------------------------------------------------------------------===//
-
-SDValue ARM64TargetLowering::LowerGlobalAddress(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  unsigned char OpFlags =
-      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
-
-  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
-         "unexpected offset in global node");
-
-  // This also catched the large code model case for Darwin.
-  if ((OpFlags & ARM64II::MO_GOT) != 0) {
-    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
-    // FIXME: Once remat is capable of dealing with instructions with register
-    // operands, expand this into two nodes instead of using a wrapper node.
-    return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-  }
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
-    // the only correct model on Darwin.
-    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                            OpFlags | ARM64II::MO_PAGE);
-    unsigned char LoFlags = OpFlags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC;
-    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-/// \brief Convert a TLS address reference into the correct sequence of loads
-/// and calls to compute the variable's address (for Darwin, currently) and
-/// return an SDValue containing the final node.
-
-/// Darwin only has one TLS scheme which must be capable of dealing with the
-/// fully general situation, in the worst case. This means:
-///     + "extern __thread" declaration.
-///     + Defined in a possibly unknown dynamic library.
-///
-/// The general system is that each __thread variable has a [3 x i64] descriptor
-/// which contains information used by the runtime to calculate the address. The
-/// only part of this the compiler needs to know about is the first xword, which
-/// contains a function pointer that must be called with the address of the
-/// entire descriptor in "x0".
-///
-/// Since this descriptor may be in a different unit, in general even the
-/// descriptor must be accessed via an indirect load. The "ideal" code sequence
-/// is:
-///     adrp x0, _var@TLVPPAGE
-///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
-///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
-///                                      ; the function pointer
-///     blr x1                           ; Uses descriptor address in x0
-///     ; Address of _var is now in x0.
-///
-/// If the address of _var's descriptor *is* known to the linker, then it can
-/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
-/// a slight efficiency gain.
-SDValue
-ARM64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
-
-  SDLoc DL(Op);
-  MVT PtrVT = getPointerTy();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-
-  SDValue TLVPAddr =
-      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-  SDValue DescAddr = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TLVPAddr);
-
-  // The first entry in the descriptor is a function pointer that we must call
-  // to obtain the address of the variable.
-  SDValue Chain = DAG.getEntryNode();
-  SDValue FuncTLVGet =
-      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
-                  false, true, true, 8);
-  Chain = FuncTLVGet.getValue(1);
-
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setAdjustsStack(true);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // Finally, we can make the call. This is just a degenerate version of a
-  // normal ARM64 call node: x0 takes the address of the descriptor, and returns
-  // the address of the variable in this thread.
-  Chain = DAG.getCopyToReg(Chain, DL, ARM64::X0, DescAddr, SDValue());
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
-                      Chain, FuncTLVGet, DAG.getRegister(ARM64::X0, MVT::i64),
-                      DAG.getRegisterMask(Mask), Chain.getValue(1));
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Chain.getValue(1));
-}
-
-/// When accessing thread-local variables under either the general-dynamic or
-/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
-/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, NZCV) are preserved.
-///
-/// Thus, the ideal call sequence on AArch64 is:
-///
-///     adrp x0, :tlsdesc:thread_var
-///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
-///     add x0, x0, :tlsdesc_lo12:thread_var
-///     .tlsdesccall thread_var
-///     blr x8
-///     (TPIDR_EL0 offset now in x0).
-///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
-///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue ARM64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
-                                                 SDValue DescAddr, SDLoc DL,
-                                                 SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-
-  // The function we need to call is simply the first entry in the GOT for this
-  // descriptor, load it in preparation.
-  SDValue Func = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, SymAddr);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // The function takes only one argument: the address of the descriptor itself
-  // in X0.
-  SDValue Glue, Chain;
-  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM64::X0, DescAddr, Glue);
-  Glue = Chain.getValue(1);
-
-  // We're now ready to populate the argument list, as with a normal call:
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Func);
-  Ops.push_back(SymAddr);
-  Ops.push_back(DAG.getRegister(ARM64::X0, PtrVT));
-  Ops.push_back(DAG.getRegisterMask(Mask));
-  Ops.push_back(Glue);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
-  Glue = Chain.getValue(1);
-
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Glue);
-}
-
-SDValue ARM64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
-                                                      SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
-         "ELF TLS only supported in small memory model");
-  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-
-  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
-
-  SDValue TPOff;
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = GA->getGlobal();
-
-  SDValue ThreadBase = DAG.getNode(ARM64ISD::THREAD_POINTER, DL, PtrVT);
-
-  if (Model == TLSModel::LocalExec) {
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(16, MVT::i32)),
-                    0);
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)),
-                    0);
-  } else if (Model == TLSModel::InitialExec) {
-    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-    TPOff = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TPOff);
-  } else if (Model == TLSModel::LocalDynamic) {
-    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
-    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
-    // the beginning of the module's TLS region, followed by a DTPREL offset
-    // calculation.
-
-    // These accesses will need deduplicating if there's more than one.
-    ARM64FunctionInfo *MFI =
-        DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-    MFI->incNumLocalDynamicTLSAccesses();
-
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                  ARM64II::MO_TLS);
-
-    // Now we can calculate the offset from TPIDR_EL0 to this module's
-    // thread-local area.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-
-    // Now use :dtprel_whatever: operations to calculate this variable's offset
-    // in its thread-storage area.
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    SDValue DTPOff =
-        SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                   DAG.getTargetConstant(16, MVT::i32)),
-                0);
-    DTPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
-                                        DAG.getTargetConstant(0, MVT::i32)),
-                     0);
-
-    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr =
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-
-    // Finally we can make a call to calculate the offset from tpidr_el0.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-  } else
-    llvm_unreachable("Unsupported ELF TLS access model");
-
-  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
-}
-
-SDValue ARM64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  if (Subtarget->isTargetDarwin())
-    return LowerDarwinGlobalTLSAddress(Op, DAG);
-  else if (Subtarget->isTargetELF())
-    return LowerELFGlobalTLSAddress(Op, DAG);
-
-  llvm_unreachable("Unexpected platform trying to use TLS");
-}
-SDValue ARM64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS = Op.getOperand(2);
-  SDValue RHS = Op.getOperand(3);
-  SDValue Dest = Op.getOperand(4);
-  SDLoc dl(Op);
-
-  // Handle f128 first, since lowering it will result in comparing the return
-  // value of a libcall against zero, which is just what the rest of LowerBR_CC
-  // is expecting to deal with.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
-  // instruction.
-  unsigned Opc = LHS.getOpcode();
-  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->isOne() &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
-           "Unexpected condition code.");
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
-      return SDValue();
-
-    // The actual operation with overflow check.
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, LHS.getValue(0), DAG);
-
-    if (CC == ISD::SETNE)
-      OFCC = getInvertedCondCode(OFCC);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
-                       CCVal, Overflow);
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    // If the RHS of the comparison is zero, we can potentially fold this
-    // to a specialized branch.
-    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
-    if (RHSC && RHSC->getZExtValue() == 0) {
-      if (CC == ISD::SETEQ) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
-      } else if (CC == ISD::SETNE) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBNZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
-                       Cmp);
-  }
-
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue BR1 =
-      DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
-                       Cmp);
-  }
-
-  return BR1;
-}
-
-SDValue ARM64TargetLowering::LowerFCOPYSIGN(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  SDValue In1 = Op.getOperand(0);
-  SDValue In2 = Op.getOperand(1);
-  EVT SrcVT = In2.getValueType();
-  if (SrcVT != VT) {
-    if (SrcVT == MVT::f32 && VT == MVT::f64)
-      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
-    else if (SrcVT == MVT::f64 && VT == MVT::f32)
-      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
-    else
-      // FIXME: Src type is different, bail out for now. Can VT really be a
-      // vector type?
-      return SDValue();
-  }
-
-  EVT VecVT;
-  EVT EltVT;
-  SDValue EltMask, VecVal1, VecVal2;
-  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
-    EltVT = MVT::i32;
-    VecVT = MVT::v4i32;
-    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
-    EltVT = MVT::i64;
-    VecVT = MVT::v2i64;
-
-    // We want to materialize a mask with the the high bit set, but the AdvSIMD
-    // immediate moves cannot materialize that in a single instruction for
-    // 64-bit elements. Instead, materialize zero and then negate it.
-    EltMask = DAG.getConstant(0, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else {
-    llvm_unreachable("Invalid type for copysign!");
-  }
-
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
-    BuildVectorOps.push_back(EltMask);
-
-  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
-
-  // If we couldn't materialize the mask above, then the mask vector will be
-  // the zero vector, and we need to negate it here.
-  if (VT == MVT::f64 || VT == MVT::v2f64) {
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
-  }
-
-  SDValue Sel =
-      DAG.getNode(ARM64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
-
-  if (VT == MVT::f32)
-    return DAG.getTargetExtractSubreg(ARM64::ssub, DL, VT, Sel);
-  else if (VT == MVT::f64)
-    return DAG.getTargetExtractSubreg(ARM64::dsub, DL, VT, Sel);
-  else
-    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
-}
-
-SDValue ARM64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
-    return SDValue();
-
-  // While there is no integer popcount instruction, it can
-  // be more efficiently lowered to the following sequence that uses
-  // AdvSIMD registers/instructions as long as the copies to/from
-  // the AdvSIMD registers are cheap.
-  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
-  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
-  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
-  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
-  SDValue Val = Op.getOperand(0);
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
-
-  SDValue VecVal;
-  if (VT == MVT::i32) {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
-    VecVal =
-        DAG.getTargetInsertSubreg(ARM64::ssub, DL, MVT::v8i8, ZeroVec, VecVal);
-  } else {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
-  }
-
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
-  SDValue UaddLV = DAG.getNode(
-      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
-      DAG.getConstant(Intrinsic::arm64_neon_uaddlv, MVT::i32), CtPop);
-
-  if (VT == MVT::i64)
-    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
-  return UaddLV;
-}
-
-SDValue ARM64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-
-  if (Op.getValueType().isVector())
-    return LowerVSETCC(Op, DAG);
-
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDLoc dl(Op);
-
-  // We chose ZeroOrOneBooleanContents, so use zero and one.
-  EVT VT = Op.getValueType();
-  SDValue TVal = DAG.getConstant(1, VT);
-  SDValue FVal = DAG.getConstant(0, VT);
-
-  // Handle f128 first, since one possible outcome is a normal integer
-  // comparison which gets picked up by the next if statement.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, use it.
-    if (!RHS.getNode()) {
-      assert(LHS.getValueType() == Op.getValueType() &&
-             "Unexpected setcc expansion!");
-      return LHS;
-    }
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    SDValue CCVal;
-    SDValue Cmp =
-        getARM64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  if (CC2 == ARM64CC::AL) {
-    changeFPCCToARM64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
-  } else {
-    // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-    // clean.  Some of them require two CSELs to implement.  As is in this case,
-    // we emit the first CSEL and then emit a second using the output of the
-    // first as the RHS.  We're effectively OR'ing the two CC's together.
-
-    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-    SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-}
-
-/// A SELECT_CC operation is really some kind of max or min if both values being
-/// compared are, in some sense, equal to the results in either case. However,
-/// it is permissible to compare f32 values and produce directly extended f64
-/// values.
-///
-/// Extending the comparison operands would also be allowed, but is less likely
-/// to happen in practice since their use is right here. Note that truncate
-/// operations would *not* be semantically equivalent.
-static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
-  if (Cmp == Result)
-    return true;
-
-  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
-  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
-  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
-      Result.getValueType() == MVT::f64) {
-    bool Lossy;
-    APFloat CmpVal = CCmp->getValueAPF();
-    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
-    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
-  }
-
-  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
-}
-
-SDValue ARM64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDValue CC = Op->getOperand(0);
-  SDValue TVal = Op->getOperand(1);
-  SDValue FVal = Op->getOperand(2);
-  SDLoc DL(Op);
-
-  unsigned Opc = CC.getOpcode();
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
-  // instruction.
-  if (CC.getResNo() == 1 &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
-      return SDValue();
-
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, CC.getValue(0), DAG);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal,
-                       Overflow);
-  }
-
-  if (CC.getOpcode() == ISD::SETCC)
-    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
-                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
-  else
-    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
-                           FVal, ISD::SETNE);
-}
-
-SDValue ARM64TargetLowering::LowerSELECT_CC(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue TVal = Op.getOperand(2);
-  SDValue FVal = Op.getOperand(3);
-  SDLoc dl(Op);
-
-  // Handle f128 first, because it will result in a comparison of some RTLIB
-  // call result against zero.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Handle integers first.
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    unsigned Opcode = ARM64ISD::CSEL;
-
-    // If both the TVal and the FVal are constants, see if we can swap them in
-    // order to for a CSINV or CSINC out of them.
-    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (TVal.getOpcode() == ISD::XOR) {
-      // If TVal is a NOT we want to swap TVal and FVal so that we can match
-      // with a CSINV rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
-      if (CVal && CVal->isAllOnesValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (TVal.getOpcode() == ISD::SUB) {
-      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
-      // that we can match with a CSNEG rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
-      if (CVal && CVal->isNullValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (CTVal && CFVal) {
-      const int64_t TrueVal = CTVal->getSExtValue();
-      const int64_t FalseVal = CFVal->getSExtValue();
-      bool Swap = false;
-
-      // If both TVal and FVal are constants, see if FVal is the
-      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
-      // instead of a CSEL in that case.
-      if (TrueVal == ~FalseVal) {
-        Opcode = ARM64ISD::CSINV;
-      } else if (TrueVal == -FalseVal) {
-        Opcode = ARM64ISD::CSNEG;
-      } else if (TVal.getValueType() == MVT::i32) {
-        // If our operands are only 32-bit wide, make sure we use 32-bit
-        // arithmetic for the check whether we can use CSINC. This ensures that
-        // the addition in the check will wrap around properly in case there is
-        // an overflow (which would not be the case if we do the check with
-        // 64-bit arithmetic).
-        const uint32_t TrueVal32 = CTVal->getZExtValue();
-        const uint32_t FalseVal32 = CFVal->getZExtValue();
-
-        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
-          Opcode = ARM64ISD::CSINC;
-
-          if (TrueVal32 > FalseVal32) {
-            Swap = true;
-          }
-        }
-        // 64-bit check whether we can use CSINC.
-      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
-        Opcode = ARM64ISD::CSINC;
-
-        if (TrueVal > FalseVal) {
-          Swap = true;
-        }
-      }
-
-      // Swap TVal and FVal if necessary.
-      if (Swap) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-
-      if (Opcode != ARM64ISD::CSEL) {
-        // Drop FVal since we can get its value by simply inverting/negating
-        // TVal.
-        FVal = TVal;
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    EVT VT = Op.getValueType();
-    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-  assert(LHS.getValueType() == RHS.getValueType());
-  EVT VT = Op.getValueType();
-
-  // Try to match this select into a max/min operation, which have dedicated
-  // opcode in the instruction set.
-  // FIXME: This is not correct in the presence of NaNs, so we only enable this
-  // in no-NaNs mode.
-  if (getTargetMachine().Options.NoNaNsFPMath) {
-    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
-    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
-        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
-      CC = ISD::getSetCCSwappedOperands(CC);
-      std::swap(MinMaxLHS, MinMaxRHS);
-    }
-
-    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
-        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-      case ISD::SETOGT:
-      case ISD::SETOGE:
-        return DAG.getNode(ARM64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
-        break;
-      case ISD::SETLT:
-      case ISD::SETLE:
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETOLT:
-      case ISD::SETOLE:
-        return DAG.getNode(ARM64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
-        break;
-      }
-    }
-  }
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two CSELs to implement.
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-  // If we need a second CSEL, emit it, using the output of the first as the
-  // RHS.  We're effectively OR'ing the two CC's together.
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-
-  // Otherwise, return the output of the first CSEL.
-  return CS1;
-}
-
-SDValue ARM64TargetLowering::LowerJumpTable(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // Jump table entries as PC relative offsets. No additional tweaking
-  // is necessary here. Just get the address of the jump table.
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G3),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G0 | MO_NC));
-  }
-
-  SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_PAGE);
-  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                      ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-  SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-  return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-}
-
-SDValue ARM64TargetLowering::LowerConstantPool(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    // Use the GOT for the large code model on iOS.
-    if (Subtarget->isTargetMachO()) {
-      SDValue GotAddr = DAG.getTargetConstantPool(
-          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-          ARM64II::MO_GOT);
-      return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-    }
-
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G3),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
-    // ELF, the only valid one on Darwin.
-    SDValue Hi =
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetConstantPool(
-        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-        ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerBlockAddress(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGEOFF |
-                                                             ARM64II::MO_NC);
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerDarwin_VASTART(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  ARM64FunctionInfo *FuncInfo =
-      DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-
-  SDLoc DL(Op);
-  SDValue FR =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  // The layout of the va_list struct is specified in the AArch64 Procedure Call
-  // Standard, section B.3.
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  SDLoc DL(Op);
-
-  SDValue Chain = Op.getOperand(0);
-  SDValue VAList = Op.getOperand(1);
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  SmallVector<SDValue, 4> MemOps;
-
-  // void *__stack at offset 0
-  SDValue Stack =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 8));
-
-  // void *__gr_top at offset 8
-  int GPRSize = FuncInfo->getVarArgsGPRSize();
-  if (GPRSize > 0) {
-    SDValue GRTop, GRTopAddr;
-
-    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(8, getPointerTy()));
-
-    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
-    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
-                        DAG.getConstant(GPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8), false, false, 8));
-  }
-
-  // void *__vr_top at offset 16
-  int FPRSize = FuncInfo->getVarArgsFPRSize();
-  if (FPRSize > 0) {
-    SDValue VRTop, VRTopAddr;
-    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(16, getPointerTy()));
-
-    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
-    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
-                        DAG.getConstant(FPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16), false, false, 8));
-  }
-
-  // int __gr_offs at offset 24
-  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(24, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24), false,
-                                false, 4));
-
-  // int __vr_offs at offset 28
-  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(28, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28), false,
-                                false, 4));
-
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
-}
-
-SDValue ARM64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
-  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
-  // pointer.
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
-  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-
-  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
-                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
-                       8, false, false, MachinePointerInfo(DestSV),
-                       MachinePointerInfo(SrcSV));
-}
-
-SDValue ARM64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() &&
-         "automatic va_arg instruction only works on Darwin");
-
-  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Addr = Op.getOperand(1);
-  unsigned Align = Op.getConstantOperandVal(3);
-
-  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
-                               MachinePointerInfo(V), false, false, false, 0);
-  Chain = VAList.getValue(1);
-
-  if (Align > 8) {
-    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
-    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                         DAG.getConstant(Align - 1, getPointerTy()));
-    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
-                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
-  }
-
-  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
-
-  // Scalar integer and FP values smaller than 64 bits are implicitly extended
-  // up to 64 bits.  At the very least, we have to increase the striding of the
-  // vaargs list to match this, and for FP values we need to introduce
-  // FP_ROUND nodes as well.
-  if (VT.isInteger() && !VT.isVector())
-    ArgSize = 8;
-  bool NeedFPTrunc = false;
-  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
-    ArgSize = 8;
-    NeedFPTrunc = true;
-  }
-
-  // Increment the pointer, VAList, to the next vaarg
-  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                               DAG.getConstant(ArgSize, getPointerTy()));
-  // Store the incremented VAList to the legalized pointer
-  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
-                                 false, false, 0);
-
-  // Load the actual argument out of the pointer VAList
-  if (NeedFPTrunc) {
-    // Load the value as an f64.
-    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
-                                 MachinePointerInfo(), false, false, false, 0);
-    // Round the value down to an f32.
-    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
-                                   DAG.getIntPtrConstant(1));
-    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
-    // Merge the rounded value with the chain output of the load.
-    return DAG.getMergeValues(Ops, DL);
-  }
-
-  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
-                     false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerFRAMEADDR(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setFrameAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ARM64::FP, VT);
-  while (Depth--)
-    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(), false, false, false, 0);
-  return FrameAddr;
-}
-
-// FIXME? Maybe this could be a TableGen attribute on some registers and
-// this table could be generated automatically from RegInfo.
-unsigned ARM64TargetLowering::getRegisterByName(const char* RegName,
-                                                EVT VT) const {
-  unsigned Reg = StringSwitch<unsigned>(RegName)
-                       .Case("sp", ARM64::SP)
-                       .Default(0);
-  if (Reg)
-    return Reg;
-  report_fatal_error("Invalid register name global variable");
-}
-
-SDValue ARM64TargetLowering::LowerRETURNADDR(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MFI->setReturnAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  if (Depth) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, getPointerTy());
-    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
-  }
-
-  // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM64::LR, &ARM64::GPR64RegClass);
-  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
-}
-
-/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftRightParts(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  // ARM64 shifts larger than the register width are wrapped rather than
-  // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, MVT::i64))
-                          : DAG.getConstant(0, VT);
-  SDValue Hi =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-  SDValue Hi = DAG.getNode(ARM64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
-
-  // ARM64 shifts of larger than register sizes are wrapped rather than clamped,
-  // so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-bool
-ARM64TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
-  // The ARM64 target doesn't support folding offsets into global addresses.
-  return false;
-}
-
-bool ARM64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
-  // FIXME: We should be able to handle f128 as well with a clever lowering.
-  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
-    return true;
-
-  if (VT == MVT::f64)
-    return ARM64_AM::getFP64Imm(Imm) != -1;
-  else if (VT == MVT::f32)
-    return ARM64_AM::getFP32Imm(Imm) != -1;
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Optimization Hooks
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-// Table of Constraints
-// TODO: This is the current set of constraints supported by ARM for the
-// compiler, not all of them may make sense, e.g. S may be difficult to support.
-//
-// r - A general register
-// w - An FP/SIMD register of some size in the range v0-v31
-// x - An FP/SIMD register of some size in the range v0-v15
-// I - Constant that can be used with an ADD instruction
-// J - Constant that can be used with a SUB instruction
-// K - Constant that can be used with a 32-bit logical instruction
-// L - Constant that can be used with a 64-bit logical instruction
-// M - Constant that can be used as a 32-bit MOV immediate
-// N - Constant that can be used as a 64-bit MOV immediate
-// Q - A memory reference with base register and no offset
-// S - A symbolic address
-// Y - Floating point constant zero
-// Z - Integer constant zero
-//
-//   Note that general register operands will be output using their 64-bit x
-// register name, whatever the size of the variable, unless the asm operand
-// is prefixed by the %w modifier. Floating-point and SIMD register operands
-// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
-// %q modifier.
-
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
-ARM64TargetLowering::ConstraintType
-ARM64TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default:
-      break;
-    case 'z':
-      return C_Other;
-    case 'x':
-    case 'w':
-      return C_RegisterClass;
-    // An address with a single base register. Due to the way we
-    // currently handle addresses it is the same as 'r'.
-    case 'Q':
-      return C_Memory;
-    }
-  }
-  return TargetLowering::getConstraintType(Constraint);
-}
-
-/// Examine constraint type and operand type and determine a weight value.
-/// This object must already have been set up with the operand type
-/// and the current alternative constraint selected.
-TargetLowering::ConstraintWeight
-ARM64TargetLowering::getSingleConstraintMatchWeight(
-    AsmOperandInfo &info, const char *constraint) const {
-  ConstraintWeight weight = CW_Invalid;
-  Value *CallOperandVal = info.CallOperandVal;
-  // If we don't have a value, we can't do a match,
-  // but allow it at the lowest weight.
-  if (!CallOperandVal)
-    return CW_Default;
-  Type *type = CallOperandVal->getType();
-  // Look at the constraint type.
-  switch (*constraint) {
-  default:
-    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
-  case 'x':
-  case 'w':
-    if (type->isFloatingPointTy() || type->isVectorTy())
-      weight = CW_Register;
-    break;
-  case 'z':
-    weight = CW_Constant;
-    break;
-  }
-  return weight;
-}
-
-std::pair<unsigned, const TargetRegisterClass *>
-ARM64TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::GPR64commonRegClass);
-      return std::make_pair(0U, &ARM64::GPR32commonRegClass);
-    case 'w':
-      if (VT == MVT::f32)
-        return std::make_pair(0U, &ARM64::FPR32RegClass);
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::FPR64RegClass);
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128RegClass);
-      break;
-    // The instructions that this constraint is designed for can
-    // only take 128-bit registers so just use that regclass.
-    case 'x':
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128_loRegClass);
-      break;
-    }
-  }
-  if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(unsigned(ARM64::NZCV), &ARM64::CCRRegClass);
-
-  // Use the default implementation in TargetLowering to convert the register
-  // constraint into a member of a register class.
-  std::pair<unsigned, const TargetRegisterClass *> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
-
-  // Not found as a standard register?
-  if (!Res.second) {
-    unsigned Size = Constraint.size();
-    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
-        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
-      const std::string Reg =
-          std::string(&Constraint[2], &Constraint[Size - 1]);
-      int RegNo = atoi(Reg.c_str());
-      if (RegNo >= 0 && RegNo <= 31) {
-        // v0 - v31 are aliases of q0 - q31.
-        // By default we'll emit v0-v31 for this unless there's a modifier where
-        // we'll emit the correct register as well.
-        Res.first = ARM64::FPR128RegClass.getRegister(RegNo);
-        Res.second = &ARM64::FPR128RegClass;
-      }
-    }
-  }
-
-  return Res;
-}
-
-/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector.  If it is invalid, don't add anything to Ops.
-void ARM64TargetLowering::LowerAsmOperandForConstraint(
-    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
-    SelectionDAG &DAG) const {
-  SDValue Result;
-
-  // Currently only support length 1 constraints.
-  if (Constraint.length() != 1)
-    return;
-
-  char ConstraintLetter = Constraint[0];
-  switch (ConstraintLetter) {
-  default:
-    break;
-
-  // This set of constraints deal with valid constants for various instructions.
-  // Validate and return a target constant for them if we can.
-  case 'z': {
-    // 'z' maps to xzr or wzr so it needs an input of 0.
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C || C->getZExtValue() != 0)
-      return;
-
-    if (Op.getValueType() == MVT::i64)
-      Result = DAG.getRegister(ARM64::XZR, MVT::i64);
-    else
-      Result = DAG.getRegister(ARM64::WZR, MVT::i32);
-    break;
-  }
-
-  case 'I':
-  case 'J':
-  case 'K':
-  case 'L':
-  case 'M':
-  case 'N':
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C)
-      return;
-
-    // Grab the value and do some validation.
-    uint64_t CVal = C->getZExtValue();
-    switch (ConstraintLetter) {
-    // The I constraint applies only to simple ADD or SUB immediate operands:
-    // i.e. 0 to 4095 with optional shift by 12
-    // The J constraint applies only to ADD or SUB immediates that would be
-    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
-    // instruction [or vice versa], in other words -1 to -4095 with optional
-    // left shift by 12.
-    case 'I':
-      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
-        break;
-      return;
-    case 'J': {
-      uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
-        break;
-      return;
-    }
-    // The K and L constraints apply *only* to logical immediates, including
-    // what used to be the MOVI alias for ORR (though the MOVI alias has now
-    // been removed and MOV should be used). So these constraints have to
-    // distinguish between bit patterns that are valid 32-bit or 64-bit
-    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
-    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
-    // versa.
-    case 'K':
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      return;
-    case 'L':
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      return;
-    // The M and N constraints are a superset of K and L respectively, for use
-    // with the MOV (immediate) alias. As well as the logical immediates they
-    // also match 32 or 64-bit immediates that can be loaded either using a
-    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
-    // (M) or 64-bit 0x1234000000000000 (N) etc.
-    // As a note some of this code is liberally stolen from the asm parser.
-    case 'M': {
-      if (!isUInt<32>(CVal))
-        return;
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      if ((CVal & 0xFFFF) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~(uint32_t)CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      return;
-    }
-    case 'N': {
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      if ((CVal & 0xFFFFULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF00000000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF000000000000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
-        break;
-      return;
-    }
-    default:
-      return;
-    }
-
-    // All assembler immediates are 64-bit integers.
-    Result = DAG.getTargetConstant(CVal, MVT::i64);
-    break;
-  }
-
-  if (Result.getNode()) {
-    Ops.push_back(Result);
-    return;
-  }
-
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
-}
-
-//===----------------------------------------------------------------------===//
-//                     ARM64 Advanced SIMD Support
-//===----------------------------------------------------------------------===//
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
-  EVT VT = V64Reg.getValueType();
-  unsigned NarrowSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-  SDLoc DL(V64Reg);
-
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
-                     V64Reg, DAG.getConstant(0, MVT::i32));
-}
-
-/// getExtFactor - Determine the adjustment factor for the position when
-/// generating an "extract from vector registers" instruction.
-static unsigned getExtFactor(SDValue &V) {
-  EVT EltType = V.getValueType().getVectorElementType();
-  return EltType.getSizeInBits() / 8;
-}
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-  SDLoc DL(V128Reg);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, DL, NarrowTy, V128Reg);
-}
-
-// Gather data to see if the operation can be modelled as a
-// shuffle in combination with VEXTs.
-SDValue ARM64TargetLowering::ReconstructShuffle(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
-      // A shuffle can only come from building a vector from various
-      // elements of other vectors.
-      return SDValue();
-    }
-
-    // Record this extraction against the appropriate vector if possible...
-    SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
-
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
-  }
-
-  // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
-    return SDValue();
-
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
-
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
-      VEXTOffsets[i] = 0;
-      continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
-      // We can pad out the smaller vector for free, so if it's part of a
-      // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
-                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
-      continue;
-    }
-
-    // Don't attempt to extract subvectors from BUILD_VECTOR sources
-    // that expand or trunc the original value.
-    // TODO: We can try to bitcast and ANY_EXTEND the result but
-    // we need to consider the cost of vector ANY_EXTEND, and the
-    // legality of all the types.
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType())
-      return SDValue();
-
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
-
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
-      // Span too large for a VEXT to cope
-      return SDValue();
-    }
-
-    if (MinElts[i] >= NumElts) {
-      // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
-      // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i], DAG.getIntPtrConstant(0));
-    } else {
-      // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i], DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(ARM64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
-    }
-  }
-
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
-      continue;
-    }
-
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
-    }
-  }
-
-  // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
-
-  return SDValue();
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are the same.
-static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
-  if (M[0] < 0)
-    return false;
-
-  Imm = M[0];
-
-  // If this is a VEXT shuffle, the immediate value is the index of the first
-  // element.  The other shuffle indices must be the successive elements after
-  // the first one.
-  unsigned ExpectedElt = Imm;
-  for (unsigned i = 1; i < NumElts; ++i) {
-    // Increment the expected index.  If it wraps around, just follow it
-    // back to index zero and keep going.
-    ++ExpectedElt;
-    if (ExpectedElt == NumElts)
-      ExpectedElt = 0;
-
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if (ExpectedElt != static_cast<unsigned>(M[i]))
-      return false;
-  }
-
-  return true;
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are different.
-static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
-                      unsigned &Imm) {
-  // Look for the first non-undef element.
-  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
-      [](int Elt) {return Elt >= 0;});
-
-  // Benefit form APInt to handle overflow when calculating expected element.
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
-  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
-  // The following shuffle indices must be the successive elements after the
-  // first real element.
-  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
-      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
-  if (FirstWrongElt != M.end())
-    return false;
-
-  // The index of an EXT is the first element if it is not UNDEF.
-  // Watch out for the beginning UNDEFs. The EXT index should be the expected
-  // value of the first element.
-  // E.g. <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
-  //      <-1, -1, 0, 1, ...> is treated as <IDX, IDX+1, 0, 1, ...>. IDX is
-  // equal to the ExpectedElt.
-  Imm = (M[0] >= 0) ? static_cast<unsigned>(M[0]) : ExpectedElt.getZExtValue();
-
-  // If no beginning UNDEFs, do swap when M[0] >= NumElts.
-  if (M[0] >= 0 && Imm >= NumElts) {
-    ReverseEXT = true;
-    Imm -= NumElts;
-  } else if (M[0] < 0) {
-    // Only do swap when beginning UNDEFs more than the first real element,
-    if (*FirstRealElt < FirstRealElt - M.begin())
-      ReverseEXT = true;
-    if (Imm >= NumElts)
-      Imm -= NumElts;
-  }
-
-  return true;
-}
-
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
-
-  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
-  if (EltSz == 64)
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
-
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
-    return false;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
-  return true;
-}
-
-static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != 2 * i + WhichResult)
-      return false;
-  }
-
-  return true;
-}
-
-static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
-static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
-static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned Half = VT.getVectorNumElements() / 2;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned j = 0; j != 2; ++j) {
-    unsigned Idx = WhichResult;
-    for (unsigned i = 0; i != Half; ++i) {
-      int MIdx = M[i + j * Half];
-      if (MIdx >= 0 && (unsigned)MIdx != Idx)
-        return false;
-      Idx += 2;
-    }
-  }
-
-  return true;
-}
-
-/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
-static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-static bool isINSMask(ArrayRef<int> M, int NumInputElements,
-                      bool &DstIsLeft, int &Anomaly) {
-  if (M.size() != static_cast<size_t>(NumInputElements))
-    return false;
-
-  int NumLHSMatch = 0, NumRHSMatch = 0;
-  int LastLHSMismatch = -1, LastRHSMismatch = -1;
-
-  for (int i = 0; i < NumInputElements; ++i) {
-    if (M[i] == -1) {
-      ++NumLHSMatch;
-      ++NumRHSMatch;
-      continue;
-    }
-
-    if (M[i] == i)
-      ++NumLHSMatch;
-    else
-      LastLHSMismatch = i;
-
-    if (M[i] == i + NumInputElements)
-      ++NumRHSMatch;
-    else
-      LastRHSMismatch = i;
-  }
-
-  if (NumLHSMatch == NumInputElements - 1) {
-    DstIsLeft = true;
-    Anomaly = LastLHSMismatch;
-    return true;
-  } else if (NumRHSMatch == NumInputElements - 1) {
-    DstIsLeft = false;
-    Anomaly = LastRHSMismatch;
-    return true;
-  }
-
-  return false;
-}
-
-static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
-  if (VT.getSizeInBits() != 128)
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-
-  for (int I = 0, E = NumElts / 2; I != E; I++) {
-    if (Mask[I] != I)
-      return false;
-  }
-
-  int Offset = NumElts / 2;
-  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
-    if (Mask[I] != I + SplitLHS * Offset)
-      return false;
-  }
-
-  return true;
-}
-
-static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue V0 = Op.getOperand(0);
-  SDValue V1 = Op.getOperand(1);
-  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
-
-  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
-      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
-    return SDValue();
-
-  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
-
-  if (!isConcatMask(Mask, VT, SplitV0))
-    return SDValue();
-
-  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                VT.getVectorNumElements() / 2);
-  if (SplitV0) {
-    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
-                     DAG.getConstant(0, MVT::i64));
-  }
-  if (V1.getValueType().getSizeInBits() == 128) {
-    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
-                     DAG.getConstant(0, MVT::i64));
-  }
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
-}
-
-/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
-/// the specified operations to build the shuffle.
-static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
-                                      SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
-  unsigned OpNum = (PFEntry >> 26) & 0x0F;
-  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
-  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
-
-  enum {
-    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
-    OP_VREV,
-    OP_VDUP0,
-    OP_VDUP1,
-    OP_VDUP2,
-    OP_VDUP3,
-    OP_VEXT1,
-    OP_VEXT2,
-    OP_VEXT3,
-    OP_VUZPL, // VUZP, left result
-    OP_VUZPR, // VUZP, right result
-    OP_VZIPL, // VZIP, left result
-    OP_VZIPR, // VZIP, right result
-    OP_VTRNL, // VTRN, left result
-    OP_VTRNR  // VTRN, right result
-  };
-
-  if (OpNum == OP_COPY) {
-    if (LHSID == (1 * 9 + 2) * 9 + 3)
-      return LHS;
-    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
-    return RHS;
-  }
-
-  SDValue OpLHS, OpRHS;
-  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
-  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
-  EVT VT = OpLHS.getValueType();
-
-  switch (OpNum) {
-  default:
-    llvm_unreachable("Unknown shuffle opcode!");
-  case OP_VREV:
-    // VREV divides the vector in half and swaps within the half.
-    if (VT.getVectorElementType() == MVT::i32 ||
-        VT.getVectorElementType() == MVT::f32)
-      return DAG.getNode(ARM64ISD::REV64, dl, VT, OpLHS);
-    // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
-      return DAG.getNode(ARM64ISD::REV32, dl, VT, OpLHS);
-    // vrev <4 x i8> -> REV16
-    assert(VT.getVectorElementType() == MVT::i8);
-    return DAG.getNode(ARM64ISD::REV16, dl, VT, OpLHS);
-  case OP_VDUP0:
-  case OP_VDUP1:
-  case OP_VDUP2:
-  case OP_VDUP3: {
-    EVT EltTy = VT.getVectorElementType();
-    unsigned Opcode;
-    if (EltTy == MVT::i8)
-      Opcode = ARM64ISD::DUPLANE8;
-    else if (EltTy == MVT::i16)
-      Opcode = ARM64ISD::DUPLANE16;
-    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
-      Opcode = ARM64ISD::DUPLANE32;
-    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
-      Opcode = ARM64ISD::DUPLANE64;
-    else
-      llvm_unreachable("Invalid vector element type?");
-
-    if (VT.getSizeInBits() == 64)
-      OpLHS = WidenVector(OpLHS, DAG);
-    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
-    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
-  }
-  case OP_VEXT1:
-  case OP_VEXT2:
-  case OP_VEXT3: {
-    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
-    return DAG.getNode(ARM64ISD::EXT, dl, VT, OpLHS, OpRHS,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-  case OP_VUZPL:
-    return DAG.getNode(ARM64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VUZPR:
-    return DAG.getNode(ARM64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPL:
-    return DAG.getNode(ARM64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPR:
-    return DAG.getNode(ARM64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNL:
-    return DAG.getNode(ARM64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNR:
-    return DAG.getNode(ARM64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  }
-}
-
-static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
-                           SelectionDAG &DAG) {
-  // Check to see if we can use the TBL instruction.
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc DL(Op);
-
-  EVT EltVT = Op.getValueType().getVectorElementType();
-  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
-
-  SmallVector<SDValue, 8> TBLMask;
-  for (int Val : ShuffleMask) {
-    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
-      unsigned Offset = Byte + Val * BytesPerElt;
-      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
-    }
-  }
-
-  MVT IndexVT = MVT::v8i8;
-  unsigned IndexLen = 8;
-  if (Op.getValueType().getSizeInBits() == 128) {
-    IndexVT = MVT::v16i8;
-    IndexLen = 16;
-  }
-
-  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
-  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
-
-  SDValue Shuffle;
-  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
-    if (IndexLen == 8)
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
-    Shuffle = DAG.getNode(
-        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-        DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                    makeArrayRef(TBLMask.data(), IndexLen)));
-  } else {
-    if (IndexLen == 8) {
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
-    } else {
-      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
-      // cannot currently represent the register constraints on the input
-      // table registers.
-      //  Shuffle = DAG.getNode(ARM64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
-      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-      //                               &TBLMask[0], IndexLen));
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
-    }
-  }
-  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
-}
-
-static unsigned getDUPLANEOp(EVT EltType) {
-  if (EltType == MVT::i8)
-    return ARM64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
-    return ARM64ISD::DUPLANE16;
-  if (EltType == MVT::i32 || EltType == MVT::f32)
-    return ARM64ISD::DUPLANE32;
-  if (EltType == MVT::i64 || EltType == MVT::f64)
-    return ARM64ISD::DUPLANE64;
-
-  llvm_unreachable("Invalid vector element type?");
-}
-
-SDValue ARM64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
-
-  // Convert shuffles that are directly supported on NEON to target-specific
-  // DAG nodes, instead of keeping them as shuffles and matching them again
-  // during code selection.  This is more efficient and avoids the possibility
-  // of inconsistencies between legalization and selection.
-  ArrayRef<int> ShuffleMask = SVN->getMask();
-
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
-                                       V1.getValueType().getSimpleVT())) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1)
-      Lane = 0;
-
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return DAG.getNode(ARM64ISD::DUP, dl, V1.getValueType(),
-                         V1.getOperand(0));
-    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
-    // constant. If so, we can just reference the lane's definition directly.
-    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
-        !isa<ConstantSDNode>(V1.getOperand(Lane)))
-      return DAG.getNode(ARM64ISD::DUP, dl, VT, V1.getOperand(Lane));
-
-    // Otherwise, duplicate from the lane of the input vector.
-    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
-
-    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
-    // to make a vector of the same size as this SHUFFLE. We can ignore the
-    // extract entirely, and canonicalise the concat using WidenVector.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
-      V1 = V1.getOperand(0);
-    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
-      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
-      Lane -= Idx * VT.getVectorNumElements() / 2;
-      V1 = WidenVector(V1.getOperand(Idx), DAG);
-    } else if (VT.getSizeInBits() == 64)
-      V1 = WidenVector(V1, DAG);
-
-    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
-  }
-
-  if (isREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(ARM64ISD::REV64, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(ARM64ISD::REV32, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(ARM64ISD::REV16, dl, V1.getValueType(), V1, V2);
-
-  bool ReverseEXT = false;
-  unsigned Imm;
-  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
-    if (ReverseEXT)
-      std::swap(V1, V2);
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V2,
-                       DAG.getConstant(Imm, MVT::i32));
-  } else if (V2->getOpcode() == ISD::UNDEF &&
-             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V1,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-
-  unsigned WhichResult;
-  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-
-  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-
-  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
-  if (Concat.getNode())
-    return Concat;
-
-  bool DstIsLeft;
-  int Anomaly;
-  int NumInputElements = V1.getValueType().getVectorNumElements();
-  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
-    SDValue DstVec = DstIsLeft ? V1 : V2;
-    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
-
-    SDValue SrcVec = V1;
-    int SrcLane = ShuffleMask[Anomaly];
-    if (SrcLane >= NumInputElements) {
-      SrcVec = V2;
-      SrcLane -= VT.getVectorNumElements();
-    }
-    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
-
-    EVT ScalarVT = VT.getVectorElementType();
-    if (ScalarVT.getSizeInBits() < 32)
-      ScalarVT = MVT::i32;
-
-    return DAG.getNode(
-        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
-        DstLaneV);
-  }
-
-  // If the shuffle is not directly supported and it has 4 elements, use
-  // the PerfectShuffle-generated table to synthesize it from other shuffles.
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts == 4) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (ShuffleMask[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = ShuffleMask[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
-  }
-
-  return GenerateTBL(Op, ShuffleMask, DAG);
-}
-
-static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
-                               APInt &UndefBits) {
-  EVT VT = BVN->getValueType(0);
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
-
-    for (unsigned i = 0; i < NumSplats; ++i) {
-      CnstBits <<= SplatBitSize;
-      UndefBits <<= SplatBitSize;
-      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
-      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
-    }
-
-    return true;
-  }
-
-  return false;
-}
-
-SDValue ARM64TargetLowering::LowerVectorAND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  SDValue LHS = Op.getOperand(0);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We only have BIC vector immediate instruction, which is and-not.
-    CnstBits = ~CnstBits;
-
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = ~UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate AND.
-FailedModImm:
-  return Op;
-}
-
-// Specialized code to quickly find if PotentialBVec is a BuildVector that
-// consists of only the same constant int value, returned in reference arg
-// ConstVal
-static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
-                                     uint64_t &ConstVal) {
-  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
-  if (!Bvec)
-    return false;
-  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
-  if (!FirstElt)
-    return false;
-  EVT VT = Bvec->getValueType(0);
-  unsigned NumElts = VT.getVectorNumElements();
-  for (unsigned i = 1; i < NumElts; ++i)
-    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
-      return false;
-  ConstVal = FirstElt->getZExtValue();
-  return true;
-}
-
-static unsigned getIntrinsicID(const SDNode *N) {
-  unsigned Opcode = N->getOpcode();
-  switch (Opcode) {
-  default:
-    return Intrinsic::not_intrinsic;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID < Intrinsic::num_intrinsics)
-      return IID;
-    return Intrinsic::not_intrinsic;
-  }
-  }
-}
-
-// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
-// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
-static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  if (!VT.isVector())
-    return SDValue();
-
-  SDLoc DL(N);
-
-  // Is the first op an AND?
-  const SDValue And = N->getOperand(0);
-  if (And.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // Is the second op an shl or lshr?
-  SDValue Shift = N->getOperand(1);
-  // This will have been turned into: ARM64ISD::VSHL vector, #shift
-  // or ARM64ISD::VLSHR vector, #shift
-  unsigned ShiftOpc = Shift.getOpcode();
-  if ((ShiftOpc != ARM64ISD::VSHL && ShiftOpc != ARM64ISD::VLSHR))
-    return SDValue();
-  bool IsShiftRight = ShiftOpc == ARM64ISD::VLSHR;
-
-  // Is the shift amount constant?
-  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
-  if (!C2node)
-    return SDValue();
-
-  // Is the and mask vector all constant?
-  uint64_t C1;
-  if (!isAllConstantBuildVector(And.getOperand(1), C1))
-    return SDValue();
-
-  // Is C1 == ~C2, taking into account how much one can shift elements of a
-  // particular size?
-  uint64_t C2 = C2node->getZExtValue();
-  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
-  if (C2 > ElemSizeInBits)
-    return SDValue();
-  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
-  if ((C1 & ElemMask) != (~C2 & ElemMask))
-    return SDValue();
-
-  SDValue X = And.getOperand(0);
-  SDValue Y = Shift.getOperand(0);
-
-  unsigned Intrin =
-      IsShiftRight ? Intrinsic::arm64_neon_vsri : Intrinsic::arm64_neon_vsli;
-  SDValue ResultSLI =
-      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
-
-  DEBUG(dbgs() << "arm64-lower: transformed: \n");
-  DEBUG(N->dump(&DAG));
-  DEBUG(dbgs() << "into: \n");
-  DEBUG(ResultSLI->dump(&DAG));
-
-  ++NumShiftInserts;
-  return ResultSLI;
-}
-
-SDValue ARM64TargetLowering::LowerVectorOR(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
-  if (EnableARM64SlrGeneration) {
-    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
-    if (Res.getNode())
-      return Res;
-  }
-
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
-  SDValue LHS = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  // OR commutes, so try swapping the operands.
-  if (!BVN) {
-    LHS = Op.getOperand(0);
-    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  }
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate OR.
-FailedModImm:
-  return Op;
-}
-
-SDValue ARM64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      // Certain magic vector constants (used to express things like NOT
-      // and NEG) are passed through unmodified.  This allows codegen patterns
-      // for these operations to match.  Special-purpose patterns will lower
-      // these immediates to MOVIs if it proves necessary.
-      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
-        return Op;
-
-      // The many faces of MOVI...
-      if (ARM64_AM::isAdvSIMDModImmType10(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType10(CnstVal);
-        if (VT.getSizeInBits() == 128) {
-          SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::v2i64,
-                                    DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-        }
-
-        // Support the V64 version via subregister insertion.
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType9(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType9(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVI, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The few faces of FMOV...
-      if (ARM64_AM::isAdvSIMDModImmType11(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType11(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType12(CnstVal) &&
-          VT.getSizeInBits() == 128) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType12(CnstVal);
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MVT::v2f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The many faces of MVNI...
-      CnstVal = ~CnstVal;
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-FailedModImm:
-
-  // Scan through the operands to find some interesting properties we can
-  // exploit:
-  //   1) If only one value is used, we can use a DUP, or
-  //   2) if only the low element is not undef, we can just insert that, or
-  //   3) if only one constant value is used (w/ some non-constant lanes),
-  //      we can splat the constant value into the whole vector then fill
-  //      in the non-constant lanes.
-  //   4) FIXME: If different constant values are used, but we can intelligently
-  //             select the values we'll be overwriting for the non-constant
-  //             lanes such that we can directly materialize the vector
-  //             some other way (MOVI, e.g.), we can be sneaky.
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isOnlyLowElement = true;
-  bool usesOnlyOneValue = true;
-  bool usesOnlyOneConstantValue = true;
-  bool isConstant = true;
-  unsigned NumConstantLanes = 0;
-  SDValue Value;
-  SDValue ConstantValue;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    if (i > 0)
-      isOnlyLowElement = false;
-    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
-      isConstant = false;
-
-    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
-      ++NumConstantLanes;
-      if (!ConstantValue.getNode())
-        ConstantValue = V;
-      else if (ConstantValue != V)
-        usesOnlyOneConstantValue = false;
-    }
-
-    if (!Value.getNode())
-      Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
-  }
-
-  if (!Value.getNode())
-    return DAG.getUNDEF(VT);
-
-  if (isOnlyLowElement)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
-
-  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
-  // i32 and try again.
-  if (usesOnlyOneValue) {
-    if (!isConstant) {
-      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-          Value.getValueType() != VT)
-        return DAG.getNode(ARM64ISD::DUP, dl, VT, Value);
-
-      // This is actually a DUPLANExx operation, which keeps everything vectory.
-
-      // DUPLANE works on 128-bit vectors, widen it if necessary.
-      SDValue Lane = Value.getOperand(1);
-      Value = Value.getOperand(0);
-      if (Value.getValueType().getSizeInBits() == 64)
-        Value = WidenVector(Value, DAG);
-
-      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
-      return DAG.getNode(Opcode, dl, VT, Value, Lane);
-    }
-
-    if (VT.getVectorElementType().isFloatingPoint()) {
-      SmallVector<SDValue, 8> Ops;
-      MVT NewType =
-          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
-      for (unsigned i = 0; i < NumElts; ++i)
-        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
-      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
-      Val = LowerBUILD_VECTOR(Val, DAG);
-      if (Val.getNode())
-        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
-    }
-  }
-
-  // If there was only one constant value used and for more than one lane,
-  // start by splatting that value, then replace the non-constant lanes. This
-  // is better than the default, which will perform a separate initialization
-  // for each lane.
-  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
-    SDValue Val = DAG.getNode(ARM64ISD::DUP, dl, VT, ConstantValue);
-    // Now insert the non-constant lanes.
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
-        // Note that type legalization likely mucked about with the VT of the
-        // source operand, so we may have to convert it here before inserting.
-        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
-      }
-    }
-    return Val;
-  }
-
-  // If all elements are constants and the case above didn't get hit, fall back
-  // to the default expansion, which will generate a load from the constant
-  // pool.
-  if (isConstant)
-    return SDValue();
-
-  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
-  if (NumElts >= 4) {
-    SDValue shuffle = ReconstructShuffle(Op, DAG);
-    if (shuffle != SDValue())
-      return shuffle;
-  }
-
-  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
-  // know the default expansion would otherwise fall back on something even
-  // worse. For a vector with one or two non-undef values, that's
-  // scalar_to_vector for the elements followed by a shuffle (provided the
-  // shuffle is valid for the target) and materialization element by element
-  // on the stack followed by a load for everything else.
-  if (!isConstant && !usesOnlyOneValue) {
-    SDValue Vec = DAG.getUNDEF(VT);
-    SDValue Op0 = Op.getOperand(0);
-    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
-    unsigned i = 0;
-    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
-    // a) Avoid a RMW dependency on the full vector register, and
-    // b) Allow the register coalescer to fold away the copy if the
-    //    value is already in an S or D register.
-    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
-      unsigned SubIdx = ElemSize == 32 ? ARM64::ssub : ARM64::dsub;
-      MachineSDNode *N =
-          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
-                             DAG.getTargetConstant(SubIdx, MVT::i32));
-      Vec = SDValue(N, 0);
-      ++i;
-    }
-    for (; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
-        continue;
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
-    }
-    return Vec;
-  }
-
-  // Just use the default expansion. We failed to find a better alternative.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(2)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform insertion by expanding the value
-  // to a V128 type and perform the insertion on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
-                             Op.getOperand(1), Op.getOperand(2));
-  // Re-narrow the resultant vector.
-  return NarrowVector(Node, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                                     SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform extraction by expanding the value
-  // to a V128 type and perform the extraction on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  EVT ExtrTy = WideTy.getVectorElementType();
-  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
-    ExtrTy = MVT::i32;
-
-  // For extractions, we just return the result directly.
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
-                     Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getOperand(0).getValueType();
-  SDLoc dl(Op);
-  // Just in case...
-  if (!VT.isVector())
-    return SDValue();
-
-  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (!Cst)
-    return SDValue();
-  unsigned Val = Cst->getZExtValue();
-
-  unsigned Size = Op.getValueType().getSizeInBits();
-  if (Val == 0) {
-    switch (Size) {
-    case 8:
-      return DAG.getTargetExtractSubreg(ARM64::bsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 16:
-      return DAG.getTargetExtractSubreg(ARM64::hsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 32:
-      return DAG.getTargetExtractSubreg(ARM64::ssub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 64:
-      return DAG.getTargetExtractSubreg(ARM64::dsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    default:
-      llvm_unreachable("Unexpected vector type in extract_subvector!");
-    }
-  }
-  // If this is extracting the upper 64-bits of a 128-bit vector, we match
-  // that directly.
-  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
-    return Op;
-
-  return SDValue();
-}
-
-bool ARM64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
-                                             EVT VT) const {
-  if (VT.getVectorNumElements() == 4 &&
-      (VT.is128BitVector() || VT.is64BitVector())) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (M[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = M[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return true;
-  }
-
-  bool DummyBool;
-  int DummyInt;
-  unsigned DummyUnsigned;
-
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
-          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
-          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
-          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
-          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
-          isZIPMask(M, VT, DummyUnsigned) ||
-          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
-          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
-          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
-          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
-          isConcatMask(M, VT, VT.getSizeInBits() == 128));
-}
-
-/// getVShiftImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift operation, where all the elements of the
-/// build_vector must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs, ElementBits) ||
-      SplatBitSize > ElementBits)
-    return false;
-  Cnt = SplatBits.getSExtValue();
-  return true;
-}
-
-/// isVShiftLImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift left operation.  That value must be in the range:
-///   0 <= Value < ElementBits for a left shift; or
-///   0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
-}
-
-/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation.  For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-///   1 <= |Value| <= ElementBits for a right shift; or
-///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
-                         int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  if (isIntrinsic)
-    Cnt = -Cnt;
-  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
-}
-
-SDValue ARM64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  int64_t Cnt;
-
-  if (!Op.getOperand(1).getValueType().isVector())
-    return Op;
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
-
-  case ISD::SHL:
-    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
-      return DAG.getNode(ARM64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       DAG.getConstant(Intrinsic::arm64_neon_ushl, MVT::i32),
-                       Op.getOperand(0), Op.getOperand(1));
-  case ISD::SRA:
-  case ISD::SRL:
-    // Right shift immediate
-    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
-        Cnt < EltSize) {
-      unsigned Opc =
-          (Op.getOpcode() == ISD::SRA) ? ARM64ISD::VASHR : ARM64ISD::VLSHR;
-      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    }
-
-    // Right shift register.  Note, there is not a shift right register
-    // instruction, but the shift left register instruction takes a signed
-    // value, where negative numbers specify a right shift.
-    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::arm64_neon_sshl
-                                                : Intrinsic::arm64_neon_ushl;
-    // negate the shift amount
-    SDValue NegShift = DAG.getNode(ARM64ISD::NEG, DL, VT, Op.getOperand(1));
-    SDValue NegShiftLeft =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
-    return NegShiftLeft;
-  }
-
-  return SDValue();
-}
-
-static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
-                                    ARM64CC::CondCode CC, bool NoNans, EVT VT,
-                                    SDLoc dl, SelectionDAG &DAG) {
-  EVT SrcVT = LHS.getValueType();
-
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
-  bool IsZero = IsCnst && (CnstBits == 0);
-
-  if (SrcVT.getVectorElementType().isFloatingPoint()) {
-    switch (CC) {
-    default:
-      return SDValue();
-    case ARM64CC::NE: {
-      SDValue Fcmeq;
-      if (IsZero)
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      else
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-      return DAG.getNode(ARM64ISD::NOT, dl, VT, Fcmeq);
-    }
-    case ARM64CC::EQ:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-    case ARM64CC::GE:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, LHS, RHS);
-    case ARM64CC::GT:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, LHS, RHS);
-    case ARM64CC::LS:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, RHS, LHS);
-    case ARM64CC::LT:
-      if (!NoNans)
-        return SDValue();
-    // If we ignore NaNs then we can use to the MI implementation.
-    // Fallthrough.
-    case ARM64CC::MI:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, RHS, LHS);
-    }
-  }
-
-  switch (CC) {
-  default:
-    return SDValue();
-  case ARM64CC::NE: {
-    SDValue Cmeq;
-    if (IsZero)
-      Cmeq = DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    else
-      Cmeq = DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-    return DAG.getNode(ARM64ISD::NOT, dl, VT, Cmeq);
-  }
-  case ARM64CC::EQ:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-  case ARM64CC::GE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, LHS, RHS);
-  case ARM64CC::GT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, LHS, RHS);
-  case ARM64CC::LE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, RHS, LHS);
-  case ARM64CC::LS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, RHS, LHS);
-  case ARM64CC::LO:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, RHS, LHS);
-  case ARM64CC::LT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, RHS, LHS);
-  case ARM64CC::HI:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, LHS, RHS);
-  case ARM64CC::HS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, LHS, RHS);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDLoc dl(Op);
-
-  if (LHS.getValueType().getVectorElementType().isInteger()) {
-    assert(LHS.getValueType() == RHS.getValueType());
-    ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-    return EmitVectorComparison(LHS, RHS, ARM64CC, false, Op.getValueType(), dl,
-                                DAG);
-  }
-
-  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
-         LHS.getValueType().getVectorElementType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  ARM64CC::CondCode CC1, CC2;
-  bool ShouldInvert;
-  changeVectorFPCCToARM64CC(CC, CC1, CC2, ShouldInvert);
-
-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
-  SDValue Cmp =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
-  if (!Cmp.getNode())
-    return SDValue();
-
-  if (CC2 != ARM64CC::AL) {
-    SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
-    if (!Cmp2.getNode())
-      return SDValue();
-
-    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
-  }
-
-  if (ShouldInvert)
-    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
-
-  return Cmp;
-}
-
-/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
-/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
-/// specified in the intrinsic calls.
-bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                             const CallInst &I,
-                                             unsigned Intrinsic) const {
-  switch (Intrinsic) {
-  case Intrinsic::arm64_neon_ld2:
-  case Intrinsic::arm64_neon_ld3:
-  case Intrinsic::arm64_neon_ld4:
-  case Intrinsic::arm64_neon_ld1x2:
-  case Intrinsic::arm64_neon_ld1x3:
-  case Intrinsic::arm64_neon_ld1x4:
-  case Intrinsic::arm64_neon_ld2lane:
-  case Intrinsic::arm64_neon_ld3lane:
-  case Intrinsic::arm64_neon_ld4lane:
-  case Intrinsic::arm64_neon_ld2r:
-  case Intrinsic::arm64_neon_ld3r:
-  case Intrinsic::arm64_neon_ld4r: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile loads with NEON intrinsics not supported
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_neon_st2:
-  case Intrinsic::arm64_neon_st3:
-  case Intrinsic::arm64_neon_st4:
-  case Intrinsic::arm64_neon_st1x2:
-  case Intrinsic::arm64_neon_st1x3:
-  case Intrinsic::arm64_neon_st1x4:
-  case Intrinsic::arm64_neon_st2lane:
-  case Intrinsic::arm64_neon_st3lane:
-  case Intrinsic::arm64_neon_st4lane: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
-    unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      Type *ArgTy = I.getArgOperand(ArgI)->getType();
-      if (!ArgTy->isVectorTy())
-        break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
-    }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile stores with NEON intrinsics not supported
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldaxr:
-  case Intrinsic::arm64_ldxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stlxr:
-  case Intrinsic::arm64_stxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(1);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldaxp:
-  case Intrinsic::arm64_ldxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stlxp:
-  case Intrinsic::arm64_stxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(2);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  default:
-    break;
-  }
-
-  return false;
-}
-
-// Truncations from 64-bit GPR to 32-bit GPR is free.
-bool ARM64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-bool ARM64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-
-// All 32-bit GPR operations implicitly zero the high-half of the corresponding
-// 64-bit GPR.
-bool ARM64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-bool ARM64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-
-bool ARM64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
-  EVT VT1 = Val.getValueType();
-  if (isZExtFree(VT1, VT2)) {
-    return true;
-  }
-
-  if (Val.getOpcode() != ISD::LOAD)
-    return false;
-
-  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
-  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
-          VT2.isInteger() && VT1.getSizeInBits() <= 32);
-}
-
-bool ARM64TargetLowering::hasPairedLoad(Type *LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-bool ARM64TargetLowering::hasPairedLoad(EVT LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType.isSimple() ||
-      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType.getSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
-                       unsigned AlignCheck) {
-  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
-          (DstAlign == 0 || DstAlign % AlignCheck == 0));
-}
-
-EVT ARM64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                             unsigned SrcAlign, bool IsMemset,
-                                             bool ZeroMemset, bool MemcpyStrSrc,
-                                             MachineFunction &MF) const {
-  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
-  // instruction to materialize the v2i64 zero and one store (with restrictive
-  // addressing mode). Just do two i64 store of zero-registers.
-  bool Fast;
-  const Function *F = MF.getFunction();
-  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
-      (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
-    return MVT::f128;
-
-  return Size >= 8 ? MVT::i64 : MVT::i32;
-}
-
-// 12-bit optionally shifted immediates are legal for adds.
-bool ARM64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
-  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
-    return true;
-  return false;
-}
-
-// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
-// immediates is the same as for an add or a sub.
-bool ARM64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
-  if (Immed < 0)
-    Immed *= -1;
-  return isLegalAddImmediate(Immed);
-}
-
-/// isLegalAddressingMode - Return true if the addressing mode represented
-/// by AM is legal for this target, for a load/store of the specified type.
-bool ARM64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty) const {
-  // ARM64 has five basic addressing modes:
-  //  reg
-  //  reg + 9-bit signed offset
-  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
-  //  reg1 + reg2
-  //  reg + SIZE_IN_BYTES * reg
-
-  // No global is ever allowed as a base.
-  if (AM.BaseGV)
-    return false;
-
-  // No reg+reg+imm addressing.
-  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
-    return false;
-
-  // check reg + imm case:
-  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
-  uint64_t NumBytes = 0;
-  if (Ty->isSized()) {
-    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
-    NumBytes = NumBits / 8;
-    if (!isPowerOf2_64(NumBits))
-      NumBytes = 0;
-  }
-
-  if (!AM.Scale) {
-    int64_t Offset = AM.BaseOffs;
-
-    // 9-bit signed offset
-    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
-      return true;
-
-    // 12-bit unsigned offset
-    unsigned shift = Log2_64(NumBytes);
-    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
-        // Must be a multiple of NumBytes (NumBytes is a power of 2)
-        (Offset >> shift) << shift == Offset)
-      return true;
-    return false;
-  }
-
-  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
-
-  if (!AM.Scale || AM.Scale == 1 ||
-      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
-    return true;
-  return false;
-}
-
-int ARM64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                              Type *Ty) const {
-  // Scaling factors are not free at all.
-  // Operands                     | Rt Latency
-  // -------------------------------------------
-  // Rt, [Xn, Xm]                 | 4
-  // -------------------------------------------
-  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
-  // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty))
-    // Scale represents reg2 * scale, thus account for 1 if
-    // it is not equal to 0 or 1.
-    return AM.Scale != 0 && AM.Scale != 1;
-  return -1;
-}
-
-bool ARM64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
-
-  if (!VT.isSimple())
-    return false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
-  case MVT::f64:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-const MCPhysReg *
-ARM64TargetLowering::getScratchRegisters(CallingConv::ID) const {
-  // LR is a callee-save register, but we must treat it as clobbered by any call
-  // site. Hence we include LR in the scratch registers, which are in turn added
-  // as implicit-defs for stackmaps and patchpoints.
-  static const MCPhysReg ScratchRegs[] = {
-    ARM64::X16, ARM64::X17, ARM64::LR, 0
-  };
-  return ScratchRegs;
-}
-
-bool ARM64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
-  EVT VT = N->getValueType(0);
-    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
-    // it with shift to let it be lowered to UBFX.
-  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
-      isa<ConstantSDNode>(N->getOperand(1))) {
-    uint64_t TruncMask = N->getConstantOperandVal(1);
-    if (isMask_64(TruncMask) &&
-      N->getOperand(0).getOpcode() == ISD::SRL &&
-      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
-      return false;
-  }
-  return true;
-}
-
-bool ARM64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                            Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return false;
-
-  int64_t Val = Imm.getSExtValue();
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
-    return true;
-
-  if ((int64_t)Val < 0)
-    Val = ~Val;
-  if (BitSize == 32)
-    Val &= (1LL << 32) - 1;
-
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  unsigned Shift = (63 - LZ) / 16;
-  // MOVZ is free so return true for one or fewer MOVK.
-  return (Shift < 3) ? true : false;
-}
-
-// Generate SUBS and CSEL for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
-  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
-  // and change it to SUB and CSEL.
-  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
-      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
-      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
-    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
-      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
-        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
-                                  N0.getOperand(0));
-        // Generate SUBS & CSEL.
-        SDValue Cmp =
-            DAG.getNode(ARM64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
-                        N0.getOperand(0), DAG.getConstant(0, VT));
-        return DAG.getNode(ARM64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
-                           DAG.getConstant(ARM64CC::PL, MVT::i32),
-                           SDValue(Cmp.getNode(), 1));
-      }
-  return SDValue();
-}
-
-// performXorCombine - Attempts to handle integer ABS.
-static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  return performIntegerAbsCombine(N, DAG);
-}
-
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Multiplication of a power of two plus/minus one can be done more
-  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
-  // future CPUs have a cheaper MADD instruction, this may need to be
-  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
-  // 64-bit is 5 cycles, so this is always a win.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    APInt Value = C->getAPIntValue();
-    EVT VT = N->getValueType(0);
-    APInt VP1 = Value + 1;
-    if (VP1.isPowerOf2()) {
-      // Multiplying by one less than a power of two, replace with a shift
-      // and a subtract.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VP1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-    APInt VM1 = Value - 1;
-    if (VM1.isPowerOf2()) {
-      // Multiplying by one more than a power of two, replace with a shift
-      // and an add.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VM1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-  }
-  return SDValue();
-}
-
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::f32 && VT != MVT::f64)
-    return SDValue();
-  // Only optimize when the source and destination types have the same width.
-  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
-    return SDValue();
-
-  // If the result of an integer load is only used by an integer-to-float
-  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
-  // This eliminates an "integer-to-vector-move UOP and improve throughput.
-  SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-      // Do not change the width of a volatile load.
-      !cast<LoadSDNode>(N0)->isVolatile()) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                               LN0->getPointerInfo(), LN0->isVolatile(),
-                               LN0->isNonTemporal(), LN0->isInvariant(),
-                               LN0->getAlignment());
-
-    // Make sure successors of the original load stay after it by updating them
-    // to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
-
-    unsigned Opcode =
-        (N->getOpcode() == ISD::SINT_TO_FP) ? ARM64ISD::SITOF : ARM64ISD::UITOF;
-    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
-  }
-
-  return SDValue();
-}
-
-/// An EXTR instruction is made up of two shifts, ORed together. This helper
-/// searches for and classifies those shifts.
-static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
-                         bool &FromHi) {
-  if (N.getOpcode() == ISD::SHL)
-    FromHi = false;
-  else if (N.getOpcode() == ISD::SRL)
-    FromHi = true;
-  else
-    return false;
-
-  if (!isa<ConstantSDNode>(N.getOperand(1)))
-    return false;
-
-  ShiftAmount = N->getConstantOperandVal(1);
-  Src = N->getOperand(0);
-  return true;
-}
-
-/// EXTR instruction extracts a contiguous chunk of bits from two existing
-/// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
-static SDValue tryCombineToEXTR(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
-
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  SDValue LHS;
-  uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
-    return SDValue();
-
-  SDValue RHS;
-  uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
-    return SDValue();
-
-  // If they're both trying to come from the high part of the register, they're
-  // not really an EXTR.
-  if (LHSFromHi == RHSFromHi)
-    return SDValue();
-
-  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
-    return SDValue();
-
-  if (LHSFromHi) {
-    std::swap(LHS, RHS);
-    std::swap(ShiftLHS, ShiftRHS);
-  }
-
-  return DAG.getNode(ARM64ISD::EXTR, DL, VT, LHS, RHS,
-                     DAG.getConstant(ShiftRHS, MVT::i64));
-}
-
-static SDValue tryCombineToBSL(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  EVT VT = N->getValueType(0);
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-
-  if (!VT.isVector())
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
-
-  SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // We only have to look for constant vectors here since the general, variable
-  // case can be handled in TableGen.
-  unsigned Bits = VT.getVectorElementType().getSizeInBits();
-  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
-  for (int i = 1; i >= 0; --i)
-    for (int j = 1; j >= 0; --j) {
-      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
-        continue;
-
-      bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
-        }
-      }
-
-      if (FoundMatch)
-        return DAG.getNode(ARM64ISD::BSL, DL, VT, SDValue(BVN0, 0),
-                           N0->getOperand(1 - i), N1->getOperand(1 - j));
-    }
-
-  return SDValue();
-}
-
-static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                                const ARM64Subtarget *Subtarget) {
-  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
-  if (!EnableARM64ExtrGeneration)
-    return SDValue();
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDValue Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
-    return Res;
-
-  Res = tryCombineToBSL(N, DCI);
-  if (Res.getNode())
-    return Res;
-
-  return SDValue();
-}
-
-static SDValue performBitcastCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Remove extraneous bitcasts around an extract_subvector.
-  // For example,
-  //    (v4i16 (bitconvert
-  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
-  //  becomes
-  //    (extract_subvector ((v8i16 ...), (i64 4)))
-
-  // Only interested in 64-bit vectors as the ultimate result.
-  EVT VT = N->getValueType(0);
-  if (!VT.isVector())
-    return SDValue();
-  if (VT.getSimpleVT().getSizeInBits() != 64)
-    return SDValue();
-  // Is the operand an extract_subvector starting at the beginning or halfway
-  // point of the vector? A low half may also come through as an
-  // EXTRACT_SUBREG, so look for that, too.
-  SDValue Op0 = N->getOperand(0);
-  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
-      !(Op0->isMachineOpcode() &&
-        Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG))
-    return SDValue();
-  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
-  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
-      return SDValue();
-  } else if (Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG) {
-    if (idx != ARM64::dsub)
-      return SDValue();
-    // The dsub reference is equivalent to a lane zero subvector reference.
-    idx = 0;
-  }
-  // Look through the bitcast of the input to the extract.
-  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue Source = Op0->getOperand(0)->getOperand(0);
-  // If the source type has twice the number of elements as our destination
-  // type, we know this is an extract of the high or low half of the vector.
-  EVT SVT = Source->getValueType(0);
-  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: bitcast extract_subvector simplification\n");
-
-  // Create the simplified form to just extract the low or high half of the
-  // vector directly rather than bothering with the bitcasts.
-  SDLoc dl(N);
-  unsigned NumElements = VT.getVectorNumElements();
-  if (idx) {
-    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
-  } else {
-    SDValue SubReg = DAG.getTargetConstant(ARM64::dsub, MVT::i32);
-    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
-                                      Source, SubReg),
-                   0);
-  }
-}
-
-static SDValue performConcatVectorsCombine(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
-  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
-  // canonicalise to that.
-  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
-    assert(VT.getVectorElementType().getSizeInBits() == 64);
-    return DAG.getNode(ARM64ISD::DUPLANE64, dl, VT,
-                       WidenVector(N->getOperand(0), DAG),
-                       DAG.getConstant(0, MVT::i64));
-  }
-
-  // Canonicalise concat_vectors so that the right-hand vector has as few
-  // bit-casts as possible before its real operation. The primary matching
-  // destination for these operations will be the narrowing "2" instructions,
-  // which depend on the operation being performed on this right-hand vector.
-  // For example,
-  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
-  // becomes
-  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
-
-  SDValue Op1 = N->getOperand(1);
-  if (Op1->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue RHS = Op1->getOperand(0);
-  MVT RHSTy = RHS.getValueType().getSimpleVT();
-  // If the RHS is not a vector, this is not the pattern we're looking for.
-  if (!RHSTy.isVector())
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: concat_vectors bitcast simplification\n");
-
-  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
-                                  RHSTy.getVectorNumElements() * 2);
-  return DAG.getNode(
-      ISD::BITCAST, dl, VT,
-      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
-                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
-}
-
-static SDValue tryCombineFixedPointConvert(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-  // Transform a scalar conversion of a value from a lane extract into a
-  // lane extract of a vector conversion. E.g., from foo1 to foo2:
-  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
-  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
-  //
-  // The second form interacts better with instruction selection and the
-  // register allocator to avoid cross-class register copies that aren't
-  // coalescable due to a lane reference.
-
-  // Check the operand and see if it originates from a lane extract.
-  SDValue Op1 = N->getOperand(1);
-  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-    // Yep, no additional predication needed. Perform the transform.
-    SDValue IID = N->getOperand(0);
-    SDValue Shift = N->getOperand(2);
-    SDValue Vec = Op1.getOperand(0);
-    SDValue Lane = Op1.getOperand(1);
-    EVT ResTy = N->getValueType(0);
-    EVT VecResTy;
-    SDLoc DL(N);
-
-    // The vector width should be 128 bits by the time we get here, even
-    // if it started as 64 bits (the extract_vector handling will have
-    // done so).
-    assert(Vec.getValueType().getSizeInBits() == 128 &&
-           "unexpected vector size on extract_vector_elt!");
-    if (Vec.getValueType() == MVT::v4i32)
-      VecResTy = MVT::v4f32;
-    else if (Vec.getValueType() == MVT::v2i64)
-      VecResTy = MVT::v2f64;
-    else
-      assert(0 && "unexpected vector type!");
-
-    SDValue Convert =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
-  }
-  return SDValue();
-}
-
-// AArch64 high-vector "long" operations are formed by performing the non-high
-// version on an extract_subvector of each operand which gets the high half:
-//
-//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
-//
-// However, there are cases which don't have an extract_high explicitly, but
-// have another operation that can be made compatible with one for free. For
-// example:
-//
-//  (dupv64 scalar) --> (extract_high (dup128 scalar))
-//
-// This routine does the actual conversion of such DUPs, once outer routines
-// have determined that everything else is in order.
-static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
-  // We can handle most types of duplicate, but the lane ones have an extra
-  // operand saying *which* lane, so we need to know.
-  bool IsDUPLANE;
-  switch (N.getOpcode()) {
-  case ARM64ISD::DUP:
-    IsDUPLANE = false;
-    break;
-  case ARM64ISD::DUPLANE8:
-  case ARM64ISD::DUPLANE16:
-  case ARM64ISD::DUPLANE32:
-  case ARM64ISD::DUPLANE64:
-    IsDUPLANE = true;
-    break;
-  default:
-    return SDValue();
-  }
-
-  MVT NarrowTy = N.getSimpleValueType();
-  if (!NarrowTy.is64BitVector())
-    return SDValue();
-
-  MVT ElementTy = NarrowTy.getVectorElementType();
-  unsigned NumElems = NarrowTy.getVectorNumElements();
-  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
-
-  SDValue NewDUP;
-  if (IsDUPLANE)
-    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
-                         N.getOperand(1));
-  else
-    NewDUP = DAG.getNode(ARM64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
-
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
-                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
-}
-
-static bool isEssentiallyExtractSubvector(SDValue N) {
-  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-    return true;
-
-  return N.getOpcode() == ISD::BITCAST &&
-         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
-}
-
-/// \brief Helper structure to keep track of ISD::SET_CC operands.
-struct GenericSetCCInfo {
-  const SDValue *Opnd0;
-  const SDValue *Opnd1;
-  ISD::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of a SET_CC lowered into ARM64 code.
-struct ARM64SetCCInfo {
-  const SDValue *Cmp;
-  ARM64CC::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of SetCC information.
-union SetCCInfo {
-  GenericSetCCInfo Generic;
-  ARM64SetCCInfo ARM64;
-};
-
-/// \brief Helper structure to be able to read SetCC information.
-/// If set to true, IsARM64 field, Info is a ARM64SetCCInfo, otherwise Info is
-/// a GenericSetCCInfo.
-struct SetCCInfoAndKind {
-  SetCCInfo Info;
-  bool IsARM64;
-};
-
-/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
-/// an
-/// ARM64 lowered one.
-/// \p SetCCInfo is filled accordingly.
-/// \post SetCCInfo is meanginfull only when this function returns true.
-/// \return True when Op is a kind of SET_CC operation.
-static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
-  // If this is a setcc, this is straight forward.
-  if (Op.getOpcode() == ISD::SETCC) {
-    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
-    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
-    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-    SetCCInfo.IsARM64 = false;
-    return true;
-  }
-  // Otherwise, check if this is a matching csel instruction.
-  // In other words:
-  // - csel 1, 0, cc
-  // - csel 0, 1, !cc
-  if (Op.getOpcode() != ARM64ISD::CSEL)
-    return false;
-  // Set the information about the operands.
-  // TODO: we want the operands of the Cmp not the csel
-  SetCCInfo.Info.ARM64.Cmp = &Op.getOperand(3);
-  SetCCInfo.IsARM64 = true;
-  SetCCInfo.Info.ARM64.CC = static_cast<ARM64CC::CondCode>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
-
-  // Check that the operands matches the constraints:
-  // (1) Both operands must be constants.
-  // (2) One must be 1 and the other must be 0.
-  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
-  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-
-  // Check (1).
-  if (!TValue || !FValue)
-    return false;
-
-  // Check (2).
-  if (!TValue->isOne()) {
-    // Update the comparison when we are interested in !cc.
-    std::swap(TValue, FValue);
-    SetCCInfo.Info.ARM64.CC =
-        ARM64CC::getInvertedCondCode(SetCCInfo.Info.ARM64.CC);
-  }
-  return TValue->isOne() && FValue->isNullValue();
-}
-
-// Returns true if Op is setcc or zext of setcc.
-static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
-  if (isSetCC(Op, Info))
-    return true;
-  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
-    isSetCC(Op->getOperand(0), Info));
-}
-
-// The folding we want to perform is:
-// (add x, [zext] (setcc cc ...) )
-//   -->
-// (csel x, (add x, 1), !cc ...)
-//
-// The latter will get matched to a CSINC instruction.
-static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
-  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
-  SDValue LHS = Op->getOperand(0);
-  SDValue RHS = Op->getOperand(1);
-  SetCCInfoAndKind InfoAndKind;
-
-  // If neither operand is a SET_CC, give up.
-  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
-    std::swap(LHS, RHS);
-    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
-      return SDValue();
-  }
-
-  // FIXME: This could be generatized to work for FP comparisons.
-  EVT CmpVT = InfoAndKind.IsARM64
-                  ? InfoAndKind.Info.ARM64.Cmp->getOperand(0).getValueType()
-                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
-  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
-    return SDValue();
-
-  SDValue CCVal;
-  SDValue Cmp;
-  SDLoc dl(Op);
-  if (InfoAndKind.IsARM64) {
-    CCVal = DAG.getConstant(
-        ARM64CC::getInvertedCondCode(InfoAndKind.Info.ARM64.CC), MVT::i32);
-    Cmp = *InfoAndKind.Info.ARM64.Cmp;
-  } else
-    Cmp = getARM64Cmp(*InfoAndKind.Info.Generic.Opnd0,
-                      *InfoAndKind.Info.Generic.Opnd1,
-                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
-                      CCVal, DAG, dl);
-
-  EVT VT = Op->getValueType(0);
-  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
-  return DAG.getNode(ARM64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
-}
-
-// The basic add/sub long vector instructions have variants with "2" on the end
-// which act on the high-half of their inputs. They are normally matched by
-// patterns like:
-//
-// (add (zeroext (extract_high LHS)),
-//      (zeroext (extract_high RHS)))
-// -> uaddl2 vD, vN, vM
-//
-// However, if one of the extracts is something like a duplicate, this
-// instruction can still be used profitably. This function puts the DAG into a
-// more appropriate form for those patterns to trigger.
-static SDValue performAddSubLongCombine(SDNode *N,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  MVT VT = N->getSimpleValueType(0);
-  if (!VT.is128BitVector()) {
-    if (N->getOpcode() == ISD::ADD)
-      return performSetccAddFolding(N, DAG);
-    return SDValue();
-  }
-
-  // Make sure both branches are extended in the same way.
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
-       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
-      LHS.getOpcode() != RHS.getOpcode())
-    return SDValue();
-
-  unsigned ExtType = LHS.getOpcode();
-
-  // It's not worth doing if at least one of the inputs isn't already an
-  // extract, but we don't know which it'll be so we have to try both.
-  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
-    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
-    if (!RHS.getNode())
-      return SDValue();
-
-    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
-  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
-    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
-    if (!LHS.getNode())
-      return SDValue();
-
-    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
-  }
-
-  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
-}
-
-// Massage DAGs which we can use the high-half "long" operations on into
-// something isel will recognize better. E.g.
-//
-// (arm64_neon_umull (extract_high vec) (dupv64 scalar)) -->
-//   (arm64_neon_umull (extract_high (v2i64 vec)))
-//                     (extract_high (v2i64 (dup128 scalar)))))
-//
-static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
-  assert(LHS.getValueType().is64BitVector() &&
-         RHS.getValueType().is64BitVector() &&
-         "unexpected shape for long operation");
-
-  // Either node could be a DUP, but it's not worth doing both of them (you'd
-  // just as well use the non-high version) so look for a corresponding extract
-  // operation on the other "wing".
-  if (isEssentiallyExtractSubvector(LHS)) {
-    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
-    if (!RHS.getNode())
-      return SDValue();
-  } else if (isEssentiallyExtractSubvector(RHS)) {
-    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
-    if (!LHS.getNode())
-      return SDValue();
-  }
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
-                     N->getOperand(0), LHS, RHS);
-}
-
-static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
-  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
-  unsigned ElemBits = ElemTy.getSizeInBits();
-
-  int64_t ShiftAmount;
-  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
-    APInt SplatValue, SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                              HasAnyUndefs, ElemBits) ||
-        SplatBitSize != ElemBits)
-      return SDValue();
-
-    ShiftAmount = SplatValue.getSExtValue();
-  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
-    ShiftAmount = CVN->getSExtValue();
-  } else
-    return SDValue();
-
-  unsigned Opcode;
-  bool IsRightShift;
-  switch (IID) {
-  default:
-    llvm_unreachable("Unknown shift intrinsic");
-  case Intrinsic::arm64_neon_sqshl:
-    Opcode = ARM64ISD::SQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_uqshl:
-    Opcode = ARM64ISD::UQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_srshl:
-    Opcode = ARM64ISD::SRSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_urshl:
-    Opcode = ARM64ISD::URSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_sqshlu:
-    Opcode = ARM64ISD::SQSHLU_I;
-    IsRightShift = false;
-    break;
-  }
-
-  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(-ShiftAmount, MVT::i32));
-  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(ShiftAmount, MVT::i32));
-
-  return SDValue();
-}
-
-// The CRC32[BH] instructions ignore the high bits of their data operand. Since
-// the intrinsics must be legal and take an i32, this means there's almost
-// certainly going to be a zext in the DAG which we can eliminate.
-static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
-  SDValue AndN = N->getOperand(2);
-  if (AndN.getOpcode() != ISD::AND)
-    return SDValue();
-
-  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
-  if (!CMask || CMask->getZExtValue() != Mask)
-    return SDValue();
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
-                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
-}
-
-static SDValue performIntrinsicCombine(SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       const ARM64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  unsigned IID = getIntrinsicID(N);
-  switch (IID) {
-  default:
-    break;
-  case Intrinsic::arm64_neon_vcvtfxs2fp:
-  case Intrinsic::arm64_neon_vcvtfxu2fp:
-    return tryCombineFixedPointConvert(N, DCI, DAG);
-    break;
-  case Intrinsic::arm64_neon_fmax:
-    return DAG.getNode(ARM64ISD::FMAX, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_fmin:
-    return DAG.getNode(ARM64ISD::FMIN, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_smull:
-  case Intrinsic::arm64_neon_umull:
-  case Intrinsic::arm64_neon_pmull:
-  case Intrinsic::arm64_neon_sqdmull:
-    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
-  case Intrinsic::arm64_neon_sqshl:
-  case Intrinsic::arm64_neon_uqshl:
-  case Intrinsic::arm64_neon_sqshlu:
-  case Intrinsic::arm64_neon_srshl:
-  case Intrinsic::arm64_neon_urshl:
-    return tryCombineShiftImm(IID, N, DAG);
-  case Intrinsic::arm64_crc32b:
-  case Intrinsic::arm64_crc32cb:
-    return tryCombineCRC32(0xff, N, DAG);
-  case Intrinsic::arm64_crc32h:
-  case Intrinsic::arm64_crc32ch:
-    return tryCombineCRC32(0xffff, N, DAG);
-  }
-  return SDValue();
-}
-
-static SDValue performExtendCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
-  // we can convert that DUP into another extract_high (of a bigger DUP), which
-  // helps the backend to decide that an sabdl2 would be useful, saving a real
-  // extract_high operation.
-  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
-      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    SDNode *ABDNode = N->getOperand(0).getNode();
-    unsigned IID = getIntrinsicID(ABDNode);
-    if (IID == Intrinsic::arm64_neon_sabd ||
-        IID == Intrinsic::arm64_neon_uabd) {
-      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
-      if (!NewABD.getNode())
-        return SDValue();
-
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
-                         NewABD);
-    }
-  }
-
-  // This is effectively a custom type legalization for ARM64.
-  //
-  // Type legalization will split an extend of a small, legal, type to a larger
-  // illegal type by first splitting the destination type, often creating
-  // illegal source types, which then get legalized in isel-confusing ways,
-  // leading to really terrible codegen. E.g.,
-  //   %result = v8i32 sext v8i8 %value
-  // becomes
-  //   %losrc = extract_subreg %value, ...
-  //   %hisrc = extract_subreg %value, ...
-  //   %lo = v4i32 sext v4i8 %losrc
-  //   %hi = v4i32 sext v4i8 %hisrc
-  // Things go rapidly downhill from there.
-  //
-  // For ARM64, the [sz]ext vector instructions can only go up one element
-  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
-  // take two instructions.
-  //
-  // This implies that the most efficient way to do the extend from v8i8
-  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
-  // the normal splitting to happen for the v8i16->v8i32.
-
-  // This is pre-legalization to catch some cases where the default
-  // type legalization will create ill-tempered code.
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // We're only interested in cleaning things up for non-legal vector types
-  // here. If both the source and destination are legal, things will just
-  // work naturally without any fiddling.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT ResVT = N->getValueType(0);
-  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
-    return SDValue();
-  // If the vector type isn't a simple VT, it's beyond the scope of what
-  // we're  worried about here. Let legalization do its thing and hope for
-  // the best.
-  if (!ResVT.isSimple())
-    return SDValue();
-
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
-  // If the source VT is a 64-bit vector, we can play games and get the
-  // better results we want.
-  if (SrcVT.getSizeInBits() != 64)
-    return SDValue();
-
-  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
-  unsigned ElementCount = SrcVT.getVectorNumElements();
-  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
-  SDLoc DL(N);
-  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
-
-  // Now split the rest of the operation into two halves, each with a 64
-  // bit source.
-  EVT LoVT, HiVT;
-  SDValue Lo, Hi;
-  unsigned NumElements = ResVT.getVectorNumElements();
-  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                 ResVT.getVectorElementType(), NumElements / 2);
-
-  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
-  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
-  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
-
-  // Now combine the parts back together so we still have a single result
-  // like the combiner expects.
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
-}
-
-/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
-/// value. The load store optimizer pass will merge them to store pair stores.
-/// This has better performance than a splat of the scalar followed by a split
-/// vector store. Even if the stores are not merged it is four stores vs a dup,
-/// followed by an ext.b and two stores.
-static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
-  SDValue StVal = St->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't replace floating point stores, they possibly won't be transformed to
-  // stp because of the store pair suppress pass.
-  if (VT.isFloatingPoint())
-    return SDValue();
-
-  // Check for insert vector elements.
-  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
-    return SDValue();
-
-  // We can express a splat as store pair(s) for 2 or 4 elements.
-  unsigned NumVecElts = VT.getVectorNumElements();
-  if (NumVecElts != 4 && NumVecElts != 2)
-    return SDValue();
-  SDValue SplatVal = StVal.getOperand(1);
-  unsigned RemainInsertElts = NumVecElts - 1;
-
-  // Check that this is a splat.
-  while (--RemainInsertElts) {
-    SDValue NextInsertElt = StVal.getOperand(0);
-    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
-      return SDValue();
-    if (NextInsertElt.getOperand(1) != SplatVal)
-      return SDValue();
-    StVal = NextInsertElt;
-  }
-  unsigned OrigAlignment = St->getAlignment();
-  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
-  unsigned Alignment = std::min(OrigAlignment, EltOffset);
-
-  // Create scalar stores. This is at least as good as the code sequence for a
-  // split unaligned store wich is a dup.s, ext.b, and two stores.
-  // Most of the time the three stores should be replaced by store pair
-  // instructions (stp).
-  SDLoc DL(St);
-  SDValue BasePtr = St->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
-                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
-
-  unsigned Offset = EltOffset;
-  while (--NumVecElts) {
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                    DAG.getConstant(Offset, MVT::i64));
-    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
-                          St->getPointerInfo(), St->isVolatile(),
-                          St->isNonTemporal(), Alignment);
-    Offset += EltOffset;
-  }
-  return NewST1;
-}
-
-static SDValue performSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   SelectionDAG &DAG,
-                                   const ARM64Subtarget *Subtarget) {
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
-  StoreSDNode *S = cast<StoreSDNode>(N);
-  if (S->isVolatile())
-    return SDValue();
-
-  // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundries. We want to split such stores.
-  if (!Subtarget->isCyclone())
-    return SDValue();
-
-  // Don't split at Oz.
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
-  if (IsMinSize)
-    return SDValue();
-
-  SDValue StVal = S->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
-  // those up regresses performance on micro-benchmarks and olden/bh.
-  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
-    return SDValue();
-
-  // Split unaligned 16B stores. They are terrible for performance.
-  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
-  // extensions can use this to mark that it does not want splitting to happen
-  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
-  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
-  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
-      S->getAlignment() <= 2)
-    return SDValue();
-
-  // If we get a splat of a scalar convert this vector store to a store of
-  // scalars. They will be merged into store pairs thereby removing two
-  // instructions.
-  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
-  if (ReplacedSplat != SDValue())
-    return ReplacedSplat;
-
-  SDLoc DL(S);
-  unsigned NumElts = VT.getVectorNumElements() / 2;
-  // Split VT into two.
-  EVT HalfVT =
-      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
-  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
-  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
-  SDValue BasePtr = S->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
-                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
-  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                  DAG.getConstant(8, MVT::i64));
-  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
-                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
-                      S->getAlignment());
-}
-
-/// Target-specific DAG combine function for post-increment LD1 (lane) and
-/// post-increment LD1R.
-static SDValue performPostLD1Combine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     bool IsLaneOp) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  unsigned LoadIdx = IsLaneOp ? 1 : 0;
-  SDNode *LD = N->getOperand(LoadIdx).getNode();
-  // If it is not LOAD, can not do such combine.
-  if (LD->getOpcode() != ISD::LOAD)
-    return SDValue();
-
-  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
-  EVT MemVT = LoadSDN->getMemoryVT();
-  // Check if memory operand is the same type as the vector element.
-  if (MemVT != VT.getVectorElementType())
-    return SDValue();
-
-  // Check if there are other uses. If so, do not combine as it will introduce
-  // an extra load.
-  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
-       ++UI) {
-    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
-      continue;
-    if (*UI != N)
-      return SDValue();
-  }
-
-  SDValue Addr = LD->getOperand(1);
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
-       Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD
-        || UI.getUse().getResNo() != Addr.getResNo())
-      continue;
-
-    // Check that the add is independent of the load.  Otherwise, folding it
-    // would create a cycle.
-    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
-      continue;
-
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
-      uint32_t IncVal = CInc->getZExtValue();
-      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
-      if (IncVal != NumBytes)
-        continue;
-      Inc = DAG.getRegister(ARM64::XZR, MVT::i64);
-    }
-
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(LD->getOperand(0));  // Chain
-    if (IsLaneOp) {
-      Ops.push_back(N->getOperand(0)); // The vector to be inserted
-      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
-    }
-    Ops.push_back(Addr);
-    Ops.push_back(Inc);
-
-    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
-    unsigned NewOp = IsLaneOp ? ARM64ISD::LD1LANEpost : ARM64ISD::LD1DUPpost;
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
-                                           MemVT,
-                                           LoadSDN->getMemOperand());
-
-    // Update the uses.
-    std::vector<SDValue> NewResults;
-    NewResults.push_back(SDValue(LD, 0));             // The result of load
-    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
-    DCI.CombineTo(LD, NewResults);
-    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
-
-    break;
-  }
-  return SDValue();
-}
-
-/// Target-specific DAG combine function for NEON load/store intrinsics
-/// to merge base address updates.
-static SDValue performNEONPostLDSTCombine(SDNode *N,
-                                          TargetLowering::DAGCombinerInfo &DCI,
-                                          SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
-
-  unsigned AddrOpIdx = N->getNumOperands() - 1;
-  SDValue Addr = N->getOperand(AddrOpIdx);
-
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
-      continue;
-
-    // Check that the add is independent of the load/store.  Otherwise, folding
-    // it would create a cycle.
-    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
-      continue;
-
-    // Find the new opcode for the updating load/store.
-    bool IsStore = false;
-    bool IsLaneOp = false;
-    bool IsDupOp = false;
-    unsigned NewOpc = 0;
-    unsigned NumVecs = 0;
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default: llvm_unreachable("unexpected intrinsic for Neon base update");
-    case Intrinsic::arm64_neon_ld2:       NewOpc = ARM64ISD::LD2post;
-      NumVecs = 2; break;
-    case Intrinsic::arm64_neon_ld3:       NewOpc = ARM64ISD::LD3post;
-      NumVecs = 3; break;
-    case Intrinsic::arm64_neon_ld4:       NewOpc = ARM64ISD::LD4post;
-      NumVecs = 4; break;
-    case Intrinsic::arm64_neon_st2:       NewOpc = ARM64ISD::ST2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::arm64_neon_st3:       NewOpc = ARM64ISD::ST3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::arm64_neon_st4:       NewOpc = ARM64ISD::ST4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::arm64_neon_ld1x2:     NewOpc = ARM64ISD::LD1x2post;
-      NumVecs = 2; break;
-    case Intrinsic::arm64_neon_ld1x3:     NewOpc = ARM64ISD::LD1x3post;
-      NumVecs = 3; break;
-    case Intrinsic::arm64_neon_ld1x4:     NewOpc = ARM64ISD::LD1x4post;
-      NumVecs = 4; break;
-    case Intrinsic::arm64_neon_st1x2:     NewOpc = ARM64ISD::ST1x2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::arm64_neon_st1x3:     NewOpc = ARM64ISD::ST1x3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::arm64_neon_st1x4:     NewOpc = ARM64ISD::ST1x4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::arm64_neon_ld2r:      NewOpc = ARM64ISD::LD2DUPpost;
-      NumVecs = 2; IsDupOp = true; break;
-    case Intrinsic::arm64_neon_ld3r:      NewOpc = ARM64ISD::LD3DUPpost;
-      NumVecs = 3; IsDupOp = true; break;
-    case Intrinsic::arm64_neon_ld4r:      NewOpc = ARM64ISD::LD4DUPpost;
-      NumVecs = 4; IsDupOp = true; break;
-    case Intrinsic::arm64_neon_ld2lane:   NewOpc = ARM64ISD::LD2LANEpost;
-      NumVecs = 2; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_ld3lane:   NewOpc = ARM64ISD::LD3LANEpost;
-      NumVecs = 3; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_ld4lane:   NewOpc = ARM64ISD::LD4LANEpost;
-      NumVecs = 4; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_st2lane:   NewOpc = ARM64ISD::ST2LANEpost;
-      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_st3lane:   NewOpc = ARM64ISD::ST3LANEpost;
-      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::arm64_neon_st4lane:   NewOpc = ARM64ISD::ST4LANEpost;
-      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
-    }
-
-    EVT VecTy;
-    if (IsStore)
-      VecTy = N->getOperand(2).getValueType();
-    else
-      VecTy = N->getValueType(0);
-
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
-      uint32_t IncVal = CInc->getZExtValue();
-      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-      if (IsLaneOp || IsDupOp)
-        NumBytes /= VecTy.getVectorNumElements();
-      if (IncVal != NumBytes)
-        continue;
-      Inc = DAG.getRegister(ARM64::XZR, MVT::i64);
-    }
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(N->getOperand(0)); // Incoming chain
-    // Load lane and store have vector list as input.
-    if (IsLaneOp || IsStore)
-      for (unsigned i = 2; i < AddrOpIdx; ++i)
-        Ops.push_back(N->getOperand(i));
-    Ops.push_back(Addr); // Base register
-    Ops.push_back(Inc);
-
-    // Return Types.
-    EVT Tys[6];
-    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
-    unsigned n;
-    for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = VecTy;
-    Tys[n++] = MVT::i64;  // Type of write back register
-    Tys[n] = MVT::Other;  // Type of the chain
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
-
-    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
-                                           MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
-
-    // Update the uses.
-    std::vector<SDValue> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i) {
-      NewResults.push_back(SDValue(UpdN.getNode(), i));
-    }
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
-    DCI.CombineTo(N, NewResults);
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
-
-    break;
-  }
-  return SDValue();
-}
-
-// Optimize compare with zero and branch.
-static SDValue performBRCONDCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  SDValue Chain = N->getOperand(0);
-  SDValue Dest = N->getOperand(1);
-  SDValue CCVal = N->getOperand(2);
-  SDValue Cmp = N->getOperand(3);
-
-  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
-  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
-  if (CC != ARM64CC::EQ && CC != ARM64CC::NE)
-    return SDValue();
-
-  unsigned CmpOpc = Cmp.getOpcode();
-  if (CmpOpc != ARM64ISD::ADDS && CmpOpc != ARM64ISD::SUBS)
-    return SDValue();
-
-  // Only attempt folding if there is only one use of the flag and no use of the
-  // value.
-  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
-    return SDValue();
-
-  SDValue LHS = Cmp.getOperand(0);
-  SDValue RHS = Cmp.getOperand(1);
-
-  assert(LHS.getValueType() == RHS.getValueType() &&
-         "Expected the value type to be the same for both operands!");
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return SDValue();
-
-  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
-    std::swap(LHS, RHS);
-
-  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
-    return SDValue();
-
-  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
-      LHS.getOpcode() == ISD::SRL)
-    return SDValue();
-
-  // Fold the compare into the branch instruction.
-  SDValue BR;
-  if (CC == ARM64CC::EQ)
-    BR = DAG.getNode(ARM64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-  else
-    BR = DAG.getNode(ARM64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-
-  // Do not add new nodes to DAG combiner worklist.
-  DCI.CombineTo(N, BR, false);
-
-  return SDValue();
-}
-
-// vselect (v1i1 setcc) ->
-//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
-// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
-// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
-// such VSELECT.
-static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  EVT CCVT = N0.getValueType();
-
-  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
-      CCVT.getVectorElementType() != MVT::i1)
-    return SDValue();
-
-  EVT ResVT = N->getValueType(0);
-  EVT CmpVT = N0.getOperand(0).getValueType();
-  // Only combine when the result type is of the same size as the compared
-  // operands.
-  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
-    return SDValue();
-
-  SDValue IfTrue = N->getOperand(1);
-  SDValue IfFalse = N->getOperand(2);
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
-                   N0.getOperand(0), N0.getOperand(1),
-                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
-  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
-                     IfTrue, IfFalse);
-}
-
-/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
-/// the compare-mask instructions rather than going via NZCV, even if LHS and
-/// RHS are really scalar. This replaces any scalar setcc in the above pattern
-/// with a vector one followed by a DUP shuffle on the result.
-static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  EVT ResVT = N->getValueType(0);
-
-  if (!N->getOperand(1).getValueType().isVector())
-    return SDValue();
-
-  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
-    return SDValue();
-
-  SDLoc DL(N0);
-
-  EVT SrcVT = N0.getOperand(0).getValueType();
-  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
-                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
-  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
-
-  // First perform a vector comparison, where lane 0 is the one we're interested
-  // in.
-  SDValue LHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
-  SDValue RHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
-  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
-
-  // Now duplicate the comparison mask we want across all other lanes.
-  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
-  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
-  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
-                     Mask);
-
-  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
-}
-
-SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N,
-                                               DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  switch (N->getOpcode()) {
-  default:
-    break;
-  case ISD::ADD:
-  case ISD::SUB:
-    return performAddSubLongCombine(N, DCI, DAG);
-  case ISD::XOR:
-    return performXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::MUL:
-    return performMulCombine(N, DAG, DCI, Subtarget);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
-  case ISD::OR:
-    return performORCombine(N, DCI, Subtarget);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return performIntrinsicCombine(N, DCI, Subtarget);
-  case ISD::ANY_EXTEND:
-  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
-    return performExtendCombine(N, DCI, DAG);
-  case ISD::BITCAST:
-    return performBitcastCombine(N, DCI, DAG);
-  case ISD::CONCAT_VECTORS:
-    return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::SELECT:
-    return performSelectCombine(N, DAG);
-  case ISD::VSELECT:
-    return performVSelectCombine(N, DCI.DAG);
-  case ISD::STORE:
-    return performSTORECombine(N, DCI, DAG, Subtarget);
-  case ARM64ISD::BRCOND:
-    return performBRCONDCombine(N, DCI, DAG);
-  case ARM64ISD::DUP:
-    return performPostLD1Combine(N, DCI, false);
-  case ISD::INSERT_VECTOR_ELT:
-    return performPostLD1Combine(N, DCI, true);
-  case ISD::INTRINSIC_VOID:
-  case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
-    case Intrinsic::arm64_neon_ld2:
-    case Intrinsic::arm64_neon_ld3:
-    case Intrinsic::arm64_neon_ld4:
-    case Intrinsic::arm64_neon_ld1x2:
-    case Intrinsic::arm64_neon_ld1x3:
-    case Intrinsic::arm64_neon_ld1x4:
-    case Intrinsic::arm64_neon_ld2lane:
-    case Intrinsic::arm64_neon_ld3lane:
-    case Intrinsic::arm64_neon_ld4lane:
-    case Intrinsic::arm64_neon_ld2r:
-    case Intrinsic::arm64_neon_ld3r:
-    case Intrinsic::arm64_neon_ld4r:
-    case Intrinsic::arm64_neon_st2:
-    case Intrinsic::arm64_neon_st3:
-    case Intrinsic::arm64_neon_st4:
-    case Intrinsic::arm64_neon_st1x2:
-    case Intrinsic::arm64_neon_st1x3:
-    case Intrinsic::arm64_neon_st1x4:
-    case Intrinsic::arm64_neon_st2lane:
-    case Intrinsic::arm64_neon_st3lane:
-    case Intrinsic::arm64_neon_st4lane:
-      return performNEONPostLDSTCombine(N, DCI, DAG);
-    default:
-      break;
-    }
-  }
-  return SDValue();
-}
-
-// Check if the return value is used as only a return value, as otherwise
-// we can't perform a tail-call. In particular, we need to check for
-// target ISD nodes that are returns and any other "odd" constructs
-// that the generic analysis code won't necessarily catch.
-bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
-  if (N->getNumValues() != 1)
-    return false;
-  if (!N->hasNUsesOfValue(1, 0))
-    return false;
-
-  SDValue TCChain = Chain;
-  SDNode *Copy = *N->use_begin();
-  if (Copy->getOpcode() == ISD::CopyToReg) {
-    // If the copy has a glue operand, we conservatively assume it isn't safe to
-    // perform a tail call.
-    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
-        MVT::Glue)
-      return false;
-    TCChain = Copy->getOperand(0);
-  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
-    return false;
-
-  bool HasRet = false;
-  for (SDNode *Node : Copy->uses()) {
-    if (Node->getOpcode() != ARM64ISD::RET_FLAG)
-      return false;
-    HasRet = true;
-  }
-
-  if (!HasRet)
-    return false;
-
-  Chain = TCChain;
-  return true;
-}
-
-// Return whether the an instruction can potentially be optimized to a tail
-// call. This will cause the optimizers to attempt to move, or duplicate,
-// return instructions to help enable tail call optimizations for this
-// instruction.
-bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall())
-    return false;
-
-  return true;
-}
-
-bool ARM64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
-                                                 SDValue &Offset,
-                                                 ISD::MemIndexedMode &AM,
-                                                 bool &IsInc,
-                                                 SelectionDAG &DAG) const {
-  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
-    return false;
-
-  Base = Op->getOperand(0);
-  // All of the indexed addressing mode instructions take a signed
-  // 9 bit immediate offset.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
-    int64_t RHSC = (int64_t)RHS->getZExtValue();
-    if (RHSC >= 256 || RHSC <= -256)
-      return false;
-    IsInc = (Op->getOpcode() == ISD::ADD);
-    Offset = Op->getOperand(1);
-    return true;
-  }
-  return false;
-}
-
-bool ARM64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
-                                                    SDValue &Offset,
-                                                    ISD::MemIndexedMode &AM,
-                                                    SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
-    return false;
-  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
-  return true;
-}
-
-bool ARM64TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                                     SDValue &Base,
-                                                     SDValue &Offset,
-                                                     ISD::MemIndexedMode &AM,
-                                                     SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
-    return false;
-  // Post-indexing updates the base, so it's not a valid transform
-  // if that's not the same as the load's pointer.
-  if (Ptr != Base)
-    return false;
-  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
-  return true;
-}
-
-void ARM64TargetLowering::ReplaceNodeResults(SDNode *N,
-                                             SmallVectorImpl<SDValue> &Results,
-                                             SelectionDAG &DAG) const {
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Don't know how to custom expand this");
-  case ISD::FP_TO_UINT:
-  case ISD::FP_TO_SINT:
-    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
-    // Let normal code take care of it by not adding anything to Results.
-    return;
-  }
-}
-
-bool ARM64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 128-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 128;
-
-  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
-}
-
-Value *ARM64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
-                                           AtomicOrdering Ord) const {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
-  // intrinsic must return {i64, i64} and we have to recombine them into a
-  // single i128 here.
-  if (ValTy->getPrimitiveSizeInBits() == 128) {
-    Intrinsic::ID Int =
-        IsAcquire ? Intrinsic::arm64_ldaxp : Intrinsic::arm64_ldxp;
-    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
-
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
-
-    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
-    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
-    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
-    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
-    return Builder.CreateOr(
-        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
-  }
-
-  Type *Tys[] = { Addr->getType() };
-  Intrinsic::ID Int =
-      IsAcquire ? Intrinsic::arm64_ldaxr : Intrinsic::arm64_ldxr;
-  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateTruncOrBitCast(
-      Builder.CreateCall(Ldxr, Addr),
-      cast<PointerType>(Addr->getType())->getElementType());
-}
-
-Value *ARM64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
-                                                 Value *Val, Value *Addr,
-                                                 AtomicOrdering Ord) const {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since the intrinsics must have legal type, the i128 intrinsics take two
-  // parameters: "i64, i64". We must marshal Val into the appropriate form
-  // before the call.
-  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
-    Intrinsic::ID Int =
-        IsRelease ? Intrinsic::arm64_stlxp : Intrinsic::arm64_stxp;
-    Function *Stxr = Intrinsic::getDeclaration(M, Int);
-    Type *Int64Ty = Type::getInt64Ty(M->getContext());
-
-    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
-    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
-  }
-
-  Intrinsic::ID Int =
-      IsRelease ? Intrinsic::arm64_stlxr : Intrinsic::arm64_stxr;
-  Type *Tys[] = { Addr->getType() };
-  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateCall2(
-      Stxr, Builder.CreateZExtOrBitCast(
-                Val, Stxr->getFunctionType()->getParamType(0)),
-      Addr);
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
deleted file mode 100644
index b2402c9791c7..000000000000
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ /dev/null
@@ -1,464 +0,0 @@
-//==-- ARM64ISelLowering.h - ARM64 DAG Lowering Interface --------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that ARM64 uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_ISELLOWERING_H
-#define LLVM_TARGET_ARM64_ISELLOWERING_H
-
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-namespace ARM64ISD {
-
-enum {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
-  CALL,         // Function call.
-
-  // Almost the same as a normal call node, except that a TLSDesc relocation is
-  // needed so the linker can relax it correctly if possible.
-  TLSDESC_CALL,
-  ADRP,     // Page address of a TargetGlobalAddress operand.
-  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
-  LOADgot,  // Load from automatically generated descriptor (e.g. Global
-            // Offset Table, TLS record).
-  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
-  BRCOND,   // Conditional branch instruction; "b.cond".
-  CSEL,
-  FCSEL, // Conditional move instruction.
-  CSINV, // Conditional select invert.
-  CSNEG, // Conditional select negate.
-  CSINC, // Conditional select increment.
-
-  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
-  // ELF.
-  THREAD_POINTER,
-  ADC,
-  SBC, // adc, sbc instructions
-
-  // Arithmetic instructions which write flags.
-  ADDS,
-  SUBS,
-  ADCS,
-  SBCS,
-  ANDS,
-
-  // Floating point comparison
-  FCMP,
-
-  // Floating point max and min instructions.
-  FMAX,
-  FMIN,
-
-  // Scalar extract
-  EXTR,
-
-  // Scalar-to-vector duplication
-  DUP,
-  DUPLANE8,
-  DUPLANE16,
-  DUPLANE32,
-  DUPLANE64,
-
-  // Vector immedate moves
-  MOVI,
-  MOVIshift,
-  MOVIedit,
-  MOVImsl,
-  FMOV,
-  MVNIshift,
-  MVNImsl,
-
-  // Vector immediate ops
-  BICi,
-  ORRi,
-
-  // Vector bit select: similar to ISD::VSELECT but not all bits within an
-  // element must be identical.
-  BSL,
-
-  // Vector arithmetic negation
-  NEG,
-
-  // Vector shuffles
-  ZIP1,
-  ZIP2,
-  UZP1,
-  UZP2,
-  TRN1,
-  TRN2,
-  REV16,
-  REV32,
-  REV64,
-  EXT,
-
-  // Vector shift by scalar
-  VSHL,
-  VLSHR,
-  VASHR,
-
-  // Vector shift by scalar (again)
-  SQSHL_I,
-  UQSHL_I,
-  SQSHLU_I,
-  SRSHR_I,
-  URSHR_I,
-
-  // Vector comparisons
-  CMEQ,
-  CMGE,
-  CMGT,
-  CMHI,
-  CMHS,
-  FCMEQ,
-  FCMGE,
-  FCMGT,
-
-  // Vector zero comparisons
-  CMEQz,
-  CMGEz,
-  CMGTz,
-  CMLEz,
-  CMLTz,
-  FCMEQz,
-  FCMGEz,
-  FCMGTz,
-  FCMLEz,
-  FCMLTz,
-
-  // Vector bitwise negation
-  NOT,
-
-  // Vector bitwise selection
-  BIT,
-
-  // Compare-and-branch
-  CBZ,
-  CBNZ,
-  TBZ,
-  TBNZ,
-
-  // Tail calls
-  TC_RETURN,
-
-  // Custom prefetch handling
-  PREFETCH,
-
-  // {s|u}int to FP within a FP register.
-  SITOF,
-  UITOF,
-
-  // NEON Load/Store with post-increment base updates
-  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
-  LD3post,
-  LD4post,
-  ST2post,
-  ST3post,
-  ST4post,
-  LD1x2post,
-  LD1x3post,
-  LD1x4post,
-  ST1x2post,
-  ST1x3post,
-  ST1x4post,
-  LD1DUPpost,
-  LD2DUPpost,
-  LD3DUPpost,
-  LD4DUPpost,
-  LD1LANEpost,
-  LD2LANEpost,
-  LD3LANEpost,
-  LD4LANEpost,
-  ST2LANEpost,
-  ST3LANEpost,
-  ST4LANEpost
-};
-
-} // end namespace ARM64ISD
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64TargetLowering : public TargetLowering {
-  bool RequireStrictAlign;
-
-public:
-  explicit ARM64TargetLowering(ARM64TargetMachine &TM);
-
-  /// Selects the correct CCAssignFn for a the given CallingConvention
-  /// value.
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
-
-  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
-  /// Mask are known to be either zero or one and return them in the
-  /// KnownZero/KnownOne bitsets.
-  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                     APInt &KnownOne, const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
-
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
-
-  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
-  /// unaligned memory accesses. of the specified type.
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
-                                     bool *Fast = nullptr) const override {
-    if (RequireStrictAlign)
-      return false;
-    // FIXME: True for Cyclone, but not necessary others.
-    if (Fast)
-      *Fast = true;
-    return true;
-  }
-
-  /// LowerOperation - Provide custom lowering hooks for some operations.
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  unsigned getFunctionAlignment(const Function *F) const;
-
-  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
-  /// be used for loads / stores from the global.
-  unsigned getMaximalGlobalOffset() const override;
-
-  /// Returns true if a cast between SrcAS and DestAS is a noop.
-  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-    // Addrspacecasts are always noops.
-    return true;
-  }
-
-  /// createFastISel - This method returns a target specific FastISel object,
-  /// or null if the target does not support "fast" ISel.
-  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                           const TargetLibraryInfo *libInfo) const override;
-
-  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-
-  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
-  /// codegen'd directly, or if it should be stack expanded.
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
-
-  /// getSetCCResultType - Return the ISD::SETCC ValueType
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
-
-  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
-
-  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *BB) const;
-
-  MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
-                              MachineBasicBlock *MBB) const override;
-
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                          unsigned Intrinsic) const override;
-
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-  bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
-  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
-  bool isZExtFree(EVT VT1, EVT VT2) const override;
-  bool isZExtFree(SDValue Val, EVT VT2) const override;
-
-  bool hasPairedLoad(Type *LoadedType,
-                     unsigned &RequiredAligment) const override;
-  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
-
-  bool isLegalAddImmediate(int64_t) const override;
-  bool isLegalICmpImmediate(int64_t) const override;
-
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                          MachineFunction &MF) const override;
-
-  /// isLegalAddressingMode - Return true if the addressing mode represented
-  /// by AM is legal for this target, for a load/store of the specified type.
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
-
-  /// \brief Return the cost of the scaling factor used in the addressing
-  /// mode represented by AM for this target, for a load/store
-  /// of the specified type.
-  /// If the AM is supported, the return value must be >= 0.
-  /// If the AM is not supported, it returns a negative value.
-  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
-
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
-  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
-
-  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
-
-  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
-  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
-
-  /// \brief Returns true if it is beneficial to convert a load of a constant
-  /// to just the constant itself.
-  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                         Type *Ty) const override;
-
-  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
-                        AtomicOrdering Ord) const override;
-  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
-                              Value *Addr, AtomicOrdering Ord) const override;
-
-  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
-
-private:
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
-  void addDRTypeForNEON(MVT VT);
-  void addQRTypeForNEON(MVT VT);
-
-  SDValue
-  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
-                    SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                          CallingConv::ID CallConv, bool isVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-                          bool isThisReturn, SDValue ThisVal) const;
-
-  bool isEligibleForTailCallOptimization(
-      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      bool isCalleeStructRet, bool isCallerStructRet,
-      const SmallVectorImpl<ISD::OutputArg> &Outs,
-      const SmallVectorImpl<SDValue> &OutVals,
-      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
-
-  /// Finds the incoming stack arguments which overlap the given fixed stack
-  /// object and incorporates their load into the current chain. This prevents
-  /// an upcoming store from clobbering the stack argument before it's used.
-  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
-                              MachineFrameInfo *MFI, int ClobberedFI) const;
-
-  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
-
-  bool IsTailCallConvention(CallingConv::ID CallCC) const;
-
-  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
-                           SDValue &Chain) const;
-
-  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                      bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
-
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
-                      SelectionDAG &DAG) const override;
-
-  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                              SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                        RTLIB::Libcall Call) const;
-  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
-
-  ConstraintType
-  getConstraintType(const std::string &Constraint) const override;
-  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
-
-  /// Examine constraint string and operand type and determine a weight value.
-  /// The operand object must already have been set up with the operand type.
-  ConstraintWeight
-  getSingleConstraintMatchWeight(AsmOperandInfo &info,
-                                 const char *constraint) const override;
-
-  std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint,
-                               MVT VT) const override;
-  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
-                                    std::vector<SDValue> &Ops,
-                                    SelectionDAG &DAG) const override;
-
-  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
-  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
-  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
-                              ISD::MemIndexedMode &AM, bool &IsInc,
-                              SelectionDAG &DAG) const;
-  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
-                                 ISD::MemIndexedMode &AM,
-                                 SelectionDAG &DAG) const override;
-  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
-                                  SDValue &Offset, ISD::MemIndexedMode &AM,
-                                  SelectionDAG &DAG) const override;
-
-  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                          SelectionDAG &DAG) const override;
-};
-
-namespace ARM64 {
-FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo);
-} // end namespace ARM64
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64_ISELLOWERING_H
diff --git a/lib/Target/ARM64/ARM64InstrAtomics.td b/lib/Target/ARM64/ARM64InstrAtomics.td
deleted file mode 100644
index 989e7a2e74ad..000000000000
--- a/lib/Target/ARM64/ARM64InstrAtomics.td
+++ /dev/null
@@ -1,308 +0,0 @@
-//===- ARM64InstrAtomics.td - ARM64 Atomic codegen support -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Atomic operand code-gen constructs.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------
-// Atomic fences
-//===----------------------------------
-def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
-def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
-
-//===----------------------------------
-// Atomic loads
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A atomic load operation that actually needs acquire semantics.
-class acquiring_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected load ordering");
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic load operation that does not need either acquire or release
-// semantics.
-class relaxed_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit loads
-def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_8> ro_indexed8:$addr),
-          (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(relaxed_load<atomic_load_8> am_indexed8:$addr),
-          (LDRBBui am_indexed8:$addr)>;
-def : Pat<(relaxed_load<atomic_load_8> am_unscaled8:$addr),
-          (LDURBBi am_unscaled8:$addr)>;
-
-// 16-bit loads
-def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_16> ro_indexed16:$addr),
-          (LDRHHro ro_indexed16:$addr)>;
-def : Pat<(relaxed_load<atomic_load_16> am_indexed16:$addr),
-          (LDRHHui am_indexed16:$addr)>;
-def : Pat<(relaxed_load<atomic_load_16> am_unscaled16:$addr),
-          (LDURHHi am_unscaled16:$addr)>;
-
-// 32-bit loads
-def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_32> ro_indexed32:$addr),
-          (LDRWro ro_indexed32:$addr)>;
-def : Pat<(relaxed_load<atomic_load_32> am_indexed32:$addr),
-          (LDRWui am_indexed32:$addr)>;
-def : Pat<(relaxed_load<atomic_load_32> am_unscaled32:$addr),
-          (LDURWi am_unscaled32:$addr)>;
-
-// 64-bit loads
-def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_64> ro_indexed64:$addr),
-          (LDRXro ro_indexed64:$addr)>;
-def : Pat<(relaxed_load<atomic_load_64> am_indexed64:$addr),
-          (LDRXui am_indexed64:$addr)>;
-def : Pat<(relaxed_load<atomic_load_64> am_unscaled64:$addr),
-          (LDURXi am_unscaled64:$addr)>;
-
-//===----------------------------------
-// Atomic stores
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A store operation that actually needs release semantics.
-class releasing_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return Ordering == Release || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic store operation that doesn't actually need to be atomic on ARM64.
-class relaxed_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit stores
-def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
-          (STLRB GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> ro_indexed8:$ptr, GPR32:$val),
-          (STRBBro GPR32:$val, ro_indexed8:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> am_indexed8:$ptr, GPR32:$val),
-          (STRBBui GPR32:$val, am_indexed8:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> am_unscaled8:$ptr, GPR32:$val),
-          (STURBBi GPR32:$val, am_unscaled8:$ptr)>;
-
-// 16-bit stores
-def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
-          (STLRH GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> ro_indexed16:$ptr, GPR32:$val),
-          (STRHHro GPR32:$val, ro_indexed16:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> am_indexed16:$ptr, GPR32:$val),
-          (STRHHui GPR32:$val, am_indexed16:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> am_unscaled16:$ptr, GPR32:$val),
-          (STURHHi GPR32:$val, am_unscaled16:$ptr)>;
-
-// 32-bit stores
-def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
-          (STLRW GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> ro_indexed32:$ptr, GPR32:$val),
-          (STRWro GPR32:$val, ro_indexed32:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> am_indexed32:$ptr, GPR32:$val),
-          (STRWui GPR32:$val, am_indexed32:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> am_unscaled32:$ptr, GPR32:$val),
-          (STURWi GPR32:$val, am_unscaled32:$ptr)>;
-
-// 64-bit stores
-def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
-          (STLRX GPR64:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> ro_indexed64:$ptr, GPR64:$val),
-          (STRXro GPR64:$val, ro_indexed64:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> am_indexed64:$ptr, GPR64:$val),
-          (STRXui GPR64:$val, am_indexed64:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> am_unscaled64:$ptr, GPR64:$val),
-          (STURXi GPR64:$val, am_unscaled64:$ptr)>;
-
-//===----------------------------------
-// Low-level exclusive operations
-//===----------------------------------
-
-// Load-exclusives.
-
-def ldxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def ldxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def ldxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def ldxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(ldxr_1 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_2 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_4 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_8 am_noindex:$addr), (LDXRX am_noindex:$addr)>;
-
-def : Pat<(and (ldxr_1 am_noindex:$addr), 0xff),
-          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldxr_2 am_noindex:$addr), 0xffff),
-          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldxr_4 am_noindex:$addr), 0xffffffff),
-          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
-
-// Load-exclusives.
-
-def ldaxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def ldaxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def ldaxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def ldaxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(ldaxr_1 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDAXRB am_noindex:$addr), sub_32)>;
-def : Pat<(ldaxr_2 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDAXRH am_noindex:$addr), sub_32)>;
-def : Pat<(ldaxr_4 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDAXRW am_noindex:$addr), sub_32)>;
-def : Pat<(ldaxr_8 am_noindex:$addr), (LDAXRX am_noindex:$addr)>;
-
-def : Pat<(and (ldaxr_1 am_noindex:$addr), 0xff),
-          (SUBREG_TO_REG (i64 0), (LDAXRB am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldaxr_2 am_noindex:$addr), 0xffff),
-          (SUBREG_TO_REG (i64 0), (LDAXRH am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldaxr_4 am_noindex:$addr), 0xffffffff),
-          (SUBREG_TO_REG (i64 0), (LDAXRW am_noindex:$addr), sub_32)>;
-
-// Store-exclusives.
-
-def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-
-def : Pat<(stxr_1 GPR64:$val, am_noindex:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_2 GPR64:$val, am_noindex:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_4 GPR64:$val, am_noindex:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_8 GPR64:$val, am_noindex:$addr),
-          (STXRX GPR64:$val, am_noindex:$addr)>;
-
-def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), am_noindex:$addr),
-          (STXRB GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), am_noindex:$addr),
-          (STXRH GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stxr_4 (zext GPR32:$val), am_noindex:$addr),
-          (STXRW GPR32:$val, am_noindex:$addr)>;
-
-def : Pat<(stxr_1 (and GPR64:$val, 0xff), am_noindex:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_2 (and GPR64:$val, 0xffff), am_noindex:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), am_noindex:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-
-// Store-release-exclusives.
-
-def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stlxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-
-def : Pat<(stlxr_1 GPR64:$val, am_noindex:$addr),
-          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stlxr_2 GPR64:$val, am_noindex:$addr),
-          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stlxr_4 GPR64:$val, am_noindex:$addr),
-          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stlxr_8 GPR64:$val, am_noindex:$addr),
-          (STLXRX GPR64:$val, am_noindex:$addr)>;
-
-def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), am_noindex:$addr),
-          (STLXRB GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), am_noindex:$addr),
-          (STLXRH GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stlxr_4 (zext GPR32:$val), am_noindex:$addr),
-          (STLXRW GPR32:$val, am_noindex:$addr)>;
-
-def : Pat<(stlxr_1 (and GPR64:$val, 0xff), am_noindex:$addr),
-          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), am_noindex:$addr),
-          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), am_noindex:$addr),
-          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-
-
-// And clear exclusive.
-
-def : Pat<(int_arm64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/ARM64/ARM64InstrFormats.td b/lib/Target/ARM64/ARM64InstrFormats.td
deleted file mode 100644
index 0ac27e09358c..000000000000
--- a/lib/Target/ARM64/ARM64InstrFormats.td
+++ /dev/null
@@ -1,8529 +0,0 @@
-//===- ARM64InstrFormats.td - ARM64 Instruction Formats ------*- tblgen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Describe ARM64 instructions format here
-//
-
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-class Format<bits<2> val> {
-  bits<2> Value = val;
-}
-
-def PseudoFrm   : Format<0>;
-def NormalFrm   : Format<1>; // Do we need any others?
-
-// ARM64 Instruction Format
-class ARM64Inst<Format f, string cstr> : Instruction {
-  field bits<32> Inst; // Instruction encoding.
-  // Mask of bits that cause an encoding to be UNPREDICTABLE.
-  // If a bit is set, then if the corresponding bit in the
-  // target encoding differs from its value in the "Inst" field,
-  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
-  field bits<32> Unpredictable = 0;
-  // SoftFail is the generic name for this field, but we alias it so
-  // as to make it more obvious what it means in ARM-land.
-  field bits<32> SoftFail = Unpredictable;
-  let Namespace   = "ARM64";
-  Format F        = f;
-  bits<2> Form    = F.Value;
-  let Pattern     = [];
-  let Constraints = cstr;
-}
-
-// Pseudo instructions (don't have encoding information)
-class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
-    : ARM64Inst<PseudoFrm, cstr> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let Pattern        = pattern;
-  let isCodeGenOnly  = 1;
-}
-
-// Real instructions (have encoding information)
-class EncodedI<string cstr, list<dag> pattern> : ARM64Inst<NormalFrm, cstr> {
-  let Pattern = pattern;
-  let Size = 4;
-}
-
-// Normal instructions
-class I<dag oops, dag iops, string asm, string operands, string cstr,
-        list<dag> pattern>
-    : EncodedI<cstr, pattern> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let AsmString      = !strconcat(asm, operands);
-}
-
-class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
-class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
-
-// Helper fragment for an extract of the high portion of a 128-bit vector.
-def extract_high_v16i8 :
-   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
-def extract_high_v8i16 :
-   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
-def extract_high_v4i32 :
-   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
-def extract_high_v2i64 :
-   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
-
-//===----------------------------------------------------------------------===//
-// Asm Operand Classes.
-//
-
-// Shifter operand for arithmetic shifted encodings.
-def ShifterOperand : AsmOperandClass {
-  let Name = "Shifter";
-}
-
-// Shifter operand for mov immediate encodings.
-def MovImm32ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm32Shifter";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "InvalidMovImm32Shift";
-}
-def MovImm64ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm64Shifter";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "InvalidMovImm64Shift";
-}
-
-// Shifter operand for arithmetic register shifted encodings.
-class ArithmeticShifterOperand<int width> : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "ArithmeticShifter" # width;
-  let PredicateMethod = "isArithmeticShifter<" # width # ">";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "AddSubRegShift" # width;
-}
-
-def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
-def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
-
-// Shifter operand for logical register shifted encodings.
-class LogicalShifterOperand<int width> : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "LogicalShifter" # width;
-  let PredicateMethod = "isLogicalShifter<" # width # ">";
-  let RenderMethod = "addShifterOperands";
-  let DiagnosticType = "AddSubRegShift" # width;
-}
-
-def LogicalShifterOperand32 : LogicalShifterOperand<32>;
-def LogicalShifterOperand64 : LogicalShifterOperand<64>;
-
-// Shifter operand for logical vector 128/64-bit shifted encodings.
-def LogicalVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "LogicalVecShifter";
-  let RenderMethod = "addShifterOperands";
-}
-def LogicalVecHalfWordShifterOperand : AsmOperandClass {
-  let SuperClasses = [LogicalVecShifterOperand];
-  let Name = "LogicalVecHalfWordShifter";
-  let RenderMethod = "addShifterOperands";
-}
-
-// The "MSL" shifter on the vector MOVI instruction.
-def MoveVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MoveVecShifter";
-  let RenderMethod = "addShifterOperands";
-}
-
-// Extend operand for arithmetic encodings.
-def ExtendOperand : AsmOperandClass {
-  let Name = "Extend";
-  let DiagnosticType = "AddSubRegExtendLarge";
-}
-def ExtendOperand64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "Extend64";
-  let DiagnosticType = "AddSubRegExtendSmall";
-}
-// 'extend' that's a lsl of a 64-bit register.
-def ExtendOperandLSL64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "ExtendLSL64";
-  let RenderMethod = "addExtend64Operands";
-  let DiagnosticType = "AddSubRegExtendLarge";
-}
-
-// 8-bit floating-point immediate encodings.
-def FPImmOperand : AsmOperandClass {
-  let Name = "FPImm";
-  let ParserMethod = "tryParseFPImm";
-  let DiagnosticType = "InvalidFPImm";
-}
-
-def CondCode : AsmOperandClass {
-  let Name = "CondCode";
-  let DiagnosticType = "InvalidCondCode";
-}
-
-// A 32-bit register pasrsed as 64-bit
-def GPR32as64Operand : AsmOperandClass {
-  let Name = "GPR32as64";
-}
-def GPR32as64 : RegisterOperand<GPR32> {
-  let ParserMatchClass = GPR32as64Operand;
-}
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
-
-
-//===----------------------------------------------------------------------===//
-// Operand Definitions.
-//
-
-// ADR[P] instruction labels.
-def AdrpOperand : AsmOperandClass {
-  let Name = "AdrpLabel";
-  let ParserMethod = "tryParseAdrpLabel";
-  let DiagnosticType = "InvalidLabel";
-}
-def adrplabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let PrintMethod = "printAdrpLabel";
-  let ParserMatchClass = AdrpOperand;
-}
-
-def AdrOperand : AsmOperandClass {
-  let Name = "AdrLabel";
-  let ParserMethod = "tryParseAdrLabel";
-  let DiagnosticType = "InvalidLabel";
-}
-def adrlabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let ParserMatchClass = AdrOperand;
-}
-
-// simm9 predicate - True if the immediate is in the range [-256, 255].
-def SImm9Operand : AsmOperandClass {
-  let Name = "SImm9";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
-}
-def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
-  let ParserMatchClass = SImm9Operand;
-}
-
-// simm7s4 predicate - True if the immediate is a multiple of 4 in the range
-// [-256, 252].
-def SImm7s4Operand : AsmOperandClass {
-  let Name = "SImm7s4";
-  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
-}
-def simm7s4 : Operand<i32> {
-  let ParserMatchClass = SImm7s4Operand;
-  let PrintMethod = "printImmScale<4>";
-}
-
-// simm7s8 predicate - True if the immediate is a multiple of 8 in the range
-// [-512, 504].
-def SImm7s8Operand : AsmOperandClass {
-  let Name = "SImm7s8";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def simm7s8 : Operand<i32> {
-  let ParserMatchClass = SImm7s8Operand;
-  let PrintMethod = "printImmScale<8>";
-}
-
-// simm7s16 predicate - True if the immediate is a multiple of 16 in the range
-// [-1024, 1008].
-def SImm7s16Operand : AsmOperandClass {
-  let Name = "SImm7s16";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def simm7s16 : Operand<i32> {
-  let ParserMatchClass = SImm7s16Operand;
-  let PrintMethod = "printImmScale<16>";
-}
-
-class AsmImmRange<int Low, int High> : AsmOperandClass {
-  let Name = "Imm" # Low # "_" # High;
-  let DiagnosticType = "InvalidImm" # Low # "_" # High;
-}
-
-def Imm1_8Operand : AsmImmRange<1, 8>;
-def Imm1_16Operand : AsmImmRange<1, 16>;
-def Imm1_32Operand : AsmImmRange<1, 32>;
-def Imm1_64Operand : AsmImmRange<1, 64>;
-
-def MovZSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG3";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG3AsmOperand;
-}
-
-def MovZSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG2AsmOperand;
-}
-
-def MovZSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG1AsmOperand;
-}
-
-def MovZSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG0AsmOperand;
-}
-
-def MovKSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG3";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG3AsmOperand;
-}
-
-def MovKSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG2AsmOperand;
-}
-
-def MovKSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG1AsmOperand;
-}
-
-def MovKSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG0AsmOperand;
-}
-
-class fixedpoint_i32<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm32";
-  let ParserMatchClass = Imm1_32Operand;
-}
-
-class fixedpoint_i64<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm64";
-  let ParserMatchClass = Imm1_64Operand;
-}
-
-def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
-def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
-
-def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
-def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
-
-def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR8OpValue";
-  let DecoderMethod = "DecodeVecShiftR8Imm";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16Imm";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32Imm";
-  let ParserMatchClass = Imm1_32Operand;
-}
-def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64Imm";
-  let ParserMatchClass = Imm1_64Operand;
-}
-def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
-  let ParserMatchClass = Imm1_32Operand;
-}
-
-def Imm0_7Operand : AsmImmRange<0, 7>;
-def Imm0_15Operand : AsmImmRange<0, 15>;
-def Imm0_31Operand : AsmImmRange<0, 31>;
-def Imm0_63Operand : AsmImmRange<0, 63>;
-
-def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 8);
-}]> {
-  let EncoderMethod = "getVecShiftL8OpValue";
-  let DecoderMethod = "DecodeVecShiftL8Imm";
-  let ParserMatchClass = Imm0_7Operand;
-}
-def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 16);
-}]> {
-  let EncoderMethod = "getVecShiftL16OpValue";
-  let DecoderMethod = "DecodeVecShiftL16Imm";
-  let ParserMatchClass = Imm0_15Operand;
-}
-def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 32);
-}]> {
-  let EncoderMethod = "getVecShiftL32OpValue";
-  let DecoderMethod = "DecodeVecShiftL32Imm";
-  let ParserMatchClass = Imm0_31Operand;
-}
-def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 64);
-}]> {
-  let EncoderMethod = "getVecShiftL64OpValue";
-  let DecoderMethod = "DecodeVecShiftL64Imm";
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-
-// Crazy immediate formats used by 32-bit and 64-bit logical immediate
-// instructions for splatting repeating bit patterns across the immediate.
-def logical_imm32_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-def logical_imm64_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-
-def LogicalImm32Operand : AsmOperandClass {
-  let Name = "LogicalImm32";
-  let DiagnosticType = "LogicalSecondSource";
-}
-def LogicalImm64Operand : AsmOperandClass {
-  let Name = "LogicalImm64";
-  let DiagnosticType = "LogicalSecondSource";
-}
-def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 32);
-}], logical_imm32_XFORM> {
-  let PrintMethod = "printLogicalImm32";
-  let ParserMatchClass = LogicalImm32Operand;
-}
-def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 64);
-}], logical_imm64_XFORM> {
-  let PrintMethod = "printLogicalImm64";
-  let ParserMatchClass = LogicalImm64Operand;
-}
-
-// imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def Imm0_65535Operand : AsmImmRange<0, 65535>;
-def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 65536;
-}]> {
-  let ParserMatchClass = Imm0_65535Operand;
-  let PrintMethod = "printHexImm";
-}
-
-// imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
-def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 256;
-}]> {
-  let ParserMatchClass = Imm0_255Operand;
-  let PrintMethod = "printHexImm";
-}
-
-// imm0_127 predicate - True if the immediate is in the range [0,127]
-def Imm0_127Operand : AsmImmRange<0, 127>;
-def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 128;
-}]> {
-  let ParserMatchClass = Imm0_127Operand;
-  let PrintMethod = "printHexImm";
-}
-
-// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
-// for all shift-amounts.
-
-// imm0_63 predicate - True if the immediate is in the range [0,63]
-def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 64;
-}]> {
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-// imm0_31 predicate - True if the immediate is in the range [0,31]
-def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 32;
-}]> {
-  let ParserMatchClass = Imm0_31Operand;
-}
-
-// imm0_15 predicate - True if the immediate is in the range [0,15]
-def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = Imm0_15Operand;
-}
-
-// imm0_7 predicate - True if the immediate is in the range [0,7]
-def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = Imm0_7Operand;
-}
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
-//  {5-0} - imm6
-class arith_shift<ValueType Ty, int width> : Operand<Ty> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = !cast<AsmOperandClass>(
-                         "ArithmeticShifterOperand" # width);
-}
-
-def arith_shift32 : arith_shift<i32, 32>;
-def arith_shift64 : arith_shift<i64, 64>;
-
-class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
-}
-
-def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
-def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
-//  {5-0} - imm6
-class logical_shift<int width> : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = !cast<AsmOperandClass>(
-                         "LogicalShifterOperand" # width);
-}
-
-def logical_shift32 : logical_shift<32>;
-def logical_shift64 : logical_shift<64>;
-
-class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, shiftop);
-}
-
-def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
-def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
-
-// A logical vector shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0, #8, #16, or #24
-def logical_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecShifterOperand;
-}
-
-// A logical vector half-word shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #8
-def logical_vec_hw_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
-}
-
-// A vector move shifter operand:
-//  {0} - imm1: #8 or #16
-def move_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getMoveVecShifterOpValue";
-  let ParserMatchClass = MoveVecShifterOperand;
-}
-
-def AddSubImmOperand : AsmOperandClass {
-  let Name = "AddSubImm";
-  let ParserMethod = "tryParseAddSubImm";
-  let DiagnosticType = "AddSubSecondSource";
-}
-// An ADD/SUB immediate shifter operand:
-//  second operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #12
-class addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let ParserMatchClass = AddSubImmOperand;
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-
-def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
-def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
-
-class neg_addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let ParserMatchClass = AddSubImmOperand;
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-
-def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
-def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
-
-// An extend operand:
-//  {5-3} - extend type
-//  {2-0} - imm3
-def arith_extend : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperand;
-}
-def arith_extend64 : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperand64;
-}
-
-// 'extend' that's a lsl of a 64-bit register.
-def arith_extendlsl64 : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperandLSL64;
-}
-
-class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend);
-}
-
-class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend64);
-}
-
-// Floating-point immediate.
-def fpimm32 : Operand<f32>,
-              PatLeaf<(f32 fpimm), [{
-      return ARM64_AM::getFP32Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP32Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-def fpimm64 : Operand<f64>,
-              PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::getFP64Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP64Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm8 : Operand<i32> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm0 : PatLeaf<(fpimm), [{
-  return N->isExactlyValue(+0.0);
-}]>;
-
-// Vector lane operands
-class AsmVectorIndex<string Suffix> : AsmOperandClass {
-  let Name = "VectorIndex" # Suffix;
-  let DiagnosticType = "InvalidIndex" # Suffix;
-}
-def VectorIndex1Operand : AsmVectorIndex<"1">;
-def VectorIndexBOperand : AsmVectorIndex<"B">;
-def VectorIndexHOperand : AsmVectorIndex<"H">;
-def VectorIndexSOperand : AsmVectorIndex<"S">;
-def VectorIndexDOperand : AsmVectorIndex<"D">;
-
-def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) == 1;
-}]> {
-  let ParserMatchClass = VectorIndex1Operand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = VectorIndexBOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = VectorIndexHOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 4;
-}]> {
-  let ParserMatchClass = VectorIndexSOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 2;
-}]> {
-  let ParserMatchClass = VectorIndexDOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def simdimmtype10 : Operand<i32>,
-                    PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::isAdvSIMDModImmType10(N->getValueAPF()
-                                               .bitcastToAPInt()
-                                               .getZExtValue());
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
-                                                           .bitcastToAPInt()
-                                                           .getZExtValue());
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = SIMDImmType10Operand;
-  let PrintMethod = "printSIMDType10Operand";
-}
-
-
-//---
-// System management
-//---
-
-// Base encoding for system instruction operands.
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-22} = 0b1101010100;
-  let Inst{21}    = L;
-}
-
-// System instructions which do not have an Rt register.
-class SimpleSystemI<bit L, dag iops, string asm, string operands>
-    : BaseSystemI<L, (outs), iops, asm, operands> {
-  let Inst{4-0} = 0b11111;
-}
-
-// System instructions which have an Rt register.
-class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : BaseSystemI<L, oops, iops, asm, operands>,
-      Sched<[WriteSys]> {
-  bits<5> Rt;
-  let Inst{4-0} = Rt;
-}
-
-// Hint instructions that take both a CRm and a 3-bit immediate.
-class HintI<string mnemonic>
-    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
-      Sched<[WriteHint]> {
-  bits <7> imm;
-  let Inst{20-12} = 0b000110010;
-  let Inst{11-5} = imm;
-}
-
-// System instructions taking a single literal operand which encodes into
-// CRm. op2 differentiates the opcodes.
-def BarrierAsmOperand : AsmOperandClass {
-  let Name = "Barrier";
-  let ParserMethod = "tryParseBarrierOperand";
-}
-def barrier_op : Operand<i32> {
-  let PrintMethod = "printBarrierOption";
-  let ParserMatchClass = BarrierAsmOperand;
-}
-class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
-    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
-      Sched<[WriteBarrier]> {
-  bits<4> CRm;
-  let Inst{20-12} = 0b000110011;
-  let Inst{11-8} = CRm;
-  let Inst{7-5} = opc;
-}
-
-// MRS/MSR system instructions. These have different operand classes because
-// a different subset of registers can be accessed through each instruction.
-def MRSSystemRegisterOperand : AsmOperandClass {
-  let Name = "MRSSystemRegister";
-  let ParserMethod = "tryParseSysReg";
-  let DiagnosticType = "MRS";
-}
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
-def mrs_sysreg_op : Operand<i32> {
-  let ParserMatchClass = MRSSystemRegisterOperand;
-  let DecoderMethod = "DecodeMRSSystemRegister";
-  let PrintMethod = "printMRSSystemRegister";
-}
-
-def MSRSystemRegisterOperand : AsmOperandClass {
-  let Name = "MSRSystemRegister";
-  let ParserMethod = "tryParseSysReg";
-  let DiagnosticType = "MSR";
-}
-def msr_sysreg_op : Operand<i32> {
-  let ParserMatchClass = MSRSystemRegisterOperand;
-  let DecoderMethod = "DecodeMSRSystemRegister";
-  let PrintMethod = "printMSRSystemRegister";
-}
-
-class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
-                       "mrs", "\t$Rt, $systemreg"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-// FIXME: Some of these def NZCV, others don't. Best way to model that?
-// Explicitly modeling each of the system register as a register class
-// would do it, but feels like overkill at this point.
-class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
-                       "msr", "\t$systemreg, $Rt"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-def SystemPStateFieldOperand : AsmOperandClass {
-  let Name = "SystemPStateField";
-  let ParserMethod = "tryParseSysReg";
-}
-def pstatefield_op : Operand<i32> {
-  let ParserMatchClass = SystemPStateFieldOperand;
-  let PrintMethod = "printSystemPStateField";
-}
-
-let Defs = [NZCV] in
-class MSRpstateI
-  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
-                  "msr", "\t$pstate_field, $imm">,
-    Sched<[WriteSys]> {
-  bits<6> pstatefield;
-  bits<4> imm;
-  let Inst{20-19} = 0b00;
-  let Inst{18-16} = pstatefield{5-3};
-  let Inst{15-12} = 0b0100;
-  let Inst{11-8} = imm;
-  let Inst{7-5} = pstatefield{2-0};
-
-  let DecoderMethod = "DecodeSystemPStateInstruction";
-}
-
-// SYS and SYSL generic system instructions.
-def SysCRAsmOperand : AsmOperandClass {
-  let Name = "SysCR";
-  let ParserMethod = "tryParseSysCROperand";
-}
-
-def sys_cr_op : Operand<i32> {
-  let PrintMethod = "printSysCROperand";
-  let ParserMatchClass = SysCRAsmOperand;
-}
-
-class SystemXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
-       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-class SystemLXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
-       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-
-// Branch (register) instructions:
-//
-//  case opc of
-//    0001 blr
-//    0000 br
-//    0101 dret
-//    0100 eret
-//    0010 ret
-//    otherwise UNDEFINED
-class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
-                    string operands, list<dag> pattern>
-    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
-  let Inst{31-25} = 0b1101011;
-  let Inst{24-21} = opc;
-  let Inst{20-16} = 0b11111;
-  let Inst{15-10} = 0b000000;
-  let Inst{4-0}   = 0b00000;
-}
-
-class BranchReg<bits<4> opc, string asm, list<dag> pattern>
-    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
-  bits<5> Rn;
-  let Inst{9-5} = Rn;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
-class SpecialReturn<bits<4> opc, string asm>
-    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
-  let Inst{9-5} = 0b11111;
-}
-
-//---
-// Conditional branch instruction.
-//---
-
-// Condition code.
-// 4-bit immediate. Pretty-printed as <cc>
-def ccode : Operand<i32> {
-  let PrintMethod = "printCondCode";
-  let ParserMatchClass = CondCode;
-}
-def inv_ccode : Operand<i32> {
-  let PrintMethod = "printInverseCondCode";
-  let ParserMatchClass = CondCode;
-}
-
-// Conditional branch target. 19-bit immediate. The low two bits of the target
-// offset are implied zero and so are not part of the immediate.
-def PCRelLabel19Operand : AsmOperandClass {
-  let Name = "PCRelLabel19";
-  let DiagnosticType = "InvalidLabel";
-}
-def am_brcond : Operand<OtherVT> {
-  let EncoderMethod = "getCondBranchTargetOpValue";
-  let DecoderMethod = "DecodePCRelLabel19";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = PCRelLabel19Operand;
-}
-
-class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
-                     "b", ".$cond\t$target", "",
-                     [(ARM64brcond bb:$target, imm:$cond, NZCV)]>,
-                   Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-  let Uses = [NZCV];
-
-  bits<4> cond;
-  bits<19> target;
-  let Inst{31-24} = 0b01010100;
-  let Inst{23-5} = target;
-  let Inst{4} = 0;
-  let Inst{3-0} = cond;
-}
-
-//---
-// Compare-and-branch instructions.
-//---
-class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
-    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
-         asm, "\t$Rt, $target", "",
-         [(node regtype:$Rt, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<19> target;
-  let Inst{30-25} = 0b011010;
-  let Inst{24}    = op;
-  let Inst{23-5}  = target;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass CmpBranch<bit op, string asm, SDNode node> {
-  def W : BaseCmpBranch<GPR32, op, asm, node> {
-    let Inst{31} = 0;
-  }
-  def X : BaseCmpBranch<GPR64, op, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Test-bit-and-branch instructions.
-//---
-// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
-// the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
-  let Name = "BranchTarget14";
-}
-def am_tbrcond : Operand<OtherVT> {
-  let EncoderMethod = "getTestBranchTargetOpValue";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = BranchTarget14Operand;
-}
-
-// AsmOperand classes to emit (or not) special diagnostics
-def TBZImm0_31Operand : AsmOperandClass {
-  let Name = "TBZImm0_31";
-  let PredicateMethod = "isImm0_31";
-  let RenderMethod = "addImm0_31Operands";
-}
-def TBZImm32_63Operand : AsmOperandClass {
-  let Name = "Imm32_63";
-  let DiagnosticType = "InvalidImm0_63";
-}
-
-class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
-  return (((uint32_t)Imm) < 32);
-}]> {
-  let ParserMatchClass = matcher;
-}
-
-def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
-def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
-
-def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
-  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
-}]> {
-  let ParserMatchClass = TBZImm32_63Operand;
-}
-
-class BaseTestBranch<RegisterClass regtype, Operand immtype,
-                     bit op, string asm, SDNode node>
-    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
-       asm, "\t$Rt, $bit_off, $target", "",
-       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<6> bit_off;
-  bits<14> target;
-
-  let Inst{30-25} = 0b011011;
-  let Inst{24}    = op;
-  let Inst{23-19} = bit_off{4-0};
-  let Inst{18-5}  = target;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeTestAndBranch";
-}
-
-multiclass TestBranch<bit op, string asm, SDNode node> {
-  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
-    let Inst{31} = 0;
-  }
-
-  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
-    let Inst{31} = 1;
-  }
-
-  // Alias X-reg with 0-31 imm to W-Reg.
-  def : InstAlias<asm # "\t$Rd, $imm, $target",
-                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
-                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
-  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
-            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
-            tbz_imm0_31_diag:$imm, bb:$target)>;
-}
-
-//---
-// Unconditional branch (immediate) instructions.
-//---
-def BranchTarget26Operand : AsmOperandClass {
-  let Name = "BranchTarget26";
-  let DiagnosticType = "InvalidLabel";
-}
-def am_b_target : Operand<OtherVT> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-def am_bl_target : Operand<i64> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-
-class BImm<bit op, dag iops, string asm, list<dag> pattern>
-    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
-  bits<26> addr;
-  let Inst{31}    = op;
-  let Inst{30-26} = 0b00101;
-  let Inst{25-0}  = addr;
-
-  let DecoderMethod = "DecodeUnconditionalBranch";
-}
-
-class BranchImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
-class CallImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
-
-//---
-// Basic one-operand data processing instructions.
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
-                         SDPatternOperator node>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-      [(set regtype:$Rd, (node regtype:$Rn))]>,
-    Sched<[WriteI, ReadI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-
-  let Inst{30-13} = 0b101101011000000000;
-  let Inst{12-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass OneOperandData<bits<3> opc, string asm,
-                          SDPatternOperator node = null_frag> {
-  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR32, asm, node> {
-  let Inst{31} = 0;
-}
-
-class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR64, asm, node> {
-  let Inst{31} = 1;
-}
-
-//---
-// Basic two-operand data processing instructions.
-//---
-class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                          list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30}    = isSub;
-  let Inst{28-21} = 0b11010000;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                      SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
-
-class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
-                              SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
-         (implicit NZCV)]> {
-  let Defs = [NZCV];
-}
-
-multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
-                       SDNode OpNode, SDNode OpNode_setflags> {
-  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
-    let Inst{31} = 0;
-    let Inst{29} = 0;
-  }
-  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-    let Inst{29} = 0;
-  }
-
-  // Sets flags.
-  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 0;
-    let Inst{29} = 1;
-  }
-  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 1;
-    let Inst{29} = 1;
-  }
-}
-
-class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
-                     SDPatternOperator OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = 0b00;
-  let Inst{13-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
-              SDPatternOperator OpNode>
-    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
-  let Inst{10}    = isSigned;
-}
-
-multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
-  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
-           Sched<[WriteID32, ReadID, ReadID]> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
-           Sched<[WriteID64, ReadID, ReadID]> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
-                SDPatternOperator OpNode = null_frag>
-  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
-    Sched<[WriteIS, ReadI]> {
-  let Inst{11-10} = shift_type;
-}
-
-multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
-  def Wr : BaseShift<shift_type, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
-                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-}
-
-class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
-
-class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
-                       RegisterClass addtype, string asm,
-                       list<dag> pattern>
-  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
-      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{30-24} = 0b0011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
-  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
-      Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
-    let Inst{31} = 0;
-  }
-
-  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
-      Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
-    let Inst{31} = 1;
-  }
-}
-
-class WideMulAccum<bit isSub, bits<3> opc, string asm,
-                   SDNode AccNode, SDNode ExtNode>
-  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
-    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
-                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
-    Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
-  let Inst{31} = 1;
-}
-
-class MulHi<bits<3> opc, string asm, SDNode OpNode>
-  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
-    Sched<[WriteIM64, ReadIM, ReadIM]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-24} = 0b10011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
-  // (i.e. all bits 1) but is ignored by the processor.
-  let PostEncoderMethod = "fixMulHigh";
-}
-
-class MulAccumWAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
-class MulAccumXAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
-class WideMulAccumAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
-
-class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
-              SDPatternOperator OpNode, string asm>
-  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
-    Sched<[WriteISReg, ReadI, ReadISReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31} = sf;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = 0b010;
-  let Inst{12} = C;
-  let Inst{11-10} = sz;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-  let Predicates = [HasCRC];
-}
-
-//---
-// Address generation.
-//---
-
-class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
-    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
-        pattern>,
-      Sched<[WriteI]> {
-  bits<5>  Xd;
-  bits<21> label;
-  let Inst{31}    = page;
-  let Inst{30-29} = label{1-0};
-  let Inst{28-24} = 0b10000;
-  let Inst{23-5}  = label{20-2};
-  let Inst{4-0}   = Xd;
-
-  let DecoderMethod = "DecodeAdrInstruction";
-}
-
-//---
-// Move immediate.
-//---
-
-def movimm32_imm : Operand<i32> {
-  let ParserMatchClass = Imm0_65535Operand;
-  let EncoderMethod = "getMoveWideImmOpValue";
-  let PrintMethod = "printHexImm";
-}
-def movimm32_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm32ShifterOperand;
-}
-def movimm64_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm64ShifterOperand;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                        string asm>
-  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "", []>,
-    Sched<[WriteImm]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass MoveImmediate<bits<2> opc, string asm> {
-  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                          string asm>
-  : I<(outs regtype:$Rd),
-      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
-    Sched<[WriteI, ReadI]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass InsertImmediate<bits<2> opc, string asm> {
-  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Add/Subtract
-//---
-
-class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
-                    string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
-        asm, "\t$Rd, $Rn, $imm", "",
-        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
-      Sched<[WriteI, ReadI]>  {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<14> imm;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b10001;
-  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
-  let Inst{21-10} = imm{11-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-  let DecoderMethod = "DecodeBaseAddSubImm";
-}
-
-class BaseAddSubRegPseudo<RegisterClass regtype,
-                          SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI, ReadI, ReadI]>;
-
-class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
-                     arith_shifted_reg shifted_regtype, string asm,
-                     SDPatternOperator OpNode>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "",
-        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
-      Sched<[WriteISReg, ReadI, ReadISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = 0;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, Operand src2Regtype,
-                     string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$R1),
-        (ins src1Regtype:$R2, src2Regtype:$R3),
-        asm, "\t$R1, $R2, $R3", "",
-        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
-      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = ext{5-3};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                       RegisterClass src1Regtype, RegisterClass src2Regtype,
-                       Operand ext_op, string asm>
-    : I<(outs dstRegtype:$Rd),
-        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
-        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
-      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = ext{5};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-// Aliases for register+register add/subtract.
-class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, RegisterClass src2Regtype,
-                     int shiftExt>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
-                      shiftExt)>;
-
-multiclass AddSub<bit isSub, string mnemonic,
-                  SDPatternOperator OpNode = null_frag> {
-  let hasSideEffects = 0 in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register - Only used for CodeGen
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1, hasSideEffects = 0 in {
-  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
-                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when either the destination or
-  // first source register is SP.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
-}
-
-multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
-  let isCompare = 1, Defs = [NZCV] in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1 in {
-  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
-                           arith_extended_reg32<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-  } // Defs = [NZCV]
-
-  // Compare aliases
-  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
-                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
-  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
-                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
-                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
-                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
-                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
-                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
-  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
-                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
-
-  // Compare shorthands
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
-                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
-  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
-                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when the first source register
-  // is SP.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
-}
-
-//---
-// Extract
-//---
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                                      SDTCisPtrTy<3>]>;
-def ARM64Extr : SDNode<"ARM64ISD::EXTR", SDTA64EXTR>;
-
-class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
-                     list<dag> patterns>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
-      Sched<[WriteExtr, ReadExtrHi]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> imm;
-
-  let Inst{30-23} = 0b00100111;
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = imm;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ExtractImm<string asm> {
-  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
-                      [(set GPR32:$Rd,
-                        (ARM64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-    // imm<5> must be zero.
-    let imm{5}   = 0;
-  }
-  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
-                      [(set GPR64:$Rd,
-                        (ARM64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
-
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Bitfield
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImm<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
-      Sched<[WriteIS, ReadI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImm<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-    // imms<5> and immr<5> must be zero, else ReservedValue().
-    let Inst{21} = 0;
-    let Inst{15} = 0;
-  }
-  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImmWith2RegArgs<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
-                             imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
-      Sched<[WriteIS, ReadI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-    // imms<5> and immr<5> must be zero, else ReservedValue().
-    let Inst{21} = 0;
-    let Inst{15} = 0;
-  }
-  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Logical
-//---
-
-// Logical (immediate)
-class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
-                     RegisterClass sregtype, Operand imm_type, string asm,
-                     list<dag> pattern>
-    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $imm", "", pattern>,
-      Sched<[WriteI, ReadI]> {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<13> imm;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100100;
-  let Inst{22}    = imm{12};
-  let Inst{21-16} = imm{11-6};
-  let Inst{15-10} = imm{5-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeLogicalImmInstruction";
-}
-
-// Logical (shifted register)
-class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
-                      logical_shifted_reg shifted_regtype, string asm,
-                      list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteISReg, ReadI, ReadISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-24} = 0b01010;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = N;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-// Aliases for register+register logical instructions.
-class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
-
-let AddedComplexity = 6 in
-multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
-  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
-                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
-                                               logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
-                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
-                                               logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-}
-
-multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
-  let isCompare = 1, Defs = [NZCV] in {
-  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-  } // end Defs = [NZCV]
-}
-
-class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI, ReadI, ReadI]>;
-
-// Split from LogicalImm as not all instructions have both.
-multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
-                      SDPatternOperator OpNode> {
-  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
-
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
-                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
-                                                 logical_shifted_reg32:$Rm))]> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
-                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
-                                                 logical_shifted_reg64:$Rm))]> {
-    let Inst{31} = 1;
-  }
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-// Split from LogicalReg to allow setting NZCV Defs
-multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
-                       SDPatternOperator OpNode = null_frag> {
-  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
-
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
-            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
-            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
-    let Inst{31} = 1;
-  }
-  } // Defs = [NZCV]
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-//---
-// Conditionally set flags
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
-      Sched<[WriteI, ReadI]> {
-  let Uses = [NZCV];
-  let Defs = [NZCV];
-
-  bits<5> Rn;
-  bits<5> imm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = imm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsImm<bit op, string asm> {
-  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-  let Defs = [NZCV];
-
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsReg<bit op, string asm> {
-  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Conditional select
-//---
-
-class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass CondSelect<bit op, bits<2> op2, string asm> {
-  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
-                       PatFrag frag>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, (frag regtype:$Rm),
-               (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteI, ReadI, ReadI]> {
-  let Uses = [NZCV];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-def inv_cond_XFORM : SDNodeXForm<imm, [{
-  ARM64CC::CondCode CC = static_cast<ARM64CC::CondCode>(N->getZExtValue());
-  return CurDAG->getTargetConstant(ARM64CC::getInvertedCondCode(CC), MVT::i32);
-}]>;
-
-multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
-  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
-    let Inst{31} = 1;
-  }
-
-  def : Pat<(ARM64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
-            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
-                                           (inv_cond_XFORM imm:$cond))>;
-
-  def : Pat<(ARM64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
-            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
-                                           (inv_cond_XFORM imm:$cond))>;
-}
-
-//---
-// Special Mask Value
-//---
-def maski8_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
-}
-def maski16_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
-}
-
-
-//---
-// Load/store
-//---
-
-// (unsigned immediate)
-// Indexed for 8-bit registers. offset is in range [0,4095].
-def MemoryIndexed8Operand : AsmOperandClass {
-  let Name = "MemoryIndexed8";
-  let DiagnosticType = "InvalidMemoryIndexed8";
-}
-def am_indexed8 : Operand<i64>,
-                  ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []> {
-  let PrintMethod = "printAMIndexed<8>";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale1>";
-  let ParserMatchClass = MemoryIndexed8Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 16-bit registers. offset is multiple of 2 in range [0,8190],
-// stored as immval/2 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed16Operand : AsmOperandClass {
-  let Name = "MemoryIndexed16";
-  let DiagnosticType = "InvalidMemoryIndexed16";
-}
-def am_indexed16 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []> {
-  let PrintMethod = "printAMIndexed<16>";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale2>";
-  let ParserMatchClass = MemoryIndexed16Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 32-bit registers. offset is multiple of 4 in range [0,16380],
-// stored as immval/4 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed32Operand : AsmOperandClass {
-  let Name = "MemoryIndexed32";
-  let DiagnosticType = "InvalidMemoryIndexed32";
-}
-def am_indexed32 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []> {
-  let PrintMethod = "printAMIndexed<32>";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale4>";
-  let ParserMatchClass = MemoryIndexed32Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 64-bit registers. offset is multiple of 8 in range [0,32760],
-// stored as immval/8 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed64Operand : AsmOperandClass {
-  let Name = "MemoryIndexed64";
-  let DiagnosticType = "InvalidMemoryIndexed64";
-}
-def am_indexed64 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []> {
-  let PrintMethod = "printAMIndexed<64>";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale8>";
-  let ParserMatchClass = MemoryIndexed64Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 128-bit registers. offset is multiple of 16 in range [0,65520],
-// stored as immval/16 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed128Operand : AsmOperandClass {
-  let Name = "MemoryIndexed128";
-  let DiagnosticType = "InvalidMemoryIndexed128";
-}
-def am_indexed128 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []> {
-  let PrintMethod = "printAMIndexed<128>";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale16>";
-  let ParserMatchClass = MemoryIndexed128Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// No offset.
-def MemoryNoIndexOperand : AsmOperandClass { let Name = "MemoryNoIndex"; }
-def am_noindex : Operand<i64>,
-                 ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
-  let PrintMethod = "printAMNoIndex";
-  let ParserMatchClass = MemoryNoIndexOperand;
-  let MIOperandInfo = (ops GPR64sp:$base);
-}
-
-class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                      string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
-  bits<5> dst;
-
-  bits<17> addr;
-  bits<5> base = addr{4-0};
-  bits<12> offset = addr{16-5};
-
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b01;
-  let Inst{23-22} = opc;
-  let Inst{21-10} = offset;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeUnsignedLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             Operand indextype, string asm, list<dag> pattern>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs regtype:$Rt), (ins indextype:$addr), asm, pattern>,
-      Sched<[WriteLD]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             Operand indextype, string asm, list<dag> pattern>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs), (ins regtype:$Rt, indextype:$addr), asm, pattern>,
-      Sched<[WriteST]>;
-
-def PrefetchOperand : AsmOperandClass {
-  let Name = "Prefetch";
-  let ParserMethod = "tryParsePrefetch";
-}
-def prfop : Operand<i32> {
-  let PrintMethod = "printPrefetchOp";
-  let ParserMatchClass = PrefetchOperand;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs), (ins prfop:$Rt, am_indexed64:$addr), asm, pat>,
-      Sched<[WriteLD]>;
-
-//---
-// Load literal
-//---
-
-// Load literal address: 19-bit immediate. The low two bits of the target
-// offset are implied zero and so are not part of the immediate.
-def am_ldrlit : Operand<OtherVT> {
-  let EncoderMethod = "getLoadLiteralOpValue";
-  let DecoderMethod = "DecodePCRelLabel19";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = PCRelLabel19Operand;
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
-        asm, "\t$Rt, $label", "", []>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
-    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
-        asm, "\t$Rt, $label", "", pat>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-//---
-// Load/store register offset
-//---
-
-class MemROAsmOperand<int sz> : AsmOperandClass {
-  let Name = "MemoryRegisterOffset"#sz;
-  let DiagnosticType = "InvalidMemoryIndexed";
-}
-
-def MemROAsmOperand8 : MemROAsmOperand<8>;
-def MemROAsmOperand16 : MemROAsmOperand<16>;
-def MemROAsmOperand32 : MemROAsmOperand<32>;
-def MemROAsmOperand64 : MemROAsmOperand<64>;
-def MemROAsmOperand128 : MemROAsmOperand<128>;
-
-class ro_indexed<int sz> : Operand<i64> { // ComplexPattern<...>
-  let PrintMethod = "printMemoryRegOffset<" # sz # ">";
-  let MIOperandInfo = (ops GPR64sp:$base, GPR64:$offset, i32imm:$extend);
-}
-
-def ro_indexed8 : ro_indexed<8>, ComplexPattern<i64, 3, "SelectAddrModeRO8", []> {
-  let ParserMatchClass = MemROAsmOperand8;
-}
-
-def ro_indexed16 : ro_indexed<16>, ComplexPattern<i64, 3, "SelectAddrModeRO16", []> {
-  let ParserMatchClass = MemROAsmOperand16;
-}
-
-def ro_indexed32 : ro_indexed<32>, ComplexPattern<i64, 3, "SelectAddrModeRO32", []> {
-  let ParserMatchClass = MemROAsmOperand32;
-}
-
-def ro_indexed64 : ro_indexed<64>, ComplexPattern<i64, 3, "SelectAddrModeRO64", []> {
-  let ParserMatchClass = MemROAsmOperand64;
-}
-
-def ro_indexed128 : ro_indexed<128>, ComplexPattern<i64, 3, "SelectAddrModeRO128", []> {
-  let ParserMatchClass = MemROAsmOperand128;
-}
-
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed8:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed8:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore16RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed16:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore16RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed16:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore32RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed32:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore32RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed32:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore64RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed64:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore64RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed64:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore128RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed128:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore128RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed128:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : I<(outs), (ins prfop:$Rt, ro_indexed64:$addr), asm,
-         "\t$Rt, $addr", "", pat>,
-      Sched<[WriteLD]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-//---
-// Load/store unscaled immediate
-//---
-
-def MemoryUnscaledOperand : AsmOperandClass {
-  let Name = "MemoryUnscaled";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
-}
-class am_unscaled_operand : Operand<i64> {
-  let PrintMethod = "printAMIndexed<8>";
-  let ParserMatchClass = MemoryUnscaledOperand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-class am_unscaled_wb_operand : Operand<i64> {
-  let PrintMethod = "printAMIndexedWB<8>";
-  let ParserMatchClass = MemoryUnscaledOperand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled   : am_unscaled_operand;
-def am_unscaled_wb: am_unscaled_wb_operand;
-def am_unscaled8  : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
-def am_unscaled16 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
-def am_unscaled32 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
-def am_unscaled64 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
-def am_unscaled128 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
-
-class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                           string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let AddedComplexity = 1 in // try this before LoadUI
-class LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   Operand amtype, string asm, list<dag> pattern>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
-                           (ins amtype:$addr), asm, pattern>,
-      Sched<[WriteLD]>;
-
-let AddedComplexity = 1 in // try this before StoreUI
-class StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    Operand amtype, string asm, list<dag> pattern>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                           (ins regtype:$Rt, amtype:$addr), asm, pattern>,
-      Sched<[WriteST]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                           (ins prfop:$Rt, am_unscaled:$addr), asm, pat>,
-      Sched<[WriteLD]>;
-
-//---
-// Load/store unscaled immediate, unprivileged
-//---
-
-class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
-                                dag oops, dag iops, string asm>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
-class LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   string asm>
-    : BaseLoadStoreUnprivileged<sz, V, opc,
-                      (outs regtype:$Rt), (ins am_unscaled:$addr), asm>,
-      Sched<[WriteLD]>;
-}
-
-let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
-class StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    string asm>
-    : BaseLoadStoreUnprivileged<sz, V, opc,
-                      (outs), (ins regtype:$Rt, am_unscaled:$addr), asm>,
-      Sched<[WriteST]>;
-}
-
-//---
-// Load/store pre-indexed
-//---
-
-class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr>
-    : I<oops, iops, asm, "\t$Rt, $addr!", cstr, []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-// FIXME: Modeling the write-back of these instructions for isel is tricky.
-//        we need the complex addressing mode for the memory reference, but
-//        we also need the write-back specified as a tied operand to the
-//        base register. That combination does not play nicely with
-//        the asm matcher and friends.
-class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                     (outs regtype:$Rt/*, GPR64sp:$wback*/),
-                     (ins am_unscaled_wb:$addr), asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteLD, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                      (outs/* GPR64sp:$wback*/),
-                      (ins regtype:$Rt, am_unscaled_wb:$addr),
-                       asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteAdr, WriteST]>;
-} // hasSideEffects = 0
-
-// ISel pseudo-instructions which have the tied operands. When the MC lowering
-// logic finally gets smart enough to strip off tied operands that are just
-// for isel convenience, we can get rid of these pseudos and just reference
-// the real instructions directly.
-//
-// Ironically, also because of the writeback operands, we can't put the
-// matcher pattern directly on the instruction, but need to define it
-// separately.
-//
-// Loads aren't matched with patterns here at all, but rather in C++
-// custom lowering.
-let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
-class LoadPreIdxPseudo<RegisterClass regtype>
-    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
-             (ins am_noindex:$addr, simm9:$offset), [],
-              "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteLD, WriteAdr]>;
-class LoadPostIdxPseudo<RegisterClass regtype>
-    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
-             (ins am_noindex:$addr, simm9:$offset), [],
-              "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteLD, WriteI]>;
-}
-multiclass StorePreIdxPseudo<RegisterClass regtype, ValueType Ty,
-                             SDPatternOperator OpNode> {
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-  def _isel: Pseudo<(outs GPR64sp:$wback),
-                    (ins regtype:$Rt, am_noindex:$addr, simm9:$offset), [],
-                    "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteAdr, WriteST]>;
-
-  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$offset),
-            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
-                                            simm9:$offset)>;
-}
-
-//---
-// Load/store post-indexed
-//---
-
-// (pre-index) load/stores.
-class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr>
-    : I<oops, iops, asm, "\t$Rt, $addr, $idx", cstr, []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0b0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-// FIXME: Modeling the write-back of these instructions for isel is tricky.
-//        we need the complex addressing mode for the memory reference, but
-//        we also need the write-back specified as a tied operand to the
-//        base register. That combination does not play nicely with
-//        the asm matcher and friends.
-class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs regtype:$Rt/*, GPR64sp:$wback*/),
-                      (ins am_noindex:$addr, simm9:$idx),
-                      asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteLD, WriteI]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs/* GPR64sp:$wback*/),
-                      (ins regtype:$Rt, am_noindex:$addr, simm9:$idx),
-                       asm, ""/*"$addr.base = $wback"*/>,
-    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
-} // hasSideEffects = 0
-
-// ISel pseudo-instructions which have the tied operands. When the MC lowering
-// logic finally gets smart enough to strip off tied operands that are just
-// for isel convenience, we can get rid of these pseudos and just reference
-// the real instructions directly.
-//
-// Ironically, also because of the writeback operands, we can't put the
-// matcher pattern directly on the instruction, but need to define it
-// separately.
-multiclass StorePostIdxPseudo<RegisterClass regtype, ValueType Ty,
-                              SDPatternOperator OpNode, Instruction Insn> {
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-  def _isel: Pseudo<(outs GPR64sp:$wback),
-                    (ins regtype:$Rt, am_noindex:$addr, simm9:$idx), [],
-                    "$addr.base = $wback,@earlyclobber $wback">,
-      PseudoInstExpansion<(Insn regtype:$Rt, am_noindex:$addr, simm9:$idx)>,
-      Sched<[WriteAdr, WriteST, ReadAdrBase]>;
-
-  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$idx),
-            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
-                                            simm9:$idx)>;
-}
-
-//---
-// Load/store pair
-//---
-
-// (indexed, offset)
-
-class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b010;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand indextype, string asm>
-    : BaseLoadStorePairOffset<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins indextype:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi]>;
-
-let mayLoad = 0, mayStore = 1 in
-class StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm>
-    : BaseLoadStorePairOffset<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
-                             asm>,
-      Sched<[WriteSTP]>;
-} // hasSideEffects = 0
-
-// (pre-indexed)
-
-def MemoryIndexed32SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed32SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
-}
-def am_indexed32simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed<32>";
-  let ParserMatchClass = MemoryIndexed32SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-def am_indexed32simm7_wb : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexedWB<32>";
-  let ParserMatchClass = MemoryIndexed32SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-def MemoryIndexed64SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed64SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def am_indexed64simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed<64>";
-  let ParserMatchClass = MemoryIndexed64SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-def am_indexed64simm7_wb : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexedWB<64>";
-  let ParserMatchClass = MemoryIndexed64SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-def MemoryIndexed128SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed128SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed128SImm7";
-}
-def am_indexed128simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed<128>";
-  let ParserMatchClass = MemoryIndexed128SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-def am_indexed128simm7_wb : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexedWB<128>";
-  let ParserMatchClass = MemoryIndexed128SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr!", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b011;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand addrmode, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins addrmode:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand addrmode, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, addrmode:$addr),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-// (post-indexed)
-
-class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr, $idx", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b001;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins am_noindex:$addr, idxtype:$idx), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                       Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2,
-                                  am_noindex:$addr, idxtype:$idx),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-//  (no-allocate)
-
-class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b000;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand indextype, string asm>
-    : BaseLoadStorePairNoAlloc<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins indextype:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm>
-    : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
-                             asm>,
-      Sched<[WriteSTP]>;
-} // hasSideEffects = 0
-
-//---
-// Load/store exclusive
-//---
-
-// True exclusive operations write to and/or read from the system's exclusive
-// monitors, which as far as a compiler is concerned can be modelled as a
-// random shared memory address. Hence LoadExclusive mayStore.
-//
-// Since these instructions have the undefined register bits set to 1 in
-// their canonical form, we need a post encoder method to set those bits
-// to 1 when encoding these instructions. We do this using the
-// fixLoadStoreExclusive function. This function has template parameters:
-//
-// fixLoadStoreExclusive<int hasRs, int hasRt2>
-//
-// hasRs indicates that the instruction uses the Rs field, so we won't set
-// it to 1 (and the same for Rt2). We don't need template parameters for
-// the other register fields since Rt and Rn are always used.
-//
-let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
-class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                             dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-30} = sz;
-  let Inst{29-24} = 0b001000;
-  let Inst{23}    = o2;
-  let Inst{22}    = L;
-  let Inst{21}    = o1;
-  let Inst{15}    = o0;
-
-  let DecoderMethod = "DecodeExclusiveLdStInstruction";
-}
-
-// Neither Rs nor Rt2 operands.
-class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                               dag oops, dag iops, string asm, string operands>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
-  bits<5> reg;
-  bits<5> base;
-  let Inst{9-5} = base;
-  let Inst{4-0} = reg;
-
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-// Simple load acquires don't set the exclusive monitor
-let mayLoad = 1, mayStore = 0 in
-class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                  RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
-      Sched<[WriteLD]>;
-
-class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                    RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
-      Sched<[WriteLD]>;
-
-class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                       RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs regtype:$Rt, regtype:$Rt2),
-                             (ins am_noindex:$addr), asm,
-                             "\t$Rt, $Rt2, $addr">,
-      Sched<[WriteLD, WriteLDHi]> {
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<5> base;
-  let Inst{14-10} = dst2;
-  let Inst{9-5} = base;
-  let Inst{4-0} = dst1;
-
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
-}
-
-// Simple store release operations do not check the exclusive monitor.
-let mayLoad = 0, mayStore = 1 in
-class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                   RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
-                               (ins regtype:$Rt, am_noindex:$addr),
-                               asm, "\t$Rt, $addr">,
-      Sched<[WriteST]>;
-
-let mayLoad = 1, mayStore = 1 in
-class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                     RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
-                             (ins regtype:$Rt, am_noindex:$addr),
-                             asm, "\t$Ws, $Rt, $addr">,
-      Sched<[WriteSTX]> {
-  bits<5> status;
-  bits<5> reg;
-  bits<5> base;
-  let Inst{20-16} = status;
-  let Inst{9-5} = base;
-  let Inst{4-0} = reg;
-
-  let Constraints = "@earlyclobber $Ws";
-  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
-}
-
-class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                         RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs GPR32:$Ws),
-                             (ins regtype:$Rt, regtype:$Rt2, am_noindex:$addr),
-                              asm, "\t$Ws, $Rt, $Rt2, $addr">,
-      Sched<[WriteSTX]> {
-  bits<5> status;
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<5> base;
-  let Inst{20-16} = status;
-  let Inst{14-10} = dst2;
-  let Inst{9-5} = base;
-  let Inst{4-0} = dst1;
-
-  let Constraints = "@earlyclobber $Ws";
-}
-
-//---
-// Exception generation
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
-    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
-      Sched<[WriteSys]> {
-  bits<16> imm;
-  let Inst{31-24} = 0b11010100;
-  let Inst{23-21} = op1;
-  let Inst{20-5}  = imm;
-  let Inst{4-2}   = 0b000;
-  let Inst{1-0}   = ll;
-}
-
-let Predicates = [HasFPARMv8] in {
-
-//---
-// Floating point to integer conversion
-//---
-
-class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-29} = 0b00;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-29} = 0b00;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 0;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
-           SDPatternOperator OpN> {
-  // Unscaled single-precision to 32-bit
-  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled single-precision to 64-bit
-  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Unscaled double-precision to 32-bit
-  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled double-precision to 64-bit
-  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-}
-
-multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
-                             SDPatternOperator OpN> {
-  // Scaled single-precision to 32-bit
-  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
-                              fixedpoint_f32_i32, asm,
-              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
-                                          fixedpoint_f32_i32:$scale)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let scale{5} = 1;
-  }
-
-  // Scaled single-precision to 64-bit
-  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
-                              fixedpoint_f32_i64, asm,
-              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
-                                          fixedpoint_f32_i64:$scale)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Scaled double-precision to 32-bit
-  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
-                              fixedpoint_f64_i32, asm,
-              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
-                                          fixedpoint_f64_i32:$scale)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let scale{5} = 1;
-  }
-
-  // Scaled double-precision to 64-bit
-  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
-                              fixedpoint_f64_i64, asm,
-              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
-                                          fixedpoint_f64_i64:$scale)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-}
-
-//---
-// Integer to floating point conversion
-//---
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseIntegerToFP<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b00001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseIntegerToFPUnscaled<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      ValueType dvt, string asm, SDNode node>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b10001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
-  // Unscaled
-  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  // Scaled
-  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
-                             [(set FPR32:$Rd,
-                                   (fdiv (node GPR32:$Rn),
-                                         fixedpoint_f32_i32:$scale))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-    let scale{5} = 1;
-  }
-
-  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
-                             [(set FPR64:$Rd,
-                                   (fdiv (node GPR32:$Rn),
-                                         fixedpoint_f64_i32:$scale))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-    let scale{5} = 1;
-  }
-
-  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
-                             [(set FPR32:$Rd,
-                                   (fdiv (node GPR64:$Rn),
-                                         fixedpoint_f32_i64:$scale))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
-                             [(set FPR64:$Rd,
-                                   (fdiv (node GPR64:$Rn),
-                                         fixedpoint_f64_i64:$scale))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-}
-
-//---
-// Unscaled integer <-> floating point conversion (i.e. FMOV)
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
-        // We use COPY_TO_REGCLASS for these bitconvert operations.
-        // copyPhysReg() expands the resultant COPY instructions after
-        // regalloc is done. This gives greater freedom for the allocator
-        // and related passes (coalescing, copy propagation, et. al.) to
-        // be more effective.
-        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterClass srcType, RegisterOperand dstType, string asm,
-                     string kind>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
-        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod =  "DecodeFMOVLaneInstruction";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterOperand srcType, RegisterClass dstType, string asm,
-                     string kind>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
-        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod =  "DecodeFMOVLaneInstruction";
-}
-
-
-
-multiclass UnscaledConversion<string asm> {
-  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
-                                             asm, ".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-
-  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
-                                               asm, ".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-}
-
-//---
-// Floating point conversion
-//---
-
-class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
-                       RegisterClass srcType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-24} = 0b00011110;
-  let Inst{23-22} = type;
-  let Inst{21-17} = 0b10001;
-  let Inst{16-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPConversion<string asm> {
-  // Double-precision to Half-precision
-  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
-                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
-
-  // Double-precision to Single-precision
-  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
-                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
-
-  // Half-precision to Double-precision
-  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
-                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
-
-  // Half-precision to Single-precision
-  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
-                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
-
-  // Single-precision to Double-precision
-  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
-                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
-
-  // Single-precision to Half-precision
-  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
-                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
-}
-
-//---
-// Single operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
-                              ValueType vt, string asm, SDPatternOperator node>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21-19} = 0b100;
-  let Inst{18-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SingleOperandFPData<bits<4> opcode, string asm,
-                               SDPatternOperator node = null_frag> {
-  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Two operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
-                           string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-         asm, "\t$Rd, $Rn, $Rm", "", pat>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass TwoOperandFPData<bits<4> opcode, string asm,
-                            SDPatternOperator node = null_frag> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                         [(set (f32 FPR32:$Rd),
-                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                         [(set (f64 FPR64:$Rd),
-                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-
-//---
-// Three operand floating point data processing
-//---
-
-class BaseThreeOperandFPData<bit isNegated, bit isSub,
-                             RegisterClass regtype, string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
-         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
-      Sched<[WriteFMul]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{31-23} = 0b000111110;
-  let Inst{21}    = isNegated;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
-                              SDPatternOperator node> {
-  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
-            [(set FPR32:$Rd,
-                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
-            [(set FPR64:$Rd,
-                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Floating point data comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandFPComparison<bit signalAllNans,
-                                 RegisterClass regtype, string asm,
-                                 list<dag> pat>
-    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b1000;
-
-  // Rm should be 0b00000 canonically, but we need to accept any value.
-  let PostEncoderMethod = "fixOneOperandFPComparison";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
-                                string asm, list<dag> pat>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rm;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b0000;
-}
-
-multiclass FPComparison<bit signalAllNans, string asm,
-                        SDPatternOperator OpNode = null_frag> {
-  let Defs = [NZCV] in {
-  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
-    let Inst{22} = 0;
-  }
-
-  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
-    let Inst{22} = 1;
-  }
-
-  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [NZCV]
-}
-
-//---
-// Floating point conditional comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
-                              RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass FPCondComparison<bit signalAllNans, string asm> {
-  let Defs = [NZCV], Uses = [NZCV] in {
-  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [NZCV], Uses = [NZCV]
-}
-
-//---
-// Floating point conditional select
-//---
-
-class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel (vt regtype:$Rn), regtype:$Rm,
-                          (i32 imm:$cond), NZCV))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPCondSelect<string asm> {
-  let Uses = [NZCV] in {
-  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Uses = [NZCV]
-}
-
-//---
-// Floating move immediate
-//---
-
-class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
-  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
-      [(set regtype:$Rd, fpimmtype:$imm)]>,
-    Sched<[WriteFImm]> {
-  bits<5> Rd;
-  bits<8> imm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-13} = imm;
-  let Inst{12-5}  = 0b10000000;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPMoveImmediate<string asm> {
-  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
-    let Inst{22} = 1;
-  }
-}
-} // end of 'let Predicates = [HasFPARMv8]'
-
-//----------------------------------------------------------------------------
-// AdvSIMD
-//----------------------------------------------------------------------------
-
-def MemorySIMDNoIndexOperand : AsmOperandClass {
-  let Name = "MemorySIMDNoIndex";
-  let ParserMethod = "tryParseNoIndexMemory";
-}
-def am_simdnoindex : Operand<i64>,
-                     ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
-  let PrintMethod = "printAMNoIndex";
-  let ParserMatchClass = MemorySIMDNoIndexOperand;
-  let MIOperandInfo = (ops GPR64sp:$base);
-  let DecoderMethod = "DecodeGPR64spRegisterClass";
-}
-
-let Predicates = [HasNEON] in {
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register vector instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// All operand sizes distinguished in the encoding.
-multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
-                                      asm, ".2d",
-         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-}
-
-// As above, but D sized elements unsupported.
-multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
-}
-
-multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-      [(set (v8i8 V64:$dst),
-            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-      [(set (v16i8 V128:$dst),
-            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-      [(set (v4i16 V64:$dst),
-            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-      [(set (v8i16 V128:$dst),
-            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-      [(set (v2i32 V64:$dst),
-            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-      [(set (v4i32 V128:$dst),
-            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// As above, but only B sized elements supported.
-multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-    [(set (v16i8 V128:$Rd),
-          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-}
-
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
-                                    string asm,
-                                    SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-     [(set (v2f32 V64:$dst),
-           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-     [(set (v4f32 V128:$dst),
-           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-     [(set (v2f64 V128:$dst),
-           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-// As above, but D and B sized elements unsupported.
-multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// Logical three vector ops share opcode bits, and only use B sized elements.
-multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
-  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
-
-  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-}
-
-multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
-                                  string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-             [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-             [(set (v16i8 V128:$dst),
-                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                           (v16i8 V128:$Rm)))]>;
-
-  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
-                           (v4i16 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
-                           (v2i32 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
-                           (v1i64 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
-                           (v8i16 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
-                           (v4i32 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
-                           (v2i64 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string dstkind,
-                        string srckind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Supports B, H, and S element sizes.
-multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b", ".8b",
-                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b", ".16b",
-                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h", ".4h",
-                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h", ".8h",
-                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, string amount>
-  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
-      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-24} = 0b101110;
-  let Inst{23-22} = size;
-  let Inst{21-10} = 0b100001001110;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorLShiftLongBySizeBHS {
-  let neverHasSideEffects = 1 in {
-  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
-                                             "shll", ".8h",  ".8b", "8">;
-  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
-                                             "shll2", ".8h", ".16b", "8">;
-  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
-                                             "shll", ".4s",  ".4h", "16">;
-  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
-                                             "shll2", ".4s", ".8h", "16">;
-  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
-                                             "shll", ".2d",  ".2s", "32">;
-  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
-                                             "shll2", ".2d", ".4s", "32">;
-  }
-}
-
-// Supports all element sizes.
-multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".4h", ".8b",
-               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".8h", ".16b",
-               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".2s", ".4h",
-               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".4s", ".8h",
-               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".1d", ".2s",
-               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".2d", ".4s",
-               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                          asm, ".4h", ".8b",
-      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
-                                      (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                          asm, ".8h", ".16b",
-      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
-                                      (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                          asm, ".2s", ".4h",
-      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
-                                      (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                          asm, ".4s", ".8h",
-      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
-                                      (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                          asm, ".1d", ".2s",
-      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
-                                      (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                          asm, ".2d", ".4s",
-      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
-                                      (v4i32 V128:$Rn)))]>;
-}
-
-// Supports all element sizes, except 1xD.
-multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                    asm, ".8b", ".8b",
-    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                    asm, ".16b", ".16b",
-    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                    asm, ".4h", ".4h",
-    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                    asm, ".8h", ".8h",
-    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                    asm, ".2s", ".2s",
-    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                    asm, ".4s", ".4s",
-    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
-                                    asm, ".2d", ".2d",
-    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                asm, ".2s", ".2s",
-    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                asm, ".4s", ".4s",
-    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
-                                asm, ".2d", ".2d",
-    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-// Supports only B element sizes.
-multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
-                          SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-}
-
-// Supports only B and H element sizes.
-multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
-}
-
-// Supports only S and D element sizes, uses high bit of the size field
-// as an extra opcode bit.
-multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-// Supports only S element size.
-multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-
-multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
-                                      asm, ".8b", ".8h",
-        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
-                                      asm#"2", ".16b", ".8h", []>;
-  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
-                                      asm, ".4h", ".4s",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
-                                      asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
-                                      asm, ".2s", ".2d",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
-                                      asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand regtype,
-                           string asm, string kind, string zero,
-                           ValueType dty, ValueType sty, SDNode OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
-      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
-      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Comparisons support all element sizes, except 1xD.
-multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
-                            SDNode OpNode> {
-  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
-                                     asm, ".8b", "0",
-                                     v8i8, v8i8, OpNode>;
-  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
-                                     asm, ".16b", "0",
-                                     v16i8, v16i8, OpNode>;
-  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
-                                     asm, ".4h", "0",
-                                     v4i16, v4i16, OpNode>;
-  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
-                                     asm, ".8h", "0",
-                                     v8i16, v8i16, OpNode>;
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
-                                     asm, ".2s", "0",
-                                     v2i32, v2i32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
-                                     asm, ".4s", "0",
-                                     v4i32, v4i32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
-                                     asm, ".2d", "0",
-                                     v2i64, v2i64, OpNode>;
-}
-
-// FP Comparisons support only S and D element sizes.
-multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
-                              string asm, SDNode OpNode> {
-
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
-                                     asm, ".2s", "0.0",
-                                     v2i32, v2f32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
-                                     asm, ".4s", "0.0",
-                                     v4i32, v4f32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
-                                     asm, ".2d", "0.0",
-                                     v2i64, v2f64, OpNode>;
-
-  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
-                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
-  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
-                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
-                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
-                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
-  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
-                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
-  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
-                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
-                                    asm, ".4s", ".4h", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".4s", ".8h", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
-                                    asm, ".2d", ".2s", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".2d", ".4s", []>;
-}
-
-multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
-                                    asm, ".4h", ".4s", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                    asm, ".2s", ".2d", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-}
-
-multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
-                                     Intrinsic OpNode> {
-  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                     asm, ".2s", ".2d",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4f32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register different-size vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// FIXME: TableGen doesn't know how to deal with expanded types that also
-//        change the element count (in this case, placing the results in
-//        the high elements of the result register rather than the low
-//        elements). Until that's fixed, we can't code-gen those.
-multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                    Intrinsic IntOp> {
-  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".8b", ".8h", ".8h",
-     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".16b", ".8h", ".8h",
-     []>;
-  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".4h", ".4s", ".4s",
-     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".4s", ".4s",
-     []>;
-  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".2s", ".2d", ".2d",
-     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".2d", ".2d",
-     []>;
-
-
-  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
-  // a version attached to an instruction.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
-                                                   (v8i16 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v8i16_v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
-                                                    (v4i32 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v4i32_v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
-                                                    (v2i64 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v2i64_v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
-                                      Intrinsic IntOp> {
-  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                            V128, V64, V64,
-                                            asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                            V128, V128, V128,
-                                            asm#"2", ".8h", ".16b", ".16b", []>;
-  let Predicates = [HasCrypto] in {
-    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
-                                              V128, V64, V64,
-                                              asm, ".1q", ".1d", ".1d", []>;
-    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
-                                              V128, V128, V128,
-                                              asm#"2", ".1q", ".2d", ".2d", []>;
-  }
-
-  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
-                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
-      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                (extract_high_v16i8 V128:$Rm)))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                  (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                 (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
-                                          string asm,
-                                          SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                   (extract_high_v16i8 V128:$Rm))))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                    (extract_high_v8i16 V128:$Rm))))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                    (extract_high_v4i32 V128:$Rm))))))]>;
-}
-
-multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
-                                      (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
-                                      string asm,
-                                      SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd),
-                  (extract_high_v16i8 V128:$Rn),
-                  (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
-                                           SDPatternOperator Accum> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                                                (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
-                                            (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (v2i32 V64:$Rn),
-                                                (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
-                                            (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".8h", ".8h", ".8b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".8h", ".16b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                                       (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".4s", ".4s", ".4h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".4s", ".8h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                                       (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".2d", ".2d", ".2s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".2d", ".4s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                                       (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
-                             string asm, string kind>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
-      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
-      [(set (vty regtype:$Rd),
-            (ARM64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> imm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size;
-  let Inst{29-21} = 0b101110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-11} = imm;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-multiclass SIMDBitwiseExtract<string asm> {
-  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
-    let imm{3} = 0;
-  }
-  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
-                        string asm, string kind, SDNode OpNode, ValueType valty>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
-      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDZipVector<bits<3>opc, string asm,
-                         SDNode OpNode> {
-  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
-      asm, ".8b", OpNode, v8i8>;
-  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
-      asm, ".16b", OpNode, v16i8>;
-  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
-      asm, ".4h", OpNode, v4i16>;
-  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
-      asm, ".8h", OpNode, v8i16>;
-  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
-      asm, ".2s", OpNode, v2i32>;
-  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
-      asm, ".4s", OpNode, v4i32>;
-  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
-      asm, ".2d", OpNode, v2i64>;
-
-  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
-        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
-  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "\t$Rd, $Rn, $Rm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-}
-
-multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
-  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
-            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
-                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-}
-
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  }
-
-  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
-                                SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
-              dag oops, dag iops, string asm, string cstr, list<dag> pat>
-  : I<oops, iops, asm,
-      "\t$Rd, $Rn, $Rm", cstr, pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$Rd),
-                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$Rd),
-                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
-            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$dst),
-                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
-                                      asm, "$Rd = $dst", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$dst),
-                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
-                                      asm, "$Rd = $dst",
-            [(set (i64 FPR64:$dst),
-                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "$Rd = $dst", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm, string zero>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "\t$Rd, $Rn, #" # zero, "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
-  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
-     [(set (f32 FPR32:$Rd), (int_arm64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-17} = 0b011111100110000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
-
-  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
-  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
-
-  def : InstAlias<asm # " $Rd, $Rn, #0",
-                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
-  def : InstAlias<asm # " $Rd, $Rn, #0",
-                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
-                          SDPatternOperator OpNode = null_frag> {
-  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
-  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
-}
-
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
-                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
-  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
-                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
-}
-
-multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
-           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
-                                 Intrinsic OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
-        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
-        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
-}
-
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
-        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
-  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, RegisterOperand vectype,
-                        string asm, string kind>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
-  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
-                                      asm, ".2s">;
-  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
-                          RegisterClass regtype, RegisterOperand vectype,
-                          string asm, string kind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
-                              string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
-                            Intrinsic intOp> {
-  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
-                                   asm, ".4s",
-        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-// FIXME: There has got to be a better way to factor these. ugh.
-
-class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
-                     string operands, string constraints, list<dag> pattern>
-  : I<outs, ins, asm, operands, constraints, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{15} = 0;
-  let Inst{10} = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
-                      RegisterOperand vecreg, RegisterClass regtype>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
-                   "{\t$Rd" # size # ", $Rn" #
-                   "|" # size # "\t$Rd, $Rn}", "",
-                   [(set (vectype vecreg:$Rd), (ARM64dup regtype:$Rn))]> {
-  let Inst{20-16} = imm5;
-  let Inst{14-11} = 0b0001;
-}
-
-class SIMDDupFromElement<bit Q, string dstkind, string srckind,
-                         ValueType vectype, ValueType insreg,
-                         RegisterOperand vecreg, Operand idxtype,
-                         ValueType elttype, SDNode OpNode>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
-                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
-                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
-                 [(set (vectype vecreg:$Rd),
-                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
-  let Inst{14-11} = 0b0000;
-}
-
-class SIMDDup64FromElement
-  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
-                       VectorIndexD, i64, ARM64duplane64> {
-  bits<1> idx;
-  let Inst{20} = idx;
-  let Inst{19-16} = 0b1000;
-}
-
-class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
-                       VectorIndexS, i64, ARM64duplane32> {
-  bits<2> idx;
-  let Inst{20-19} = idx;
-  let Inst{18-16} = 0b100;
-}
-
-class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
-                       VectorIndexH, i64, ARM64duplane16> {
-  bits<3> idx;
-  let Inst{20-18} = idx;
-  let Inst{17-16} = 0b10;
-}
-
-class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
-                          RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
-                       VectorIndexB, i64, ARM64duplane8> {
-  bits<4> idx;
-  let Inst{20-17} = idx;
-  let Inst{16} = 1;
-}
-
-class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
-                  Operand idxtype, string asm, list<dag> pattern>
-  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
-                   "{\t$Rd, $Rn" # size # "$idx" #
-                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
-  let Inst{14-11} = imm4;
-}
-
-class SIMDSMov<bit Q, string size, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
-class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
-      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
-
-class SIMDMovAlias<string asm, string size, Instruction inst,
-                   RegisterClass regtype, Operand idxtype>
-    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
-                    "|" # size # "\t$dst, $src$idx}",
-                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
-
-multiclass SMov {
-  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-}
-
-multiclass UMov {
-  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-  def : SIMDMovAlias<"mov", ".s",
-                     !cast<Instruction>(NAME#"vi32"),
-                     GPR32, VectorIndexS>;
-  def : SIMDMovAlias<"mov", ".d",
-                     !cast<Instruction>(NAME#"vi64"),
-                     GPR64, VectorIndexD>;
-}
-
-class SIMDInsFromMain<string size, ValueType vectype,
-                      RegisterClass regtype, Operand idxtype>
-  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" #
-                   "|" # size # "\t$Rd$idx, $Rn}",
-                   "$Rd = $dst",
-            [(set V128:$dst,
-              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
-  let Inst{14-11} = 0b0011;
-}
-
-class SIMDInsFromElement<string size, ValueType vectype,
-                         ValueType elttype, Operand idxtype>
-  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
-                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
-                   "$Rd = $dst",
-         [(set V128:$dst,
-               (vector_insert
-                 (vectype V128:$Rd),
-                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
-                 idxtype:$idx))]>;
-
-class SIMDInsMainMovAlias<string size, Instruction inst,
-                          RegisterClass regtype, Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
-                        "|" # size #"\t$dst$idx, $src}",
-                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
-class SIMDInsElementMovAlias<string size, Instruction inst,
-                             Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
-                      # "|" # size #" $dst$idx, $src$idx2}",
-                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
-
-
-multiclass SIMDIns {
-  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
-    bits<4> idx;
-    bits<4> idx2;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-    let Inst{14-11} = idx2;
-  }
-  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
-    bits<3> idx;
-    bits<3> idx2;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-    let Inst{14-12} = idx2;
-    let Inst{11} = 0;
-  }
-  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
-    bits<2> idx;
-    bits<2> idx2;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-    let Inst{14-13} = idx2;
-    let Inst{12-11} = 0;
-  }
-  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
-    bits<1> idx;
-    bits<1> idx2;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-    let Inst{14} = idx2;
-    let Inst{13-11} = 0;
-  }
-
-  // For all forms of the INS instruction, the "mov" mnemonic is the
-  // preferred alias. Why they didn't just call the instruction "mov" in
-  // the first place is a very good question indeed...
-  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
-                         GPR32, VectorIndexB>;
-  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
-                         GPR32, VectorIndexH>;
-  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
-                         GPR32, VectorIndexS>;
-  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
-                         GPR64, VectorIndexD>;
-
-  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
-                         VectorIndexB>;
-  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
-                         VectorIndexH>;
-  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
-                         VectorIndexS>;
-  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
-                         VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-class SIMDTableLookupAlias<string asm, Instruction inst,
-                          RegisterOperand vectype, RegisterOperand listtype>
-    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
-                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
-
-multiclass SIMDTableLookup<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-multiclass SIMDTableLookupTied<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY
-//----------------------------------------------------------------------------
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
-                        string kind, Operand idxtype>
-  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
-       "{\t$dst, $src" # kind # "$idx" #
-       "|\t$dst, $src$idx}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> dst;
-  bits<5> src;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{15-10} = 0b000001;
-  let Inst{9-5}   = src;
-  let Inst{4-0}   = dst;
-}
-
-class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
-      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
-    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
-                    # "|\t$dst, $src$index}",
-                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
-
-
-multiclass SIMDScalarCPY<string asm> {
-  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
-                                                          VectorIndexD:$idx)))),
-            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
-
-  // 'DUP' mnemonic aliases.
-  def : SIMDScalarCPYAlias<"dup", ".b",
-                           !cast<Instruction>(NAME#"i8"),
-                           FPR8, V128, VectorIndexB>;
-  def : SIMDScalarCPYAlias<"dup", ".h",
-                           !cast<Instruction>(NAME#"i16"),
-                           FPR16, V128, VectorIndexH>;
-  def : SIMDScalarCPYAlias<"dup", ".s",
-                           !cast<Instruction>(NAME#"i32"),
-                           FPR32, V128, VectorIndexS>;
-  def : SIMDScalarCPYAlias<"dup", ".d",
-                           !cast<Instruction>(NAME#"i64"),
-                           FPR64, V128, VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//----------------------------------------------------------------------------
-
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
-                          string asm, string op_string,
-                          string cstr, list<dag> pattern>
-  : I<oops, iops, asm, op_string, cstr, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<8> imm8;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = op;
-  let Inst{28-19} = 0b0111100000;
-  let Inst{18-16} = imm8{7-5};
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = imm8{4-0};
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
-                        !con((ins immtype:$imm8), opt_shift_iop), asm,
-                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "", pattern> {
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
-                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
-                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "$Rd = $dst", pattern> {
-  let DecoderMethod = "DecodeModImmTiedInstruction";
-}
-
-class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-
-class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
-                                      string asm> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
-                                                 asm, ".4h", []>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
-                                                 asm, ".8h", []>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
-                                             asm, ".2s", []>;
-  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
-                                             asm, ".4s", []>;
-}
-
-multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
-                                      bits<2> w_cmode, string asm,
-                                      SDNode OpNode> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
-                                                 asm, ".4h",
-             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
-                                                 asm, ".8h",
-             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
-                                             asm, ".2s",
-             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
-                                             asm, ".4s",
-             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-}
-
-class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
-                             RegisterOperand vectype, string asm,
-                             string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins move_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<1> shift;
-  let Inst{15-13} = cmode{3-1};
-  let Inst{12}    = shift;
-}
-
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
-                                   RegisterOperand vectype,
-                                   Operand imm_type, string asm,
-                                   string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
-                              asm, kind, pattern> {
-  let Inst{15-12} = cmode;
-}
-
-class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
-                                   list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
-                        "\t$Rd, $imm8", "", pattern> {
-  let Inst{15-12} = cmode;
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
-      asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$dst),
-      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2f32 V64:$Rd),
-        (OpNode (v2f32 V64:$Rn),
-         (v2f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4f32 V128:$Rd),
-        (OpNode (v4f32 V128:$Rn),
-         (v4f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d",
-    [(set (v2f64 V128:$Rd),
-        (OpNode (v2f64 V128:$Rn),
-         (v2f64 (ARM64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (f32 FPR32Op:$Rd),
-          (OpNode (f32 FPR32Op:$Rn),
-                  (f32 (vector_extract (v4f32 V128:$Rm),
-                                       VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d",
-    [(set (f64 FPR64Op:$Rd),
-          (OpNode (f64 FPR64Op:$Rn),
-                  (f64 (vector_extract (v2f64 V128:$Rm),
-                                       VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
-  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # v2i32_indexed)
-                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-
-  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v4i32_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 V128:$Rm),
-                                           VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v2i64_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 FPR64Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexD:$idx)>;
-}
-
-multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
-                                          V128, VectorIndexS,
-                                          asm, ".2s", ".2s", ".2s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
-                         SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s",  ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-      [(set (i32 FPR32Op:$Rd),
-            (OpNode FPR32Op:$Rn,
-                    (i32 (vector_extract (v4i32 V128:$Rm),
-                                         VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
-                                          V128_lo, VectorIndexH,
-                                          asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$dst),
-        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$dst),
-       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$dst),
-       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$dst),
-       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator Accum> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                             (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
-  // intermediate EXTRACT_SUBREG would be untyped.
-  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                (i32 (vector_extract (v4i32
-                         (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx)))),
-                         (i64 0))))),
-            (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME # v4i16_indexed)
-                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
-                    V128_lo:$Rm, VectorIndexH:$idx),
-                ssub)>;
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                            (extract_high_v8i16 V128:$Rn),
-                            (extract_high_v8i16
-                                (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (Accum (v2i64 V128:$Rd),
-               (v2i64 (int_arm64_neon_sqdmull
-                          (v2i32 V64:$Rn),
-                          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull
-                            (extract_high_v4i32 V128:$Rn),
-                            (extract_high_v4i32
-                                (ARM64duplane32 (v4i32 V128:$Rm),
-                                                VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (i64 FPR64Op:$dst),
-          (Accum (i64 FPR64Op:$Rd),
-                 (i64 (int_arm64_neon_sqdmulls_scalar
-                            (i32 FPR32Op:$Rn),
-                            (i32 (vector_extract (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift by immediate
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$Rd),
-     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
-                                                   (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                           (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
-                                            vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (v1i64 FPR64:$Rd),
-       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode = null_frag> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR16, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR32, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR64, vecshiftR32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-}
-
-multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftL8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftL16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftL32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
-}
-
-multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector x indexed element
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand dst_reg, RegisterOperand src_reg,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand vectype1, RegisterOperand vectype2,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
-                              Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
-                                  Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
-                                     SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V128, vecshiftR16Narrow,
-                                  asm, ".8b", ".8h",
-      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR16Narrow,
-                                  asm#"2", ".16b", ".8h", []> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V128, vecshiftR32Narrow,
-                                  asm, ".4h", ".4s",
-      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR32Narrow,
-                                  asm#"2", ".8h", ".4s", []> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V128, vecshiftR64Narrow,
-                                  asm, ".2s", ".2d",
-      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR64Narrow,
-                                  asm#"2", ".4s", ".2d", []> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
-  // themselves, so put them here instead.
-
-  // Patterns involving what's effectively an insert high and a normal
-  // intrinsic, represented by CONCAT_VECTORS.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
-                                                   vecshiftR16Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v16i8_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR16Narrow:$imm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
-                                                     vecshiftR32Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v8i16_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR32Narrow:$imm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
-                                                     vecshiftR64Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v4i32_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR64Narrow:$imm)>;
-}
-
-multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                           (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
-             [(set (v16i8 V128:$dst),
-               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
-              [(set (v4i16 V64:$dst),
-                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                        (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
-            [(set (v8i16 V128:$dst),
-              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                      (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
-              [(set (v2i32 V64:$dst),
-                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                        (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
-            [(set (v4i32 V128:$dst),
-              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                      (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
-              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                      (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$dst),
-                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$dst),
-                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$dst),
-                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                                   (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$dst),
-                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-                    [(set (v2i32 V64:$dst),
-                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-                    [(set (v4i32 V128:$dst),
-                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-                    [(set (v2i64 V128:$dst),
-                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm#"2", ".8h", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm#"2", ".4s", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
-
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm#"2", ".2d", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-}
-
-
-//---
-// Vector load/store
-//---
-// SIMD ldX/stX no-index memory references don't allow the optional
-// ", #0" constant and handle post-indexing explicitly, so we use
-// a more specialized parse method for them. Otherwise, it's the same as
-// the general am_noindex handling.
-
-class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
-                   string asm, dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, "\t$Vt, $vaddr", "", pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011000;
-  let Inst{22} = L;
-  let Inst{21-16} = 0b000000;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-}
-
-class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
-                       string asm, dag oops, dag iops>
-  : I<oops, iops, asm, "\t$Vt, $vaddr, $Xm", "$vaddr = $wback", []> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  bits<5> Xm;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011001;
-  let Inst{22} = L;
-  let Inst{21} = 0;
-  let Inst{20-16} = Xm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-}
-
-// The immediate form of AdvSIMD post-indexed addressing is encoded with
-// register post-index addressing from the zero register.
-multiclass SIMDLdStAliases<string asm, string layout, string Count,
-                           int Offset, int Size> {
-  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
-  //      "ld1\t$Vt, $vaddr, #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      XZR), 1>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
-  //      "ld1.8b\t$Vt, $vaddr, #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      XZR), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1]"
-  //      "ld1\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Twov8b VecListTwo64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
-                  (!cast<Instruction>(NAME # Count # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
-  //      "ld1\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, GPR64pi8:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-
-
-    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-// Only ld1/st1 has a v1d version.
-multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
-    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
-                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                                 am_simdnoindex:$vaddr), []>;
-    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-
-    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDLd1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // LD1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
-                       (outs am_simdnoindex:$wback,
-                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDSt1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // ST1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
-    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
-                       (outs am_simdnoindex:$wback),
-                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass SIMDLd1Multiple<string asm> {
-  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDSt1Multiple<string asm> {
-  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDLd2Multiple<string asm> {
-  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDSt2Multiple<string asm> {
-  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDLd3Multiple<string asm> {
-  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDSt3Multiple<string asm> {
-  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDLd4Multiple<string asm> {
-  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-multiclass SIMDSt4Multiple<string asm> {
-  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-//---
-// AdvSIMD Load/store single-element
-//---
-
-class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, string cst,
-                         dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, operands, cst, pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-}
-
-class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, string cst,
-                         dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-}
-
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
-                  Operand listtype>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr", "",
-                       (outs listtype:$Vt), (ins am_simdnoindex:$vaddr),
-                       []> {
-  let Inst{30} = Q;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
-                      string asm, Operand listtype, Operand GPR64pi>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr, $Xm",
-                       "$vaddr = $wback",
-                       (outs am_simdnoindex:$wback, listtype:$Vt),
-                       (ins am_simdnoindex:$vaddr, GPR64pi:$Xm), []> {
-  bits<5> Xm;
-  let Inst{30} = Q;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-
-multiclass SIMDLdrAliases<string asm, string layout, string Count,
-                          int Offset, int Size> {
-  // E.g. "ld1r { v0.8b }, [x1], #1"
-  //      "ld1r.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      XZR), 1>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], #1"
-  //      "ld1r.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      XZR), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1]"
-  //      "ld1r.8b\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
-                  (!cast<Instruction>(NAME # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], x2"
-  //      "ld1r.8b\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
-  int Offset1, int Offset2, int Offset4, int Offset8> {
-  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count # "8b")>;
-  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count #"16b")>;
-  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"4h")>;
-  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"8h")>;
-  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"2s")>;
-  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"4s")>;
-  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"1d")>;
-  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"2d")>;
-
-  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "8b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "16b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "4h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "8h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "2s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "4s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "1d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "2d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-
-  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
-  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
-  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
-  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
-  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
-  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
-  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
-  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
-}
-
-class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                           "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-
-class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-
-class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                           "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                           "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "", oops, iops,
-                       pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", "",
-                           oops, iops, pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                           "$vaddr = $wback", oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
-                           (outs listtype:$dst),
-                           (ins listtype:$Vt, VectorIndexB:$idx,
-                                am_simdnoindex:$vaddr), []>;
-
-  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
-                            (outs am_simdnoindex:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexB:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
-                            (outs am_simdnoindex:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
-                            (outs am_simdnoindex:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
-                            (outs am_simdnoindex:$wback, listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
-                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                        am_simdnoindex:$vaddr), []>;
-
-  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
-                                    (outs am_simdnoindex:$wback),
-                                    (ins listtype:$Vt, VectorIndexB:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
-                            (outs am_simdnoindex:$wback),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
-                            (outs am_simdnoindex:$wback),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
-                            (outs am_simdnoindex:$wback),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-
-multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
-                                 string Count, int Offset, Operand idxtype> {
-  // E.g. "ld1 { v0.8b }[0], [x1], #1"
-  //      "ld1\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt$idx, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Type  # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      idxtype:$idx, XZR), 1>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], #1"
-  //      "ld1.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Type # "_POST")
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                      idxtype:$idx, XZR), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1]"
-  //      "ld1.8b\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr",
-                      (!cast<Instruction>(NAME # Type)
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx, am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], x2"
-  //      "ld1.8b\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, $Xm",
-                      (!cast<Instruction>(NAME # Type # "_POST")
-                         am_simdnoindex:$vaddr,
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx,
-                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdSt1SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
-}
-
-multiclass SIMDLdSt2SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
-}
-
-multiclass SIMDLdSt3SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
-}
-
-multiclass SIMDLdSt4SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
-}
-} // end of 'let Predicates = [HasNEON]'
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-let Predicates = [HasCrypto] in {
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
-              list<dag> pat>
-  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0100111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
-            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
-            "$Rd = $dst",
-            [(set (v16i8 V128:$dst),
-                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
-                     dag oops, dag iops, list<dag> pat>
-  : I<oops, iops, asm,
-      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
-      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
-                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
-                   [(set (v4i32 V128:$dst),
-                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA2OpInst<bits<4> opc, string asm, string kind,
-                 string cstr, dag oops, dag iops,
-                 list<dag> pat>
-  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
-                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0101111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
-               (ins V128:$Rd, V128:$Rn),
-               [(set (v4i32 V128:$dst),
-                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-
-class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
-               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-} // end of 'let Predicates = [HasCrypto]'
-
-// Allow the size specifier tokens to be upper case, not just lower.
-def : TokenAlias<".8B", ".8b">;
-def : TokenAlias<".4H", ".4h">;
-def : TokenAlias<".2S", ".2s">;
-def : TokenAlias<".1D", ".1d">;
-def : TokenAlias<".16B", ".16b">;
-def : TokenAlias<".8H", ".8h">;
-def : TokenAlias<".4S", ".4s">;
-def : TokenAlias<".2D", ".2d">;
-def : TokenAlias<".1Q", ".1q">;
-def : TokenAlias<".B", ".b">;
-def : TokenAlias<".H", ".h">;
-def : TokenAlias<".S", ".s">;
-def : TokenAlias<".D", ".d">;
-def : TokenAlias<".Q", ".q">;
diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp
deleted file mode 100644
index 5643fb0ce2c8..000000000000
--- a/lib/Target/ARM64/ARM64InstrInfo.cpp
+++ /dev/null
@@ -1,2033 +0,0 @@
-//===- ARM64InstrInfo.cpp - ARM64 Instruction Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-#define GET_INSTRINFO_CTOR_DTOR
-#include "ARM64GenInstrInfo.inc"
-
-ARM64InstrInfo::ARM64InstrInfo(const ARM64Subtarget &STI)
-    : ARM64GenInstrInfo(ARM64::ADJCALLSTACKDOWN, ARM64::ADJCALLSTACKUP),
-      RI(this, &STI), Subtarget(STI) {}
-
-/// GetInstSize - Return the number of bytes of code the specified
-/// instruction may be.  This returns the maximum number of bytes.
-unsigned ARM64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MCInstrDesc &Desc = MI->getDesc();
-
-  switch (Desc.getOpcode()) {
-  default:
-    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
-    return 4;
-  case TargetOpcode::DBG_VALUE:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-    return 0;
-  }
-
-  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
-}
-
-static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
-                            SmallVectorImpl<MachineOperand> &Cond) {
-  // Block ends with fall-through condbranch.
-  switch (LastInst->getOpcode()) {
-  default:
-    llvm_unreachable("Unknown branch instruction?");
-  case ARM64::Bcc:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::TBZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZW:
-  case ARM64::TBNZX:
-    Target = LastInst->getOperand(2).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    Cond.push_back(LastInst->getOperand(1));
-  }
-}
-
-// Branch analysis.
-bool ARM64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return false;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
-  if (!isUnpredicatedTerminator(I))
-    return false;
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-
-  // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (isUncondBranchOpcode(LastOpc)) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
-    if (isCondBranchOpcode(LastOpc)) {
-      // Block ends with fall-through condbranch.
-      parseCondBranch(LastInst, TBB, Cond);
-      return false;
-    }
-    return true; // Can't handle indirect branch.
-  }
-
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-
-  // If AllowModify is true and the block ends with two or more unconditional
-  // branches, delete all but the first unconditional branch.
-  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
-    while (isUncondBranchOpcode(SecondLastOpc)) {
-      LastInst->eraseFromParent();
-      LastInst = SecondLastInst;
-      LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-        // Return now the only terminator is an unconditional branch.
-        TBB = LastInst->getOperand(0).getMBB();
-        return false;
-      } else {
-        SecondLastInst = I;
-        SecondLastOpc = SecondLastInst->getOpcode();
-      }
-    }
-  }
-
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
-
-  // If the block ends with a B and a Bcc, handle it.
-  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    parseCondBranch(SecondLastInst, TBB, Cond);
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
-
-  // If the block ends with two unconditional branches, handle it.  The second
-  // one is not executed, so remove it.
-  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
-
-  // ...likewise if it ends with an indirect branch followed by an unconditional
-  // branch.
-  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return true;
-  }
-
-  // Otherwise, can't handle this.
-  return true;
-}
-
-bool ARM64InstrInfo::ReverseBranchCondition(
-    SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    ARM64CC::CondCode CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
-    Cond[0].setImm(ARM64CC::getInvertedCondCode(CC));
-  } else {
-    // Folded compare-and-branch
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown conditional branch!");
-    case ARM64::CBZW:
-      Cond[1].setImm(ARM64::CBNZW);
-      break;
-    case ARM64::CBNZW:
-      Cond[1].setImm(ARM64::CBZW);
-      break;
-    case ARM64::CBZX:
-      Cond[1].setImm(ARM64::CBNZX);
-      break;
-    case ARM64::CBNZX:
-      Cond[1].setImm(ARM64::CBZX);
-      break;
-    case ARM64::TBZW:
-      Cond[1].setImm(ARM64::TBNZW);
-      break;
-    case ARM64::TBNZW:
-      Cond[1].setImm(ARM64::TBZW);
-      break;
-    case ARM64::TBZX:
-      Cond[1].setImm(ARM64::TBNZX);
-      break;
-    case ARM64::TBNZX:
-      Cond[1].setImm(ARM64::TBZX);
-      break;
-    }
-  }
-
-  return false;
-}
-
-unsigned ARM64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return 0;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return 0;
-    --I;
-  }
-  if (!isUncondBranchOpcode(I->getOpcode()) &&
-      !isCondBranchOpcode(I->getOpcode()))
-    return 0;
-
-  // Remove the branch.
-  I->eraseFromParent();
-
-  I = MBB.end();
-
-  if (I == MBB.begin())
-    return 1;
-  --I;
-  if (!isCondBranchOpcode(I->getOpcode()))
-    return 1;
-
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
-}
-
-void ARM64InstrInfo::instantiateCondBranch(
-    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
-    const SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    BuildMI(&MBB, DL, get(ARM64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
-  } else {
-    // Folded compare-and-branch
-    const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
-    if (Cond.size() > 3)
-      MIB.addImm(Cond[3].getImm());
-    MIB.addMBB(TBB);
-  }
-}
-
-unsigned ARM64InstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
-  // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-
-  if (!FBB) {
-    if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, DL, get(ARM64::B)).addMBB(TBB);
-    else
-      instantiateCondBranch(MBB, DL, TBB, Cond);
-    return 1;
-  }
-
-  // Two-way conditional branch.
-  instantiateCondBranch(MBB, DL, TBB, Cond);
-  BuildMI(&MBB, DL, get(ARM64::B)).addMBB(FBB);
-  return 2;
-}
-
-// Find the original register that VReg is copied from.
-static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
-  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
-    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-    if (!DefMI->isFullCopy())
-      return VReg;
-    VReg = DefMI->getOperand(1).getReg();
-  }
-  return VReg;
-}
-
-// Determine if VReg is defined by an instruction that can be folded into a
-// csel instruction. If so, return the folded opcode, and the replacement
-// register.
-static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
-                                unsigned *NewVReg = nullptr) {
-  VReg = removeCopies(MRI, VReg);
-  if (!TargetRegisterInfo::isVirtualRegister(VReg))
-    return 0;
-
-  bool Is64Bit = ARM64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
-  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-  unsigned Opc = 0;
-  unsigned SrcOpNum = 0;
-  switch (DefMI->getOpcode()) {
-  case ARM64::ADDSXri:
-  case ARM64::ADDSWri:
-    // if NZCV is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::NZCV, true) == -1)
-      return 0;
-  // fall-through to ADDXri and ADDWri.
-  case ARM64::ADDXri:
-  case ARM64::ADDWri:
-    // add x, 1 -> csinc.
-    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
-        DefMI->getOperand(3).getImm() != 0)
-      return 0;
-    SrcOpNum = 1;
-    Opc = Is64Bit ? ARM64::CSINCXr : ARM64::CSINCWr;
-    break;
-
-  case ARM64::ORNXrr:
-  case ARM64::ORNWrr: {
-    // not x -> csinv, represented as orn dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSINVXr : ARM64::CSINVWr;
-    break;
-  }
-
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSWrr:
-    // if NZCV is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::NZCV, true) == -1)
-      return 0;
-  // fall-through to SUBXrr and SUBWrr.
-  case ARM64::SUBXrr:
-  case ARM64::SUBWrr: {
-    // neg x -> csneg, represented as sub dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSNEGXr : ARM64::CSNEGWr;
-    break;
-  }
-  default:
-    return 0;
-  }
-  assert(Opc && SrcOpNum && "Missing parameters");
-
-  if (NewVReg)
-    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
-  return Opc;
-}
-
-bool ARM64InstrInfo::canInsertSelect(
-    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
-    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
-    int &FalseCycles) const {
-  // Check register classes.
-  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *RC =
-      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
-  if (!RC)
-    return false;
-
-  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
-  unsigned ExtraCondLat = Cond.size() != 1;
-
-  // GPRs are handled by csel.
-  // FIXME: Fold in x+1, -x, and ~x when applicable.
-  if (ARM64::GPR64allRegClass.hasSubClassEq(RC) ||
-      ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-    // Single-cycle csel, csinc, csinv, and csneg.
-    CondCycles = 1 + ExtraCondLat;
-    TrueCycles = FalseCycles = 1;
-    if (canFoldIntoCSel(MRI, TrueReg))
-      TrueCycles = 0;
-    else if (canFoldIntoCSel(MRI, FalseReg))
-      FalseCycles = 0;
-    return true;
-  }
-
-  // Scalar floating point is handled by fcsel.
-  // FIXME: Form fabs, fmin, and fmax when applicable.
-  if (ARM64::FPR64RegClass.hasSubClassEq(RC) ||
-      ARM64::FPR32RegClass.hasSubClassEq(RC)) {
-    CondCycles = 5 + ExtraCondLat;
-    TrueCycles = FalseCycles = 2;
-    return true;
-  }
-
-  // Can't do vectors.
-  return false;
-}
-
-void ARM64InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DstReg,
-                                  const SmallVectorImpl<MachineOperand> &Cond,
-                                  unsigned TrueReg, unsigned FalseReg) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  // Parse the condition code, see parseCondBranch() above.
-  ARM64CC::CondCode CC;
-  switch (Cond.size()) {
-  default:
-    llvm_unreachable("Unknown condition opcode in Cond");
-  case 1: // b.cc
-    CC = ARM64CC::CondCode(Cond[0].getImm());
-    break;
-  case 3: { // cbz/cbnz
-    // We must insert a compare against 0.
-    bool Is64Bit;
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::CBZW:
-      Is64Bit = 0;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBZX:
-      Is64Bit = 1;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBNZW:
-      Is64Bit = 0;
-      CC = ARM64CC::NE;
-      break;
-    case ARM64::CBNZX:
-      Is64Bit = 1;
-      CC = ARM64CC::NE;
-      break;
-    }
-    unsigned SrcReg = Cond[2].getReg();
-    if (Is64Bit) {
-      // cmp reg, #0 is actually subs xzr, reg, #0.
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR64spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSXri), ARM64::XZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    } else {
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR32spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSWri), ARM64::WZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    }
-    break;
-  }
-  case 4: { // tbz/tbnz
-    // We must insert a tst instruction.
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::TBZW:
-    case ARM64::TBZX:
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::TBNZW:
-    case ARM64::TBNZX:
-      CC = ARM64CC::NE;
-      break;
-    }
-    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
-    if (Cond[1].getImm() == ARM64::TBZW || Cond[1].getImm() == ARM64::TBNZW)
-      BuildMI(MBB, I, DL, get(ARM64::ANDSWri), ARM64::WZR)
-          .addReg(Cond[2].getReg())
-          .addImm(ARM64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
-    else
-      BuildMI(MBB, I, DL, get(ARM64::ANDSXri), ARM64::XZR)
-          .addReg(Cond[2].getReg())
-          .addImm(ARM64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
-    break;
-  }
-  }
-
-  unsigned Opc = 0;
-  const TargetRegisterClass *RC = nullptr;
-  bool TryFold = false;
-  if (MRI.constrainRegClass(DstReg, &ARM64::GPR64RegClass)) {
-    RC = &ARM64::GPR64RegClass;
-    Opc = ARM64::CSELXr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::GPR32RegClass)) {
-    RC = &ARM64::GPR32RegClass;
-    Opc = ARM64::CSELWr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR64RegClass)) {
-    RC = &ARM64::FPR64RegClass;
-    Opc = ARM64::FCSELDrrr;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR32RegClass)) {
-    RC = &ARM64::FPR32RegClass;
-    Opc = ARM64::FCSELSrrr;
-  }
-  assert(RC && "Unsupported regclass");
-
-  // Try folding simple instructions into the csel.
-  if (TryFold) {
-    unsigned NewVReg = 0;
-    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
-    if (FoldedOpc) {
-      // The folded opcodes csinc, csinc and csneg apply the operation to
-      // FalseReg, so we need to invert the condition.
-      CC = ARM64CC::getInvertedCondCode(CC);
-      TrueReg = FalseReg;
-    } else
-      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
-
-    // Fold the operation. Leave any dead instructions for DCE to clean up.
-    if (FoldedOpc) {
-      FalseReg = NewVReg;
-      Opc = FoldedOpc;
-      // The extends the live range of NewVReg.
-      MRI.clearKillFlags(NewVReg);
-    }
-  }
-
-  // Pull all virtual register into the appropriate class.
-  MRI.constrainRegClass(TrueReg, RC);
-  MRI.constrainRegClass(FalseReg, RC);
-
-  // Insert the csel.
-  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
-      CC);
-}
-
-bool ARM64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                           unsigned &SrcReg, unsigned &DstReg,
-                                           unsigned &SubIdx) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case ARM64::SBFMXri: // aka sxtw
-  case ARM64::UBFMXri: // aka uxtw
-    // Check for the 32 -> 64 bit extension case, these instructions can do
-    // much more.
-    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
-      return false;
-    // This is a signed or unsigned 32 -> 64 bit extension.
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SubIdx = ARM64::sub_32;
-    return true;
-  }
-}
-
-/// analyzeCompare - For a comparison instruction, return the source registers
-/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-/// Return true if the comparison instruction can be analyzed.
-bool ARM64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                                    unsigned &SrcReg2, int &CmpMask,
-                                    int &CmpValue) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWrs:
-  case ARM64::SUBSWrx:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXrs:
-  case ARM64::SUBSXrx:
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWrs:
-  case ARM64::ADDSWrx:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXrs:
-  case ARM64::ADDSXrx:
-    // Replace SUBSWrr with SUBWrr if NZCV is not used.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
-    CmpMask = ~0;
-    CmpValue = 0;
-    return true;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI->getOperand(2).getImm();
-    return true;
-  case ARM64::ANDSWri:
-  case ARM64::ANDSXri:
-    // ANDS does not use the same encoding scheme as the others xxxS
-    // instructions.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = ARM64_AM::decodeLogicalImmediate(
-        MI->getOperand(2).getImm(),
-        MI->getOpcode() == ARM64::ANDSWri ? 32 : 64);
-    return true;
-  }
-
-  return false;
-}
-
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
-  MachineBasicBlock *MBB = Instr->getParent();
-  assert(MBB && "Can't get MachineBasicBlock here");
-  MachineFunction *MF = MBB->getParent();
-  assert(MF && "Can't get MachineFunction here");
-  const TargetMachine *TM = &MF->getTarget();
-  const TargetInstrInfo *TII = TM->getInstrInfo();
-  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
-  MachineRegisterInfo *MRI = &MF->getRegInfo();
-
-  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
-       ++OpIdx) {
-    MachineOperand &MO = Instr->getOperand(OpIdx);
-    const TargetRegisterClass *OpRegCstraints =
-        Instr->getRegClassConstraint(OpIdx, TII, TRI);
-
-    // If there's no constraint, there's nothing to do.
-    if (!OpRegCstraints)
-      continue;
-    // If the operand is a frame index, there's nothing to do here.
-    // A frame index operand will resolve correctly during PEI.
-    if (MO.isFI())
-      continue;
-
-    assert(MO.isReg() &&
-           "Operand has register constraints without being a register!");
-
-    unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      if (!OpRegCstraints->contains(Reg))
-        return false;
-    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
-               !MRI->constrainRegClass(Reg, OpRegCstraints))
-      return false;
-  }
-
-  return true;
-}
-
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
-bool ARM64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
-
-  // Replace SUBSWrr with SUBWrr if NZCV is not used.
-  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(ARM64::NZCV, true);
-  if (Cmp_NZCV != -1) {
-    unsigned NewOpc;
-    switch (CmpInstr->getOpcode()) {
-    default:
-      return false;
-    case ARM64::ADDSWrr:      NewOpc = ARM64::ADDWrr; break;
-    case ARM64::ADDSWri:      NewOpc = ARM64::ADDWri; break;
-    case ARM64::ADDSWrs:      NewOpc = ARM64::ADDWrs; break;
-    case ARM64::ADDSWrx:      NewOpc = ARM64::ADDWrx; break;
-    case ARM64::ADDSXrr:      NewOpc = ARM64::ADDXrr; break;
-    case ARM64::ADDSXri:      NewOpc = ARM64::ADDXri; break;
-    case ARM64::ADDSXrs:      NewOpc = ARM64::ADDXrs; break;
-    case ARM64::ADDSXrx:      NewOpc = ARM64::ADDXrx; break;
-    case ARM64::SUBSWrr:      NewOpc = ARM64::SUBWrr; break;
-    case ARM64::SUBSWri:      NewOpc = ARM64::SUBWri; break;
-    case ARM64::SUBSWrs:      NewOpc = ARM64::SUBWrs; break;
-    case ARM64::SUBSWrx:      NewOpc = ARM64::SUBWrx; break;
-    case ARM64::SUBSXrr:      NewOpc = ARM64::SUBXrr; break;
-    case ARM64::SUBSXri:      NewOpc = ARM64::SUBXri; break;
-    case ARM64::SUBSXrs:      NewOpc = ARM64::SUBXrs; break;
-    case ARM64::SUBSXrx:      NewOpc = ARM64::SUBXrx; break;
-    }
-
-    const MCInstrDesc &MCID = get(NewOpc);
-    CmpInstr->setDesc(MCID);
-    CmpInstr->RemoveOperand(Cmp_NZCV);
-    bool succeeded = UpdateOperandRegClass(CmpInstr);
-    (void)succeeded;
-    assert(succeeded && "Some operands reg class are incompatible!");
-    return true;
-  }
-
-  // Continue only if we have a "ri" where immediate is zero.
-  if (CmpValue != 0 || SrcReg2 != 0)
-    return false;
-
-  // CmpInstr is a Compare instruction if destination register is not used.
-  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
-    return false;
-
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI)
-    return false;
-
-  // We iterate backward, starting from the instruction before CmpInstr and
-  // stop when reaching the definition of the source register or done with the
-  // basic block, to check whether NZCV is used or modified in between.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
-
-  // Early exit if CmpInstr is at the beginning of the BB.
-  if (I == B)
-    return false;
-
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, we can't optimize away the Compare.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that NZCV isn't set between the comparison instruction and the one we
-  // want to change.
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
-
-    if (Instr.modifiesRegister(ARM64::NZCV, TRI) ||
-        Instr.readsRegister(ARM64::NZCV, TRI))
-      // This instruction modifies or uses NZCV after the one we want to
-      // change. We can't do this transformation.
-      return false;
-    if (I == B)
-      // The 'and' is below the comparison instruction.
-      return false;
-  }
-
-  unsigned NewOpc = MI->getOpcode();
-  switch (MI->getOpcode()) {
-  default:
-    return false;
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXri:
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXri:
-    break;
-  case ARM64::ADDWrr:    NewOpc = ARM64::ADDSWrr; break;
-  case ARM64::ADDWri:    NewOpc = ARM64::ADDSWri; break;
-  case ARM64::ADDXrr:    NewOpc = ARM64::ADDSXrr; break;
-  case ARM64::ADDXri:    NewOpc = ARM64::ADDSXri; break;
-  case ARM64::ADCWr:     NewOpc = ARM64::ADCSWr; break;
-  case ARM64::ADCXr:     NewOpc = ARM64::ADCSXr; break;
-  case ARM64::SUBWrr:    NewOpc = ARM64::SUBSWrr; break;
-  case ARM64::SUBWri:    NewOpc = ARM64::SUBSWri; break;
-  case ARM64::SUBXrr:    NewOpc = ARM64::SUBSXrr; break;
-  case ARM64::SUBXri:    NewOpc = ARM64::SUBSXri; break;
-  case ARM64::SBCWr:     NewOpc = ARM64::SBCSWr; break;
-  case ARM64::SBCXr:     NewOpc = ARM64::SBCSXr; break;
-  case ARM64::ANDWri:    NewOpc = ARM64::ANDSWri; break;
-  case ARM64::ANDXri:    NewOpc = ARM64::ANDSXri; break;
-  }
-
-  // Scan forward for the use of NZCV.
-  // When checking against MI: if it's a conditional code requires
-  // checking of V bit, then this is not safe to do.
-  // It is safe to remove CmpInstr if NZCV is redefined or killed.
-  // If we are done with the basic block, we need to check whether NZCV is
-  // live-out.
-  bool IsSafe = false;
-  for (MachineBasicBlock::iterator I = CmpInstr,
-                                   E = CmpInstr->getParent()->end();
-       !IsSafe && ++I != E;) {
-    const MachineInstr &Instr = *I;
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
-         ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(ARM64::NZCV)) {
-        IsSafe = true;
-        break;
-      }
-      if (!MO.isReg() || MO.getReg() != ARM64::NZCV)
-        continue;
-      if (MO.isDef()) {
-        IsSafe = true;
-        break;
-      }
-
-      // Decode the condition code.
-      unsigned Opc = Instr.getOpcode();
-      ARM64CC::CondCode CC;
-      switch (Opc) {
-      default:
-        return false;
-      case ARM64::Bcc:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 2).getImm();
-        break;
-      case ARM64::CSINVWr:
-      case ARM64::CSINVXr:
-      case ARM64::CSINCWr:
-      case ARM64::CSINCXr:
-      case ARM64::CSELWr:
-      case ARM64::CSELXr:
-      case ARM64::CSNEGWr:
-      case ARM64::CSNEGXr:
-      case ARM64::FCSELSrrr:
-      case ARM64::FCSELDrrr:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 1).getImm();
-        break;
-      }
-
-      // It is not safe to remove Compare instruction if Overflow(V) is used.
-      switch (CC) {
-      default:
-        // NZCV can be used multiple times, we should continue.
-        break;
-      case ARM64CC::VS:
-      case ARM64CC::VC:
-      case ARM64CC::GE:
-      case ARM64CC::LT:
-      case ARM64CC::GT:
-      case ARM64CC::LE:
-        return false;
-      }
-    }
-  }
-
-  // If NZCV is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if (!IsSafe) {
-    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
-    for (auto *MBB : ParentBlock->successors())
-      if (MBB->isLiveIn(ARM64::NZCV))
-        return false;
-  }
-
-  // Update the instruction to set NZCV.
-  MI->setDesc(get(NewOpc));
-  CmpInstr->eraseFromParent();
-  bool succeeded = UpdateOperandRegClass(MI);
-  (void)succeeded;
-  assert(succeeded && "Some operands reg class are incompatible!");
-  MI->addRegisterDefined(ARM64::NZCV, TRI);
-  return true;
-}
-
-/// Return true if this is this instruction has a non-zero immediate
-bool ARM64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::ADDSWrs:
-  case ARM64::ADDSXrs:
-  case ARM64::ADDWrs:
-  case ARM64::ADDXrs:
-  case ARM64::ANDSWrs:
-  case ARM64::ANDSXrs:
-  case ARM64::ANDWrs:
-  case ARM64::ANDXrs:
-  case ARM64::BICSWrs:
-  case ARM64::BICSXrs:
-  case ARM64::BICWrs:
-  case ARM64::BICXrs:
-  case ARM64::CRC32Brr:
-  case ARM64::CRC32CBrr:
-  case ARM64::CRC32CHrr:
-  case ARM64::CRC32CWrr:
-  case ARM64::CRC32CXrr:
-  case ARM64::CRC32Hrr:
-  case ARM64::CRC32Wrr:
-  case ARM64::CRC32Xrr:
-  case ARM64::EONWrs:
-  case ARM64::EONXrs:
-  case ARM64::EORWrs:
-  case ARM64::EORXrs:
-  case ARM64::ORNWrs:
-  case ARM64::ORNXrs:
-  case ARM64::ORRWrs:
-  case ARM64::ORRXrs:
-  case ARM64::SUBSWrs:
-  case ARM64::SUBSXrs:
-  case ARM64::SUBWrs:
-  case ARM64::SUBXrs:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
-      return (val != 0);
-    }
-    break;
-  }
-  return false;
-}
-
-/// Return true if this is this instruction has a non-zero immediate
-bool ARM64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::ADDSWrx:
-  case ARM64::ADDSXrx:
-  case ARM64::ADDSXrx64:
-  case ARM64::ADDWrx:
-  case ARM64::ADDXrx:
-  case ARM64::ADDXrx64:
-  case ARM64::SUBSWrx:
-  case ARM64::SUBSXrx:
-  case ARM64::SUBSXrx64:
-  case ARM64::SUBWrx:
-  case ARM64::SUBXrx:
-  case ARM64::SUBXrx64:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
-      return (val != 0);
-    }
-    break;
-  }
-
-  return false;
-}
-
-// Return true if this instruction simply sets its single destination register
-// to zero. This is equivalent to a register rename of the zero-register.
-bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVZWi:
-  case ARM64::MOVZXi: // movz Rd, #0 (LSL #0)
-    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 3 &&
-             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
-      return true;
-    }
-    break;
-  case ARM64::ANDWri: // and Rd, Rzr, #imm
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  case ARM64::ANDXri:
-    return MI->getOperand(1).getReg() == ARM64::XZR;
-  case TargetOpcode::COPY:
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::GPR32RegClass.contains(DstReg) ||
-            ARM64::GPR64RegClass.contains(DstReg));
-  }
-  case ARM64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
-    if (MI->getOperand(1).getReg() == ARM64::XZR) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
-      return true;
-    }
-  case ARM64::ADDXri: // add Xd, Xn, #0 (LSL #0)
-    if (MI->getOperand(2).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::FPR64RegClass.contains(DstReg) ||
-            ARM64::FPR128RegClass.contains(DstReg));
-  }
-  case ARM64::ORRv16i8:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
-             "invalid ORRv16i8 operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-unsigned ARM64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-
-  return 0;
-}
-
-unsigned ARM64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                            int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-  return 0;
-}
-
-/// Return true if this is load/store scales or extends its register offset.
-/// This refers to scaling a dynamic index as opposed to scaled immediates.
-/// MI should be a memory op that allows scaled addressing.
-bool ARM64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRBBro:
-  case ARM64::LDRBro:
-  case ARM64::LDRDro:
-  case ARM64::LDRHHro:
-  case ARM64::LDRHro:
-  case ARM64::LDRQro:
-  case ARM64::LDRSBWro:
-  case ARM64::LDRSBXro:
-  case ARM64::LDRSHWro:
-  case ARM64::LDRSHXro:
-  case ARM64::LDRSWro:
-  case ARM64::LDRSro:
-  case ARM64::LDRWro:
-  case ARM64::LDRXro:
-  case ARM64::STRBBro:
-  case ARM64::STRBro:
-  case ARM64::STRDro:
-  case ARM64::STRHHro:
-  case ARM64::STRHro:
-  case ARM64::STRQro:
-  case ARM64::STRSro:
-  case ARM64::STRWro:
-  case ARM64::STRXro:
-    unsigned Val = MI->getOperand(3).getImm();
-    ARM64_AM::ShiftExtendType ExtType = ARM64_AM::getMemExtendType(Val);
-    return (ExtType != ARM64_AM::UXTX) || ARM64_AM::getMemDoShift(Val);
-  }
-  return false;
-}
-
-/// Check all MachineMemOperands for a hint to suppress pairing.
-bool ARM64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  for (auto *MM : MI->memoperands()) {
-    if (MM->getFlags() &
-        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Set a flag on the first MachineMemOperand to suppress pairing.
-void ARM64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
-  if (MI->memoperands_empty())
-    return;
-
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  (*MI->memoperands_begin())
-      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
-}
-
-bool ARM64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                          unsigned &Offset,
-                                          const TargetRegisterInfo *TRI) const {
-  switch (LdSt->getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-  case ARM64::STRXui:
-  case ARM64::STRWui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-  case ARM64::LDRXui:
-  case ARM64::LDRWui:
-    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
-      return false;
-    BaseReg = LdSt->getOperand(1).getReg();
-    MachineFunction &MF = *LdSt->getParent()->getParent();
-    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
-    Offset = LdSt->getOperand(2).getImm() * Width;
-    return true;
-  };
-}
-
-/// Detect opportunities for ldp/stp formation.
-///
-/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
-bool ARM64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                        MachineInstr *SecondLdSt,
-                                        unsigned NumLoads) const {
-  // Only cluster up to a single pair.
-  if (NumLoads > 1)
-    return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
-    return false;
-  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
-    return false;
-  // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
-}
-
-bool ARM64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
-                                            MachineInstr *Second) const {
-  // Cyclone can fuse CMN, CMP followed by Bcc.
-
-  // FIXME: B0 can also fuse:
-  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
-  if (Second->getOpcode() != ARM64::Bcc)
-    return false;
-  switch (First->getOpcode()) {
-  default:
-    return false;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::ANDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-  case ARM64::ANDSXri:
-    return true;
-  }
-}
-
-MachineInstr *ARM64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                                       int FrameIx,
-                                                       uint64_t Offset,
-                                                       const MDNode *MDPtr,
-                                                       DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM64::DBG_VALUE))
-                                .addFrameIndex(FrameIx)
-                                .addImm(0)
-                                .addImm(Offset)
-                                .addMetadata(MDPtr);
-  return &*MIB;
-}
-
-static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
-                                            unsigned Reg, unsigned SubIdx,
-                                            unsigned State,
-                                            const TargetRegisterInfo *TRI) {
-  if (!SubIdx)
-    return MIB.addReg(Reg, State);
-
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
-  return MIB.addReg(Reg, State, SubIdx);
-}
-
-static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
-                                        unsigned NumRegs) {
-  // We really want the positive remainder mod 32 here, that happens to be
-  // easily obtainable with a mask.
-  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
-}
-
-void ARM64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator I,
-                                      DebugLoc DL, unsigned DestReg,
-                                      unsigned SrcReg, bool KillSrc,
-                                      unsigned Opcode,
-                                      llvm::ArrayRef<unsigned> Indices) const {
-  assert(getSubTarget().hasNEON() &&
-         "Unexpected register copy without NEON");
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
-  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
-  unsigned NumRegs = Indices.size();
-
-  int SubReg = 0, End = NumRegs, Incr = 1;
-  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
-    SubReg = NumRegs - 1;
-    End = -1;
-    Incr = -1;
-  }
-
-  for (; SubReg != End; SubReg += Incr) {
-    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
-    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
-  }
-}
-
-void ARM64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I, DebugLoc DL,
-                                 unsigned DestReg, unsigned SrcReg,
-                                 bool KillSrc) const {
-  if (ARM64::GPR32spRegClass.contains(DestReg) &&
-      (ARM64::GPR32spRegClass.contains(SrcReg) || SrcReg == ARM64::WZR)) {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-
-    if (DestReg == ARM64::WSP || SrcReg == ARM64::WSP) {
-      // If either operand is WSP, expand to ADD #0.
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestRegX)
-            .addReg(SrcRegX, RegState::Undef)
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        BuildMI(MBB, I, DL, get(ARM64::ADDWri), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc))
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-      }
-    } else if (SrcReg == ARM64::WZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZWi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestRegX)
-            .addReg(ARM64::XZR)
-            .addReg(SrcRegX, RegState::Undef)
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        // Otherwise, expand to ORR WZR.
-        BuildMI(MBB, I, DL, get(ARM64::ORRWrr), DestReg)
-            .addReg(ARM64::WZR)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-      }
-    }
-    return;
-  }
-
-  if (ARM64::GPR64spRegClass.contains(DestReg) &&
-      (ARM64::GPR64spRegClass.contains(SrcReg) || SrcReg == ARM64::XZR)) {
-    if (DestReg == ARM64::SP || SrcReg == ARM64::SP) {
-      // If either operand is SP, expand to ADD #0.
-      BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc))
-          .addImm(0)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else if (SrcReg == ARM64::XZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZXi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      // Otherwise, expand to ORR XZR.
-      BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestReg)
-          .addReg(ARM64::XZR)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  // Copy a DDDD register quad by copying the individual sub-registers.
-  if (ARM64::DDDDRegClass.contains(DestReg) &&
-      ARM64::DDDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2, ARM64::dsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DDD register triple by copying the individual sub-registers.
-  if (ARM64::DDDRegClass.contains(DestReg) &&
-      ARM64::DDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DD register pair by copying the individual sub-registers.
-  if (ARM64::DDRegClass.contains(DestReg) &&
-      ARM64::DDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQQ register quad by copying the individual sub-registers.
-  if (ARM64::QQQQRegClass.contains(DestReg) &&
-      ARM64::QQQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2, ARM64::qsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQ register triple by copying the individual sub-registers.
-  if (ARM64::QQQRegClass.contains(DestReg) &&
-      ARM64::QQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQ register pair by copying the individual sub-registers.
-  if (ARM64::QQRegClass.contains(DestReg) &&
-      ARM64::QQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  if (ARM64::FPR128RegClass.contains(DestReg) &&
-      ARM64::FPR128RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(ARM64::STRQpre))
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addReg(ARM64::SP)
-        .addImm(-16);
-      BuildMI(MBB, I, DL, get(ARM64::LDRQpre))
-        .addReg(DestReg, RegState::Define)
-        .addReg(ARM64::SP)
-        .addImm(16);
-    }
-    return;
-  }
-
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::dsub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::dsub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(ARM64::FMOVDr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::ssub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::ssub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  if (ARM64::FPR16RegClass.contains(DestReg) &&
-      ARM64::FPR16RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR32RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR32RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  if (ARM64::FPR8RegClass.contains(DestReg) &&
-      ARM64::FPR8RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR128RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR128RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-          SrcReg, getKillRegState(KillSrc));
-    } else {
-      DestReg =
-          RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR32RegClass);
-      SrcReg =
-          RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR32RegClass);
-      BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  // Copies between GPR64 and FPR64.
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::GPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVXDr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVDXr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  // Copies between GPR32 and FPR32.
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::GPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVWSr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVSWr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  assert(0 && "unimplemented reg-to-reg copy");
-}
-
-void ARM64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                         MachineBasicBlock::iterator MBBI,
-                                         unsigned SrcReg, bool isKill, int FI,
-                                         const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRWui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-      else
-        assert(SrcReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRXui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      else
-        assert(SrcReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Twov1d, Offset = false;
-    }
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Threev1d, Offset = false;
-    }
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Fourv1d, Offset = false;
-    } else if (ARM64::QQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Twov2d, Offset = false;
-    }
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Threev2d, Offset = false;
-    }
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register store without NEON");
-      Opc = ARM64::ST1Fourv2d, Offset = false;
-    }
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(SrcReg, getKillRegState(isKill))
-                                      .addFrameIndex(FI);
-
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void ARM64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          unsigned DestReg, int FI,
-                                          const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
-
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRWui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR32RegClass);
-      else
-        assert(DestReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRXui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR64RegClass);
-      else
-        assert(DestReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Twov1d, Offset = false;
-    }
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Threev1d, Offset = false;
-    }
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Fourv1d, Offset = false;
-    } else if (ARM64::QQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Twov2d, Offset = false;
-    }
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Threev2d, Offset = false;
-    }
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
-             "Unexpected register load without NEON");
-      Opc = ARM64::LD1Fourv2d, Offset = false;
-    }
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(DestReg, getDefRegState(true))
-                                      .addFrameIndex(FI);
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg, int Offset,
-                           const ARM64InstrInfo *TII, MachineInstr::MIFlag Flag,
-                           bool SetNZCV) {
-  if (DestReg == SrcReg && Offset == 0)
-    return;
-
-  bool isSub = Offset < 0;
-  if (isSub)
-    Offset = -Offset;
-
-  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
-  // scratch register.  If DestReg is a virtual register, use it as the
-  // scratch register; otherwise, create a new virtual register (to be
-  // replaced by the scavenger at the end of PEI).  That case can be optimized
-  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
-  // register can be loaded with offset%8 and the add/sub can use an extending
-  // instruction with LSL#3.
-  // Currently the function handles any offsets but generates a poor sequence
-  // of code.
-  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
-
-  unsigned Opc;
-  if (SetNZCV)
-    Opc = isSub ? ARM64::SUBSXri : ARM64::ADDSXri;
-  else
-    Opc = isSub ? ARM64::SUBXri : ARM64::ADDXri;
-  const unsigned MaxEncoding = 0xfff;
-  const unsigned ShiftSize = 12;
-  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
-  while (((unsigned)Offset) >= (1 << ShiftSize)) {
-    unsigned ThisVal;
-    if (((unsigned)Offset) > MaxEncodableValue) {
-      ThisVal = MaxEncodableValue;
-    } else {
-      ThisVal = Offset & MaxEncodableValue;
-    }
-    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
-           "Encoding cannot handle value that big");
-    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-        .addReg(SrcReg)
-        .addImm(ThisVal >> ShiftSize)
-        .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftSize))
-        .setMIFlag(Flag);
-
-    SrcReg = DestReg;
-    Offset -= ThisVal;
-    if (Offset == 0)
-      return;
-  }
-  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-      .addReg(SrcReg)
-      .addImm(Offset)
-      .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-      .setMIFlag(Flag);
-}
-
-MachineInstr *
-ARM64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const {
-  // This is a bit of a hack. Consider this instruction:
-  //
-  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
-  //
-  // We explicitly chose GPR64all for the virtual register so such a copy might
-  // be eliminated by RegisterCoalescer. However, that may not be possible, and
-  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
-  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
-  //
-  // To prevent that, we are going to constrain the %vreg0 register class here.
-  //
-  // <rdar://problem/11522048>
-  //
-  if (MI->isCopy()) {
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    if (SrcReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) {
-      MF.getRegInfo().constrainRegClass(DstReg, &ARM64::GPR64RegClass);
-      return nullptr;
-    }
-    if (DstReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-      MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      return nullptr;
-    }
-  }
-
-  // Cannot fold.
-  return nullptr;
-}
-
-int llvm::isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                                  bool *OutUseUnscaledOp,
-                                  unsigned *OutUnscaledOp,
-                                  int *EmittableOffset) {
-  int Scale = 1;
-  bool IsSigned = false;
-  // The ImmIdx should be changed case by case if it is not 2.
-  unsigned ImmIdx = 2;
-  unsigned UnscaledOp = 0;
-  // Set output values in case of early exit.
-  if (EmittableOffset)
-    *EmittableOffset = 0;
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = false;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = 0;
-  switch (MI.getOpcode()) {
-  default:
-    assert(0 && "unhandled opcode in rewriteARM64FrameIndex");
-  // Vector spills/fills can't take an immediate offset.
-  case ARM64::LD1Twov2d:
-  case ARM64::LD1Threev2d:
-  case ARM64::LD1Fourv2d:
-  case ARM64::LD1Twov1d:
-  case ARM64::LD1Threev1d:
-  case ARM64::LD1Fourv1d:
-  case ARM64::ST1Twov2d:
-  case ARM64::ST1Threev2d:
-  case ARM64::ST1Fourv2d:
-  case ARM64::ST1Twov1d:
-  case ARM64::ST1Threev1d:
-  case ARM64::ST1Fourv1d:
-    return ARM64FrameOffsetCannotUpdate;
-  case ARM64::PRFMui:
-    Scale = 8;
-    UnscaledOp = ARM64::PRFUMi;
-    break;
-  case ARM64::LDRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURXi;
-    break;
-  case ARM64::LDRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURWi;
-    break;
-  case ARM64::LDRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBi;
-    break;
-  case ARM64::LDRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHi;
-    break;
-  case ARM64::LDRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSi;
-    break;
-  case ARM64::LDRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURDi;
-    break;
-  case ARM64::LDRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::LDURQi;
-    break;
-  case ARM64::LDRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBBi;
-    break;
-  case ARM64::LDRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHHi;
-    break;
-  case ARM64::LDRSBXui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBXi;
-    break;
-  case ARM64::LDRSBWui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBWi;
-    break;
-  case ARM64::LDRSHXui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHXi;
-    break;
-  case ARM64::LDRSHWui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHWi;
-    break;
-  case ARM64::LDRSWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSWi;
-    break;
-
-  case ARM64::STRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURXi;
-    break;
-  case ARM64::STRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURWi;
-    break;
-  case ARM64::STRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBi;
-    break;
-  case ARM64::STRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHi;
-    break;
-  case ARM64::STRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURSi;
-    break;
-  case ARM64::STRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURDi;
-    break;
-  case ARM64::STRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::STURQi;
-    break;
-  case ARM64::STRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBBi;
-    break;
-  case ARM64::STRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHHi;
-    break;
-
-  case ARM64::LDPXi:
-  case ARM64::LDPDi:
-  case ARM64::STPXi:
-  case ARM64::STPDi:
-    IsSigned = true;
-    Scale = 8;
-    break;
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-    IsSigned = true;
-    Scale = 16;
-    break;
-  case ARM64::LDPWi:
-  case ARM64::LDPSi:
-  case ARM64::STPWi:
-  case ARM64::STPSi:
-    IsSigned = true;
-    Scale = 4;
-    break;
-
-  case ARM64::LDURXi:
-  case ARM64::LDURWi:
-  case ARM64::LDURBi:
-  case ARM64::LDURHi:
-  case ARM64::LDURSi:
-  case ARM64::LDURDi:
-  case ARM64::LDURQi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSBWi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSHWi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::STURWi:
-  case ARM64::STURBi:
-  case ARM64::STURHi:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-  case ARM64::STURQi:
-  case ARM64::STURBBi:
-  case ARM64::STURHHi:
-    Scale = 1;
-    break;
-  }
-
-  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
-
-  bool useUnscaledOp = false;
-  // If the offset doesn't match the scale, we rewrite the instruction to
-  // use the unscaled instruction instead. Likewise, if we have a negative
-  // offset (and have an unscaled op to use).
-  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
-    useUnscaledOp = true;
-
-  // Use an unscaled addressing mode if the instruction has a negative offset
-  // (or if the instruction is already using an unscaled addressing mode).
-  unsigned MaskBits;
-  if (IsSigned) {
-    // ldp/stp instructions.
-    MaskBits = 7;
-    Offset /= Scale;
-  } else if (UnscaledOp == 0 || useUnscaledOp) {
-    MaskBits = 9;
-    IsSigned = true;
-    Scale = 1;
-  } else {
-    MaskBits = 12;
-    IsSigned = false;
-    Offset /= Scale;
-  }
-
-  // Attempt to fold address computation.
-  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
-  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
-  if (Offset >= MinOff && Offset <= MaxOff) {
-    if (EmittableOffset)
-      *EmittableOffset = Offset;
-    Offset = 0;
-  } else {
-    int NewOff = Offset < 0 ? MinOff : MaxOff;
-    if (EmittableOffset)
-      *EmittableOffset = NewOff;
-    Offset = (Offset - NewOff) * Scale;
-  }
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = useUnscaledOp;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = UnscaledOp;
-  return ARM64FrameOffsetCanUpdate |
-         (Offset == 0 ? ARM64FrameOffsetIsLegal : 0);
-}
-
-bool llvm::rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                  unsigned FrameReg, int &Offset,
-                                  const ARM64InstrInfo *TII) {
-  unsigned Opcode = MI.getOpcode();
-  unsigned ImmIdx = FrameRegIdx + 1;
-
-  if (Opcode == ARM64::ADDSXri || Opcode == ARM64::ADDXri) {
-    Offset += MI.getOperand(ImmIdx).getImm();
-    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
-                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
-                    MachineInstr::NoFlags, (Opcode == ARM64::ADDSXri));
-    MI.eraseFromParent();
-    Offset = 0;
-    return true;
-  }
-
-  int NewOffset;
-  unsigned UnscaledOp;
-  bool UseUnscaledOp;
-  int Status = isARM64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, &UnscaledOp,
-                                       &NewOffset);
-  if (Status & ARM64FrameOffsetCanUpdate) {
-    if (Status & ARM64FrameOffsetIsLegal)
-      // Replace the FrameIndex with FrameReg.
-      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-    if (UseUnscaledOp)
-      MI.setDesc(TII->get(UnscaledOp));
-
-    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
-    return Offset == 0;
-  }
-
-  return false;
-}
-
-void ARM64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  NopInst.setOpcode(ARM64::HINT);
-  NopInst.addOperand(MCOperand::CreateImm(0));
-}
diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h
deleted file mode 100644
index ce195e763b2b..000000000000
--- a/lib/Target/ARM64/ARM64InstrInfo.h
+++ /dev/null
@@ -1,231 +0,0 @@
-//===- ARM64InstrInfo.h - ARM64 Instruction Information ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64INSTRINFO_H
-#define LLVM_TARGET_ARM64INSTRINFO_H
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-#define GET_INSTRINFO_HEADER
-#include "ARM64GenInstrInfo.inc"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64InstrInfo : public ARM64GenInstrInfo {
-  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
-  // They will be shifted into MOTargetHintStart when accessed.
-  enum TargetMemOperandFlags {
-    MOSuppressPair = 1
-  };
-
-  const ARM64RegisterInfo RI;
-  const ARM64Subtarget &Subtarget;
-
-public:
-  explicit ARM64InstrInfo(const ARM64Subtarget &STI);
-
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
-  const ARM64RegisterInfo &getRegisterInfo() const { return RI; }
-
-  const ARM64Subtarget &getSubTarget() const { return Subtarget; }
-
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
-
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
-
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
-                              int &FrameIndex) const override;
-
-  /// Returns true if there is a shiftable register and that the shift value
-  /// is non-zero.
-  bool hasShiftedReg(const MachineInstr *MI) const;
-
-  /// Returns true if there is an extendable register and that the extending value
-  /// is non-zero.
-  bool hasExtendedReg(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction set its full destination register to zero?
-  bool isGPRZero(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename a GPR without modifying bits?
-  bool isGPRCopy(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename an FPR without modifying bits?
-  bool isFPRCopy(const MachineInstr *MI) const;
-
-  /// Return true if this is load/store scales or extends its register offset.
-  /// This refers to scaling a dynamic index as opposed to scaled immediates.
-  /// MI should be a memory op that allows scaled addressing.
-  bool isScaledAddr(const MachineInstr *MI) const;
-
-  /// Return true if pairing the given load or store is hinted to be
-  /// unprofitable.
-  bool isLdStPairSuppressed(const MachineInstr *MI) const;
-
-  /// Hint that pairing the given load or store is unprofitable.
-  void suppressLdStPair(MachineInstr *MI) const;
-
-  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                            unsigned &Offset,
-                            const TargetRegisterInfo *TRI) const override;
-
-  bool enableClusterLoads() const override { return true; }
-
-  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const override;
-
-  bool shouldScheduleAdjacent(MachineInstr *First,
-                              MachineInstr *Second) const override;
-
-  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const;
-  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                        bool KillSrc, unsigned Opcode,
-                        llvm::ArrayRef<unsigned> Indices) const;
-  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const override;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
-                           bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
-                            int FrameIndex, const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                        const SmallVectorImpl<unsigned> &Ops,
-                        int FrameIndex) const override;
-
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                     MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify = false) const override;
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                        MachineBasicBlock *FBB,
-                        const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const override;
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  bool canInsertSelect(const MachineBasicBlock &,
-                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
-                       unsigned, int &, int &, int &) const override;
-  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    DebugLoc DL, unsigned DstReg,
-                    const SmallVectorImpl<MachineOperand> &Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
-
-  /// analyzeCompare - For a comparison instruction, return the source registers
-  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-  /// Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
-  /// optimizeCompareInstr - Convert the instruction supplying the argument to
-  /// the comparison into one that sets the zero bit in the flags register.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
-                            const MachineRegisterInfo *MRI) const override;
-
-private:
-  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
-                             MachineBasicBlock *TBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const;
-};
-
-/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
-/// plus Offset.  This is intended to be used from within the prolog/epilog
-/// insertion (PEI) pass, where a virtual scratch register may be allocated
-/// if necessary, to be replaced by the scavenger at the end of PEI.
-void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
-                     const ARM64InstrInfo *TII,
-                     MachineInstr::MIFlag = MachineInstr::NoFlags,
-                     bool SetNZCV = false);
-
-/// rewriteARM64FrameIndex - Rewrite MI to access 'Offset' bytes from the
-/// FP. Return false if the offset could not be handled directly in MI, and
-/// return the left-over portion by reference.
-bool rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                            unsigned FrameReg, int &Offset,
-                            const ARM64InstrInfo *TII);
-
-/// \brief Use to report the frame offset status in isARM64FrameOffsetLegal.
-enum ARM64FrameOffsetStatus {
-  ARM64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
-  ARM64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
-  ARM64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
-};
-
-/// \brief Check if the @p Offset is a valid frame offset for @p MI.
-/// The returned value reports the validity of the frame offset for @p MI.
-/// It uses the values defined by ARM64FrameOffsetStatus for that.
-/// If result == ARM64FrameOffsetCannotUpdate, @p MI cannot be updated to
-/// use an offset.eq
-/// If result & ARM64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
-/// If result & ARM64FrameOffsetCanUpdate, @p Offset contains the
-/// amount that is off the limit of the legal offset.
-/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
-/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
-/// If set, @p EmittableOffset contains the amount that can be set in @p MI
-/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
-/// is a legal offset.
-int isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                            bool *OutUseUnscaledOp = nullptr,
-                            unsigned *OutUnscaledOp = nullptr,
-                            int *EmittableOffset = nullptr);
-
-static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM64::B; }
-
-static inline bool isCondBranchOpcode(int Opc) {
-  switch (Opc) {
-  case ARM64::Bcc:
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-  case ARM64::TBZW:
-  case ARM64::TBZX:
-  case ARM64::TBNZW:
-  case ARM64::TBNZX:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM64::BR; }
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td
deleted file mode 100644
index 4c735c057a27..000000000000
--- a/lib/Target/ARM64/ARM64InstrInfo.td
+++ /dev/null
@@ -1,5184 +0,0 @@
-//===- ARM64InstrInfo.td - Describe the ARM64 Instructions -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Instruction definitions.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ARM Instruction Predicate Definitions.
-//
-def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
-                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
-def HasNEON          : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicate<"FeatureNEON", "neon">;
-def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
-def HasCRC           : Predicate<"Subtarget->hasCRC()">,
-                                 AssemblerPredicate<"FeatureCRC", "crc">;
-def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
-def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
-
-//===----------------------------------------------------------------------===//
-// ARM64-specific DAG Nodes.
-//
-
-// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
-def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
-                                              [SDTCisSameAs<0, 2>,
-                                               SDTCisSameAs<0, 3>,
-                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
-
-// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
-                                            [SDTCisSameAs<0, 1>,
-                                             SDTCisSameAs<0, 2>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<3, i32>]>;
-
-// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
-                                            [SDTCisSameAs<0, 2>,
-                                             SDTCisSameAs<0, 3>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<1, i32>,
-                                             SDTCisVT<4, i32>]>;
-
-def SDT_ARM64Brcond  : SDTypeProfile<0, 3,
-                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
-                                      SDTCisVT<2, i32>]>;
-def SDT_ARM64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
-def SDT_ARM64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
-                                        SDTCisVT<2, OtherVT>]>;
-
-
-def SDT_ARM64CSel  : SDTypeProfile<1, 4,
-                                   [SDTCisSameAs<0, 1>,
-                                    SDTCisSameAs<0, 2>,
-                                    SDTCisInt<3>,
-                                    SDTCisVT<4, i32>]>;
-def SDT_ARM64FCmp   : SDTypeProfile<0, 2,
-                                   [SDTCisFP<0>,
-                                    SDTCisSameAs<0, 1>]>;
-def SDT_ARM64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
-def SDT_ARM64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
-def SDT_ARM64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                          SDTCisSameAs<0, 1>,
-                                          SDTCisSameAs<0, 2>]>;
-def SDT_ARM64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
-def SDT_ARM64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDT_ARM64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisInt<2>, SDTCisInt<3>]>;
-def SDT_ARM64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
-def SDT_ARM64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
-
-def SDT_ARM64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64fcmpz : SDTypeProfile<1, 1, []>;
-def SDT_ARM64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
-def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>]>;
-def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>,
-                                           SDTCisSameAs<0,3>]>;
-def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
-def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
-
-def SDT_ARM64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
-
-def SDT_ARM64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
-                                                 SDTCisPtrTy<1>]>;
-def SDT_ARM64WrapperLarge : SDTypeProfile<1, 4,
-                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
-                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
-                                         SDTCisSameAs<1, 4>]>;
-
-
-// Node definitions.
-def ARM64adrp          : SDNode<"ARM64ISD::ADRP", SDTIntUnaryOp, []>;
-def ARM64addlow        : SDNode<"ARM64ISD::ADDlow", SDTIntBinOp, []>;
-def ARM64LOADgot       : SDNode<"ARM64ISD::LOADgot", SDTIntUnaryOp>;
-def ARM64callseq_start : SDNode<"ISD::CALLSEQ_START",
-                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
-                                [SDNPHasChain, SDNPOutGlue]>;
-def ARM64callseq_end   : SDNode<"ISD::CALLSEQ_END",
-                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                               SDTCisVT<1, i32> ]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def ARM64call          : SDNode<"ARM64ISD::CALL",
-                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                                 SDNPVariadic]>;
-def ARM64brcond        : SDNode<"ARM64ISD::BRCOND", SDT_ARM64Brcond,
-                                [SDNPHasChain]>;
-def ARM64cbz           : SDNode<"ARM64ISD::CBZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64cbnz           : SDNode<"ARM64ISD::CBNZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64tbz           : SDNode<"ARM64ISD::TBZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-def ARM64tbnz           : SDNode<"ARM64ISD::TBNZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-
-
-def ARM64csel          : SDNode<"ARM64ISD::CSEL", SDT_ARM64CSel>;
-def ARM64csinv         : SDNode<"ARM64ISD::CSINV", SDT_ARM64CSel>;
-def ARM64csneg         : SDNode<"ARM64ISD::CSNEG", SDT_ARM64CSel>;
-def ARM64csinc         : SDNode<"ARM64ISD::CSINC", SDT_ARM64CSel>;
-def ARM64retflag       : SDNode<"ARM64ISD::RET_FLAG", SDTNone,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def ARM64adc       : SDNode<"ARM64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
-def ARM64sbc       : SDNode<"ARM64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
-def ARM64add_flag  : SDNode<"ARM64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
-                            [SDNPCommutative]>;
-def ARM64sub_flag  : SDNode<"ARM64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
-def ARM64and_flag  : SDNode<"ARM64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
-                            [SDNPCommutative]>;
-def ARM64adc_flag  : SDNode<"ARM64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
-def ARM64sbc_flag  : SDNode<"ARM64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
-
-def ARM64threadpointer : SDNode<"ARM64ISD::THREAD_POINTER", SDTPtrLeaf>;
-
-def ARM64fcmp      : SDNode<"ARM64ISD::FCMP", SDT_ARM64FCmp>;
-
-def ARM64fmax      : SDNode<"ARM64ISD::FMAX", SDTFPBinOp>;
-def ARM64fmin      : SDNode<"ARM64ISD::FMIN", SDTFPBinOp>;
-
-def ARM64dup       : SDNode<"ARM64ISD::DUP", SDT_ARM64Dup>;
-def ARM64duplane8  : SDNode<"ARM64ISD::DUPLANE8", SDT_ARM64DupLane>;
-def ARM64duplane16 : SDNode<"ARM64ISD::DUPLANE16", SDT_ARM64DupLane>;
-def ARM64duplane32 : SDNode<"ARM64ISD::DUPLANE32", SDT_ARM64DupLane>;
-def ARM64duplane64 : SDNode<"ARM64ISD::DUPLANE64", SDT_ARM64DupLane>;
-
-def ARM64zip1      : SDNode<"ARM64ISD::ZIP1", SDT_ARM64Zip>;
-def ARM64zip2      : SDNode<"ARM64ISD::ZIP2", SDT_ARM64Zip>;
-def ARM64uzp1      : SDNode<"ARM64ISD::UZP1", SDT_ARM64Zip>;
-def ARM64uzp2      : SDNode<"ARM64ISD::UZP2", SDT_ARM64Zip>;
-def ARM64trn1      : SDNode<"ARM64ISD::TRN1", SDT_ARM64Zip>;
-def ARM64trn2      : SDNode<"ARM64ISD::TRN2", SDT_ARM64Zip>;
-
-def ARM64movi_edit : SDNode<"ARM64ISD::MOVIedit", SDT_ARM64MOVIedit>;
-def ARM64movi_shift : SDNode<"ARM64ISD::MOVIshift", SDT_ARM64MOVIshift>;
-def ARM64movi_msl : SDNode<"ARM64ISD::MOVImsl", SDT_ARM64MOVIshift>;
-def ARM64mvni_shift : SDNode<"ARM64ISD::MVNIshift", SDT_ARM64MOVIshift>;
-def ARM64mvni_msl : SDNode<"ARM64ISD::MVNImsl", SDT_ARM64MOVIshift>;
-def ARM64movi : SDNode<"ARM64ISD::MOVI", SDT_ARM64MOVIedit>;
-def ARM64fmov : SDNode<"ARM64ISD::FMOV", SDT_ARM64MOVIedit>;
-
-def ARM64rev16 : SDNode<"ARM64ISD::REV16", SDT_ARM64UnaryVec>;
-def ARM64rev32 : SDNode<"ARM64ISD::REV32", SDT_ARM64UnaryVec>;
-def ARM64rev64 : SDNode<"ARM64ISD::REV64", SDT_ARM64UnaryVec>;
-def ARM64ext : SDNode<"ARM64ISD::EXT", SDT_ARM64ExtVec>;
-
-def ARM64vashr : SDNode<"ARM64ISD::VASHR", SDT_ARM64vshift>;
-def ARM64vlshr : SDNode<"ARM64ISD::VLSHR", SDT_ARM64vshift>;
-def ARM64vshl : SDNode<"ARM64ISD::VSHL", SDT_ARM64vshift>;
-def ARM64sqshli : SDNode<"ARM64ISD::SQSHL_I", SDT_ARM64vshift>;
-def ARM64uqshli : SDNode<"ARM64ISD::UQSHL_I", SDT_ARM64vshift>;
-def ARM64sqshlui : SDNode<"ARM64ISD::SQSHLU_I", SDT_ARM64vshift>;
-def ARM64srshri : SDNode<"ARM64ISD::SRSHR_I", SDT_ARM64vshift>;
-def ARM64urshri : SDNode<"ARM64ISD::URSHR_I", SDT_ARM64vshift>;
-
-def ARM64not: SDNode<"ARM64ISD::NOT", SDT_ARM64unvec>;
-def ARM64bit: SDNode<"ARM64ISD::BIT", SDT_ARM64trivec>;
-def ARM64bsl: SDNode<"ARM64ISD::BSL", SDT_ARM64trivec>;
-
-def ARM64cmeq: SDNode<"ARM64ISD::CMEQ", SDT_ARM64binvec>;
-def ARM64cmge: SDNode<"ARM64ISD::CMGE", SDT_ARM64binvec>;
-def ARM64cmgt: SDNode<"ARM64ISD::CMGT", SDT_ARM64binvec>;
-def ARM64cmhi: SDNode<"ARM64ISD::CMHI", SDT_ARM64binvec>;
-def ARM64cmhs: SDNode<"ARM64ISD::CMHS", SDT_ARM64binvec>;
-
-def ARM64fcmeq: SDNode<"ARM64ISD::FCMEQ", SDT_ARM64fcmp>;
-def ARM64fcmge: SDNode<"ARM64ISD::FCMGE", SDT_ARM64fcmp>;
-def ARM64fcmgt: SDNode<"ARM64ISD::FCMGT", SDT_ARM64fcmp>;
-
-def ARM64cmeqz: SDNode<"ARM64ISD::CMEQz", SDT_ARM64unvec>;
-def ARM64cmgez: SDNode<"ARM64ISD::CMGEz", SDT_ARM64unvec>;
-def ARM64cmgtz: SDNode<"ARM64ISD::CMGTz", SDT_ARM64unvec>;
-def ARM64cmlez: SDNode<"ARM64ISD::CMLEz", SDT_ARM64unvec>;
-def ARM64cmltz: SDNode<"ARM64ISD::CMLTz", SDT_ARM64unvec>;
-def ARM64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
-                         (ARM64not (ARM64cmeqz (and node:$LHS, node:$RHS)))>;
-
-def ARM64fcmeqz: SDNode<"ARM64ISD::FCMEQz", SDT_ARM64fcmpz>;
-def ARM64fcmgez: SDNode<"ARM64ISD::FCMGEz", SDT_ARM64fcmpz>;
-def ARM64fcmgtz: SDNode<"ARM64ISD::FCMGTz", SDT_ARM64fcmpz>;
-def ARM64fcmlez: SDNode<"ARM64ISD::FCMLEz", SDT_ARM64fcmpz>;
-def ARM64fcmltz: SDNode<"ARM64ISD::FCMLTz", SDT_ARM64fcmpz>;
-
-def ARM64bici: SDNode<"ARM64ISD::BICi", SDT_ARM64vecimm>;
-def ARM64orri: SDNode<"ARM64ISD::ORRi", SDT_ARM64vecimm>;
-
-def ARM64neg : SDNode<"ARM64ISD::NEG", SDT_ARM64unvec>;
-
-def ARM64tcret: SDNode<"ARM64ISD::TC_RETURN", SDT_ARM64TCRET,
-                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
-
-def ARM64Prefetch        : SDNode<"ARM64ISD::PREFETCH", SDT_ARM64PREFETCH,
-                               [SDNPHasChain, SDNPSideEffect]>;
-
-def ARM64sitof: SDNode<"ARM64ISD::SITOF", SDT_ARM64ITOF>;
-def ARM64uitof: SDNode<"ARM64ISD::UITOF", SDT_ARM64ITOF>;
-
-def ARM64tlsdesc_call : SDNode<"ARM64ISD::TLSDESC_CALL", SDT_ARM64TLSDescCall,
-                               [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                                SDNPVariadic]>;
-
-def ARM64WrapperLarge : SDNode<"ARM64ISD::WrapperLarge", SDT_ARM64WrapperLarge>;
-
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-
-// ARM64 Instruction Predicate Definitions.
-//
-def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
-def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize   : Predicate<"ForCodeSize">;
-def NotForCodeSize   : Predicate<"!ForCodeSize">;
-
-include "ARM64InstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous instructions.
-//===----------------------------------------------------------------------===//
-
-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(ARM64callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                            [(ARM64callseq_end timm:$amt1, timm:$amt2)]>;
-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
-
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, they can be
-// removed, along with the ARM64Wrapper node.
-
-let AddedComplexity = 10 in
-def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
-                     [(set GPR64:$dst, (ARM64LOADgot tglobaladdr:$addr))]>,
-              Sched<[WriteLDAdr]>;
-
-// The MOVaddr instruction should match only when the add is not folded
-// into a load or store address.
-def MOVaddr
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaladdr:$hi),
-                                            tglobaladdr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrJT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tjumptable:$hi),
-                                             tjumptable:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrCP
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tconstpool:$hi),
-                                             tconstpool:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrBA
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tblockaddress:$hi),
-                                             tblockaddress:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrTLS
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaltlsaddr:$hi),
-                                            tglobaltlsaddr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrEXT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp texternalsym:$hi),
-                                            texternalsym:$low))]>,
-      Sched<[WriteAdrAdr]>;
-
-} // isReMaterializable, isCodeGenOnly
-
-def : Pat<(ARM64LOADgot tglobaltlsaddr:$addr),
-          (LOADgot tglobaltlsaddr:$addr)>;
-
-def : Pat<(ARM64LOADgot texternalsym:$addr),
-          (LOADgot texternalsym:$addr)>;
-
-def : Pat<(ARM64LOADgot tconstpool:$addr),
-          (LOADgot tconstpool:$addr)>;
-
-//===----------------------------------------------------------------------===//
-// System instructions.
-//===----------------------------------------------------------------------===//
-
-def HINT  : HintI<"hint">;
-def : InstAlias<"nop",  (HINT 0b000)>;
-def : InstAlias<"yield",(HINT 0b001)>;
-def : InstAlias<"wfe",  (HINT 0b010)>;
-def : InstAlias<"wfi",  (HINT 0b011)>;
-def : InstAlias<"sev",  (HINT 0b100)>;
-def : InstAlias<"sevl", (HINT 0b101)>;
-
-  // As far as LLVM is concerned this writes to the system's exclusive monitors.
-let mayLoad = 1, mayStore = 1 in
-def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
-
-def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
-def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
-def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
-def : InstAlias<"clrex", (CLREX 0xf)>;
-def : InstAlias<"isb", (ISB 0xf)>;
-
-def MRS    : MRSI;
-def MSR    : MSRI;
-def MSRpstate: MSRpstateI;
-
-// The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(ARM64threadpointer), (MRS 0xde82)>;
-
-// Generic system instructions
-def SYSxt  : SystemXtI<0, "sys">;
-def SYSLxt : SystemLXtI<1, "sysl">;
-
-def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
-                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
-                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
-
-//===----------------------------------------------------------------------===//
-// Move immediate instructions.
-//===----------------------------------------------------------------------===//
-
-defm MOVK : InsertImmediate<0b11, "movk">;
-defm MOVN : MoveImmediate<0b00, "movn">;
-
-let PostEncoderMethod = "fixMOVZ" in
-defm MOVZ : MoveImmediate<0b10, "movz">;
-
-// First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
-
-// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
-
-// Final group of aliases covers true "mov $Rd, $imm" cases.
-multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
-                          int width, int shift> {
-  def _asmoperand : AsmOperandClass {
-    let Name = basename # width # "_lsl" # shift # "MovAlias";
-    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
-                               # shift # ">";
-    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
-  }
-
-  def _movimm : Operand<i32> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
-  }
-
-  def : InstAlias<"mov $Rd, $imm",
-                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
-}
-
-defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
-defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
-
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
-defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
-
-defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
-defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
-
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
-defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
-
-let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
-    isAsCheapAsAMove = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, we can select
-// directly to the real instructions and get rid of these pseudos.
-
-def MOVi32imm
-    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
-             [(set GPR32:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-def MOVi64imm
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
-             [(set GPR64:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-} // isReMaterializable, isCodeGenOnly
-
-// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
-// eventual expansion code fewer bits to worry about getting right. Marshalling
-// the types is a little tricky though:
-def i64imm_32bit : ImmLeaf<i64, [{
-  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
-}]>;
-
-def trunc_imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
-}]>;
-
-def : Pat<(i64 i64imm_32bit:$src),
-          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
-
-// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
-// sequences.
-def : Pat<(ARM64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
-                             tglobaladdr:$g1, tglobaladdr:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
-                                  tglobaladdr:$g2, 32),
-                          tglobaladdr:$g1, 16),
-                  tglobaladdr:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
-                             tblockaddress:$g1, tblockaddress:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
-                                  tblockaddress:$g2, 32),
-                          tblockaddress:$g1, 16),
-                  tblockaddress:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tconstpool:$g3, tconstpool:$g2,
-                             tconstpool:$g1, tconstpool:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
-                                  tconstpool:$g2, 32),
-                          tconstpool:$g1, 16),
-                  tconstpool:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tjumptable:$g3, tjumptable:$g2,
-                             tjumptable:$g1, tjumptable:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
-                                  tjumptable:$g2, 32),
-                          tjumptable:$g1, 16),
-                  tjumptable:$g0, 0)>;
-
-
-//===----------------------------------------------------------------------===//
-// Arithmetic instructions.
-//===----------------------------------------------------------------------===//
-
-// Add/subtract with carry.
-defm ADC : AddSubCarry<0, "adc", "adcs", ARM64adc, ARM64adc_flag>;
-defm SBC : AddSubCarry<1, "sbc", "sbcs", ARM64sbc, ARM64sbc_flag>;
-
-def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
-
-// Add/subtract
-defm ADD : AddSub<0, "add", add>;
-defm SUB : AddSub<1, "sub">;
-
-def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
-
-defm ADDS : AddSubS<0, "adds", ARM64add_flag, "cmn">;
-defm SUBS : AddSubS<1, "subs", ARM64sub_flag, "cmp">;
-
-// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
-def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
-          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
-          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
-def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
-          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
-          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
-          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
-          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(ARM64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(ARM64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
-def : InstAlias<"neg $dst, $src$shift",
-                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
-def : InstAlias<"neg $dst, $src$shift",
-                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
-
-def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
-def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
-def : InstAlias<"negs $dst, $src$shift",
-                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
-def : InstAlias<"negs $dst, $src$shift",
-                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
-
-
-// Unsigned/Signed divide
-defm UDIV : Div<0, "udiv", udiv>;
-defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_arm64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_arm64_sdiv>;
-}
-
-// Variable shift
-defm ASRV : Shift<0b10, "asr", sra>;
-defm LSLV : Shift<0b00, "lsl", shl>;
-defm LSRV : Shift<0b01, "lsr", srl>;
-defm RORV : Shift<0b11, "ror", rotr>;
-
-def : ShiftAlias<"asrv", ASRVWr, GPR32>;
-def : ShiftAlias<"asrv", ASRVXr, GPR64>;
-def : ShiftAlias<"lslv", LSLVWr, GPR32>;
-def : ShiftAlias<"lslv", LSLVXr, GPR64>;
-def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
-def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
-def : ShiftAlias<"rorv", RORVWr, GPR32>;
-def : ShiftAlias<"rorv", RORVXr, GPR64>;
-
-// Multiply-add
-let AddedComplexity = 7 in {
-defm MADD : MulAccum<0, "madd", add>;
-defm MSUB : MulAccum<1, "msub", sub>;
-
-def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
-          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
-          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-
-def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
-          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
-          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
-
-let AddedComplexity = 5 in {
-def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
-def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
-def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
-def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
-
-def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
-          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
-          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-
-def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
-          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
-          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-} // AddedComplexity = 5
-
-def : MulAccumWAlias<"mul", MADDWrrr>;
-def : MulAccumXAlias<"mul", MADDXrrr>;
-def : MulAccumWAlias<"mneg", MSUBWrrr>;
-def : MulAccumXAlias<"mneg", MSUBXrrr>;
-def : WideMulAccumAlias<"smull", SMADDLrrr>;
-def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
-def : WideMulAccumAlias<"umull", UMADDLrrr>;
-def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
-
-// Multiply-high
-def SMULHrr : MulHi<0b010, "smulh", mulhs>;
-def UMULHrr : MulHi<0b110, "umulh", mulhu>;
-
-// CRC32
-def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_arm64_crc32b, "crc32b">;
-def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_arm64_crc32h, "crc32h">;
-def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_arm64_crc32w, "crc32w">;
-def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_arm64_crc32x, "crc32x">;
-
-def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_arm64_crc32cb, "crc32cb">;
-def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_arm64_crc32ch, "crc32ch">;
-def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_arm64_crc32cw, "crc32cw">;
-def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_arm64_crc32cx, "crc32cx">;
-
-
-//===----------------------------------------------------------------------===//
-// Logical instructions.
-//===----------------------------------------------------------------------===//
-
-// (immediate)
-defm ANDS : LogicalImmS<0b11, "ands", ARM64and_flag>;
-defm AND  : LogicalImm<0b00, "and", and>;
-defm EOR  : LogicalImm<0b10, "eor", xor>;
-defm ORR  : LogicalImm<0b01, "orr", or>;
-
-// FIXME: these aliases *are* canonical sometimes (when movz can't be
-// used). Actually, it seems to be working right now, but putting logical_immXX
-// here is a bit dodgy on the AsmParser side too.
-def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
-                                          logical_imm32:$imm), 0>;
-def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
-                                          logical_imm64:$imm), 0>;
-
-
-// (register)
-defm ANDS : LogicalRegS<0b11, 0, "ands", ARM64and_flag>;
-defm BICS : LogicalRegS<0b11, 1, "bics",
-                        BinOpFrag<(ARM64and_flag node:$LHS, (not node:$RHS))>>;
-defm AND  : LogicalReg<0b00, 0, "and", and>;
-defm BIC  : LogicalReg<0b00, 1, "bic",
-                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
-defm EON  : LogicalReg<0b10, 1, "eon",
-                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
-defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
-defm ORN  : LogicalReg<0b01, 1, "orn",
-                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
-defm ORR  : LogicalReg<0b01, 0, "orr", or>;
-
-def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
-def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
-
-def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
-def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
-
-def : InstAlias<"mvn $Wd, $Wm$sh",
-                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
-def : InstAlias<"mvn $Xd, $Xm$sh",
-                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
-
-def : InstAlias<"tst $src1, $src2",
-                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
-def : InstAlias<"tst $src1, $src2",
-                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
-
-def : InstAlias<"tst $src1, $src2",
-                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
-def : InstAlias<"tst $src1, $src2",
-                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
-
-def : InstAlias<"tst $src1, $src2$sh",
-               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
-def : InstAlias<"tst $src1, $src2$sh",
-               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
-
-
-def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
-def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
-
-
-//===----------------------------------------------------------------------===//
-// One operand data processing instructions.
-//===----------------------------------------------------------------------===//
-
-defm CLS    : OneOperandData<0b101, "cls">;
-defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT   : OneOperandData<0b000, "rbit">;
-def  REV16Wr : OneWRegData<0b001, "rev16",
-                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
-def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
-
-def : Pat<(cttz GPR32:$Rn),
-          (CLZWr (RBITWr GPR32:$Rn))>;
-def : Pat<(cttz GPR64:$Rn),
-          (CLZXr (RBITXr GPR64:$Rn))>;
-def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
-                (i32 1))),
-          (CLSWr GPR32:$Rn)>;
-def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
-                (i64 1))),
-          (CLSXr GPR64:$Rn)>;
-
-// Unlike the other one operand instructions, the instructions with the "rev"
-// mnemonic do *not* just different in the size bit, but actually use different
-// opcode bits for the different sizes.
-def REVWr   : OneWRegData<0b010, "rev", bswap>;
-def REVXr   : OneXRegData<0b011, "rev", bswap>;
-def REV32Xr : OneXRegData<0b010, "rev32",
-                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
-
-// The bswap commutes with the rotr so we want a pattern for both possible
-// orders.
-def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
-def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield immediate extraction instruction.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in
-defm EXTR : ExtractImm<"extr">;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
-
-def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
-          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
-def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
-          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
-
-//===----------------------------------------------------------------------===//
-// Other bitfield immediate instructions.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
-defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
-defm SBFM : BitfieldImm<0b00, "sbfm">;
-defm UBFM : BitfieldImm<0b10, "ubfm">;
-}
-
-def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 31 - shift_amt)
-def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 31 - shift_amt)
-def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 63 - shift_amt)
-def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 63 - shift_amt)
-def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(31, 63 - shift_amt)
-def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 31 ? 31 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
-                              (i64 (i32shift_b imm0_31:$imm)))>;
-def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_b imm0_63:$imm)))>;
-
-let AddedComplexity = 10 in {
-def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-}
-
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-//===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
-//===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
-
-//===----------------------------------------------------------------------===//
-// Conditional select instructions.
-//===----------------------------------------------------------------------===//
-defm CSEL  : CondSelect<0, 0b00, "csel">;
-
-def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
-defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
-defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
-defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
-
-def : Pat<(ARM64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
-          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
-          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
-          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
-          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
-          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
-          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-
-def : Pat<(ARM64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
-          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
-          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
-          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
-          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
-
-// The inverse of the condition code from the alias instruction is what is used
-// in the aliased instruction. The parser all ready inverts the condition code
-// for these aliases.
-def : InstAlias<"cset $dst, $cc",
-                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
-def : InstAlias<"cset $dst, $cc",
-                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
-
-def : InstAlias<"csetm $dst, $cc",
-                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
-def : InstAlias<"csetm $dst, $cc",
-                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
-
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
-
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
-
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
-
-//===----------------------------------------------------------------------===//
-// PC-relative instructions.
-//===----------------------------------------------------------------------===//
-let isReMaterializable = 1 in {
-let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
-def ADR  : ADRI<0, "adr", adrlabel, []>;
-} // neverHasSideEffects = 1
-
-def ADRP : ADRI<1, "adrp", adrplabel,
-                [(set GPR64:$Xd, (ARM64adrp tglobaladdr:$label))]>;
-} // isReMaterializable = 1
-
-// page address of a constant pool entry, block address
-def : Pat<(ARM64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
-def : Pat<(ARM64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions.
-//===----------------------------------------------------------------------===//
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-def RET  : BranchReg<0b0010, "ret", []>;
-def DRPS : SpecialReturn<0b0101, "drps">;
-def ERET : SpecialReturn<0b0100, "eret">;
-} // isReturn = 1, isTerminator = 1, isBarrier = 1
-
-// Default to the LR register.
-def : InstAlias<"ret", (RET LR)>;
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BLR : BranchReg<0b0001, "blr", [(ARM64call GPR64:$Rn)]>;
-} // isCall
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
-def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
-} // isBranch, isTerminator, isBarrier, isIndirectBranch
-
-// Create a separate pseudo-instruction for codegen to use so that we don't
-// flag lr as used in every function. It'll be restored before the RET by the
-// epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(ARM64retflag)]> {
-  let isTerminator = 1;
-  let isBarrier = 1;
-  let isReturn = 1;
-}
-
-// This is a directive-like pseudo-instruction. The purpose is to insert an
-// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
-// (which in the usual case is a BLR).
-let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
-  let AsmString = ".tlsdesccall $sym";
-}
-
-// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
-// gets expanded to two MCInsts during lowering.
-let isCall = 1, Defs = [LR] in
-def TLSDESC_BLR
-    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
-             [(ARM64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
-
-def : Pat<(ARM64tlsdesc_call GPR64:$dest, texternalsym:$sym),
-          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
-//===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instruction.
-//===----------------------------------------------------------------------===//
-def Bcc : BranchCond;
-
-//===----------------------------------------------------------------------===//
-// Compare-and-branch instructions.
-//===----------------------------------------------------------------------===//
-defm CBZ  : CmpBranch<0, "cbz", ARM64cbz>;
-defm CBNZ : CmpBranch<1, "cbnz", ARM64cbnz>;
-
-//===----------------------------------------------------------------------===//
-// Test-bit-and-branch instructions.
-//===----------------------------------------------------------------------===//
-defm TBZ  : TestBranch<0, "tbz", ARM64tbz>;
-defm TBNZ : TestBranch<1, "tbnz", ARM64tbnz>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions.
-//===----------------------------------------------------------------------===//
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-def B  : BranchImm<0, "b", [(br bb:$addr)]>;
-} // isBranch, isTerminator, isBarrier
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BL : CallImm<1, "bl", [(ARM64call tglobaladdr:$addr)]>;
-} // isCall
-def : Pat<(ARM64call texternalsym:$func), (BL texternalsym:$func)>;
-
-//===----------------------------------------------------------------------===//
-// Exception generation instructions.
-//===----------------------------------------------------------------------===//
-def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
-def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
-def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
-def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
-def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
-def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
-def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
-def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
-
-// DCPSn defaults to an immediate operand of zero if unspecified.
-def : InstAlias<"dcps1", (DCPS1 0)>;
-def : InstAlias<"dcps2", (DCPS2 0)>;
-def : InstAlias<"dcps3", (DCPS3 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-def LDPWi : LoadPairOffset<0b00, 0, GPR32, am_indexed32simm7, "ldp">;
-def LDPXi : LoadPairOffset<0b10, 0, GPR64, am_indexed64simm7, "ldp">;
-def LDPSi : LoadPairOffset<0b00, 1, FPR32, am_indexed32simm7, "ldp">;
-def LDPDi : LoadPairOffset<0b01, 1, FPR64, am_indexed64simm7, "ldp">;
-def LDPQi : LoadPairOffset<0b10, 1, FPR128, am_indexed128simm7, "ldp">;
-
-def LDPSWi : LoadPairOffset<0b01, 0, GPR64, am_indexed32simm7, "ldpsw">;
-
-// Pair (pre-indexed)
-def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, am_indexed32simm7_wb, "ldp">;
-def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, am_indexed64simm7_wb, "ldp">;
-def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, am_indexed32simm7_wb, "ldp">;
-def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, am_indexed64simm7_wb, "ldp">;
-def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, am_indexed128simm7_wb, "ldp">;
-
-def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, am_indexed32simm7_wb, "ldpsw">;
-
-// Pair (post-indexed)
-def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
-
-def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
-
-
-// Pair (no allocate)
-def LDNPWi : LoadPairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "ldnp">;
-def LDNPXi : LoadPairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "ldnp">;
-def LDNPSi : LoadPairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "ldnp">;
-def LDNPDi : LoadPairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "ldnp">;
-def LDNPQi : LoadPairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "ldnp">;
-
-//---
-// (register offset)
-//---
-
-let AddedComplexity = 10 in {
-// Integer
-def LDRBBro : Load8RO<0b00,  0, 0b01, GPR32, "ldrb",
-                      [(set GPR32:$Rt, (zextloadi8 ro_indexed8:$addr))]>;
-def LDRHHro : Load16RO<0b01, 0, 0b01, GPR32, "ldrh",
-                      [(set GPR32:$Rt, (zextloadi16 ro_indexed16:$addr))]>;
-def LDRWro  : Load32RO<0b10,   0, 0b01, GPR32, "ldr",
-                      [(set GPR32:$Rt, (load ro_indexed32:$addr))]>;
-def LDRXro  : Load64RO<0b11,   0, 0b01, GPR64, "ldr",
-                      [(set GPR64:$Rt, (load ro_indexed64:$addr))]>;
-
-// Floating-point
-def LDRBro : Load8RO<0b00,   1, 0b01, FPR8,   "ldr",
-                      [(set FPR8:$Rt, (load ro_indexed8:$addr))]>;
-def LDRHro : Load16RO<0b01,  1, 0b01, FPR16,  "ldr",
-                      [(set (f16 FPR16:$Rt), (load ro_indexed16:$addr))]>;
-def LDRSro : Load32RO<0b10,    1, 0b01, FPR32,  "ldr",
-                      [(set (f32 FPR32:$Rt), (load ro_indexed32:$addr))]>;
-def LDRDro : Load64RO<0b11,    1, 0b01, FPR64,  "ldr",
-                      [(set (f64 FPR64:$Rt), (load ro_indexed64:$addr))]>;
-def LDRQro : Load128RO<0b00,    1, 0b11, FPR128, "ldr", []> {
-  let mayLoad = 1;
-}
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
-           (LDRDro ro_indexed64:$addr)>;
-def : Pat <(v2i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDro ro_indexed64:$addr), dsub)>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must do vector loads with LD1 in big-endian.
-  def : Pat<(v2f32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-  def : Pat<(v8i8  (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-  def : Pat<(v4i16 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-  def : Pat<(v2i32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-}
-def : Pat<(v1f64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v1i64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must do vector loads with LD1 in big-endian.
-  def : Pat<(v4f32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-  def : Pat<(v2f64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-  def : Pat<(v16i8 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-  def : Pat<(v8i16 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-  def : Pat<(v4i32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-  def : Pat<(v2i64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-}
-def : Pat<(f128  (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-
-// Load sign-extended half-word
-def LDRSHWro : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh",
-                      [(set GPR32:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
-def LDRSHXro : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh",
-                      [(set GPR64:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
-
-// Load sign-extended byte
-def LDRSBWro : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb",
-                      [(set GPR32:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
-def LDRSBXro : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb",
-                      [(set GPR64:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
-
-// Load sign-extended word
-def LDRSWro  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw",
-                      [(set GPR64:$Rt, (sextloadi32 ro_indexed32:$addr))]>;
-
-// Pre-fetch.
-def PRFMro : PrefetchRO<0b11, 0, 0b10, "prfm",
-                        [(ARM64Prefetch imm:$Rt, ro_indexed64:$addr)]>;
-
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 ro_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi32 ro_indexed32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRWro ro_indexed32:$addr), sub_32)>;
-
-// zextloadi1 -> zextloadi8
-def : Pat<(i32 (zextloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i64 (zextloadi1 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-
-// extload -> zextload
-def : Pat<(i32 (extloadi16 ro_indexed16:$addr)), (LDRHHro ro_indexed16:$addr)>;
-def : Pat<(i32 (extloadi8 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i32 (extloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i64 (extloadi32 ro_indexed32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRWro ro_indexed32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 ro_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-
-} // AddedComplexity = 10
-
-//---
-// (unsigned immediate)
-//---
-def LDRXui : LoadUI<0b11, 0, 0b01, GPR64, am_indexed64, "ldr",
-                    [(set GPR64:$Rt, (load am_indexed64:$addr))]>;
-def LDRWui : LoadUI<0b10, 0, 0b01, GPR32, am_indexed32, "ldr",
-                    [(set GPR32:$Rt, (load am_indexed32:$addr))]>;
-def LDRBui : LoadUI<0b00, 1, 0b01, FPR8, am_indexed8, "ldr",
-                    [(set FPR8:$Rt, (load am_indexed8:$addr))]>;
-def LDRHui : LoadUI<0b01, 1, 0b01, FPR16, am_indexed16, "ldr",
-                    [(set (f16 FPR16:$Rt), (load am_indexed16:$addr))]>;
-def LDRSui : LoadUI<0b10, 1, 0b01, FPR32, am_indexed32, "ldr",
-                    [(set (f32 FPR32:$Rt), (load am_indexed32:$addr))]>;
-def LDRDui : LoadUI<0b11, 1, 0b01, FPR64, am_indexed64, "ldr",
-                    [(set (f64 FPR64:$Rt), (load am_indexed64:$addr))]>;
-def LDRQui : LoadUI<0b00, 1, 0b11, FPR128, am_indexed128, "ldr",
-                    [(set (f128 FPR128:$Rt), (load am_indexed128:$addr))]>;
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
-           (LDRDui am_indexed64:$addr)>;
-def : Pat <(v2i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDui am_indexed64:$addr), dsub)>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must use LD1 to perform vector loads in big-endian.
-  def : Pat<(v2f32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-  def : Pat<(v8i8 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-  def : Pat<(v4i16 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-  def : Pat<(v2i32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-}
-def : Pat<(v1f64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v1i64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use LD1 to perform vector loads in big-endian.
-  def : Pat<(v4f32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-  def : Pat<(v2f64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-  def : Pat<(v16i8 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-  def : Pat<(v8i16 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-  def : Pat<(v4i32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-  def : Pat<(v2i64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-}
-def : Pat<(f128  (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-
-def LDRHHui : LoadUI<0b01, 0, 0b01, GPR32, am_indexed16, "ldrh",
-                     [(set GPR32:$Rt, (zextloadi16 am_indexed16:$addr))]>;
-def LDRBBui : LoadUI<0b00, 0, 0b01, GPR32, am_indexed8, "ldrb",
-                     [(set GPR32:$Rt, (zextloadi8 am_indexed8:$addr))]>;
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
-
-// zextloadi1 -> zextloadi8
-def : Pat<(i32 (zextloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i64 (zextloadi1 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-
-// extload -> zextload
-def : Pat<(i32 (extloadi16 am_indexed16:$addr)), (LDRHHui am_indexed16:$addr)>;
-def : Pat<(i32 (extloadi8 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i32 (extloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i64 (extloadi32 am_indexed32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 am_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-
-// load sign-extended half-word
-def LDRSHWui : LoadUI<0b01, 0, 0b11, GPR32, am_indexed16, "ldrsh",
-                      [(set GPR32:$Rt, (sextloadi16 am_indexed16:$addr))]>;
-def LDRSHXui : LoadUI<0b01, 0, 0b10, GPR64, am_indexed16, "ldrsh",
-                      [(set GPR64:$Rt, (sextloadi16 am_indexed16:$addr))]>;
-
-// load sign-extended byte
-def LDRSBWui : LoadUI<0b00, 0, 0b11, GPR32, am_indexed8, "ldrsb",
-                      [(set GPR32:$Rt, (sextloadi8 am_indexed8:$addr))]>;
-def LDRSBXui : LoadUI<0b00, 0, 0b10, GPR64, am_indexed8, "ldrsb",
-                      [(set GPR64:$Rt, (sextloadi8 am_indexed8:$addr))]>;
-
-// load sign-extended word
-def LDRSWui  : LoadUI<0b10, 0, 0b10, GPR64, am_indexed32, "ldrsw",
-                      [(set GPR64:$Rt, (sextloadi32 am_indexed32:$addr))]>;
-
-// load zero-extended word
-def : Pat<(i64 (zextloadi32 am_indexed32:$addr)),
- (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
-
-// Pre-fetch.
-def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
-                        [(ARM64Prefetch imm:$Rt, am_indexed64:$addr)]>;
-
-//---
-// (literal)
-def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
-def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
-def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
-def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
-def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
-
-// load sign-extended word
-def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
-
-// prefetch
-def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
-//                   [(ARM64Prefetch imm:$Rt, tglobaladdr:$label)]>;
-
-//---
-// (unscaled immediate)
-def LDURXi : LoadUnscaled<0b11, 0, 0b01, GPR64, am_unscaled64, "ldur",
-                          [(set GPR64:$Rt, (load am_unscaled64:$addr))]>;
-def LDURWi : LoadUnscaled<0b10, 0, 0b01, GPR32, am_unscaled32, "ldur",
-                          [(set GPR32:$Rt, (load am_unscaled32:$addr))]>;
-def LDURBi : LoadUnscaled<0b00, 1, 0b01, FPR8,  am_unscaled8, "ldur",
-                          [(set FPR8:$Rt, (load am_unscaled8:$addr))]>;
-def LDURHi : LoadUnscaled<0b01, 1, 0b01, FPR16, am_unscaled16, "ldur",
-                          [(set (f16 FPR16:$Rt), (load am_unscaled16:$addr))]>;
-def LDURSi : LoadUnscaled<0b10, 1, 0b01, FPR32, am_unscaled32, "ldur",
-                          [(set (f32 FPR32:$Rt), (load am_unscaled32:$addr))]>;
-def LDURDi : LoadUnscaled<0b11, 1, 0b01, FPR64, am_unscaled64, "ldur",
-                          [(set (f64 FPR64:$Rt), (load am_unscaled64:$addr))]>;
-def LDURQi : LoadUnscaled<0b00, 1, 0b11, FPR128, am_unscaled128, "ldur",
-                          [(set (f128 FPR128:$Rt), (load am_unscaled128:$addr))]>;
-
-def LDURHHi
-    : LoadUnscaled<0b01, 0, 0b01, GPR32, am_unscaled16, "ldurh",
-                   [(set GPR32:$Rt, (zextloadi16 am_unscaled16:$addr))]>;
-def LDURBBi
-    : LoadUnscaled<0b00, 0, 0b01, GPR32, am_unscaled8, "ldurb",
-                   [(set GPR32:$Rt, (zextloadi8 am_unscaled8:$addr))]>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  def : Pat<(v2f32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-  def : Pat<(v8i8 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-  def : Pat<(v4i16 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-  def : Pat<(v2i32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-}
-def : Pat<(v1f64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v1i64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  def : Pat<(v4f32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-  def : Pat<(v2f64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-  def : Pat<(v16i8 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-  def : Pat<(v8i16 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-  def : Pat<(v4i32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-  def : Pat<(v2i64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-  def : Pat<(v2f64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-}
-
-//  anyext -> zext
-def : Pat<(i32 (extloadi16 am_unscaled16:$addr)), (LDURHHi am_unscaled16:$addr)>;
-def : Pat<(i32 (extloadi8 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i32 (extloadi1 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i64 (extloadi32 am_unscaled32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 am_unscaled16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-// unscaled zext
-def : Pat<(i32 (zextloadi16 am_unscaled16:$addr)),
-    (LDURHHi am_unscaled16:$addr)>;
-def : Pat<(i32 (zextloadi8 am_unscaled8:$addr)),
-    (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i32 (zextloadi1 am_unscaled8:$addr)),
-    (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i64 (zextloadi32 am_unscaled32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi1 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-
-
-//---
-// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
-
-// Define new assembler match classes as we want to only match these when
-// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
-// associate a DiagnosticType either, as we want the diagnostic for the
-// canonical form (the scaled operand) to take precedence.
-def MemoryUnscaledFB8Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB8";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB16Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB16";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB32Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB32";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB64Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB64";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB128Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB128";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def am_unscaled_fb8 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB8Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb16 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB16Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb32 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB32Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb64 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB64Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb128 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB128Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def : InstAlias<"ldr $Rt, $addr", (LDURXi GPR64:$Rt, am_unscaled_fb64:$addr), 0>;
-def : InstAlias<"ldr $Rt, $addr", (LDURWi GPR32:$Rt, am_unscaled_fb32:$addr), 0>;
-def : InstAlias<"ldr $Rt, $addr", (LDURBi FPR8:$Rt, am_unscaled_fb8:$addr), 0>;
-def : InstAlias<"ldr $Rt, $addr", (LDURHi FPR16:$Rt, am_unscaled_fb16:$addr), 0>;
-def : InstAlias<"ldr $Rt, $addr", (LDURSi FPR32:$Rt, am_unscaled_fb32:$addr), 0>;
-def : InstAlias<"ldr $Rt, $addr", (LDURDi FPR64:$Rt, am_unscaled_fb64:$addr), 0>;
-def : InstAlias<"ldr $Rt, $addr", (LDURQi FPR128:$Rt, am_unscaled_fb128:$addr), 0>;
-
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
-  (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
-  (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-
-// load sign-extended half-word
-def LDURSHWi
-    : LoadUnscaled<0b01, 0, 0b11, GPR32, am_unscaled16, "ldursh",
-                   [(set GPR32:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
-def LDURSHXi
-    : LoadUnscaled<0b01, 0, 0b10, GPR64, am_unscaled16, "ldursh",
-                   [(set GPR64:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
-
-// load sign-extended byte
-def LDURSBWi
-    : LoadUnscaled<0b00, 0, 0b11, GPR32, am_unscaled8, "ldursb",
-                   [(set GPR32:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
-def LDURSBXi
-    : LoadUnscaled<0b00, 0, 0b10, GPR64, am_unscaled8, "ldursb",
-                   [(set GPR64:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
-
-// load sign-extended word
-def LDURSWi
-    : LoadUnscaled<0b10, 0, 0b10, GPR64, am_unscaled32, "ldursw",
-                   [(set GPR64:$Rt, (sextloadi32 am_unscaled32:$addr))]>;
-
-// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
-def : InstAlias<"ldrb $Rt, $addr",
-                (LDURBBi GPR32:$Rt, am_unscaled_fb8:$addr), 0>;
-def : InstAlias<"ldrh $Rt, $addr",
-                (LDURHHi GPR32:$Rt, am_unscaled_fb16:$addr), 0>;
-def : InstAlias<"ldrsb $Rt, $addr",
-                (LDURSBWi GPR32:$Rt, am_unscaled_fb8:$addr), 0>;
-def : InstAlias<"ldrsb $Rt, $addr",
-                (LDURSBXi GPR64:$Rt, am_unscaled_fb8:$addr), 0>;
-def : InstAlias<"ldrsh $Rt, $addr",
-                (LDURSHWi GPR32:$Rt, am_unscaled_fb16:$addr), 0>;
-def : InstAlias<"ldrsh $Rt, $addr",
-                (LDURSHXi GPR64:$Rt, am_unscaled_fb16:$addr), 0>;
-def : InstAlias<"ldrsw $Rt, $addr",
-                (LDURSWi GPR64:$Rt, am_unscaled_fb32:$addr), 0>;
-
-// Pre-fetch.
-def PRFUMi : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
-                               [(ARM64Prefetch imm:$Rt, am_unscaled64:$addr)]>;
-
-//---
-// (unscaled immediate, unprivileged)
-def LDTRXi : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
-def LDTRWi : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
-
-def LDTRHi : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
-def LDTRBi : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
-
-// load sign-extended half-word
-def LDTRSHWi : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
-def LDTRSHXi : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
-
-// load sign-extended byte
-def LDTRSBWi : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
-def LDTRSBXi : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
-
-// load sign-extended word
-def LDTRSWi  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
-
-//---
-// (immediate pre-indexed)
-def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-// ISel pseudos and patterns. See expanded comment on LoadPreIdxPseudo.
-def LDRQpre_isel  : LoadPreIdxPseudo<FPR128>;
-def LDRDpre_isel  : LoadPreIdxPseudo<FPR64>;
-def LDRSpre_isel  : LoadPreIdxPseudo<FPR32>;
-def LDRXpre_isel  : LoadPreIdxPseudo<GPR64>;
-def LDRWpre_isel  : LoadPreIdxPseudo<GPR32>;
-def LDRHHpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRBBpre_isel : LoadPreIdxPseudo<GPR32>;
-
-def LDRSWpre_isel : LoadPreIdxPseudo<GPR64>;
-def LDRSHWpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRSHXpre_isel : LoadPreIdxPseudo<GPR64>;
-def LDRSBWpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRSBXpre_isel : LoadPreIdxPseudo<GPR64>;
-
-//---
-// (immediate post-indexed)
-def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-// ISel pseudos and patterns. See expanded comment on LoadPostIdxPseudo.
-def LDRQpost_isel  : LoadPostIdxPseudo<FPR128>;
-def LDRDpost_isel  : LoadPostIdxPseudo<FPR64>;
-def LDRSpost_isel  : LoadPostIdxPseudo<FPR32>;
-def LDRXpost_isel  : LoadPostIdxPseudo<GPR64>;
-def LDRWpost_isel  : LoadPostIdxPseudo<GPR32>;
-def LDRHHpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRBBpost_isel : LoadPostIdxPseudo<GPR32>;
-
-def LDRSWpost_isel : LoadPostIdxPseudo<GPR64>;
-def LDRSHWpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRSHXpost_isel : LoadPostIdxPseudo<GPR64>;
-def LDRSBWpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRSBXpost_isel : LoadPostIdxPseudo<GPR64>;
-
-//===----------------------------------------------------------------------===//
-// Store instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-// FIXME: Use dedicated range-checked addressing mode operand here.
-def STPWi : StorePairOffset<0b00, 0, GPR32, am_indexed32simm7, "stp">;
-def STPXi : StorePairOffset<0b10, 0, GPR64, am_indexed64simm7, "stp">;
-def STPSi : StorePairOffset<0b00, 1, FPR32, am_indexed32simm7, "stp">;
-def STPDi : StorePairOffset<0b01, 1, FPR64, am_indexed64simm7, "stp">;
-def STPQi : StorePairOffset<0b10, 1, FPR128, am_indexed128simm7, "stp">;
-
-// Pair (pre-indexed)
-def STPWpre : StorePairPreIdx<0b00, 0, GPR32, am_indexed32simm7_wb, "stp">;
-def STPXpre : StorePairPreIdx<0b10, 0, GPR64, am_indexed64simm7_wb, "stp">;
-def STPSpre : StorePairPreIdx<0b00, 1, FPR32, am_indexed32simm7_wb, "stp">;
-def STPDpre : StorePairPreIdx<0b01, 1, FPR64, am_indexed64simm7_wb, "stp">;
-def STPQpre : StorePairPreIdx<0b10, 1, FPR128, am_indexed128simm7_wb, "stp">;
-
-// Pair (pre-indexed)
-def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
-
-// Pair (no allocate)
-def STNPWi : StorePairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "stnp">;
-def STNPXi : StorePairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "stnp">;
-def STNPSi : StorePairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "stnp">;
-def STNPDi : StorePairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "stnp">;
-def STNPQi : StorePairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "stnp">;
-
-//---
-// (Register offset)
-
-let AddedComplexity = 10 in {
-
-// Integer
-def STRHHro : Store16RO<0b01, 0, 0b00, GPR32, "strh",
-                            [(truncstorei16 GPR32:$Rt, ro_indexed16:$addr)]>;
-def STRBBro : Store8RO<0b00,  0, 0b00, GPR32, "strb",
-                            [(truncstorei8 GPR32:$Rt, ro_indexed8:$addr)]>;
-def STRWro  : Store32RO<0b10,   0, 0b00, GPR32, "str",
-                            [(store GPR32:$Rt, ro_indexed32:$addr)]>;
-def STRXro  : Store64RO<0b11,   0, 0b00, GPR64, "str",
-                            [(store GPR64:$Rt, ro_indexed64:$addr)]>;
-
-// truncstore i64
-def : Pat<(truncstorei8 GPR64:$Rt, ro_indexed8:$addr),
-           (STRBBro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed8:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, ro_indexed16:$addr),
-           (STRHHro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed16:$addr)>;
-def : Pat<(truncstorei32 GPR64:$Rt, ro_indexed32:$addr),
-           (STRWro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed32:$addr)>;
-
-
-// Floating-point
-def STRBro : Store8RO<0b00,  1, 0b00, FPR8,  "str",
-                            [(store FPR8:$Rt, ro_indexed8:$addr)]>;
-def STRHro : Store16RO<0b01, 1, 0b00, FPR16, "str",
-                            [(store (f16 FPR16:$Rt), ro_indexed16:$addr)]>;
-def STRSro : Store32RO<0b10,   1, 0b00, FPR32, "str",
-                            [(store (f32 FPR32:$Rt), ro_indexed32:$addr)]>;
-def STRDro : Store64RO<0b11,   1, 0b00, FPR64, "str",
-                            [(store (f64 FPR64:$Rt), ro_indexed64:$addr)]>;
-def STRQro : Store128RO<0b00,   1, 0b10, FPR128, "str", []> {
-  let mayStore = 1;
-}
-
-// Match all store 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v2f32 FPR64:$Rn), ro_indexed64:$addr),
-            (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-  def : Pat<(store (v8i8 FPR64:$Rn), ro_indexed64:$addr),
-            (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-  def : Pat<(store (v4i16 FPR64:$Rn), ro_indexed64:$addr),
-            (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-  def : Pat<(store (v2i32 FPR64:$Rn), ro_indexed64:$addr),
-            (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-}
-def : Pat<(store (v1f64 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v4f32 FPR128:$Rn), ro_indexed128:$addr),
-            (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-  def : Pat<(store (v2f64 FPR128:$Rn), ro_indexed128:$addr),
-            (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-  def : Pat<(store (v16i8 FPR128:$Rn), ro_indexed128:$addr),
-            (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-  def : Pat<(store (v8i16 FPR128:$Rn), ro_indexed128:$addr),
-            (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-  def : Pat<(store (v4i32 FPR128:$Rn), ro_indexed128:$addr),
-            (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-  def : Pat<(store (v2i64 FPR128:$Rn), ro_indexed128:$addr),
-            (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-}
-def : Pat<(store (f128 FPR128:$Rn),  ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-
-//---
-// (unsigned immediate)
-def STRXui : StoreUI<0b11, 0, 0b00, GPR64, am_indexed64, "str",
-                     [(store GPR64:$Rt, am_indexed64:$addr)]>;
-def STRWui : StoreUI<0b10, 0, 0b00, GPR32, am_indexed32, "str",
-                     [(store GPR32:$Rt, am_indexed32:$addr)]>;
-def STRBui : StoreUI<0b00, 1, 0b00, FPR8, am_indexed8, "str",
-                     [(store FPR8:$Rt, am_indexed8:$addr)]>;
-def STRHui : StoreUI<0b01, 1, 0b00, FPR16, am_indexed16, "str",
-                     [(store (f16 FPR16:$Rt), am_indexed16:$addr)]>;
-def STRSui : StoreUI<0b10, 1, 0b00, FPR32, am_indexed32, "str",
-                     [(store (f32 FPR32:$Rt), am_indexed32:$addr)]>;
-def STRDui : StoreUI<0b11, 1, 0b00, FPR64, am_indexed64, "str",
-                     [(store (f64 FPR64:$Rt), am_indexed64:$addr)]>;
-def STRQui : StoreUI<0b00, 1, 0b10, FPR128, am_indexed128, "str", []> {
-  let mayStore = 1;
-}
-
-// Match all store 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v2f32 FPR64:$Rn), am_indexed64:$addr),
-            (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-  def : Pat<(store (v8i8 FPR64:$Rn), am_indexed64:$addr),
-            (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-  def : Pat<(store (v4i16 FPR64:$Rn), am_indexed64:$addr),
-            (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-  def : Pat<(store (v2i32 FPR64:$Rn), am_indexed64:$addr),
-            (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-}
-def : Pat<(store (v1f64 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v4f32 FPR128:$Rn), am_indexed128:$addr),
-            (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-  def : Pat<(store (v2f64 FPR128:$Rn), am_indexed128:$addr),
-            (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-  def : Pat<(store (v16i8 FPR128:$Rn), am_indexed128:$addr),
-            (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-  def : Pat<(store (v8i16 FPR128:$Rn), am_indexed128:$addr),
-            (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-  def : Pat<(store (v4i32 FPR128:$Rn), am_indexed128:$addr),
-            (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-  def : Pat<(store (v2i64 FPR128:$Rn), am_indexed128:$addr),
-            (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-}
-def : Pat<(store (f128  FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-
-def STRHHui : StoreUI<0b01, 0, 0b00, GPR32, am_indexed16, "strh",
-                      [(truncstorei16 GPR32:$Rt, am_indexed16:$addr)]>;
-def STRBBui : StoreUI<0b00, 0, 0b00, GPR32, am_indexed8,  "strb",
-                      [(truncstorei8 GPR32:$Rt, am_indexed8:$addr)]>;
-
-// truncstore i64
-def : Pat<(truncstorei32 GPR64:$Rt, am_indexed32:$addr),
-  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed32:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, am_indexed16:$addr),
-  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed16:$addr)>;
-def : Pat<(truncstorei8 GPR64:$Rt, am_indexed8:$addr),
-  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed8:$addr)>;
-
-} // AddedComplexity = 10
-
-//---
-// (unscaled immediate)
-def STURXi : StoreUnscaled<0b11, 0, 0b00, GPR64, am_unscaled64, "stur",
-                           [(store GPR64:$Rt, am_unscaled64:$addr)]>;
-def STURWi : StoreUnscaled<0b10, 0, 0b00, GPR32, am_unscaled32, "stur",
-                           [(store GPR32:$Rt, am_unscaled32:$addr)]>;
-def STURBi : StoreUnscaled<0b00, 1, 0b00, FPR8,  am_unscaled8, "stur",
-                           [(store FPR8:$Rt, am_unscaled8:$addr)]>;
-def STURHi : StoreUnscaled<0b01, 1, 0b00, FPR16, am_unscaled16, "stur",
-                           [(store (f16 FPR16:$Rt), am_unscaled16:$addr)]>;
-def STURSi : StoreUnscaled<0b10, 1, 0b00, FPR32, am_unscaled32, "stur",
-                           [(store (f32 FPR32:$Rt), am_unscaled32:$addr)]>;
-def STURDi : StoreUnscaled<0b11, 1, 0b00, FPR64, am_unscaled64, "stur",
-                           [(store (f64 FPR64:$Rt), am_unscaled64:$addr)]>;
-def STURQi : StoreUnscaled<0b00, 1, 0b10, FPR128, am_unscaled128, "stur",
-                           [(store (f128 FPR128:$Rt), am_unscaled128:$addr)]>;
-def STURHHi : StoreUnscaled<0b01, 0, 0b00, GPR32, am_unscaled16, "sturh",
-                            [(truncstorei16 GPR32:$Rt, am_unscaled16:$addr)]>;
-def STURBBi : StoreUnscaled<0b00, 0, 0b00, GPR32, am_unscaled8, "sturb",
-                            [(truncstorei8 GPR32:$Rt, am_unscaled8:$addr)]>;
-
-// Match all store 64 bits width whose type is compatible with FPR64
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v2f32 FPR64:$Rn), am_unscaled64:$addr),
-            (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-  def : Pat<(store (v8i8 FPR64:$Rn), am_unscaled64:$addr),
-            (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-  def : Pat<(store (v4i16 FPR64:$Rn), am_unscaled64:$addr),
-            (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-  def : Pat<(store (v2i32 FPR64:$Rn), am_unscaled64:$addr),
-            (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-}
-def : Pat<(store (v1f64 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-let Predicates = [IsLE] in {
-  // We must use ST1 to store vectors in big-endian.
-  def : Pat<(store (v4f32 FPR128:$Rn), am_unscaled128:$addr),
-            (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-  def : Pat<(store (v2f64 FPR128:$Rn), am_unscaled128:$addr),
-            (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-  def : Pat<(store (v16i8 FPR128:$Rn), am_unscaled128:$addr),
-            (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-  def : Pat<(store (v8i16 FPR128:$Rn), am_unscaled128:$addr),
-            (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-  def : Pat<(store (v4i32 FPR128:$Rn), am_unscaled128:$addr),
-            (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-  def : Pat<(store (v2i64 FPR128:$Rn), am_unscaled128:$addr),
-            (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-  def : Pat<(store (v2f64 FPR128:$Rn), am_unscaled128:$addr),
-            (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-}
-
-// unscaled i64 truncating stores
-def : Pat<(truncstorei32 GPR64:$Rt, am_unscaled32:$addr),
-  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled32:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, am_unscaled16:$addr),
-  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled16:$addr)>;
-def : Pat<(truncstorei8 GPR64:$Rt, am_unscaled8:$addr),
-  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled8:$addr)>;
-
-//---
-// STR mnemonics fall back to STUR for negative or unaligned offsets.
-def : InstAlias<"str $Rt, $addr",
-                (STURXi GPR64:$Rt, am_unscaled_fb64:$addr), 0>;
-def : InstAlias<"str $Rt, $addr",
-                (STURWi GPR32:$Rt, am_unscaled_fb32:$addr), 0>;
-def : InstAlias<"str $Rt, $addr",
-                (STURBi FPR8:$Rt, am_unscaled_fb8:$addr), 0>;
-def : InstAlias<"str $Rt, $addr",
-                (STURHi FPR16:$Rt, am_unscaled_fb16:$addr), 0>;
-def : InstAlias<"str $Rt, $addr",
-                (STURSi FPR32:$Rt, am_unscaled_fb32:$addr), 0>;
-def : InstAlias<"str $Rt, $addr",
-                (STURDi FPR64:$Rt, am_unscaled_fb64:$addr), 0>;
-def : InstAlias<"str $Rt, $addr",
-                (STURQi FPR128:$Rt, am_unscaled_fb128:$addr), 0>;
-
-def : InstAlias<"strb $Rt, $addr",
-                (STURBBi GPR32:$Rt, am_unscaled_fb8:$addr), 0>;
-def : InstAlias<"strh $Rt, $addr",
-                (STURHHi GPR32:$Rt, am_unscaled_fb16:$addr), 0>;
-
-//---
-// (unscaled immediate, unprivileged)
-def STTRWi : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
-def STTRXi : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
-
-def STTRHi : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
-def STTRBi : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
-
-//---
-// (immediate pre-indexed)
-def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str">;
-def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str">;
-def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str">;
-def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str">;
-def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str">;
-def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str">;
-def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str">;
-
-def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb">;
-def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh">;
-
-// ISel pseudos and patterns. See expanded comment on StorePreIdxPseudo.
-defm STRQpre : StorePreIdxPseudo<FPR128, f128, pre_store>;
-defm STRDpre : StorePreIdxPseudo<FPR64, f64, pre_store>;
-defm STRSpre : StorePreIdxPseudo<FPR32, f32, pre_store>;
-defm STRXpre : StorePreIdxPseudo<GPR64, i64, pre_store>;
-defm STRWpre : StorePreIdxPseudo<GPR32, i32, pre_store>;
-defm STRHHpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti16>;
-defm STRBBpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti8>;
-// truncstore i64
-def : Pat<(pre_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRWpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(pre_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRHHpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(pre_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRBBpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-
-def : Pat<(pre_store (v8i8 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpre_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v4i16 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpre_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2i32 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpre_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2f32 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpre_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v1i64 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpre_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v1f64 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpre_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-
-def : Pat<(pre_store (v16i8 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpre_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v8i16 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpre_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v4i32 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpre_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v4f32 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpre_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2i64 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpre_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(pre_store (v2f64 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpre_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-
-//---
-// (immediate post-indexed)
-def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str">;
-def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str">;
-def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,  "str">;
-def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str">;
-def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str">;
-def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str">;
-def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str">;
-
-def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb">;
-def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh">;
-
-// ISel pseudos and patterns. See expanded comment on StorePostIdxPseudo.
-defm STRQpost : StorePostIdxPseudo<FPR128, f128, post_store, STRQpost>;
-defm STRDpost : StorePostIdxPseudo<FPR64, f64, post_store, STRDpost>;
-defm STRSpost : StorePostIdxPseudo<FPR32, f32, post_store, STRSpost>;
-defm STRXpost : StorePostIdxPseudo<GPR64, i64, post_store, STRXpost>;
-defm STRWpost : StorePostIdxPseudo<GPR32, i32, post_store, STRWpost>;
-defm STRHHpost : StorePostIdxPseudo<GPR32, i32, post_truncsti16, STRHHpost>;
-defm STRBBpost : StorePostIdxPseudo<GPR32, i32, post_truncsti8, STRBBpost>;
-// truncstore i64
-def : Pat<(post_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRWpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(post_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRHHpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(post_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRBBpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-
-def : Pat<(post_store (v8i8 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpost_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v4i16 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpost_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v2i32 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpost_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v2f32 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpost_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v1i64 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpost_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v1f64 FPR64:$Rt), am_noindex:$addr, simm9:$off),
-          (STRDpost_isel FPR64:$Rt, am_noindex:$addr, simm9:$off)>;
-
-def : Pat<(post_store (v16i8 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpost_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v8i16 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpost_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v4i32 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpost_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v4f32 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpost_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v2i64 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpost_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-def : Pat<(post_store (v2f64 FPR128:$Rt), am_noindex:$addr, simm9:$off),
-          (STRQpost_isel FPR128:$Rt, am_noindex:$addr, simm9:$off)>;
-
-//===----------------------------------------------------------------------===//
-// Load/store exclusive instructions.
-//===----------------------------------------------------------------------===//
-
-def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
-def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
-def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
-def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
-
-def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
-def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
-def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
-def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
-
-def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
-def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
-def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
-def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
-
-def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
-def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
-def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
-def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
-
-def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
-def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
-def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
-def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
-
-def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
-def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
-def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
-def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
-
-def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
-def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
-
-def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
-def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
-
-def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
-def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
-
-def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
-def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
-
-//===----------------------------------------------------------------------===//
-// Scaled floating point to integer conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_arm64_neon_fcvtas>;
-defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_arm64_neon_fcvtau>;
-defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_arm64_neon_fcvtms>;
-defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_arm64_neon_fcvtmu>;
-defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_arm64_neon_fcvtns>;
-defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_arm64_neon_fcvtnu>;
-defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_arm64_neon_fcvtps>;
-defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_arm64_neon_fcvtpu>;
-defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>;
-defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>;
-}
-
-//===----------------------------------------------------------------------===//
-// Scaled integer to floating point conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
-defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
-
-//===----------------------------------------------------------------------===//
-// Unscaled integer to floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FMOV : UnscaledConversion<"fmov">;
-
-def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
-def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCVT : FPConversion<"fcvt">;
-
-def : Pat<(f32_to_f16 FPR32:$Rn),
-          (i32 (COPY_TO_REGCLASS
-                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
-                   GPR32))>;
-
-def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
-                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
-
-//===----------------------------------------------------------------------===//
-// Floating point single operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
-defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
-defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
-defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
-defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
-
-def : Pat<(v1f64 (int_arm64_neon_frintn (v1f64 FPR64:$Rn))),
-          (FRINTNDr FPR64:$Rn)>;
-
-// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
-// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
-// <rdar://problem/13715968>
-// TODO: We should really model the FPSR flags correctly. This is really ugly.
-let hasSideEffects = 1 in {
-defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-}
-
-defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
-
-let SchedRW = [WriteFDiv] in {
-defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating point two operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
-let SchedRW = [WriteFDiv] in {
-defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
-}
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", ARM64fmax>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_arm64_neon_fminnm>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", ARM64fmin>;
-let SchedRW = [WriteFMul] in {
-defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
-defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
-}
-defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
-
-def : Pat<(v1f64 (ARM64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (ARM64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point three operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
-defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
-     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
-defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
-     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
-defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
-     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-
-// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
-// the NEON variant.
-def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
-          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
-          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
-// "(-a) + b*(-c)".
-def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
-          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
-          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
-          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
-          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCMPE : FPComparison<1, "fcmpe">;
-defm FCMP  : FPComparison<0, "fcmp", ARM64fcmp>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP  : FPCondComparison<0, "fccmp">;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional select instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCSEL : FPCondSelect<"fcsel">;
-
-// CSEL instructions providing f128 types need to be handled by a
-// pseudo-instruction since the eventual code will need to introduce basic
-// blocks and control flow.
-def F128CSEL : Pseudo<(outs FPR128:$Rd),
-                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
-                      [(set (f128 FPR128:$Rd),
-                            (ARM64csel FPR128:$Rn, FPR128:$Rm,
-                                       (i32 imm:$cond), NZCV))]> {
-  let Uses = [NZCV];
-  let usesCustomInserter = 1;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Floating point immediate move.
-//===----------------------------------------------------------------------===//
-
-let isReMaterializable = 1 in {
-defm FMOV : FPMoveImmediate<"fmov">;
-}
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_arm64_neon_cls>;
-defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
-defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", ARM64cmltz>;
-defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
-defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
-
-defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_arm64_neon_fcvtas>;
-defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_arm64_neon_fcvtau>;
-defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (v4i16 V64:$Rn))),
-          (FCVTLv4i16 V64:$Rn)>;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
-                                                              (i64 4)))),
-          (FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
-                                                    (i64 2))))),
-          (FCVTLv4i32 V128:$Rn)>;
-
-defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_arm64_neon_fcvtms>;
-defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_arm64_neon_fcvtmu>;
-defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_arm64_neon_fcvtns>;
-defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_arm64_neon_fcvtnu>;
-defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
-def : Pat<(v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
-          (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd,
-                          (v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
-          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
-          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_arm64_neon_fcvtps>;
-defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_arm64_neon_fcvtpu>;
-defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
-                                        int_arm64_neon_fcvtxn>;
-defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
-defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
-                                       int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
-                                       int_arm64_neon_fcvtzu>;
-}
-defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
-defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_arm64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
-defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
-defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
-defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
-defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
-defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_arm64_neon_frsqrte>;
-defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
-defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
-                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
-// Aliases for MVN -> NOT.
-def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
-                (NOTv8i8 V64:$Vd, V64:$Vn)>;
-def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
-                (NOTv16i8 V128:$Vd, V128:$Vn)>;
-
-def : Pat<(ARM64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
-def : Pat<(ARM64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
-def : Pat<(ARM64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
-def : Pat<(ARM64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
-def : Pat<(ARM64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
-
-def : Pat<(ARM64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_arm64_neon_rbit>;
-defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", ARM64rev16>;
-defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", ARM64rev32>;
-defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", ARM64rev64>;
-defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_saddlp node:$RHS))> >;
-defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_arm64_neon_saddlp>;
-defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
-defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
-defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_arm64_neon_sqxtn>;
-defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_arm64_neon_sqxtun>;
-defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_arm64_neon_suqadd>;
-defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_uaddlp node:$RHS))> >;
-defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
-                    int_arm64_neon_uaddlp>;
-defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
-defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_arm64_neon_uqxtn>;
-defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_arm64_neon_urecpe>;
-defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_arm64_neon_ursqrte>;
-defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_arm64_neon_usqadd>;
-defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
-
-def : Pat<(v2f32 (ARM64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
-def : Pat<(v4f32 (ARM64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
-
-// Patterns for vector long shift (by element width). These need to match all
-// three of zext, sext and anyext so it's easier to pull the patterns out of the
-// definition.
-multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
-  def : Pat<(ARM64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
-            (SHLLv8i8 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
-            (SHLLv16i8 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
-            (SHLLv4i16 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
-            (SHLLv8i16 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
-            (SHLLv2i32 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
-            (SHLLv4i32 V128:$Rn)>;
-}
-
-defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
-defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_arm64_neon_addp>;
-defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_arm64_neon_fabd>;
-defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_arm64_neon_facge>;
-defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_arm64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_arm64_neon_addp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_arm64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_arm64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", ARM64fmax>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_arm64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_arm64_neon_fminnm>;
-defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_arm64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", ARM64fmin>;
-
-// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
-// instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
-            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
-            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
-          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_arm64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_arm64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
-defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
-                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
-                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
-defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_arm64_neon_pmul>;
-defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_sabd node:$MHS, node:$RHS))> >;
-defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_arm64_neon_sabd>;
-defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_arm64_neon_shadd>;
-defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_arm64_neon_shsub>;
-defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_arm64_neon_smaxp>;
-defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_arm64_neon_smax>;
-defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_arm64_neon_sminp>;
-defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_arm64_neon_smin>;
-defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_arm64_neon_sqsub>;
-defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_arm64_neon_srhadd>;
-defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
-defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_uabd node:$MHS, node:$RHS))> >;
-defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_arm64_neon_uabd>;
-defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_arm64_neon_uhadd>;
-defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_arm64_neon_uhsub>;
-defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_arm64_neon_umaxp>;
-defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_arm64_neon_umax>;
-defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_arm64_neon_uminp>;
-defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_arm64_neon_umin>;
-defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_arm64_neon_uqsub>;
-defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_arm64_neon_urhadd>;
-defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_arm64_neon_ushl>;
-
-defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
-defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
-                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", ARM64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
-defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
-defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
-                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
-defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
-
-def : Pat<(ARM64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(ARM64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(ARM64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(ARM64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(ARM64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(ARM64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(ARM64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(ARM64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
-def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-
-def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
-def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-
-def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmls.8b\t$dst, $src1, $src2}",
-                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmls.16b\t$dst, $src1, $src2}",
-                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmls.4h\t$dst, $src1, $src2}",
-                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmls.8h\t$dst, $src1, $src2}",
-                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmls.2s\t$dst, $src1, $src2}",
-                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmls.4s\t$dst, $src1, $src2}",
-                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmls.2d\t$dst, $src1, $src2}",
-                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlo.8b\t$dst, $src1, $src2}",
-                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlo.16b\t$dst, $src1, $src2}",
-                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlo.4h\t$dst, $src1, $src2}",
-                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlo.8h\t$dst, $src1, $src2}",
-                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlo.2s\t$dst, $src1, $src2}",
-                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlo.4s\t$dst, $src1, $src2}",
-                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlo.2d\t$dst, $src1, $src2}",
-                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmle.8b\t$dst, $src1, $src2}",
-                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmle.16b\t$dst, $src1, $src2}",
-                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmle.4h\t$dst, $src1, $src2}",
-                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmle.8h\t$dst, $src1, $src2}",
-                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmle.2s\t$dst, $src1, $src2}",
-                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmle.4s\t$dst, $src1, $src2}",
-                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmle.2d\t$dst, $src1, $src2}",
-                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlt.8b\t$dst, $src1, $src2}",
-                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlt.16b\t$dst, $src1, $src2}",
-                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlt.4h\t$dst, $src1, $src2}",
-                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlt.8h\t$dst, $src1, $src2}",
-                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlt.2s\t$dst, $src1, $src2}",
-                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlt.4s\t$dst, $src1, $src2}",
-                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlt.2d\t$dst, $src1, $src2}",
-                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmle.2s\t$dst, $src1, $src2}",
-                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmle.4s\t$dst, $src1, $src2}",
-                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmle.2d\t$dst, $src1, $src2}",
-                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmlt.2s\t$dst, $src1, $src2}",
-                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmlt.4s\t$dst, $src1, $src2}",
-                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmlt.2d\t$dst, $src1, $src2}",
-                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|facle.2s\t$dst, $src1, $src2}",
-                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|facle.4s\t$dst, $src1, $src2}",
-                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|facle.2d\t$dst, $src1, $src2}",
-                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|faclt.2s\t$dst, $src1, $src2}",
-                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|faclt.4s\t$dst, $src1, $src2}",
-                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|faclt.2d\t$dst, $src1, $src2}",
-                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
-defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_arm64_sisd_fabd>;
-def : Pat<(v1f64 (int_arm64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
-                                     int_arm64_neon_facge>;
-defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
-                                     int_arm64_neon_facgt>;
-defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_arm64_neon_fmulx>;
-defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_arm64_neon_frsqrts>;
-defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_arm64_neon_sqsub>;
-defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
-defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_arm64_neon_uqsub>;
-defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_arm64_neon_ushl>;
-
-def : InstAlias<"cmls $dst, $src1, $src2",
-                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"cmle $dst, $src1, $src2",
-                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"cmlo $dst, $src1, $src2",
-                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"cmlt $dst, $src1, $src2",
-                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions (mixed operands).
-//===----------------------------------------------------------------------===//
-defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
-                                       int_arm64_neon_sqdmulls_scalar>;
-defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
-defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
-
-def : Pat<(i64 (int_arm64_neon_sqadd (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_sqsub (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", ARM64cmltz>;
-defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
-def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
-defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
-                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", ARM64sitof>;
-defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_arm64_neon_scalar_sqxtn>;
-defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_arm64_neon_scalar_sqxtun>;
-defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
-                                     int_arm64_neon_suqadd>;
-defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", ARM64uitof>;
-defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_arm64_neon_scalar_uqxtn>;
-defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
-                                    int_arm64_neon_usqadd>;
-
-def : Pat<(ARM64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm64_neon_fcvtas (v1f64 FPR64:$Rn))),
-          (FCVTASv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtau (v1f64 FPR64:$Rn))),
-          (FCVTAUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtms (v1f64 FPR64:$Rn))),
-          (FCVTMSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtmu (v1f64 FPR64:$Rn))),
-          (FCVTMUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtns (v1f64 FPR64:$Rn))),
-          (FCVTNSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtnu (v1f64 FPR64:$Rn))),
-          (FCVTNUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtps (v1f64 FPR64:$Rn))),
-          (FCVTPSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtpu (v1f64 FPR64:$Rn))),
-          (FCVTPUv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpe (f32 FPR32:$Rn))),
-          (FRECPEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpe (f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frecpe (v1f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpx (f32 FPR32:$Rn))),
-          (FRECPXv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpx (f64 FPR64:$Rn))),
-          (FRECPXv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frsqrte (f32 FPR32:$Rn))),
-          (FRSQRTEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frsqrte (f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frsqrte (v1f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// Here are the patterns for 8 and 16-bits to float.
-// 8-bits -> float.
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURBi am_unscaled8:$addr), bsub))>;
-// 16-bits -> float.
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURHi am_unscaled16:$addr), hsub))>;
-// 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// UCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURBi am_unscaled8:$addr), bsub))>;
-// 16-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURHi am_unscaled16:$addr), hsub))>;
-// 32-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (load ro_indexed32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub))>;
-def : Pat <(f64 (uint_to_fp (i32 (load am_indexed32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub))>;
-def : Pat <(f64 (uint_to_fp (i32 (load am_unscaled32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURSi am_unscaled32:$addr), ssub))>;
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three different-sized vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_arm64_neon_addhn>;
-defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_arm64_neon_subhn>;
-defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_arm64_neon_raddhn>;
-defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_arm64_neon_rsubhn>;
-defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_arm64_neon_pmull>;
-defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
-                                             int_arm64_neon_sabd>;
-defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
-                                          int_arm64_neon_sabd>;
-defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
-            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
-defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
-                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
-defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_arm64_neon_smull>;
-defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
-                                               int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
-                                               int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
-                                     int_arm64_neon_sqdmull>;
-defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
-                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
-defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
-                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
-                                              int_arm64_neon_uabd>;
-defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
-                                          int_arm64_neon_uabd>;
-defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
-                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
-defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
-                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
-defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_arm64_neon_umull>;
-defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
-                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
-defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
-                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
-
-// Patterns for 64-bit pmull
-def : Pat<(int_arm64_neon_pmull64 V64:$Rn, V64:$Rm),
-          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_arm64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
-                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
-          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
-
-// CodeGen patterns for addhn and subhn instructions, which can actually be
-// written in LLVM IR without too much difficulty.
-
-// ADDHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
-          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-// SUBHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
-          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector instruction.
-//----------------------------------------------------------------------------
-
-defm EXT : SIMDBitwiseExtract<"ext">;
-
-def : Pat<(v4i16 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8i16 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v2f32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v4i32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2f64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-
-// We use EXT to handle extract_subvector to copy the upper 64-bits of a
-// 128-bit vector.
-def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-defm TRN1 : SIMDZipVector<0b010, "trn1", ARM64trn1>;
-defm TRN2 : SIMDZipVector<0b110, "trn2", ARM64trn2>;
-defm UZP1 : SIMDZipVector<0b001, "uzp1", ARM64uzp1>;
-defm UZP2 : SIMDZipVector<0b101, "uzp2", ARM64uzp2>;
-defm ZIP1 : SIMDZipVector<0b011, "zip1", ARM64zip1>;
-defm ZIP2 : SIMDZipVector<0b111, "zip2", ARM64zip2>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX instructions
-//----------------------------------------------------------------------------
-
-defm TBL : SIMDTableLookup<    0, "tbl">;
-defm TBX : SIMDTableLookupTied<1, "tbx">;
-
-def : Pat<(v8i8 (int_arm64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBLv16i8One V128:$Ri, V128:$Rn)>;
-
-def : Pat<(v8i8 (int_arm64_neon_tbx1 (v8i8 V64:$Rd),
-                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbx1 (v16i8 V128:$Rd),
-                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY instruction
-//----------------------------------------------------------------------------
-
-defm CPY : SIMDScalarCPY<"cpy">;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
-defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
-def : Pat<(i64 (int_arm64_neon_saddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(i64 (int_arm64_neon_uaddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v2f32 V64:$Rn))),
-          (FADDPv2i32p V64:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v4f32 V128:$Rn))),
-          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
-def : Pat<(f64 (int_arm64_neon_faddv (v2f64 V128:$Rn))),
-          (FADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxnmv (v2f32 V64:$Rn))),
-          (FMAXNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxnmv (v2f64 V128:$Rn))),
-          (FMAXNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxv (v2f32 V64:$Rn))),
-          (FMAXPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxv (v2f64 V128:$Rn))),
-          (FMAXPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminnmv (v2f32 V64:$Rn))),
-          (FMINNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminnmv (v2f64 V128:$Rn))),
-          (FMINNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminv (v2f32 V64:$Rn))),
-          (FMINPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminv (v2f64 V128:$Rn))),
-          (FMINPv2i64p V128:$Rn)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
-def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
-def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
-def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
-def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
-def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
-def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
-
-def DUPv2i64lane : SIMDDup64FromElement;
-def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
-def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
-def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
-def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
-def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
-def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
-
-def : Pat<(v2f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v2f32 (DUPv2i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v4f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v4f32 (DUPv4i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v2f64 (ARM64dup (f64 FPR64:$Rn))),
-          (v2f64 (DUPv2i64lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
-            (i64 0)))>;
-
-def : Pat<(v2f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v4f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v2f64 (ARM64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
-          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
-
-// If there's an (ARM64dup (vector_extract ...) ...), we can use a duplane
-// instruction even if the types don't match: we just have to remap the lane
-// carefully. N.b. this trick only applies to truncations.
-def VecIndex_x2 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
-}]>;
-def VecIndex_x4 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
-}]>;
-def VecIndex_x8 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
-}]>;
-
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
-                            ValueType Src128VT, ValueType ScalVT,
-                            Instruction DUP, SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
-                                                     imm:$idx)))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
-                                                     imm:$idx)))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
-                               SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
-                                                         imm:$idx))))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
-                                                         imm:$idx))))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
-
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
-
-// SMOV and UMOV definitions, with some extra patterns for convenience
-defm SMOV : SMov;
-defm UMOV : UMov;
-
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
-          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
-
-// Extracting i8 or i16 elements will have the zero-extend transformed to
-// an 'and' mask by type legalization since neither i8 nor i16 are legal types
-// for ARM64. Match these patterns here since UMOV already zeroes out the high
-// bits of the destination register.
-def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
-               (i32 0xff)),
-          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
-               (i32 0xffff)),
-          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
-
-defm INS : SIMDIns;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
-          (SUBREG_TO_REG (i32 0),
-                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
-            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                                  (i64 FPR64:$Rn), dsub))>;
-
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
-
-def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (EXTRACT_SUBREG
-            (INSvi32lane
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
-              VectorIndexS:$imm,
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-              (i64 0)),
-            dsub)>;
-def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (INSvi32lane
-            V128:$Rn, VectorIndexS:$imm,
-            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-            (i64 0))>;
-def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
-            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
-          (INSvi64lane
-            V128:$Rn, VectorIndexD:$imm,
-            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
-            (i64 0))>;
-
-// Copy an element at a constant index in one vector into a constant indexed
-// element of another.
-// FIXME refactor to a shared class/dev parameterized on vector type, vector
-// index type and INS extension
-def : Pat<(v16i8 (int_arm64_neon_vcopy_lane
-                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
-                   VectorIndexB:$idx2)),
-          (v16i8 (INSvi8lane
-                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
-          )>;
-def : Pat<(v8i16 (int_arm64_neon_vcopy_lane
-                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
-                   VectorIndexH:$idx2)),
-          (v8i16 (INSvi16lane
-                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
-          )>;
-def : Pat<(v4i32 (int_arm64_neon_vcopy_lane
-                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
-                   VectorIndexS:$idx2)),
-          (v4i32 (INSvi32lane
-                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
-          )>;
-def : Pat<(v2i64 (int_arm64_neon_vcopy_lane
-                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
-                   VectorIndexD:$idx2)),
-          (v2i64 (INSvi64lane
-                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
-          )>;
-
-multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
-                                ValueType VTScal, Instruction INS> {
-  def : Pat<(VT128 (vector_insert V128:$src,
-                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
-
-  def : Pat<(VT128 (vector_insert V128:$src,
-                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (INS V128:$src, imm:$Immd,
-                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
-
-  def : Pat<(VT64 (vector_insert V64:$src,
-                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
-                                 imm:$Immd, V128:$Rn, imm:$Immn),
-                            dsub)>;
-
-  def : Pat<(VT64 (vector_insert V64:$src,
-                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
-                        imm:$Immd)),
-            (EXTRACT_SUBREG
-                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
-                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
-                dsub)>;
-}
-
-defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
-
-
-// Floating point vector extractions are codegen'd as either a sequence of
-// subregister extractions, possibly fed by an INS if the lane number is
-// anything other than zero.
-def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
-          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
-          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
-def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
-          (f64 (EXTRACT_SUBREG
-            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexD:$idx),
-            dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
-          (f32 (EXTRACT_SUBREG
-            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexS:$idx),
-            ssub))>;
-
-// All concat_vectors operations are canonicalised to act on i64 vectors for
-// ARM64. In the general case we need an instruction, which had just as well be
-// INS.
-class ConcatPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
-        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
-                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
-
-def : ConcatPat<v2i64, v1i64>;
-def : ConcatPat<v2f64, v1f64>;
-def : ConcatPat<v4i32, v2i32>;
-def : ConcatPat<v4f32, v2f32>;
-def : ConcatPat<v8i16, v4i16>;
-def : ConcatPat<v16i8, v8i8>;
-
-// If the high lanes are undef, though, we can just ignore them:
-class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
-        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
-
-def : ConcatUndefPat<v2i64, v1i64>;
-def : ConcatUndefPat<v2f64, v1f64>;
-def : ConcatUndefPat<v4i32, v2i32>;
-def : ConcatUndefPat<v4f32, v2f32>;
-def : ConcatUndefPat<v8i16, v4i16>;
-def : ConcatUndefPat<v16i8, v8i8>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
-defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
-defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
-defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
-defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
-defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
-defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_arm64_neon_fmaxnmv>;
-defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_arm64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_arm64_neon_fminnmv>;
-defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_arm64_neon_fminv>;
-
-multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-
-}
-
-multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-           ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
-                                                Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-            ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_arm64_neon_saddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_saddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_arm64_neon_uaddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_uaddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_arm64_neon_smaxv>;
-def : Pat<(i32 (int_arm64_neon_smaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_arm64_neon_sminv>;
-def : Pat<(i32 (int_arm64_neon_sminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_arm64_neon_umaxv>;
-def : Pat<(i32 (int_arm64_neon_umaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_arm64_neon_uminv>;
-def : Pat<(i32 (int_arm64_neon_uminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_arm64_neon_saddlv>;
-defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_arm64_neon_uaddlv>;
-
-// The vaddlv_s32 intrinsic gets mapped to SADDLP.
-def : Pat<(i64 (int_arm64_neon_saddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-// The vaddlv_u32 intrinsic gets mapped to UADDLP.
-def : Pat<(i64 (int_arm64_neon_uaddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-
-//------------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//------------------------------------------------------------------------------
-
-// AdvSIMD BIC
-defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", ARM64bici>;
-// AdvSIMD ORR
-defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", ARM64orri>;
-
-def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
-def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
-
-def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
-def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
-def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
-
-def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-// AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
-                                              "fmov", ".2d",
-                       [(set (v2f64 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
-                                              "fmov", ".2s",
-                       [(set (v2f32 V64:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
-                                              "fmov", ".4s",
-                       [(set (v4f32 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-
-// AdvSIMD MOVI
-
-// EDIT byte mask: scalar
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
-                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 here.
-def : Pat<(f64 (ARM64movi_edit imm0_255:$shift)),
-          (MOVID imm0_255:$shift)>;
-
-def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
-
-def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
-
-// EDIT byte mask: 2d
-
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 in the pattern
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
-                                                simdimmtype10,
-                                                "movi", ".2d",
-                   [(set (v2i64 V128:$Rd), (ARM64movi_edit imm0_255:$imm8))]>;
-
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32   fpimm0),
-          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
-      Requires<[HasZCZ]>;
-def : Pat<(f64   fpimm0),
-          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
-      Requires<[HasZCZ]>;
-}
-
-def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-
-def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-
-def : Pat<(v2f64 (ARM64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4f32 (ARM64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
-
-def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : Pat<(v2i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-// Per byte: 8b & 16b
-def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
-                                                 "movi", ".8b",
-                       [(set (v8i8 V64:$Rd), (ARM64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
-                                                 "movi", ".16b",
-                       [(set (v16i8 V128:$Rd), (ARM64movi imm0_255:$imm8))]>;
-
-// AdvSIMD MVNI
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
-
-def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
-def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
-
-def : Pat<(v2i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let neverHasSideEffects = 1 in {
-  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
-  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
-}
-
-// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
-// instruction expects the addend first, while the intrinsic expects it last.
-
-// On the other hand, there are quite a few valid combinatorial options due to
-// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
-
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
-
-multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
-  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (v2f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (v4f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
-  // (DUPLANE from 64-bit would be trivial).
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 (fneg V128:$Rm)),
-                                           VectorIndexD:$idx))),
-            (FMLSv2i64_indexed
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 (fneg FPR64Op:$Rm))))),
-            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 (fneg V64:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-}
-
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
-
-defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_arm64_neon_fmulx>;
-defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
-
-def : Pat<(v2f32 (fmul V64:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv2i32_indexed V64:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv4i32_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (ARM64dup (f64 FPR64:$Rm)))),
-          (FMULv2i64_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
-            (i64 0))>;
-
-defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
-              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
-              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
-defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
-                int_arm64_neon_smull>;
-defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
-                                           int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
-                                           int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_arm64_neon_sqdmull>;
-defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
-                int_arm64_neon_umull>;
-
-// A scalar sqdmull with the second operand being a vector lane can be
-// handled directly with the indexed instruction encoding.
-def : Pat<(int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                          (vector_extract (v4i32 V128:$Vm),
-                                                           VectorIndexS:$idx)),
-          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
-// Codegen patterns for the above. We don't put these directly on the
-// instructions because TableGen's type inference can't handle the truth.
-// Having the same base pattern for fp <--> int totally freaks it out.
-def : Pat<(int_arm64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
-          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
-          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-
-defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", ARM64vshl>;
-defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
-defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
-                                     int_arm64_neon_sqrshrn>;
-defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
-                                     int_arm64_neon_sqrshrun>;
-defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
-                                     int_arm64_neon_sqshrn>;
-defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
-                                     int_arm64_neon_sqshrun>;
-defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
-defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64srshri node:$MHS, node:$RHS))>>;
-defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
-                                     int_arm64_neon_uqrshrn>;
-defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
-                                     int_arm64_neon_uqshrn>;
-defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64urshri node:$MHS, node:$RHS))>>;
-defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vlshr node:$MHS, node:$RHS))>>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_arm64_neon_vcvtfp2fxs>;
-defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_arm64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
-                                   int_arm64_neon_vcvtfxs2fp>;
-defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
-                                         int_arm64_neon_rshrn>;
-defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", ARM64vshl>;
-defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
-                          BinOpFrag<(trunc (ARM64vashr node:$LHS, node:$RHS))>>;
-defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_arm64_neon_vsli>;
-def : Pat<(v1i64 (int_arm64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftL64:$imm))),
-          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
-defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
-                                         int_arm64_neon_sqrshrn>;
-defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
-                                         int_arm64_neon_sqrshrun>;
-defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
-                                         int_arm64_neon_sqshrn>;
-defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
-                                         int_arm64_neon_sqshrun>;
-defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_arm64_neon_vsri>;
-def : Pat<(v1i64 (int_arm64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftR64:$imm))),
-          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
-defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
-                 TriOpFrag<(add node:$LHS,
-                                (ARM64srshri node:$MHS, node:$RHS))> >;
-defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
-                BinOpFrag<(ARM64vshl (sext node:$LHS), node:$RHS)>>;
-
-defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
-                TriOpFrag<(add node:$LHS, (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
-                        int_arm64_neon_vcvtfxu2fp>;
-defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
-                                         int_arm64_neon_uqrshrn>;
-defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
-                                         int_arm64_neon_uqshrn>;
-defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
-                TriOpFrag<(add node:$LHS,
-                               (ARM64urshri node:$MHS, node:$RHS))> >;
-defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
-                BinOpFrag<(ARM64vshl (zext node:$LHS), node:$RHS)>>;
-defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
-                TriOpFrag<(add node:$LHS, (ARM64vlshr node:$MHS, node:$RHS))> >;
-
-// SHRN patterns for when a logical right shift was used instead of arithmetic
-// (the immediate guarantees no sign bits actually end up in the result so it
-// doesn't matter).
-def : Pat<(v8i8 (trunc (ARM64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
-          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v4i16 (trunc (ARM64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
-          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v2i32 (trunc (ARM64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
-          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
-
-def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
-                                 (trunc (ARM64vlshr (v8i16 V128:$Rn),
-                                                    vecshiftR16Narrow:$imm)))),
-          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
-                                 (trunc (ARM64vlshr (v4i32 V128:$Rn),
-                                                    vecshiftR32Narrow:$imm)))),
-          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
-                                 (trunc (ARM64vlshr (v2i64 V128:$Rn),
-                                                    vecshiftR64Narrow:$imm)))),
-          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-
-// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
-// Anyexts are implemented as zexts.
-def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-// Also match an extend from the upper half of a 128 bit source register.
-def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
-
-// Vector shift sxtl aliases
-def : InstAlias<"sxtl.8h $dst, $src1",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.8h, $src1.8b",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.4s $dst, $src1",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.4s, $src1.4h",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.2d $dst, $src1",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.2d, $src1.2s",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift sxtl2 aliases
-def : InstAlias<"sxtl2.8h $dst, $src1",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.4s $dst, $src1",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.2d $dst, $src1",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// Vector shift uxtl aliases
-def : InstAlias<"uxtl.8h $dst, $src1",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.8h, $src1.8b",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.4s $dst, $src1",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.4s, $src1.4h",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.2d $dst, $src1",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.2d, $src1.2s",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift uxtl2 aliases
-def : InstAlias<"uxtl2.8h $dst, $src1",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.4s $dst, $src1",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.2d $dst, $src1",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// These patterns are more complex because floating point loads do not
-// support sign extension.
-// The sign extension has to be explicitly added and is only supported for
-// one step: byte-to-half, half-to-word, word-to-doubleword.
-// SCVTF GPR -> FPR is 9 cycles.
-// SCVTF FPR -> FPR is 4 cyclces.
-// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
-// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
-// and still being faster.
-// However, this is not good for code size.
-// 8-bits -> float. 2 sizes step-up.
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 ro_indexed8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRBro ro_indexed8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_indexed8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRBui am_indexed8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_unscaled8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDURBi am_unscaled8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-// 16-bits -> float. 1 size step-up.
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRHro ro_indexed16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRHui am_indexed16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDURHi am_unscaled16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-// 32-bits to 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// SCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double. 3 size step-up: give up.
-// 16-bits -> double. 2 size step.
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                 (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv4i16_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRHro ro_indexed16:$addr),
-                                                  hsub),
-                                     0),
-                                   dsub)),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
-           (SCVTFv1i64  (f64 (EXTRACT_SUBREG
-                               (SSHLLv2i32_shift
-                                 (f64
-                                   (EXTRACT_SUBREG
-                                     (SSHLLv4i16_shift
-                                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRHui am_indexed16:$addr),
-                                                  hsub),
-                                      0),
-                                    dsub)),
-                                 0),
-                              dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv4i16_shift
-                                     (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDURHi am_unscaled16:$addr),
-                                                  hsub),
-                                      0),
-                                   dsub)),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-// 32-bits -> double. 1 size step-up.
-def : Pat <(f64 (sint_to_fp (i32 (load ro_indexed32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRSro ro_indexed32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (load am_indexed32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRSui am_indexed32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (load am_unscaled32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDURSi am_unscaled32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD Load-Store Structure
-//----------------------------------------------------------------------------
-defm LD1 : SIMDLd1Multiple<"ld1">;
-defm LD2 : SIMDLd2Multiple<"ld2">;
-defm LD3 : SIMDLd3Multiple<"ld3">;
-defm LD4 : SIMDLd4Multiple<"ld4">;
-
-defm ST1 : SIMDSt1Multiple<"st1">;
-defm ST2 : SIMDSt2Multiple<"st2">;
-defm ST3 : SIMDSt3Multiple<"st3">;
-defm ST4 : SIMDSt4Multiple<"st4">;
-
-class Ld1Pat<ValueType ty, Instruction INST>
-  : Pat<(ty (load am_simdnoindex:$vaddr)), (INST am_simdnoindex:$vaddr)>;
-
-def : Ld1Pat<v16i8, LD1Onev16b>;
-def : Ld1Pat<v8i16, LD1Onev8h>;
-def : Ld1Pat<v4i32, LD1Onev4s>;
-def : Ld1Pat<v2i64, LD1Onev2d>;
-def : Ld1Pat<v8i8,  LD1Onev8b>;
-def : Ld1Pat<v4i16, LD1Onev4h>;
-def : Ld1Pat<v2i32, LD1Onev2s>;
-def : Ld1Pat<v1i64, LD1Onev1d>;
-
-class St1Pat<ValueType ty, Instruction INST>
-  : Pat<(store ty:$Vt, am_simdnoindex:$vaddr),
-        (INST ty:$Vt, am_simdnoindex:$vaddr)>;
-
-def : St1Pat<v16i8, ST1Onev16b>;
-def : St1Pat<v8i16, ST1Onev8h>;
-def : St1Pat<v4i32, ST1Onev4s>;
-def : St1Pat<v2i64, ST1Onev2d>;
-def : St1Pat<v8i8,  ST1Onev8b>;
-def : St1Pat<v4i16, ST1Onev4h>;
-def : St1Pat<v2i32, ST1Onev2s>;
-def : St1Pat<v1i64, ST1Onev1d>;
-
-//---
-// Single-element
-//---
-
-defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
-defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
-defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
-defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
-let mayLoad = 1, neverHasSideEffects = 1 in {
-defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
-defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
-defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
-defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
-defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
-defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
-defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
-defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
-defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
-defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
-defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
-defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
-defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
-defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
-defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
-defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
-}
-
-def : Pat<(v8i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
-          (LD1Rv8b am_simdnoindex:$vaddr)>;
-def : Pat<(v16i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
-          (LD1Rv16b am_simdnoindex:$vaddr)>;
-def : Pat<(v4i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
-          (LD1Rv4h am_simdnoindex:$vaddr)>;
-def : Pat<(v8i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
-          (LD1Rv8h am_simdnoindex:$vaddr)>;
-def : Pat<(v2i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2s am_simdnoindex:$vaddr)>;
-def : Pat<(v4i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv4s am_simdnoindex:$vaddr)>;
-def : Pat<(v2i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2d am_simdnoindex:$vaddr)>;
-def : Pat<(v1i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv1d am_simdnoindex:$vaddr)>;
-// Grab the floating point version too
-def : Pat<(v2f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2s am_simdnoindex:$vaddr)>;
-def : Pat<(v4f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv4s am_simdnoindex:$vaddr)>;
-def : Pat<(v2f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2d am_simdnoindex:$vaddr)>;
-def : Pat<(v1f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv1d am_simdnoindex:$vaddr)>;
-
-class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne128:$Rd),
-           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
-        (LD1 VecListOne128:$Rd, VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
-def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
-
-class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne64:$Rd),
-           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
-        (EXTRACT_SUBREG
-            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
-                          VecIndex:$idx, am_simdnoindex:$vaddr),
-            dsub)>;
-
-def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
-def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
-
-
-defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
-defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
-defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
-defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
-
-// Stores
-defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
-defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
-defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
-defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
-
-let AddedComplexity = 8 in
-class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-             am_simdnoindex:$vaddr),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
-def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
-def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
-def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
-
-let AddedComplexity = 8 in
-class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-             am_simdnoindex:$vaddr),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
-def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
-def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
-def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
-
-multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                             ValueType VTy, ValueType STy, Instruction ST1,
-                             int offset> {
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-              am_simdnoindex:$vaddr, offset),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, am_simdnoindex:$vaddr, XZR)>;
-
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-              am_simdnoindex:$vaddr, GPR64:$Rm),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, am_simdnoindex:$vaddr, $Rm)>;
-}
-
-defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
-defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
-                        2>;
-defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
-defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
-defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
-defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
-
-multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                             ValueType VTy, ValueType STy, Instruction ST1,
-                             int offset> {
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-              am_simdnoindex:$vaddr, offset),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr, XZR)>;
-
-  def : Pat<(scalar_store
-              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-              am_simdnoindex:$vaddr, GPR64:$Rm),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr, $Rm)>;
-}
-
-defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
-                         1>;
-defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
-                         2>;
-defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
-defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
-defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
-defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
-
-let mayStore = 1, neverHasSideEffects = 1 in {
-defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
-defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
-defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
-defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
-defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
-defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
-defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
-defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
-defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
-defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
-defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
-defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
-}
-
-defm ST1 : SIMDLdSt1SingleAliases<"st1">;
-defm ST2 : SIMDLdSt2SingleAliases<"st2">;
-defm ST3 : SIMDLdSt3SingleAliases<"st3">;
-defm ST4 : SIMDLdSt4SingleAliases<"st4">;
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-def AESErr   : AESTiedInst<0b0100, "aese",   int_arm64_crypto_aese>;
-def AESDrr   : AESTiedInst<0b0101, "aesd",   int_arm64_crypto_aesd>;
-def AESMCrr  : AESInst<    0b0110, "aesmc",  int_arm64_crypto_aesmc>;
-def AESIMCrr : AESInst<    0b0111, "aesimc", int_arm64_crypto_aesimc>;
-
-def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_arm64_crypto_sha1c>;
-def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_arm64_crypto_sha1p>;
-def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_arm64_crypto_sha1m>;
-def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_arm64_crypto_sha1su0>;
-def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_arm64_crypto_sha256h>;
-def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_arm64_crypto_sha256h2>;
-def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_arm64_crypto_sha256su1>;
-
-def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_arm64_crypto_sha1h>;
-def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_arm64_crypto_sha1su1>;
-def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_arm64_crypto_sha256su0>;
-
-//----------------------------------------------------------------------------
-// Compiler-pseudos
-//----------------------------------------------------------------------------
-// FIXME: Like for X86, these should go in their own separate .td file.
-
-// Any instruction that defines a 32-bit result leaves the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits.
-// FIXME: X86 also checks for CMOV here. Do we need something similar?
-def def32 : PatLeaf<(i32 GPR32:$src), [{
-  return N->getOpcode() != ISD::TRUNCATE &&
-         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
-         N->getOpcode() != ISD::CopyFromReg;
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
-
-// For an anyext, we don't care what the high bits are, so we can perform an
-// INSERT_SUBREF into an IMPLICIT_DEF.
-def : Pat<(i64 (anyext GPR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-
-// When we need to explicitly zero-extend, we use an unsigned bitfield move
-// instruction (UBFM) on the enclosing super-reg.
-def : Pat<(i64 (zext GPR32:$src)),
- (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-
-// To sign extend, we use a signed bitfield move instruction (SBFM) on the
-// containing super-reg.
-def : Pat<(i64 (sext GPR32:$src)),
-   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
-                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
-                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
-                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
-
-def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 (i64shift_a        imm0_63:$imm)),
-                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
-
-// sra patterns have an AddedComplexity of 10, so make sure we have a higher
-// AddedComplexity for the following patterns since we want to match sext + sra
-// patterns before we attempt to match a single sra node.
-let AddedComplexity = 20 in {
-// We support all sext + sra combinations which preserve at least one bit of the
-// original value which is to be sign extended. E.g. we support shifts up to
-// bitwidth-1 bits.
-def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
-
-def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
-
-def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 imm0_31:$imm), 31)>;
-} // AddedComplexity = 20
-
-// To truncate, we can simply extract from a subregister.
-def : Pat<(i32 (trunc GPR64sp:$src)),
-          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
-
-// __builtin_trap() uses the BRK instruction on ARM64.
-def : Pat<(trap), (BRK 1)>;
-
-// Conversions within AdvSIMD types in the same register size are free.
-// But because we need a consistent lane ordering, in big endian many
-// conversions require one or more REV instructions.
-//
-// Consider a simple memory load followed by a bitconvert then a store.
-//   v0 = load v2i32
-//   v1 = BITCAST v2i32 v0 to v4i16
-//        store v4i16 v2
-//
-// In big endian mode every memory access has an implicit byte swap. LDR and
-// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
-// is, they treat the vector as a sequence of elements to be byte-swapped.
-// The two pairs of instructions are fundamentally incompatible. We've decided
-// to use LD1/ST1 only to simplify compiler implementation.
-//
-// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
-// the original code sequence:
-//   v0 = load v2i32
-//   v1 = REV v2i32                  (implicit)
-//   v2 = BITCAST v2i32 v1 to v4i16
-//   v3 = REV v4i16 v2               (implicit)
-//        store v4i16 v3
-//
-// But this is now broken - the value stored is different to the value loaded
-// due to lane reordering. To fix this, on every BITCAST we must perform two
-// other REVs:
-//   v0 = load v2i32
-//   v1 = REV v2i32                  (implicit)
-//   v2 = REV v2i32
-//   v3 = BITCAST v2i32 v2 to v4i16
-//   v4 = REV v4i16
-//   v5 = REV v4i16 v4               (implicit)
-//        store v4i16 v5
-//
-// This means an extra two instructions, but actually in most cases the two REV
-// instructions can be combined into one. For example:
-//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
-//
-// There is also no 128-bit REV instruction. This must be synthesized with an
-// EXT instruction.
-//
-// Most bitconverts require some sort of conversion. The only exceptions are:
-//   a) Identity conversions -  vNfX <-> vNiX
-//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
-//
-
-let Predicates = [IsLE] in {
-def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-
-def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
-                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
-                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
-                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
-                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
-
-def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
-          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
-          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
-          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
-          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
-}
-def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
-          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
-          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
-
-def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
-          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
-def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
-          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
-def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
-          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
-          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
-def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
-          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
-                             (v1i64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
-                             (v1i64 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
-                             (v1i64 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
-                             (v1i64 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
-                             (v2i32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
-                             (v2i32 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
-                             (v2i32 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
-                             (v2i32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
-                             (v2i32 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
-                             (v4i16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
-                             (v4i16 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
-                             (v4i16 (REV16v8i8 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
-                             (v4i16 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
-                             (v4i16 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
-                             (v4i16 (REV64v4i16 FPR64:$src))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
-                             (v8i8 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
-                             (v8i8 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
-                             (v8i8 (REV16v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
-                             (v8i8 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
-                             (v8i8 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
-                             (v8i8 (REV64v8i8 FPR64:$src))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
-                             (f64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
-                             (f64 (REV64v4i16 FPR64:$src))>;
-def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
-                             (f64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
-                             (f64 (REV64v8i8 FPR64:$src))>;
-}
-def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
-                             (v1f64 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
-                             (v1f64 (REV64v4i16 FPR64:$src))>;
-def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
-                             (v1f64 (REV64v8i8 FPR64:$src))>;
-def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
-                             (v1f64 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
-                             (v2f32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
-                             (v2f32 (REV32v4i16 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
-                             (v2f32 (REV32v8i8 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
-                             (v2f32 (REV64v2i32 FPR64:$src))>;
-def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
-                             (v2f32 (REV64v2i32 FPR64:$src))>;
-}
-def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
-                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
-def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
-def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
-                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
-def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
-                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
-def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
-def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
-                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
-                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
-                             (v2f64 (EXTv16i8 FPR128:$src,
-                                              FPR128:$src, (i32 8)))>;
-def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
-                             (v2f64 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
-                             (v2f64 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
-                             (v2f64 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
-                             (v2f64 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
-                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
-def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
-                             (v4f32 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
-                             (v4f32 (REV32v16i8 FPR128:$src))>;
-def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
-                             (v4f32 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
-                             (v4f32 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
-                             (v2i64 (EXTv16i8 FPR128:$src,
-                                              FPR128:$src, (i32 8)))>;
-def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
-                             (v2i64 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
-                             (v2i64 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
-                             (v2i64 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
-                             (v2i64 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
-                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
-                                              (REV64v4i32 FPR128:$src),
-                                              (i32 8)))>;
-def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
-                             (v4i32 (REV64v4i32 FPR128:$src))>;
-def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
-                             (v4i32 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
-                             (v4i32 (REV32v16i8 FPR128:$src))>;
-def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
-                             (v4i32 (REV64v4i32 FPR128:$src))>;
-}
-def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
-
-let Predicates = [IsLE] in {
-def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
-                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
-                                              (REV64v8i16 FPR128:$src),
-                                              (i32 8)))>;
-def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
-                             (v8i16 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
-                             (v8i16 (REV32v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
-                             (v8i16 (REV16v16i8 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
-                             (v8i16 (REV64v8i16 FPR128:$src))>;
-def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
-                             (v8i16 (REV32v8i16 FPR128:$src))>;
-}
-
-let Predicates = [IsLE] in {
-def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
-}
-let Predicates = [IsBE] in {
-def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
-                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
-                                              (REV64v16i8 FPR128:$src),
-                                              (i32 8)))>;
-def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
-                             (v16i8 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
-                             (v16i8 (REV32v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
-                             (v16i8 (REV16v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
-                             (v16i8 (REV64v16i8 FPR128:$src))>;
-def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
-                             (v16i8 (REV32v16i8 FPR128:$src))>;
-}
-
-def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-
-// A 64-bit subvector insert to the first 128-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-
-// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
-// or v2f32.
-def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
-                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
-           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
-def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
-                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
-           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
-    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
-    // so we match on v4f32 here, not v2f32. This will also catch adding
-    // the low two lanes of a true v4f32 vector.
-def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
-                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
-          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
-
-// Scalar 64-bit shifts in FPR64 registers.
-def : Pat<(i64 (int_arm64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-
-// Tail call return handling. These are all compiler pseudo-instructions,
-// so no encoding information or anything like that.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
-  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
-}
-
-def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
-def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
-def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
-          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
-
-include "ARM64InstrAtomics.td"
diff --git a/lib/Target/ARM64/ARM64MCInstLower.cpp b/lib/Target/ARM64/ARM64MCInstLower.cpp
deleted file mode 100644
index 525f484ca4c5..000000000000
--- a/lib/Target/ARM64/ARM64MCInstLower.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-//===-- ARM64MCInstLower.cpp - Convert ARM64 MachineInstr to an MCInst---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower ARM64 MachineInstrs to their corresponding
-// MCInst records.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCInstLower.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-ARM64MCInstLower::ARM64MCInstLower(MCContext &ctx, Mangler &mang,
-                                   AsmPrinter &printer)
-    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
-
-MCSymbol *
-ARM64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  return Printer.getSymbol(MO.getGlobal());
-}
-
-MCSymbol *
-ARM64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
-                                                     MCSymbol *Sym) const {
-  // FIXME: We would like an efficient form for this, so we don't have to do a
-  // lot of extra uniquing.
-  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
-  if ((MO.getTargetFlags() & ARM64II::MO_GOT) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
-    else
-      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
-  } else if ((MO.getTargetFlags() & ARM64II::MO_TLS) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
-    else
-      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
-  } else {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_PAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
-  }
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
-                                                  MCSymbol *Sym) const {
-  uint32_t RefFlags = 0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_GOT)
-    RefFlags |= ARM64MCExpr::VK_GOT;
-  else if (MO.getTargetFlags() & ARM64II::MO_TLS) {
-    TLSModel::Model Model;
-    if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      Model = Printer.TM.getTLSModel(GV);
-    } else {
-      assert(MO.isSymbol() &&
-             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
-             "unexpected external TLS symbol");
-      Model = TLSModel::GeneralDynamic;
-    }
-    switch (Model) {
-    case TLSModel::InitialExec:
-      RefFlags |= ARM64MCExpr::VK_GOTTPREL;
-      break;
-    case TLSModel::LocalExec:
-      RefFlags |= ARM64MCExpr::VK_TPREL;
-      break;
-    case TLSModel::LocalDynamic:
-      RefFlags |= ARM64MCExpr::VK_DTPREL;
-      break;
-    case TLSModel::GeneralDynamic:
-      RefFlags |= ARM64MCExpr::VK_TLSDESC;
-      break;
-    }
-  } else {
-    // No modifier means this is a generic reference, classified as absolute for
-    // the cases where it matters (:abs_g0: etc).
-    RefFlags |= ARM64MCExpr::VK_ABS;
-  }
-
-  if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-    RefFlags |= ARM64MCExpr::VK_PAGE;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGEOFF)
-    RefFlags |= ARM64MCExpr::VK_PAGEOFF;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G3)
-    RefFlags |= ARM64MCExpr::VK_G3;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G2)
-    RefFlags |= ARM64MCExpr::VK_G2;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G1)
-    RefFlags |= ARM64MCExpr::VK_G1;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G0)
-    RefFlags |= ARM64MCExpr::VK_G0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_NC)
-    RefFlags |= ARM64MCExpr::VK_NC;
-
-  const MCExpr *Expr =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-
-  ARM64MCExpr::VariantKind RefKind;
-  RefKind = static_cast<ARM64MCExpr::VariantKind>(RefFlags);
-  Expr = ARM64MCExpr::Create(Expr, RefKind, Ctx);
-
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
-                                               MCSymbol *Sym) const {
-  if (TargetTriple.isOSDarwin())
-    return lowerSymbolOperandDarwin(MO, Sym);
-
-  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
-  return lowerSymbolOperandELF(MO, Sym);
-}
-
-bool ARM64MCInstLower::lowerOperand(const MachineOperand &MO,
-                                    MCOperand &MCOp) const {
-  switch (MO.getType()) {
-  default:
-    assert(0 && "unknown operand type");
-  case MachineOperand::MO_Register:
-    // Ignore all implicit register operands.
-    if (MO.isImplicit())
-      return false;
-    MCOp = MCOperand::CreateReg(MO.getReg());
-    break;
-  case MachineOperand::MO_RegisterMask:
-    // Regmasks are like implicit defs.
-    return false;
-  case MachineOperand::MO_Immediate:
-    MCOp = MCOperand::CreateImm(MO.getImm());
-    break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(
-        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
-    break;
-  case MachineOperand::MO_GlobalAddress:
-    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
-    break;
-  case MachineOperand::MO_JumpTableIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_ConstantPoolIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_BlockAddress:
-    MCOp = LowerSymbolOperand(
-        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
-    break;
-  }
-  return true;
-}
-
-void ARM64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
-
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MCOperand MCOp;
-    if (lowerOperand(MI->getOperand(i), MCOp))
-      OutMI.addOperand(MCOp);
-  }
-}
diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
deleted file mode 100644
index 0b6f4f1ec646..000000000000
--- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h
+++ /dev/null
@@ -1,163 +0,0 @@
-//===- ARM64MachineFuctionInfo.h - ARM64 machine function info --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares ARM64-specific per-machine-function information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MACHINEFUNCTIONINFO_H
-#define ARM64MACHINEFUNCTIONINFO_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-
-namespace llvm {
-
-/// ARM64FunctionInfo - This class is derived from MachineFunctionInfo and
-/// contains private ARM64-specific information for each MachineFunction.
-class ARM64FunctionInfo : public MachineFunctionInfo {
-
-  /// Number of bytes of arguments this function has on the stack. If the callee
-  /// is expected to restore the argument stack this should be a multiple of 16,
-  /// all usable during a tail call.
-  ///
-  /// The alternative would forbid tail call optimisation in some cases: if we
-  /// want to transfer control from a function with 8-bytes of stack-argument
-  /// space to a function with 16-bytes then misalignment of this value would
-  /// make a stack adjustment necessary, which could not be undone by the
-  /// callee.
-  unsigned BytesInStackArgArea;
-
-  /// The number of bytes to restore to deallocate space for incoming
-  /// arguments. Canonically 0 in the C calling convention, but non-zero when
-  /// callee is expected to pop the args.
-  unsigned ArgumentStackToRestore;
-
-  /// HasStackFrame - True if this function has a stack frame. Set by
-  /// processFunctionBeforeCalleeSavedScan().
-  bool HasStackFrame;
-
-  /// \brief Amount of stack frame size, not including callee-saved registers.
-  unsigned LocalStackSize;
-
-  /// \brief Number of TLS accesses using the special (combinable)
-  /// _TLS_MODULE_BASE_ symbol.
-  unsigned NumLocalDynamicTLSAccesses;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed on the
-  /// stack.
-  int VarArgsStackIndex;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// general purpose registers.
-  int VarArgsGPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in general purpose
-  /// registers.
-  unsigned VarArgsGPRSize;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// floating-point registers.
-  int VarArgsFPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in floating-point
-  /// registers.
-  unsigned VarArgsFPRSize;
-
-public:
-  ARM64FunctionInfo()
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
-
-  explicit ARM64FunctionInfo(MachineFunction &MF)
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
-    (void)MF;
-  }
-
-  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
-  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
-
-  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
-  void setArgumentStackToRestore(unsigned bytes) {
-    ArgumentStackToRestore = bytes;
-  }
-
-  bool hasStackFrame() const { return HasStackFrame; }
-  void setHasStackFrame(bool s) { HasStackFrame = s; }
-
-  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
-  unsigned getLocalStackSize() const { return LocalStackSize; }
-
-  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
-  unsigned getNumLocalDynamicTLSAccesses() const {
-    return NumLocalDynamicTLSAccesses;
-  }
-
-  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
-  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
-
-  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
-  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
-
-  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
-  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
-
-  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
-  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
-
-  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
-  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
-
-  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
-
-  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
-
-  // Shortcuts for LOH related types.
-  class MILOHDirective {
-    MCLOHType Kind;
-
-    /// Arguments of this directive. Order matters.
-    SmallVector<const MachineInstr *, 3> Args;
-
-  public:
-    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
-
-    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
-        : Kind(Kind), Args(Args.begin(), Args.end()) {
-      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
-    }
-
-    MCLOHType getKind() const { return Kind; }
-    const LOHArgs &getArgs() const { return Args; }
-  };
-
-  typedef MILOHDirective::LOHArgs MILOHArgs;
-  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
-
-  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
-
-  /// Add a LOH directive of this @p Kind and this @p Args.
-  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
-    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
-    LOHRelated.insert(Args.begin(), Args.end());
-  }
-
-private:
-  // Hold the lists of LOHs.
-  MILOHContainer LOHContainerSet;
-  SetOfInstructions LOHRelated;
-};
-} // End llvm namespace
-
-#endif // ARM64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.cpp b/lib/Target/ARM64/ARM64RegisterInfo.cpp
deleted file mode 100644
index d3c647bd90b2..000000000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-//===- ARM64RegisterInfo.cpp - ARM64 Register Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64RegisterInfo.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetOptions.h"
-
-using namespace llvm;
-
-#define GET_REGINFO_TARGET_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-ARM64RegisterInfo::ARM64RegisterInfo(const ARM64InstrInfo *tii,
-                                     const ARM64Subtarget *sti)
-    : ARM64GenRegisterInfo(ARM64::LR), TII(tii), STI(sti) {}
-
-const MCPhysReg *
-ARM64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  assert(MF && "Invalid MachineFunction pointer.");
-  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_SaveList;
-  else
-    return CSR_ARM64_AAPCS_SaveList;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
-  if (CC == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_RegMask;
-  else
-    return CSR_ARM64_AAPCS_RegMask;
-}
-
-const uint32_t *ARM64RegisterInfo::getTLSCallPreservedMask() const {
-  if (STI->isTargetDarwin())
-    return CSR_ARM64_TLS_Darwin_RegMask;
-
-  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
-  return CSR_ARM64_TLS_ELF_RegMask;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
-  // This should return a register mask that is the same as that returned by
-  // getCallPreservedMask but that additionally preserves the register used for
-  // the first i64 argument (which must also be the register used to return a
-  // single i64 return value)
-  //
-  // In case that the calling convention does not use the same register for
-  // both, the function should return NULL (does not currently apply)
-  return CSR_ARM64_AAPCS_ThisReturn_RegMask;
-}
-
-BitVector ARM64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  // FIXME: avoid re-calculating this every time.
-  BitVector Reserved(getNumRegs());
-  Reserved.set(ARM64::SP);
-  Reserved.set(ARM64::XZR);
-  Reserved.set(ARM64::WSP);
-  Reserved.set(ARM64::WZR);
-
-  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
-    Reserved.set(ARM64::FP);
-    Reserved.set(ARM64::W29);
-  }
-
-  if (STI->isTargetDarwin()) {
-    Reserved.set(ARM64::X18); // Platform register
-    Reserved.set(ARM64::W18);
-  }
-
-  if (hasBasePointer(MF)) {
-    Reserved.set(ARM64::X19);
-    Reserved.set(ARM64::W19);
-  }
-
-  return Reserved;
-}
-
-bool ARM64RegisterInfo::isReservedReg(const MachineFunction &MF,
-                                      unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (Reg) {
-  default:
-    break;
-  case ARM64::SP:
-  case ARM64::XZR:
-  case ARM64::WSP:
-  case ARM64::WZR:
-    return true;
-  case ARM64::X18:
-  case ARM64::W18:
-    return STI->isTargetDarwin();
-  case ARM64::FP:
-  case ARM64::W29:
-    return TFI->hasFP(MF) || STI->isTargetDarwin();
-  case ARM64::W19:
-  case ARM64::X19:
-    return hasBasePointer(MF);
-  }
-
-  return false;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                      unsigned Kind) const {
-  return &ARM64::GPR64RegClass;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &ARM64::CCRRegClass)
-    return nullptr; // Can't copy NZCV.
-  return RC;
-}
-
-unsigned ARM64RegisterInfo::getBaseRegister() const { return ARM64::X19; }
-
-bool ARM64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // In the presence of variable sized objects, if the fixed stack size is
-  // large enough that referencing from the FP won't result in things being
-  // in range relatively often, we can use a base pointer to allow access
-  // from the other direction like the SP normally works.
-  if (MFI->hasVarSizedObjects()) {
-    // Conservatively estimate whether the negative offset from the frame
-    // pointer will be sufficient to reach. If a function has a smallish
-    // frame, it's less likely to have lots of spills and callee saved
-    // space, so it's all more likely to be within range of the frame pointer.
-    // If it's wrong, we'll materialize the constant and still get to the
-    // object; it's just suboptimal. Negative offsets use the unscaled
-    // load/store instructions, which have a 9-bit signed immediate.
-    if (MFI->getLocalFrameSize() < 256)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-unsigned ARM64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  return TFI->hasFP(MF) ? ARM64::FP : ARM64::SP;
-}
-
-bool
-ARM64RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  return true;
-}
-
-bool ARM64RegisterInfo::requiresVirtualBaseRegisters(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool
-ARM64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // ARM64FrameLowering::resolveFrameIndexReference() can always fall back
-  // to the stack pointer, so only put the emergency spill slot next to the
-  // FP when there's no better way to access it (SP or base pointer).
-  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
-}
-
-bool ARM64RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool ARM64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Only consider eliminating leaf frames.
-  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
-                          MFI->adjustsStack()))
-    return true;
-  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
-}
-
-/// needsFrameBaseReg - Returns true if the instruction's frame index
-/// reference would be better served by a base register other than FP
-/// or SP. Used by LocalStackFrameAllocation to determine which frame index
-/// references it should create new base registers for.
-bool ARM64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
-                                          int64_t Offset) const {
-  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
-    assert(i < MI->getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-
-  // It's the load/store FI references that cause issues, as it can be difficult
-  // to materialize the offset if it won't fit in the literal field. Estimate
-  // based on the size of the local frame and some conservative assumptions
-  // about the rest of the stack frame (note, this is pre-regalloc, so
-  // we don't know everything for certain yet) whether this offset is likely
-  // to be out of range of the immediate. Return true if so.
-
-  // We only generate virtual base registers for loads and stores, so
-  // return false for everything else.
-  if (!MI->mayLoad() && !MI->mayStore())
-    return false;
-
-  // Without a virtual base register, if the function has variable sized
-  // objects, all fixed-size local references will be via the frame pointer,
-  // Approximate the offset and see if it's legal for the instruction.
-  // Note that the incoming offset is based on the SP value at function entry,
-  // so it'll be negative.
-  MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Estimate an offset from the frame pointer.
-  // Conservatively assume all GPR callee-saved registers get pushed.
-  // FP, LR, X19-X28, D8-D15. 64-bits each.
-  int64_t FPOffset = Offset - 16 * 20;
-  // Estimate an offset from the stack pointer.
-  // The incoming offset is relating to the SP at the start of the function,
-  // but when we access the local it'll be relative to the SP after local
-  // allocation, so adjust our SP-relative offset by that allocation size.
-  Offset += MFI->getLocalFrameSize();
-  // Assume that we'll have at least some spill slots allocated.
-  // FIXME: This is a total SWAG number. We should run some statistics
-  //        and pick a real one.
-  Offset += 128; // 128 bytes of spill slots
-
-  // If there is a frame pointer, try using it.
-  // The FP is only available if there is no dynamic realignment. We
-  // don't know for sure yet whether we'll need that, so we guess based
-  // on whether there are any local variables that would trigger it.
-  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
-    return false;
-
-  // If we can reference via the stack pointer or base pointer, try that.
-  // FIXME: This (and the code that resolves the references) can be improved
-  //        to only disallow SP relative references in the live range of
-  //        the VLA(s). In practice, it's unclear how much difference that
-  //        would make, but it may be worth doing.
-  if (isFrameOffsetLegal(MI, Offset))
-    return false;
-
-  // The offset likely isn't legal; we want to allocate a virtual base register.
-  return true;
-}
-
-bool ARM64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
-                                           int64_t Offset) const {
-  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
-  assert(MI && "Unable to get the legal offset for nil instruction.");
-  int SaveOffset = Offset;
-  return isARM64FrameOffsetLegal(*MI, SaveOffset) & ARM64FrameOffsetIsLegal;
-}
-
-/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
-/// at the beginning of the basic block.
-void ARM64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                     unsigned BaseReg,
-                                                     int FrameIdx,
-                                                     int64_t Offset) const {
-  MachineBasicBlock::iterator Ins = MBB->begin();
-  DebugLoc DL; // Defaults to "unknown"
-  if (Ins != MBB->end())
-    DL = Ins->getDebugLoc();
-
-  const MCInstrDesc &MCID = TII->get(ARM64::ADDXri);
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const MachineFunction &MF = *MBB->getParent();
-  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
-  unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-
-  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
-      .addFrameIndex(FrameIdx)
-      .addImm(Offset)
-      .addImm(Shifter);
-}
-
-void ARM64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                                          int64_t Offset) const {
-  int Off = Offset; // ARM doesn't need the general 64-bit offsets
-  unsigned i = 0;
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-  bool Done = rewriteARM64FrameIndex(MI, i, BaseReg, Off, TII);
-  assert(Done && "Unable to resolve frame index!");
-  (void)Done;
-}
-
-void ARM64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, unsigned FIOperandNum,
-                                            RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Unexpected");
-
-  MachineInstr &MI = *II;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  const ARM64FrameLowering *TFI = static_cast<const ARM64FrameLowering *>(
-      MF.getTarget().getFrameLowering());
-
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned FrameReg;
-  int Offset;
-
-  // Special handling of dbg_value, stackmap and patchpoint instructions.
-  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
-      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
-    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
-                                             /*PreferFP=*/true);
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
-    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
-
-  // Modify MI as necessary to handle as much of 'Offset' as possible
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
-  if (rewriteARM64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
-    return;
-
-  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
-         "Emergency spill slot is out of reach");
-
-  // If we get here, the immediate doesn't fit into the instruction.  We folded
-  // as much as possible above.  Handle the rest, providing a register that is
-  // SP+LargeImm.
-  unsigned ScratchReg =
-      MF.getRegInfo().createVirtualRegister(&ARM64::GPR64RegClass);
-  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
-  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
-}
-
-namespace llvm {
-
-unsigned ARM64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                                MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (RC->getID()) {
-  default:
-    return 0;
-  case ARM64::GPR32RegClassID:
-  case ARM64::GPR32spRegClassID:
-  case ARM64::GPR32allRegClassID:
-  case ARM64::GPR64spRegClassID:
-  case ARM64::GPR64allRegClassID:
-  case ARM64::GPR64RegClassID:
-  case ARM64::GPR32commonRegClassID:
-  case ARM64::GPR64commonRegClassID:
-    return 32 - 1                                      // XZR/SP
-           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
-           - STI->isTargetDarwin() // X18 reserved as platform register
-           - hasBasePointer(MF);   // X19
-  case ARM64::FPR8RegClassID:
-  case ARM64::FPR16RegClassID:
-  case ARM64::FPR32RegClassID:
-  case ARM64::FPR64RegClassID:
-  case ARM64::FPR128RegClassID:
-    return 32;
-
-  case ARM64::DDRegClassID:
-  case ARM64::DDDRegClassID:
-  case ARM64::DDDDRegClassID:
-  case ARM64::QQRegClassID:
-  case ARM64::QQQRegClassID:
-  case ARM64::QQQQRegClassID:
-    return 32;
-
-  case ARM64::FPR128_loRegClassID:
-    return 16;
-  }
-}
-
-} // namespace llvm
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.h b/lib/Target/ARM64/ARM64RegisterInfo.h
deleted file mode 100644
index 7691fadbcc8a..000000000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.h
+++ /dev/null
@@ -1,101 +0,0 @@
-//===- ARM64RegisterInfo.h - ARM64 Register Information Impl ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the MRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64REGISTERINFO_H
-#define LLVM_TARGET_ARM64REGISTERINFO_H
-
-#define GET_REGINFO_HEADER
-#include "ARM64GenRegisterInfo.inc"
-
-namespace llvm {
-
-class ARM64InstrInfo;
-class ARM64Subtarget;
-class MachineFunction;
-class RegScavenger;
-class TargetRegisterClass;
-
-struct ARM64RegisterInfo : public ARM64GenRegisterInfo {
-private:
-  const ARM64InstrInfo *TII;
-  const ARM64Subtarget *STI;
-
-public:
-  ARM64RegisterInfo(const ARM64InstrInfo *tii, const ARM64Subtarget *sti);
-
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
-  /// Code Generation virtual methods...
-  const MCPhysReg *
-  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
-
-  unsigned getCSRFirstUseCost() const override {
-    // The cost will be compared against BlockFrequency where entry has the
-    // value of 1 << 14. A value of 5 will choose to spill or split really
-    // cold path instead of using a callee-saved register.
-    return 5;
-  }
-
-  // Calls involved in thread-local variable lookup save more registers than
-  // normal calls, so they need a different mask to represent this.
-  const uint32_t *getTLSCallPreservedMask() const;
-
-  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
-  /// case that 'returned' is on an i64 first argument if the calling convention
-  /// is one that can (partially) model this attribute with a preserved mask
-  /// (i.e. it is a calling convention that uses the same register for the first
-  /// i64 argument and an i64 return value)
-  ///
-  /// Should return NULL in the case that the calling convention does not have
-  /// this property
-  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override;
-  const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
-  const TargetRegisterClass *
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
-
-  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
-  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
-
-  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  bool isFrameOffsetLegal(const MachineInstr *MI,
-                          int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                         int64_t Offset) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS = nullptr) const override;
-  bool cannotEliminateFrame(const MachineFunction &MF) const;
-
-  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
-  bool hasBasePointer(const MachineFunction &MF) const;
-  unsigned getBaseRegister() const;
-
-  // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
-
-  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const override;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64REGISTERINFO_H
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.td b/lib/Target/ARM64/ARM64RegisterInfo.td
deleted file mode 100644
index fb82598ff7a3..000000000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.td
+++ /dev/null
@@ -1,583 +0,0 @@
-//===- ARM64RegisterInfo.td - Describe the ARM64 Regisers --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-
-class ARM64Reg<bits<16> enc, string n, list<Register> subregs = [],
-               list<string> altNames = []>
-        : Register<n, altNames> {
-  let HWEncoding = enc;
-  let Namespace = "ARM64";
-  let SubRegs = subregs;
-}
-
-let Namespace = "ARM64" in {
-  def sub_32 : SubRegIndex<32>;
-
-  def bsub : SubRegIndex<8>;
-  def hsub : SubRegIndex<16>;
-  def ssub : SubRegIndex<32>;
-  def dsub : SubRegIndex<32>;
-  def qhisub : SubRegIndex<64>;
-  def qsub : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def dsub0 : SubRegIndex<64>;
-  def dsub1 : SubRegIndex<64>;
-  def dsub2 : SubRegIndex<64>;
-  def dsub3 : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def qsub0 : SubRegIndex<128>;
-  def qsub1 : SubRegIndex<128>;
-  def qsub2 : SubRegIndex<128>;
-  def qsub3 : SubRegIndex<128>;
-}
-
-let Namespace = "ARM64" in {
-  def vreg : RegAltNameIndex;
-  def vlist1 : RegAltNameIndex;
-}
-
-//===----------------------------------------------------------------------===//
-// Registers
-//===----------------------------------------------------------------------===//
-def W0    : ARM64Reg<0,   "w0" >, DwarfRegNum<[0]>;
-def W1    : ARM64Reg<1,   "w1" >, DwarfRegNum<[1]>;
-def W2    : ARM64Reg<2,   "w2" >, DwarfRegNum<[2]>;
-def W3    : ARM64Reg<3,   "w3" >, DwarfRegNum<[3]>;
-def W4    : ARM64Reg<4,   "w4" >, DwarfRegNum<[4]>;
-def W5    : ARM64Reg<5,   "w5" >, DwarfRegNum<[5]>;
-def W6    : ARM64Reg<6,   "w6" >, DwarfRegNum<[6]>;
-def W7    : ARM64Reg<7,   "w7" >, DwarfRegNum<[7]>;
-def W8    : ARM64Reg<8,   "w8" >, DwarfRegNum<[8]>;
-def W9    : ARM64Reg<9,   "w9" >, DwarfRegNum<[9]>;
-def W10   : ARM64Reg<10, "w10">, DwarfRegNum<[10]>;
-def W11   : ARM64Reg<11, "w11">, DwarfRegNum<[11]>;
-def W12   : ARM64Reg<12, "w12">, DwarfRegNum<[12]>;
-def W13   : ARM64Reg<13, "w13">, DwarfRegNum<[13]>;
-def W14   : ARM64Reg<14, "w14">, DwarfRegNum<[14]>;
-def W15   : ARM64Reg<15, "w15">, DwarfRegNum<[15]>;
-def W16   : ARM64Reg<16, "w16">, DwarfRegNum<[16]>;
-def W17   : ARM64Reg<17, "w17">, DwarfRegNum<[17]>;
-def W18   : ARM64Reg<18, "w18">, DwarfRegNum<[18]>;
-def W19   : ARM64Reg<19, "w19">, DwarfRegNum<[19]>;
-def W20   : ARM64Reg<20, "w20">, DwarfRegNum<[20]>;
-def W21   : ARM64Reg<21, "w21">, DwarfRegNum<[21]>;
-def W22   : ARM64Reg<22, "w22">, DwarfRegNum<[22]>;
-def W23   : ARM64Reg<23, "w23">, DwarfRegNum<[23]>;
-def W24   : ARM64Reg<24, "w24">, DwarfRegNum<[24]>;
-def W25   : ARM64Reg<25, "w25">, DwarfRegNum<[25]>;
-def W26   : ARM64Reg<26, "w26">, DwarfRegNum<[26]>;
-def W27   : ARM64Reg<27, "w27">, DwarfRegNum<[27]>;
-def W28   : ARM64Reg<28, "w28">, DwarfRegNum<[28]>;
-def W29   : ARM64Reg<29, "w29">, DwarfRegNum<[29]>;
-def W30   : ARM64Reg<30, "w30">, DwarfRegNum<[30]>;
-def WSP   : ARM64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR   : ARM64Reg<31, "wzr">, DwarfRegAlias<WSP>;
-
-let SubRegIndices = [sub_32] in {
-def X0    : ARM64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
-def X1    : ARM64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
-def X2    : ARM64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
-def X3    : ARM64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
-def X4    : ARM64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
-def X5    : ARM64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
-def X6    : ARM64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
-def X7    : ARM64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
-def X8    : ARM64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
-def X9    : ARM64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
-def X10   : ARM64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
-def X11   : ARM64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
-def X12   : ARM64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
-def X13   : ARM64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
-def X14   : ARM64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
-def X15   : ARM64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
-def X16   : ARM64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
-def X17   : ARM64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
-def X18   : ARM64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
-def X19   : ARM64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
-def X20   : ARM64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
-def X21   : ARM64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
-def X22   : ARM64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
-def X23   : ARM64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
-def X24   : ARM64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
-def X25   : ARM64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
-def X26   : ARM64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
-def X27   : ARM64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
-def X28   : ARM64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
-def FP    : ARM64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
-def LR    : ARM64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
-def SP    : ARM64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
-def XZR   : ARM64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
-}
-
-// Condition code register.
-def NZCV  : ARM64Reg<0, "nzcv">;
-
-// GPR register classes with the intersections of GPR32/GPR32sp and
-// GPR64/GPR64sp for use by the coalescer.
-def GPR32common : RegisterClass<"ARM64", [i32], 32, (sequence "W%u", 0, 30)> {
-  let AltOrders = [(rotl GPR32common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64common : RegisterClass<"ARM64", [i64], 64,
-                                (add (sequence "X%u", 0, 28), FP, LR)> {
-  let AltOrders = [(rotl GPR64common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-// GPR register classes which exclude SP/WSP.
-def GPR32 : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR)> {
-  let AltOrders = [(rotl GPR32, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64 : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR)> {
-  let AltOrders = [(rotl GPR64, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-// GPR register classes which include SP/WSP.
-def GPR32sp : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WSP)> {
-  let AltOrders = [(rotl GPR32sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64sp : RegisterClass<"ARM64", [i64], 64, (add GPR64common, SP)> {
-  let AltOrders = [(rotl GPR64sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-def GPR32sponly : RegisterClass<"ARM64", [i32], 32, (add WSP)>;
-def GPR64sponly : RegisterClass<"ARM64", [i64], 64, (add SP)>;
-
-// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
-// constraint used by any instructions, it is used as a common super-class.
-def GPR32all : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR, WSP)>;
-def GPR64all : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR, SP)>;
-
-// For tail calls, we can't use callee-saved registers, as they are restored
-// to the saved value before the tail call, which would clobber a call address.
-// This is for indirect tail calls to store the address of the destination.
-def tcGPR64 : RegisterClass<"ARM64", [i64], 64, (sub GPR64common, X19, X20, X21,
-                                                     X22, X23, X24, X25, X26,
-                                                     X27, X28)>;
-
-// GPR register classes for post increment amount of vector load/store that
-// has alternate printing when Rm=31 and prints a constant immediate value
-// equal to the total number of bytes transferred.
-
-// FIXME: TableGen *should* be able to do these itself now. There appears to be
-// a bug in counting how many operands a Post-indexed MCInst should have which
-// means the aliases don't trigger.
-def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
-def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
-def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
-def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
-def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
-def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
-def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
-def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
-def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
-def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
-def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
-def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
-
-// Condition code regclass.
-def CCR : RegisterClass<"ARM64", [i32], 32, (add NZCV)> {
-  let CopyCost = -1;  // Don't allow copying of status registers.
-
-  // CCR is not allocatable.
-  let isAllocatable = 0;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating Point Scalar Registers
-//===----------------------------------------------------------------------===//
-
-def B0    : ARM64Reg<0,   "b0">, DwarfRegNum<[64]>;
-def B1    : ARM64Reg<1,   "b1">, DwarfRegNum<[65]>;
-def B2    : ARM64Reg<2,   "b2">, DwarfRegNum<[66]>;
-def B3    : ARM64Reg<3,   "b3">, DwarfRegNum<[67]>;
-def B4    : ARM64Reg<4,   "b4">, DwarfRegNum<[68]>;
-def B5    : ARM64Reg<5,   "b5">, DwarfRegNum<[69]>;
-def B6    : ARM64Reg<6,   "b6">, DwarfRegNum<[70]>;
-def B7    : ARM64Reg<7,   "b7">, DwarfRegNum<[71]>;
-def B8    : ARM64Reg<8,   "b8">, DwarfRegNum<[72]>;
-def B9    : ARM64Reg<9,   "b9">, DwarfRegNum<[73]>;
-def B10   : ARM64Reg<10, "b10">, DwarfRegNum<[74]>;
-def B11   : ARM64Reg<11, "b11">, DwarfRegNum<[75]>;
-def B12   : ARM64Reg<12, "b12">, DwarfRegNum<[76]>;
-def B13   : ARM64Reg<13, "b13">, DwarfRegNum<[77]>;
-def B14   : ARM64Reg<14, "b14">, DwarfRegNum<[78]>;
-def B15   : ARM64Reg<15, "b15">, DwarfRegNum<[79]>;
-def B16   : ARM64Reg<16, "b16">, DwarfRegNum<[80]>;
-def B17   : ARM64Reg<17, "b17">, DwarfRegNum<[81]>;
-def B18   : ARM64Reg<18, "b18">, DwarfRegNum<[82]>;
-def B19   : ARM64Reg<19, "b19">, DwarfRegNum<[83]>;
-def B20   : ARM64Reg<20, "b20">, DwarfRegNum<[84]>;
-def B21   : ARM64Reg<21, "b21">, DwarfRegNum<[85]>;
-def B22   : ARM64Reg<22, "b22">, DwarfRegNum<[86]>;
-def B23   : ARM64Reg<23, "b23">, DwarfRegNum<[87]>;
-def B24   : ARM64Reg<24, "b24">, DwarfRegNum<[88]>;
-def B25   : ARM64Reg<25, "b25">, DwarfRegNum<[89]>;
-def B26   : ARM64Reg<26, "b26">, DwarfRegNum<[90]>;
-def B27   : ARM64Reg<27, "b27">, DwarfRegNum<[91]>;
-def B28   : ARM64Reg<28, "b28">, DwarfRegNum<[92]>;
-def B29   : ARM64Reg<29, "b29">, DwarfRegNum<[93]>;
-def B30   : ARM64Reg<30, "b30">, DwarfRegNum<[94]>;
-def B31   : ARM64Reg<31, "b31">, DwarfRegNum<[95]>;
-
-let SubRegIndices = [bsub] in {
-def H0    : ARM64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
-def H1    : ARM64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
-def H2    : ARM64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
-def H3    : ARM64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
-def H4    : ARM64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
-def H5    : ARM64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
-def H6    : ARM64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
-def H7    : ARM64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
-def H8    : ARM64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
-def H9    : ARM64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
-def H10   : ARM64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
-def H11   : ARM64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
-def H12   : ARM64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
-def H13   : ARM64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
-def H14   : ARM64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
-def H15   : ARM64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
-def H16   : ARM64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
-def H17   : ARM64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
-def H18   : ARM64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
-def H19   : ARM64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
-def H20   : ARM64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
-def H21   : ARM64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
-def H22   : ARM64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
-def H23   : ARM64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
-def H24   : ARM64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
-def H25   : ARM64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
-def H26   : ARM64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
-def H27   : ARM64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
-def H28   : ARM64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
-def H29   : ARM64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
-def H30   : ARM64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
-def H31   : ARM64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [hsub] in {
-def S0    : ARM64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
-def S1    : ARM64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
-def S2    : ARM64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
-def S3    : ARM64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
-def S4    : ARM64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
-def S5    : ARM64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
-def S6    : ARM64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
-def S7    : ARM64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
-def S8    : ARM64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
-def S9    : ARM64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
-def S10   : ARM64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
-def S11   : ARM64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
-def S12   : ARM64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
-def S13   : ARM64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
-def S14   : ARM64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
-def S15   : ARM64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
-def S16   : ARM64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
-def S17   : ARM64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
-def S18   : ARM64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
-def S19   : ARM64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
-def S20   : ARM64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
-def S21   : ARM64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
-def S22   : ARM64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
-def S23   : ARM64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
-def S24   : ARM64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
-def S25   : ARM64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
-def S26   : ARM64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
-def S27   : ARM64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
-def S28   : ARM64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
-def S29   : ARM64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
-def S30   : ARM64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
-def S31   : ARM64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
-def D0    : ARM64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
-def D1    : ARM64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
-def D2    : ARM64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
-def D3    : ARM64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
-def D4    : ARM64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
-def D5    : ARM64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
-def D6    : ARM64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
-def D7    : ARM64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
-def D8    : ARM64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
-def D9    : ARM64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
-def D10   : ARM64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
-def D11   : ARM64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
-def D12   : ARM64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
-def D13   : ARM64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
-def D14   : ARM64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
-def D15   : ARM64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
-def D16   : ARM64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
-def D17   : ARM64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
-def D18   : ARM64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
-def D19   : ARM64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
-def D20   : ARM64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
-def D21   : ARM64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
-def D22   : ARM64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
-def D23   : ARM64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
-def D24   : ARM64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
-def D25   : ARM64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
-def D26   : ARM64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
-def D27   : ARM64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
-def D28   : ARM64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
-def D29   : ARM64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
-def D30   : ARM64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
-def D31   : ARM64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
-def Q0    : ARM64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
-def Q1    : ARM64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
-def Q2    : ARM64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
-def Q3    : ARM64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
-def Q4    : ARM64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
-def Q5    : ARM64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
-def Q6    : ARM64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
-def Q7    : ARM64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
-def Q8    : ARM64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
-def Q9    : ARM64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
-def Q10   : ARM64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
-def Q11   : ARM64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
-def Q12   : ARM64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
-def Q13   : ARM64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
-def Q14   : ARM64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
-def Q15   : ARM64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
-def Q16   : ARM64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
-def Q17   : ARM64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
-def Q18   : ARM64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
-def Q19   : ARM64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
-def Q20   : ARM64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
-def Q21   : ARM64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
-def Q22   : ARM64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
-def Q23   : ARM64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
-def Q24   : ARM64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
-def Q25   : ARM64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
-def Q26   : ARM64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
-def Q27   : ARM64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
-def Q28   : ARM64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
-def Q29   : ARM64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
-def Q30   : ARM64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
-def Q31   : ARM64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-def FPR8  : RegisterClass<"ARM64", [untyped], 8, (sequence "B%u", 0, 31)> {
-  let Size = 8;
-}
-def FPR16 : RegisterClass<"ARM64", [f16], 16, (sequence "H%u", 0, 31)> {
-  let Size = 16;
-}
-def FPR32 : RegisterClass<"ARM64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
-def FPR64 : RegisterClass<"ARM64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64],
-                                    64, (sequence "D%u", 0, 31)>;
-// We don't (yet) have an f128 legal type, so don't use that here. We
-// normalize 128-bit vectors to v2f64 for arg passing and such, so use
-// that here.
-def FPR128 : RegisterClass<"ARM64",
-                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
-                           128, (sequence "Q%u", 0, 31)>;
-
-// The lower 16 vector registers.  Some instructions can only take registers
-// in this range.
-def FPR128_lo : RegisterClass<"ARM64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                              128, (trunc FPR128, 16)>;
-
-// Pairs, triples, and quads of 64-bit vector registers.
-def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
-                                 [(rotl FPR64, 0), (rotl FPR64, 1),
-                                  (rotl FPR64, 2)]>;
-def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
-                               [(rotl FPR64, 0), (rotl FPR64, 1),
-                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
-def DD   : RegisterClass<"ARM64", [untyped], 64, (add DSeqPairs)> {
-  let Size = 128;
-}
-def DDD  : RegisterClass<"ARM64", [untyped], 64, (add DSeqTriples)> {
-  let Size = 196;
-}
-def DDDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqQuads)> {
-  let Size = 256;
-}
-
-// Pairs, triples, and quads of 128-bit vector registers.
-def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
-                                 [(rotl FPR128, 0), (rotl FPR128, 1),
-                                  (rotl FPR128, 2)]>;
-def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
-                               [(rotl FPR128, 0), (rotl FPR128, 1),
-                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
-def QQ   : RegisterClass<"ARM64", [untyped], 128, (add QSeqPairs)> {
-  let Size = 256;
-}
-def QQQ  : RegisterClass<"ARM64", [untyped], 128, (add QSeqTriples)> {
-  let Size = 384;
-}
-def QQQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqQuads)> {
-  let Size = 512;
-}
-
-
-// Vector operand versions of the FP registers. Alternate name printing and
-// assmebler matching.
-def VectorReg64AsmOperand : AsmOperandClass {
-  let Name = "VectorReg64";
-  let PredicateMethod = "isVectorReg";
-}
-def VectorReg128AsmOperand : AsmOperandClass {
-  let Name = "VectorReg128";
-  let PredicateMethod = "isVectorReg";
-}
-
-def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
-  let ParserMatchClass = VectorReg64AsmOperand;
-}
-
-def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
-  let ParserMatchClass = VectorReg128AsmOperand;
-}
-
-def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
-def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
-  let ParserMatchClass = VectorRegLoAsmOperand;
-}
-
-class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
-    : AsmOperandClass {
-  let Name = "TypedVectorList" # count # "_" # lanes # kind;
-
-  let PredicateMethod
-      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
-  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
-}
-
-class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
-    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
-                                                   # kind # "'>">;
-
-multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
-  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
-  def _64AsmOperand : AsmOperandClass {
-    let Name = NAME # "64";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList64Operands<" # count # ">";
-  }
-
-  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
-  }
-
-  def _128AsmOperand : AsmOperandClass {
-    let Name = NAME # "128";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList128Operands<" # count # ">";
-  }
-
-  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
-  }
-
-  // 64-bit register lists with explicit type.
-
-  // { v0.8b, v1.8b }
-  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
-  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
-  }
-
-  // { v0.4h, v1.4h }
-  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
-  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
-  }
-
-  // { v0.2s, v1.2s }
-  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
-  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
-  }
-
-  // { v0.1d, v1.1d }
-  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
-  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
-  }
-
-  // 128-bit register lists with explicit type
-
-  // { v0.16b, v1.16b }
-  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
-  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
-  }
-
-  // { v0.8h, v1.8h }
-  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
-  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
-  }
-
-  // { v0.4s, v1.4s }
-  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
-  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
-  }
-
-  // { v0.2d, v1.2d }
-  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
-  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
-  }
-
-  // { v0.b, v1.b }
-  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
-  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
-  }
-
-  // { v0.h, v1.h }
-  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
-  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
-  }
-
-  // { v0.s, v1.s }
-  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
-  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
-  }
-
-  // { v0.d, v1.d }
-  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
-  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
-  }
-
-
-}
-
-defm VecListOne   : VectorList<1, FPR64, FPR128>;
-defm VecListTwo   : VectorList<2, DD,    QQ>;
-defm VecListThree : VectorList<3, DDD,   QQQ>;
-defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
-
-
-// Register operand versions of the scalar FP registers.
-def FPR16Op : RegisterOperand<FPR16, "printOperand">;
-def FPR32Op : RegisterOperand<FPR32, "printOperand">;
-def FPR64Op : RegisterOperand<FPR64, "printOperand">;
-def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td
deleted file mode 100644
index 3a4194173a8e..000000000000
--- a/lib/Target/ARM64/ARM64Schedule.td
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-// Define TII for use in SchedVariant Predicates.
-// const MachineInstr *MI and const TargetSchedModel *SchedModel
-// are defined by default.
-def : PredicateProlog<[{
-  const ARM64InstrInfo *TII =
-    static_cast<const ARM64InstrInfo*>(SchedModel->getInstrInfo());
-  (void)TII;
-}]>;
-
-// ARM64 Scheduler Definitions
-
-def WriteImm       : SchedWrite; // MOVN, MOVZ
-// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
-// select the correct sequence of WriteImms.
-
-def WriteI         : SchedWrite; // ALU
-def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
-def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
-def ReadI          : SchedRead;  // ALU
-def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
-def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
-def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
-def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
-def WriteIS        : SchedWrite; // Shift/Scale
-def WriteID32      : SchedWrite; // 32-bit Divide
-def WriteID64      : SchedWrite; // 64-bit Divide
-def ReadID         : SchedRead;  // 32/64-bit Divide
-def WriteIM32      : SchedWrite; // 32-bit Multiply
-def WriteIM64      : SchedWrite; // 64-bit Multiply
-def ReadIM         : SchedRead;  // 32/64-bit Multiply
-def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
-def WriteBr        : SchedWrite; // Branch
-def WriteBrReg     : SchedWrite; // Indirect Branch
-
-def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
-def WriteST        : SchedWrite; // Store to base addr plus immediate offset
-def WriteSTP       : SchedWrite; // Store a register pair.
-def WriteAdr       : SchedWrite; // Address pre/post increment.
-
-def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
-def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
-def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
-
-// Predicate for determining when a shiftable register is shifted.
-def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
-
-// Predicate for determining when a extendedable register is extended.
-def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
-
-// ScaledIdxPred is true if a WriteLDIdx operand will be
-// scaled. Subtargets can use this to dynamically select resources and
-// latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
-
-// Serialized two-level address load.
-// EXAMPLE: LOADGot
-def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
-
-// Serialized two-level address lookup.
-// EXAMPLE: MOVaddr...
-def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
-
-// The second register of a load-pair.
-// LDP,LDPSW,LDNP,LDXP,LDAXP
-def WriteLDHi : SchedWrite;
-
-// Store-exclusive is a store followed by a dependent load.
-def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
-
-def WriteSys     : SchedWrite; // Long, variable latency system ops.
-def WriteBarrier : SchedWrite; // Memory barrier.
-def WriteHint    : SchedWrite; // Hint instruction.
-
-def WriteF       : SchedWrite; // General floating-point ops.
-def WriteFCmp    : SchedWrite; // Floating-point compare.
-def WriteFCvt    : SchedWrite; // Float conversion.
-def WriteFCopy   : SchedWrite; // Float-int register copy.
-def WriteFImm    : SchedWrite; // Floating-point immediate.
-def WriteFMul    : SchedWrite; // Floating-point multiply.
-def WriteFDiv    : SchedWrite; // Floating-point division.
-
-def WriteV   : SchedWrite; // Vector ops.
-def WriteVLD : SchedWrite; // Vector loads.
-def WriteVST : SchedWrite; // Vector stores.
-
-// Read the unwritten lanes of the VLD's destination registers.
-def ReadVLD : SchedRead;
-
-// Sequential vector load and shuffle.
-def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
-def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
-
-// Store a shuffled vector.
-def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
-def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
deleted file mode 100644
index f8a2527616c0..000000000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===-- ARM64SelectionDAGInfo.cpp - ARM64 SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64SelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-selectiondag-info"
-
-ARM64SelectionDAGInfo::ARM64SelectionDAGInfo(const TargetMachine &TM)
-    : TargetSelectionDAGInfo(TM),
-      Subtarget(&TM.getSubtarget<ARM64Subtarget>()) {}
-
-ARM64SelectionDAGInfo::~ARM64SelectionDAGInfo() {}
-
-SDValue ARM64SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile,
-    MachinePointerInfo DstPtrInfo) const {
-  // Check to see if there is a specialized entry-point for memory zeroing.
-  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
-  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
-  const char *bzeroEntry =
-      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr;
-  // For small size (< 256), it is not beneficial to use bzero
-  // instead of memset.
-  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
-    const ARM64TargetLowering &TLI = *static_cast<const ARM64TargetLowering *>(
-                                          DAG.getTarget().getTargetLowering());
-
-    EVT IntPtr = TLI.getPointerTy();
-    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = Dst;
-    Entry.Ty = IntPtrTy;
-    Args.push_back(Entry);
-    Entry.Node = Size;
-    Args.push_back(Entry);
-    TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Chain)
-      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
-      .setDiscardResult();
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    return CallResult.second;
-  }
-  return SDValue();
-}
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.h b/lib/Target/ARM64/ARM64SelectionDAGInfo.h
deleted file mode 100644
index 770775fc02dc..000000000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- ARM64SelectionDAGInfo.h - ARM64 SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ARM64 subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SELECTIONDAGINFO_H
-#define ARM64SELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class ARM64SelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-public:
-  explicit ARM64SelectionDAGInfo(const TargetMachine &TM);
-  ~ARM64SelectionDAGInfo();
-
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                                  SDValue Dst, SDValue Src, SDValue Size,
-                                  unsigned Align, bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const override;
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/ARM64Subtarget.cpp b/lib/Target/ARM64/ARM64Subtarget.cpp
deleted file mode 100644
index 528cfc97cbf7..000000000000
--- a/lib/Target/ARM64/ARM64Subtarget.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-//===-- ARM64Subtarget.cpp - ARM64 Subtarget Information --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-subtarget"
-
-#define GET_SUBTARGETINFO_CTOR
-#define GET_SUBTARGETINFO_TARGET_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-ARM64Subtarget::ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                               const std::string &FS, bool LittleEndian)
-    : ARM64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
-      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
-      CPUString(CPU), TargetTriple(TT), IsLittleEndian(LittleEndian) {
-  // Determine default and user-specified characteristics
-
-  if (CPUString.empty())
-    CPUString = "generic";
-
-  ParseSubtargetFeatures(CPUString, FS);
-}
-
-/// ClassifyGlobalReference - Find the target operand flags that describe
-/// how a global value should be referenced for the current subtarget.
-unsigned char
-ARM64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const {
-
-  // Determine whether this is a reference to a definition or a declaration.
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
-
-  // MachO large model always goes via a GOT, simply to get a single 8-byte
-  // absolute relocation on all global addresses.
-  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
-    return ARM64II::MO_GOT;
-
-  // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB). Therefore they must use the
-  // GOT.
-  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
-    return ARM64II::MO_GOT;
-
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-
-  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
-  //   + On MachO, if the symbol is defined in this module the GOT can be
-  //     skipped.
-  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
-  //     defined could end up in unexpected places. Use a GOT.
-  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
-    if (isTargetMachO())
-      return (isDecl || GV->isWeakForLinker()) ? ARM64II::MO_GOT
-                                               : ARM64II::MO_NO_FLAG;
-    else
-      // No need to go through the GOT for local symbols on ELF.
-      return GV->hasLocalLinkage() ? ARM64II::MO_NO_FLAG : ARM64II::MO_GOT;
-  }
-
-  return ARM64II::MO_NO_FLAG;
-}
-
-/// This function returns the name of a function which has an interface
-/// like the non-standard bzero function, if such a function exists on
-/// the current subtarget and it is considered prefereable over
-/// memset with zero passed as the second argument. Otherwise it
-/// returns null.
-const char *ARM64Subtarget::getBZeroEntry() const {
-  // Prefer bzero on Darwin only.
-  if(isTargetDarwin())
-    return "bzero";
-
-  return nullptr;
-}
-
-void ARM64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                         MachineInstr *begin, MachineInstr *end,
-                                         unsigned NumRegionInstrs) const {
-  // LNT run (at least on Cyclone) showed reasonably significant gains for
-  // bi-directional scheduling. 253.perlbmk.
-  Policy.OnlyTopDown = false;
-  Policy.OnlyBottomUp = false;
-}
diff --git a/lib/Target/ARM64/ARM64Subtarget.h b/lib/Target/ARM64/ARM64Subtarget.h
deleted file mode 100644
index 88b9c2e7aa3b..000000000000
--- a/lib/Target/ARM64/ARM64Subtarget.h
+++ /dev/null
@@ -1,108 +0,0 @@
-//=====---- ARM64Subtarget.h - Define Subtarget for the ARM64 -*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SUBTARGET_H
-#define ARM64SUBTARGET_H
-
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "ARM64RegisterInfo.h"
-#include <string>
-
-#define GET_SUBTARGETINFO_HEADER
-#include "ARM64GenSubtargetInfo.inc"
-
-namespace llvm {
-class GlobalValue;
-class StringRef;
-
-class ARM64Subtarget : public ARM64GenSubtargetInfo {
-protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
-
-  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
-  ARMProcFamilyEnum ARMProcFamily;
-
-  bool HasFPARMv8;
-  bool HasNEON;
-  bool HasCrypto;
-  bool HasCRC;
-
-  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
-  bool HasZeroCycleRegMove;
-
-  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
-  bool HasZeroCycleZeroing;
-
-  /// CPUString - String name of used CPU.
-  std::string CPUString;
-
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
-
-  /// IsLittleEndian - Is the target little endian?
-  bool IsLittleEndian;
-
-public:
-  /// This constructor initializes the data members to match that
-  /// of the specified triple.
-  ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS, bool LittleEndian);
-
-  bool enableMachineScheduler() const override { return true; }
-
-  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
-
-  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
-
-  bool hasFPARMv8() const { return HasFPARMv8; }
-  bool hasNEON() const { return HasNEON; }
-  bool hasCrypto() const { return HasCrypto; }
-  bool hasCRC() const { return HasCRC; }
-
-  bool isLittleEndian() const { return IsLittleEndian; }
-
-  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-
-  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
-
-  bool isCyclone() const { return CPUString == "cyclone"; }
-
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  unsigned getMaxInlineSizeThreshold() const { return 64; }
-
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  /// ClassifyGlobalReference - Find the target operand flags that describe
-  /// how a global value should be referenced for the current subtarget.
-  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const;
-
-  /// This function returns the name of a function which has an interface
-  /// like the non-standard bzero function, if such a function exists on
-  /// the current subtarget and it is considered prefereable over
-  /// memset with zero passed as the second argument. Otherwise it
-  /// returns null.
-  const char *getBZeroEntry() const;
-
-  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
-                           MachineInstr *end,
-                           unsigned NumRegionInstrs) const override;
-};
-} // End llvm namespace
-
-#endif // ARM64SUBTARGET_H
diff --git a/lib/Target/ARM64/ARM64TargetMachine.cpp b/lib/Target/ARM64/ARM64TargetMachine.cpp
deleted file mode 100644
index f5c187ceb278..000000000000
--- a/lib/Target/ARM64/ARM64TargetMachine.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-//===-- ARM64TargetMachine.cpp - Define TargetMachine for ARM64 -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/PassManager.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-static cl::opt<bool>
-EnableCCMP("arm64-ccmp", cl::desc("Enable the CCMP formation pass"),
-           cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-EnableEarlyIfConvert("arm64-early-ifcvt", cl::desc("Enable the early if "
-                     "converter pass"), cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-EnableStPairSuppress("arm64-stp-suppress", cl::desc("Suppress STP for ARM64"),
-                     cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-EnableAdvSIMDScalar("arm64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
-                    " integer instructions"), cl::init(false), cl::Hidden);
-
-static cl::opt<bool>
-EnablePromoteConstant("arm64-promote-const", cl::desc("Enable the promote "
-                      "constant pass"), cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
-EnableCollectLOH("arm64-collect-loh", cl::desc("Enable the pass that emits the"
-                 " linker optimization hints (LOH)"), cl::init(true),
-                 cl::Hidden);
-
-static cl::opt<bool>
-EnableDeadRegisterElimination("arm64-dead-def-elimination", cl::Hidden,
-                              cl::desc("Enable the pass that removes dead"
-                                       " definitons and replaces stores to"
-                                       " them with stores to the zero"
-                                       " register"),
-                              cl::init(true));
-
-static cl::opt<bool>
-EnableLoadStoreOpt("arm64-load-store-opt", cl::desc("Enable the load/store pair"
-                   " optimization pass"), cl::init(true), cl::Hidden);
-
-extern "C" void LLVMInitializeARM64Target() {
-  // Register the target.
-  RegisterTargetMachine<ARM64leTargetMachine> X(TheARM64leTarget);
-  RegisterTargetMachine<ARM64beTargetMachine> Y(TheARM64beTarget);
-}
-
-/// TargetMachine ctor - Create an ARM64 architecture model.
-///
-ARM64TargetMachine::ARM64TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL,
-                                       bool LittleEndian)
-    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, LittleEndian),
-      // This nested ternary is horrible, but DL needs to be properly initialized
-      // before TLInfo is constructed.
-      DL(Subtarget.isTargetMachO() ?
-         "e-m:o-i64:64-i128:128-n32:64-S128" :
-         (LittleEndian ?
-          "e-m:e-i64:64-i128:128-n32:64-S128" :
-          "E-m:e-i64:64-i128:128-n32:64-S128")),
-      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
-      TSInfo(*this) {
-  initAsmInfo();
-}
-
-void ARM64leTargetMachine::anchor() { }
-
-ARM64leTargetMachine::
-ARM64leTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ARM64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ARM64beTargetMachine::anchor() { }
-
-ARM64beTargetMachine::
-ARM64beTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ARM64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
-
-namespace {
-/// ARM64 Code Generator Pass Configuration Options.
-class ARM64PassConfig : public TargetPassConfig {
-public:
-  ARM64PassConfig(ARM64TargetMachine *TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
-
-  ARM64TargetMachine &getARM64TargetMachine() const {
-    return getTM<ARM64TargetMachine>();
-  }
-
-  bool addPreISel() override;
-  bool addInstSelector() override;
-  bool addILPOpts() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
-};
-} // namespace
-
-void ARM64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our ARM64 pass. This
-  // allows the ARM64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createARM64TargetTransformInfoPass(this));
-}
-
-TargetPassConfig *ARM64TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new ARM64PassConfig(this, PM);
-}
-
-// Pass Pipeline Configuration
-bool ARM64PassConfig::addPreISel() {
-  // Run promote constant before global merge, so that the promoted constants
-  // get a chance to be merged
-  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
-    addPass(createARM64PromoteConstantPass());
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createGlobalMergePass(TM));
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64AddressTypePromotionPass());
-
-  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
-  // ourselves.
-  addPass(createAtomicExpandLoadLinkedPass(TM));
-
-  return false;
-}
-
-bool ARM64PassConfig::addInstSelector() {
-  addPass(createARM64ISelDag(getARM64TargetMachine(), getOptLevel()));
-
-  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
-  // references to _TLS_MODULE_BASE_ as possible.
-  if (TM->getSubtarget<ARM64Subtarget>().isTargetELF() &&
-      getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64CleanupLocalDynamicTLSPass());
-
-  return false;
-}
-
-bool ARM64PassConfig::addILPOpts() {
-  if (EnableCCMP)
-    addPass(createARM64ConditionalCompares());
-  if (EnableEarlyIfConvert)
-    addPass(&EarlyIfConverterID);
-  if (EnableStPairSuppress)
-    addPass(createARM64StorePairSuppressPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreRegAlloc() {
-  // Use AdvSIMD scalar instructions whenever profitable.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
-    addPass(createARM64AdvSIMDScalar());
-  return true;
-}
-
-bool ARM64PassConfig::addPostRegAlloc() {
-  // Change dead register definitions to refer to the zero register.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
-    addPass(createARM64DeadRegisterDefinitions());
-  return true;
-}
-
-bool ARM64PassConfig::addPreSched2() {
-  // Expand some pseudo instructions to allow proper scheduling.
-  addPass(createARM64ExpandPseudoPass());
-  // Use load/store pair instructions when possible.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
-    addPass(createARM64LoadStoreOptimizationPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreEmitPass() {
-  // Relax conditional branch instructions if they're otherwise out of
-  // range of their destination.
-  addPass(createARM64BranchRelaxation());
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
-      TM->getSubtarget<ARM64Subtarget>().isTargetMachO())
-    addPass(createARM64CollectLOHPass());
-  return true;
-}
diff --git a/lib/Target/ARM64/ARM64TargetMachine.h b/lib/Target/ARM64/ARM64TargetMachine.h
deleted file mode 100644
index 730ffcaaf6d5..000000000000
--- a/lib/Target/ARM64/ARM64TargetMachine.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//===-- ARM64TargetMachine.h - Define TargetMachine for ARM64 ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetMachine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETMACHINE_H
-#define ARM64TARGETMACHINE_H
-
-#include "ARM64InstrInfo.h"
-#include "ARM64ISelLowering.h"
-#include "ARM64Subtarget.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64SelectionDAGInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/MC/MCStreamer.h"
-
-namespace llvm {
-
-class ARM64TargetMachine : public LLVMTargetMachine {
-protected:
-  ARM64Subtarget Subtarget;
-
-private:
-  const DataLayout DL;
-  ARM64InstrInfo InstrInfo;
-  ARM64TargetLowering TLInfo;
-  ARM64FrameLowering FrameLowering;
-  ARM64SelectionDAGInfo TSInfo;
-
-public:
-  ARM64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
-                     const TargetOptions &Options, Reloc::Model RM,
-                     CodeModel::Model CM, CodeGenOpt::Level OL,
-                     bool IsLittleEndian);
-
-  const ARM64Subtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const ARM64TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-  const DataLayout *getDataLayout() const override { return &DL; }
-  const ARM64FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  const ARM64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const ARM64RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-  const ARM64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-
-  // Pass Pipeline Configuration
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-
-  /// \brief Register ARM64 analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
-};
-
-// ARM64leTargetMachine - ARM64 little endian target machine.
-//
-class ARM64leTargetMachine : public ARM64TargetMachine {
-  virtual void anchor();
-public:
-  ARM64leTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
-// ARM64beTargetMachine - ARM64 big endian target machine.
-//
-class ARM64beTargetMachine : public ARM64TargetMachine {
-  virtual void anchor();
-public:
-  ARM64beTargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.cpp b/lib/Target/ARM64/ARM64TargetObjectFile.cpp
deleted file mode 100644
index cde01e515dc4..000000000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- ARM64TargetObjectFile.cpp - ARM64 Object Info ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetObjectFile.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Dwarf.h"
-using namespace llvm;
-using namespace dwarf;
-
-void ARM64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
-                                           const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
-
-const MCExpr *ARM64_MachoTargetObjectFile::getTTypeGlobalReference(
-    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
-    const TargetMachine &TM, MachineModuleInfo *MMI,
-    MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
-    const MCExpr *Res =
-        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
-    MCSymbol *PCSym = getContext().CreateTempSymbol();
-    Streamer.EmitLabel(PCSym);
-    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
-    return MCBinaryExpr::CreateSub(Res, PC, getContext());
-  }
-
-  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
-      GV, Encoding, Mang, TM, MMI, Streamer);
-}
-
-MCSymbol *ARM64_MachoTargetObjectFile::getCFIPersonalitySymbol(
-    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
-    MachineModuleInfo *MMI) const {
-  return TM.getSymbol(GV, Mang);
-}
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.h b/lib/Target/ARM64/ARM64TargetObjectFile.h
deleted file mode 100644
index 62446f94f179..000000000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- ARM64TargetObjectFile.h - ARM64 Object Info -*- C++ -------------*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-
-namespace llvm {
-class ARM64TargetMachine;
-
-/// This implementation is used for AArch64 ELF targets (Linux in particular).
-class ARM64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-};
-
-/// ARM64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
-class ARM64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
-public:
-  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
-                                        unsigned Encoding, Mangler &Mang,
-                                        const TargetMachine &TM,
-                                        MachineModuleInfo *MMI,
-                                        MCStreamer &Streamer) const override;
-
-  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
-                                    const TargetMachine &TM,
-                                    MachineModuleInfo *MMI) const override;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
deleted file mode 100644
index ac7142f3febd..000000000000
--- a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
+++ /dev/null
@@ -1,463 +0,0 @@
-//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// ARM64 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
-#include <algorithm>
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64tti"
-
-// Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeARM64TTIPass(PassRegistry &);
-}
-
-namespace {
-
-class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
-  const ARM64TargetMachine *TM;
-  const ARM64Subtarget *ST;
-  const ARM64TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  ARM64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  ARM64TTI(const ARM64TargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
-    initializeARM64TTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-  unsigned getIntImmCost(int64_t Val) const;
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 32;
-      return 0;
-    }
-    return 31;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 128;
-      return 0;
-    }
-    return 64;
-  }
-
-  unsigned getMaximumUnrollFactor() const override { return 2; }
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
-      override;
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
-      override;
-
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Opd1Info = OK_AnyValue,
-                                  OperandValueKind Opd2Info = OK_AnyValue) const
-      override;
-
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
-      override;
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
-                   "ARM64 Target Transform Info", true, true, false)
-char ARM64TTI::ID = 0;
-
-ImmutablePass *
-llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
-  return new ARM64TTI(TM);
-}
-
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
-/// method might only calculate a fraction of a larger immediate. Therefore it
-/// is valid to return a cost of ZERO.
-unsigned ARM64TTI::getIntImmCost(int64_t Val) const {
-  // Check if the immediate can be encoded within an instruction.
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, 64))
-    return 0;
-
-  if (Val < 0)
-    Val = ~Val;
-
-  // Calculate how many moves we will need to materialize this constant.
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  return (64 - LZ + 15) / 16;
-}
-
-/// \brief Calculate the cost of materializing the given constant.
-unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return ~0U;
-
-  // Sign-extend all constants to a multiple of 64-bit.
-  APInt ImmVal = Imm;
-  if (BitSize & 0x3f)
-    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
-
-  // Split the constant into 64-bit chunks and calculate the cost for each
-  // chunk.
-  unsigned Cost = 0;
-  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
-    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
-    int64_t Val = Tmp.getSExtValue();
-    Cost += getIntImmCost(Val);
-  }
-  // We need at least one instruction to materialze the constant.
-  return std::max(1U, Cost);
-}
-
-unsigned ARM64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  // There is no cost model for constants with a bit size of 0. Return TCC_Free
-  // here, so that constant hoisting will ignore this constant.
-  if (BitSize == 0)
-    return TCC_Free;
-
-  unsigned ImmIdx = ~0U;
-  switch (Opcode) {
-  default:
-    return TCC_Free;
-  case Instruction::GetElementPtr:
-    // Always hoist the base address of a GetElementPtr.
-    if (Idx == 0)
-      return 2 * TCC_Basic;
-    return TCC_Free;
-  case Instruction::Store:
-    ImmIdx = 0;
-    break;
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Mul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::ICmp:
-    ImmIdx = 1;
-    break;
-  // Always return TCC_Free for the shift value of a shift instruction.
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-    if (Idx == 1)
-      return TCC_Free;
-    break;
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::IntToPtr:
-  case Instruction::PtrToInt:
-  case Instruction::BitCast:
-  case Instruction::PHI:
-  case Instruction::Call:
-  case Instruction::Select:
-  case Instruction::Ret:
-  case Instruction::Load:
-    break;
-  }
-
-  if (Idx == ImmIdx) {
-    unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
-    return (Cost <= NumConstants * TCC_Basic)
-      ? static_cast<unsigned>(TCC_Free) : Cost;
-  }
-  return ARM64TTI::getIntImmCost(Imm, Ty);
-}
-
-unsigned ARM64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  // There is no cost model for constants with a bit size of 0. Return TCC_Free
-  // here, so that constant hoisting will ignore this constant.
-  if (BitSize == 0)
-    return TCC_Free;
-
-  switch (IID) {
-  default:
-    return TCC_Free;
-  case Intrinsic::sadd_with_overflow:
-  case Intrinsic::uadd_with_overflow:
-  case Intrinsic::ssub_with_overflow:
-  case Intrinsic::usub_with_overflow:
-  case Intrinsic::smul_with_overflow:
-  case Intrinsic::umul_with_overflow:
-    if (Idx == 1) {
-      unsigned NumConstants = (BitSize + 63) / 64;
-      unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
-      return (Cost <= NumConstants * TCC_Basic)
-        ? static_cast<unsigned>(TCC_Free) : Cost;
-    }
-    break;
-  case Intrinsic::experimental_stackmap:
-    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
-    break;
-  case Intrinsic::experimental_patchpoint_void:
-  case Intrinsic::experimental_patchpoint_i64:
-    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
-    break;
-  }
-  return ARM64TTI::getIntImmCost(Imm, Ty);
-}
-
-ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
-  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  if (TyWidth == 32 || TyWidth == 64)
-    return PSK_FastHardware;
-  // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
-  return PSK_Software;
-}
-
-unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
-
-  if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-
-  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
-    // LowerVectorINT_TO_FP:
-    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    // LowerVectorFP_TO_INT
-    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
-    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
-    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
-  };
-
-  int Idx = ConvertCostTableLookup<MVT>(
-      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
-      SrcTy.getSimpleVT());
-  if (Idx != -1)
-    return ConversionTbl[Idx].Cost;
-
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-}
-
-unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
-  assert(Val->isVectorTy() && "This must be a vector type");
-
-  if (Index != -1U) {
-    // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
-
-    // This type is legalized to a scalar type.
-    if (!LT.second.isVector())
-      return 0;
-
-    // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
-
-    // The element at index zero is already inside the vector.
-    if (Index == 0)
-      return 0;
-  }
-
-  // All other insert/extracts cost this much.
-  return 2;
-}
-
-unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind Opd1Info,
-                                          OperandValueKind Opd2Info) const {
-  // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-
-  switch (ISD) {
-  default:
-    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
-                                                       Opd2Info);
-  case ISD::ADD:
-  case ISD::MUL:
-  case ISD::XOR:
-  case ISD::OR:
-  case ISD::AND:
-    // These nodes are marked as 'custom' for combining purposes only.
-    // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
-  }
-}
-
-unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
-  // Address computations in vectorized code with non-consecutive addresses will
-  // likely result in more instructions compared to scalar code where the
-  // computation can more often be merged into the index mode. The resulting
-  // extra micro-ops can significantly decrease throughput.
-  unsigned NumVectorInstToHideOverhead = 10;
-
-  if (Ty->isVectorTy() && IsComplex)
-    return NumVectorInstToHideOverhead;
-
-  // In many cases the address computation is not merged into the instruction
-  // addressing mode.
-  return 1;
-}
-
-unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // We don't lower vector selects well that are wider than the register width.
-  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
-    // We would need this many instructions to hide the scalarization happening.
-    unsigned AmortizationCost = 20;
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-    VectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
-    };
-
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
-    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx =
-          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
-                                 SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return VectorSelectTbl[Idx].Cost;
-    }
-  }
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
-}
-
-unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
-
-  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
-      Src->getVectorElementType()->isIntegerTy(64)) {
-    // Unaligned stores are extremely inefficient. We don't split
-    // unaligned v2i64 stores because the negative impact that has shown in
-    // practice on inlined memcpy code.
-    // We make v2i64 stores expensive so that we will only vectorize if there
-    // are 6 other instructions getting vectorized.
-    unsigned AmortizationCost = 6;
-
-    return LT.first * 2 * AmortizationCost;
-  }
-
-  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
-      Src->getVectorNumElements() < 8) {
-    // We scalarize the loads/stores because there is not v.4b register and we
-    // have to promote the elements to v.4h.
-    unsigned NumVecElts = Src->getVectorNumElements();
-    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
-    // We generate 2 instructions per vector element.
-    return NumVectorizableInstsToAmortize * NumVecElts * 2;
-  }
-
-  return LT.first;
-}
diff --git a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
deleted file mode 100644
index cc301f60ede8..000000000000
--- a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
+++ /dev/null
@@ -1,4430 +0,0 @@
-//===-- ARM64AsmParser.cpp - Parse ARM64 assembly to MCInst instructions --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include <cstdio>
-using namespace llvm;
-
-namespace {
-
-class ARM64Operand;
-
-class ARM64AsmParser : public MCTargetAsmParser {
-public:
-  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
-
-private:
-  StringRef Mnemonic; ///< Instruction mnemonic.
-  MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
-
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
-
-  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
-  ARM64CC::CondCode parseCondCodeString(StringRef Cond);
-  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
-  int tryParseRegister();
-  int tryMatchVectorRegister(StringRef &Kind, bool expected);
-  bool parseRegister(OperandVector &Operands);
-  bool parseMemory(OperandVector &Operands);
-  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
-  bool parseVectorList(OperandVector &Operands);
-  bool parseOperand(OperandVector &Operands, bool isCondCode,
-                    bool invertCondCode);
-
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
-
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
-  bool parseDirectiveTLSDescCall(SMLoc L);
-
-  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
-
-  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
-                               bool MatchingInlineAsm) override;
-/// @name Auto-generated Match Functions
-/// {
-
-#define GET_ASSEMBLER_HEADER
-#include "ARM64GenAsmMatcher.inc"
-
-  /// }
-
-  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
-  OperandMatchResultTy tryParseNoIndexMemory(OperandVector &Operands);
-  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
-  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
-  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
-  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
-  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
-  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
-  bool tryParseVectorRegister(OperandVector &Operands);
-
-public:
-  enum ARM64MatchResultTy {
-    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
-#define GET_OPERAND_DIAGNOSTIC_TYPES
-#include "ARM64GenAsmMatcher.inc"
-  };
-  ARM64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-                 const MCInstrInfo &MII,
-                 const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
-    MCAsmParserExtension::Initialize(_Parser);
-
-    // Initialize the set of available features.
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
-  }
-
-  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc, OperandVector &Operands) override;
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  bool ParseDirective(AsmToken DirectiveID) override;
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
-                                      unsigned Kind) override;
-
-  static bool classifySymbolRef(const MCExpr *Expr,
-                                ARM64MCExpr::VariantKind &ELFRefKind,
-                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                int64_t &Addend);
-};
-} // end anonymous namespace
-
-namespace {
-
-/// ARM64Operand - Instances of this class represent a parsed ARM64 machine
-/// instruction.
-class ARM64Operand : public MCParsedAsmOperand {
-public:
-  enum MemIdxKindTy {
-    ImmediateOffset, // pre-indexed, no writeback
-    RegisterOffset   // register offset, with optional extend
-  };
-
-private:
-  enum KindTy {
-    k_Immediate,
-    k_ShiftedImm,
-    k_CondCode,
-    k_Memory,
-    k_Register,
-    k_VectorList,
-    k_VectorIndex,
-    k_Token,
-    k_SysReg,
-    k_SysCR,
-    k_Prefetch,
-    k_ShiftExtend,
-    k_FPImm,
-    k_Barrier
-  } Kind;
-
-  SMLoc StartLoc, EndLoc, OffsetLoc;
-
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
-    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
-  };
-
-  struct RegOp {
-    unsigned RegNum;
-    bool isVector;
-  };
-
-  struct VectorListOp {
-    unsigned RegNum;
-    unsigned Count;
-    unsigned NumElements;
-    unsigned ElementKind;
-  };
-
-  struct VectorIndexOp {
-    unsigned Val;
-  };
-
-  struct ImmOp {
-    const MCExpr *Val;
-  };
-
-  struct ShiftedImmOp {
-    const MCExpr *Val;
-    unsigned ShiftAmount;
-  };
-
-  struct CondCodeOp {
-    ARM64CC::CondCode Code;
-  };
-
-  struct FPImmOp {
-    unsigned Val; // Encoded 8-bit representation.
-  };
-
-  struct BarrierOp {
-    unsigned Val; // Not the enum since not all values have names.
-  };
-
-  struct SysRegOp {
-    const char *Data;
-    unsigned Length;
-    uint64_t FeatureBits; // We need to pass through information about which
-                          // core we are compiling for so that the SysReg
-                          // Mappers can appropriately conditionalize.
-  };
-
-  struct SysCRImmOp {
-    unsigned Val;
-  };
-
-  struct PrefetchOp {
-    unsigned Val;
-  };
-
-  struct ShiftExtendOp {
-    ARM64_AM::ShiftExtendType Type;
-    unsigned Amount;
-  };
-
-  struct ExtendOp {
-    unsigned Val;
-  };
-
-  // This is for all forms of ARM64 address expressions
-  struct MemOp {
-    unsigned BaseRegNum, OffsetRegNum;
-    ARM64_AM::ShiftExtendType ExtType;
-    unsigned ShiftVal;
-    bool ExplicitShift;
-    const MCExpr *OffsetImm;
-    MemIdxKindTy Mode;
-  };
-
-  union {
-    struct TokOp Tok;
-    struct RegOp Reg;
-    struct VectorListOp VectorList;
-    struct VectorIndexOp VectorIndex;
-    struct ImmOp Imm;
-    struct ShiftedImmOp ShiftedImm;
-    struct CondCodeOp CondCode;
-    struct FPImmOp FPImm;
-    struct BarrierOp Barrier;
-    struct SysRegOp SysReg;
-    struct SysCRImmOp SysCRImm;
-    struct PrefetchOp Prefetch;
-    struct ShiftExtendOp ShiftExtend;
-    struct MemOp Mem;
-  };
-
-  // Keep the MCContext around as the MCExprs may need manipulated during
-  // the add<>Operands() calls.
-  MCContext &Ctx;
-
-  ARM64Operand(KindTy K, MCContext &_Ctx)
-      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
-
-public:
-  ARM64Operand(const ARM64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
-    Kind = o.Kind;
-    StartLoc = o.StartLoc;
-    EndLoc = o.EndLoc;
-    switch (Kind) {
-    case k_Token:
-      Tok = o.Tok;
-      break;
-    case k_Immediate:
-      Imm = o.Imm;
-      break;
-    case k_ShiftedImm:
-      ShiftedImm = o.ShiftedImm;
-      break;
-    case k_CondCode:
-      CondCode = o.CondCode;
-      break;
-    case k_FPImm:
-      FPImm = o.FPImm;
-      break;
-    case k_Barrier:
-      Barrier = o.Barrier;
-      break;
-    case k_Register:
-      Reg = o.Reg;
-      break;
-    case k_VectorList:
-      VectorList = o.VectorList;
-      break;
-    case k_VectorIndex:
-      VectorIndex = o.VectorIndex;
-      break;
-    case k_SysReg:
-      SysReg = o.SysReg;
-      break;
-    case k_SysCR:
-      SysCRImm = o.SysCRImm;
-      break;
-    case k_Prefetch:
-      Prefetch = o.Prefetch;
-      break;
-    case k_Memory:
-      Mem = o.Mem;
-      break;
-    case k_ShiftExtend:
-      ShiftExtend = o.ShiftExtend;
-      break;
-    }
-  }
-
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const override { return StartLoc; }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const override { return EndLoc; }
-  /// getOffsetLoc - Get the location of the offset of this memory operand.
-  SMLoc getOffsetLoc() const { return OffsetLoc; }
-
-  StringRef getToken() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return StringRef(Tok.Data, Tok.Length);
-  }
-
-  bool isTokenSuffix() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return Tok.IsSuffix;
-  }
-
-  const MCExpr *getImm() const {
-    assert(Kind == k_Immediate && "Invalid access!");
-    return Imm.Val;
-  }
-
-  const MCExpr *getShiftedImmVal() const {
-    assert(Kind == k_ShiftedImm && "Invalid access!");
-    return ShiftedImm.Val;
-  }
-
-  unsigned getShiftedImmShift() const {
-    assert(Kind == k_ShiftedImm && "Invalid access!");
-    return ShiftedImm.ShiftAmount;
-  }
-
-  ARM64CC::CondCode getCondCode() const {
-    assert(Kind == k_CondCode && "Invalid access!");
-    return CondCode.Code;
-  }
-
-  unsigned getFPImm() const {
-    assert(Kind == k_FPImm && "Invalid access!");
-    return FPImm.Val;
-  }
-
-  unsigned getBarrier() const {
-    assert(Kind == k_Barrier && "Invalid access!");
-    return Barrier.Val;
-  }
-
-  unsigned getReg() const override {
-    assert(Kind == k_Register && "Invalid access!");
-    return Reg.RegNum;
-  }
-
-  unsigned getVectorListStart() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.RegNum;
-  }
-
-  unsigned getVectorListCount() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.Count;
-  }
-
-  unsigned getVectorIndex() const {
-    assert(Kind == k_VectorIndex && "Invalid access!");
-    return VectorIndex.Val;
-  }
-
-  StringRef getSysReg() const {
-    assert(Kind == k_SysReg && "Invalid access!");
-    return StringRef(SysReg.Data, SysReg.Length);
-  }
-
-  uint64_t getSysRegFeatureBits() const {
-    assert(Kind == k_SysReg && "Invalid access!");
-    return SysReg.FeatureBits;
-  }
-
-  unsigned getSysCR() const {
-    assert(Kind == k_SysCR && "Invalid access!");
-    return SysCRImm.Val;
-  }
-
-  unsigned getPrefetch() const {
-    assert(Kind == k_Prefetch && "Invalid access!");
-    return Prefetch.Val;
-  }
-
-  ARM64_AM::ShiftExtendType getShiftExtendType() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.Type;
-  }
-
-  unsigned getShiftExtendAmount() const {
-    assert(Kind == k_ShiftExtend && "Invalid access!");
-    return ShiftExtend.Amount;
-  }
-
-  bool isImm() const override { return Kind == k_Immediate; }
-  bool isSImm9() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val < 256);
-  }
-  bool isSImm7s4() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
-  }
-  bool isSImm7s8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
-  }
-  bool isSImm7s16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
-  }
-  bool isImm0_7() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 8);
-  }
-  bool isImm1_8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 9);
-  }
-  bool isImm0_15() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 16);
-  }
-  bool isImm1_16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 17);
-  }
-  bool isImm0_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 32);
-  }
-  bool isImm1_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 32);
-  }
-  bool isImm1_32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 33);
-  }
-  bool isImm0_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 64);
-  }
-  bool isImm1_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 64);
-  }
-  bool isImm1_64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 65);
-  }
-  bool isImm0_127() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 128);
-  }
-  bool isImm0_255() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 256);
-  }
-  bool isImm0_65535() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 65536);
-  }
-  bool isImm32_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 32 && Val < 64);
-  }
-  bool isLogicalImm32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 32);
-  }
-  bool isLogicalImm64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 64);
-  }
-  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
-  bool isAddSubImm() const {
-    if (!isShiftedImm() && !isImm())
-      return false;
-
-    const MCExpr *Expr;
-
-    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
-    if (isShiftedImm()) {
-      unsigned Shift = ShiftedImm.ShiftAmount;
-      Expr = ShiftedImm.Val;
-      if (Shift != 0 && Shift != 12)
-        return false;
-    } else {
-      Expr = getImm();
-    }
-
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    int64_t Addend;
-    if (ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind,
-                                          DarwinRefKind, Addend)) {
-      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
-          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
-          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
-          || ELFRefKind == ARM64MCExpr::VK_LO12
-          || ELFRefKind == ARM64MCExpr::VK_DTPREL_HI12
-          || ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12
-          || ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC
-          || ELFRefKind == ARM64MCExpr::VK_TPREL_HI12
-          || ELFRefKind == ARM64MCExpr::VK_TPREL_LO12
-          || ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC
-          || ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
-  }
-  bool isCondCode() const { return Kind == k_CondCode; }
-  bool isSIMDImmType10() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isAdvSIMDModImmType10(MCE->getValue());
-  }
-  bool isBranchTarget26() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
-  }
-  bool isPCRelLabel19() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
-  }
-  bool isBranchTarget14() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
-  }
-
-  bool isMovWSymbol(ArrayRef<ARM64MCExpr::VariantKind> AllowedModifiers) const {
-    if (!isImm())
-      return false;
-
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    int64_t Addend;
-    if (!ARM64AsmParser::classifySymbolRef(getImm(), ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      return false;
-    }
-    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
-      return false;
-
-    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
-      if (ELFRefKind == AllowedModifiers[i])
-        return Addend == 0;
-    }
-
-    return false;
-  }
-
-  bool isMovZSymbolG3() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2,
-                                                   ARM64MCExpr::VK_ABS_G2_S,
-                                                   ARM64MCExpr::VK_TPREL_G2,
-                                                   ARM64MCExpr::VK_DTPREL_G2 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G1,
-                                                   ARM64MCExpr::VK_ABS_G1_S,
-                                                   ARM64MCExpr::VK_GOTTPREL_G1,
-                                                   ARM64MCExpr::VK_TPREL_G1,
-                                                   ARM64MCExpr::VK_DTPREL_G1, };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G0,
-                                                   ARM64MCExpr::VK_ABS_G0_S,
-                                                   ARM64MCExpr::VK_TPREL_G0,
-                                                   ARM64MCExpr::VK_DTPREL_G0 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG3() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2_NC };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G1_NC, ARM64MCExpr::VK_TPREL_G1_NC,
-      ARM64MCExpr::VK_DTPREL_G1_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G0_NC,   ARM64MCExpr::VK_GOTTPREL_G0_NC,
-      ARM64MCExpr::VK_TPREL_G0_NC, ARM64MCExpr::VK_DTPREL_G0_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  template<int RegWidth, int Shift>
-  bool isMOVZMovAlias() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    uint64_t Value = CE->getValue();
-
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
-    if (Value == 0 && Shift != 0)
-      return false;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
-  }
-
-  template<int RegWidth, int Shift>
-  bool isMOVNMovAlias() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    uint64_t Value = CE->getValue();
-
-    // MOVZ takes precedence over MOVN.
-    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
-      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
-        return false;
-
-    Value = ~Value;
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
-  }
-
-  bool isFPImm() const { return Kind == k_FPImm; }
-  bool isBarrier() const { return Kind == k_Barrier; }
-  bool isSysReg() const { return Kind == k_SysReg; }
-  bool isMRSSystemRegister() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    auto Mapper = ARM64SysReg::MRSMapper(getSysRegFeatureBits());
-    Mapper.fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
-  }
-  bool isMSRSystemRegister() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    auto Mapper = ARM64SysReg::MSRMapper(getSysRegFeatureBits());
-    Mapper.fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
-  }
-  bool isSystemPStateField() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    ARM64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
-
-    return IsKnownRegister;
-  }
-  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
-  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
-  bool isVectorRegLo() const {
-    return Kind == k_Register && Reg.isVector &&
-      ARM64MCRegisterClasses[ARM64::FPR128_loRegClassID].contains(Reg.RegNum);
-  }
-  bool isGPR32as64() const {
-    return Kind == k_Register && !Reg.isVector &&
-      ARM64MCRegisterClasses[ARM64::GPR64RegClassID].contains(Reg.RegNum);
-  }
-
-  /// Is this a vector list with the type implicit (presumably attached to the
-  /// instruction itself)?
-  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
-    return Kind == k_VectorList && VectorList.Count == NumRegs &&
-           !VectorList.ElementKind;
-  }
-
-  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
-  bool isTypedVectorList() const {
-    if (Kind != k_VectorList)
-      return false;
-    if (VectorList.Count != NumRegs)
-      return false;
-    if (VectorList.ElementKind != ElementKind)
-      return false;
-    return VectorList.NumElements == NumElements;
-  }
-
-  bool isVectorIndex1() const {
-    return Kind == k_VectorIndex && VectorIndex.Val == 1;
-  }
-  bool isVectorIndexB() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 16;
-  }
-  bool isVectorIndexH() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 8;
-  }
-  bool isVectorIndexS() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 4;
-  }
-  bool isVectorIndexD() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 2;
-  }
-  bool isToken() const override { return Kind == k_Token; }
-  bool isTokenEqual(StringRef Str) const {
-    return Kind == k_Token && getToken() == Str;
-  }
-  bool isMem() const override { return Kind == k_Memory; }
-  bool isSysCR() const { return Kind == k_SysCR; }
-  bool isPrefetch() const { return Kind == k_Prefetch; }
-  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
-  bool isShifter() const {
-    if (!isShiftExtend())
-      return false;
-
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR ||
-            ST == ARM64_AM::ROR || ST == ARM64_AM::MSL);
-  }
-  bool isExtend() const {
-    if (!isShiftExtend())
-      return false;
-
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return (ET == ARM64_AM::UXTB || ET == ARM64_AM::SXTB ||
-            ET == ARM64_AM::UXTH || ET == ARM64_AM::SXTH ||
-            ET == ARM64_AM::UXTW || ET == ARM64_AM::SXTW ||
-            ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX ||
-            ET == ARM64_AM::LSL) &&
-           getShiftExtendAmount() <= 4;
-  }
-
-  bool isExtend64() const {
-    if (!isExtend())
-      return false;
-    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return ET != ARM64_AM::UXTX && ET != ARM64_AM::SXTX;
-  }
-  bool isExtendLSL64() const {
-    if (!isExtend())
-      return false;
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    return (ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX || ET == ARM64_AM::LSL) &&
-      getShiftExtendAmount() <= 4;
-  }
-
-  template <unsigned width>
-  bool isArithmeticShifter() const {
-    if (!isShifter())
-      return false;
-
-    // An arithmetic shifter is LSL, LSR, or ASR.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR ||
-            ST == ARM64_AM::ASR) && getShiftExtendAmount() < width;
-  }
-
-  template <unsigned width>
-  bool isLogicalShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical shifter is LSL, LSR, ASR or ROR.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR ||
-            ST == ARM64_AM::ROR) &&
-           getShiftExtendAmount() < width;
-  }
-
-  bool isMovImm32Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0, 16, 32, or 48.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = getShiftExtendAmount();
-    return (Val == 0 || Val == 16);
-  }
-
-  bool isMovImm64Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0 or 16.
-    ARM64_AM::ShiftExtendType ST = getShiftExtendType();
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = getShiftExtendAmount();
-    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
-  }
-
-  bool isLogicalVecShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
-    unsigned Shift = getShiftExtendAmount();
-    return getShiftExtendType() == ARM64_AM::LSL &&
-           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
-  }
-
-  bool isLogicalVecHalfWordShifter() const {
-    if (!isLogicalVecShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0 or 8.
-    unsigned Shift = getShiftExtendAmount();
-    return getShiftExtendType() == ARM64_AM::LSL && (Shift == 0 || Shift == 8);
-  }
-
-  bool isMoveVecShifter() const {
-    if (!isShiftExtend())
-      return false;
-
-    // A logical vector shifter is a left shift by 8 or 16.
-    unsigned Shift = getShiftExtendAmount();
-    return getShiftExtendType() == ARM64_AM::MSL && (Shift == 8 || Shift == 16);
-  }
-
-  bool isMemoryRegisterOffset8() const {
-    return isMem() && Mem.Mode == RegisterOffset && Mem.ShiftVal == 0;
-  }
-
-  bool isMemoryRegisterOffset16() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 1);
-  }
-
-  bool isMemoryRegisterOffset32() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 2);
-  }
-
-  bool isMemoryRegisterOffset64() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 3);
-  }
-
-  bool isMemoryRegisterOffset128() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 4);
-  }
-
-  bool isMemoryUnscaled() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    // Make sure the immediate value is valid.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    if (!CE)
-      return false;
-    // The offset must fit in a signed 9-bit unscaled immediate.
-    int64_t Value = CE->getValue();
-    return (Value >= -256 && Value < 256);
-  }
-  // Fallback unscaled operands are for aliases of LDR/STR that fall back
-  // to LDUR/STUR when the offset is not legal for the former but is for
-  // the latter. As such, in addition to checking for being a legal unscaled
-  // address, also check that it is not a legal scaled address. This avoids
-  // ambiguity in the matcher.
-  bool isMemoryUnscaledFB8() const {
-    return isMemoryUnscaled() && !isMemoryIndexed8();
-  }
-  bool isMemoryUnscaledFB16() const {
-    return isMemoryUnscaled() && !isMemoryIndexed16();
-  }
-  bool isMemoryUnscaledFB32() const {
-    return isMemoryUnscaled() && !isMemoryIndexed32();
-  }
-  bool isMemoryUnscaledFB64() const {
-    return isMemoryUnscaled() && !isMemoryIndexed64();
-  }
-  bool isMemoryUnscaledFB128() const {
-    return isMemoryUnscaled() && !isMemoryIndexed128();
-  }
-  bool isMemoryIndexed(unsigned Scale) const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    // Make sure the immediate value is valid.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-
-    if (CE) {
-      // The offset must be a positive multiple of the scale and in range of
-      // encoding with a 12-bit immediate.
-      int64_t Value = CE->getValue();
-      return (Value >= 0 && (Value % Scale) == 0 && Value <= (4095 * Scale));
-    }
-
-    // If it's not a constant, check for some expressions we know.
-    const MCExpr *Expr = Mem.OffsetImm;
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    int64_t Addend;
-    if (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      // If we don't understand the expression, assume the best and
-      // let the fixup and relocation code deal with it.
-      return true;
-    }
-
-    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-        ELFRefKind == ARM64MCExpr::VK_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_GOT_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_GOTTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
-      // Note that we don't range-check the addend. It's adjusted modulo page
-      // size when converted, so there is no "out of range" condition when using
-      // @pageoff.
-      return Addend >= 0 && (Addend % Scale) == 0;
-    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
-               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
-      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
-      return Addend == 0;
-    }
-
-    return false;
-  }
-  bool isMemoryIndexed128() const { return isMemoryIndexed(16); }
-  bool isMemoryIndexed64() const { return isMemoryIndexed(8); }
-  bool isMemoryIndexed32() const { return isMemoryIndexed(4); }
-  bool isMemoryIndexed16() const { return isMemoryIndexed(2); }
-  bool isMemoryIndexed8() const { return isMemoryIndexed(1); }
-  bool isMemoryNoIndex() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-
-    // Make sure the immediate value is valid. Only zero is allowed.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    if (!CE || CE->getValue() != 0)
-      return false;
-    return true;
-  }
-  bool isMemorySIMDNoIndex() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    return Mem.OffsetImm == nullptr;
-  }
-  bool isMemoryIndexedSImm9() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return Value >= -256 && Value <= 255;
-  }
-  bool isMemoryIndexed32SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 4) == 0) && Value >= -256 && Value <= 252;
-  }
-  bool isMemoryIndexed64SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 8) == 0) && Value >= -512 && Value <= 504;
-  }
-  bool isMemoryIndexed128SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 16) == 0) && Value >= -1024 && Value <= 1008;
-  }
-
-  bool isAdrpLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    if (!isImm())
-        return false;
-
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
-      int64_t Val = CE->getValue();
-      int64_t Min = - (4096 * (1LL << (21 - 1)));
-      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
-      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
-    }
-
-    return true;
-  }
-
-  bool isAdrLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    if (!isImm())
-        return false;
-
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
-      int64_t Val = CE->getValue();
-      int64_t Min = - (1LL << (21 - 1));
-      int64_t Max = ((1LL << (21 - 1)) - 1);
-      return Val >= Min && Val <= Max;
-    }
-
-    return true;
-  }
-
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.  Null MCExpr = 0.
-    if (!Expr)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    assert(ARM64MCRegisterClasses[ARM64::GPR64RegClassID].contains(getReg()));
-
-    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
-    uint32_t Reg = RI->getRegClass(ARM64::GPR32RegClassID).getRegister(
-        RI->getEncodingValue(getReg()));
-
-    Inst.addOperand(MCOperand::CreateReg(Reg));
-  }
-
-  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    assert(ARM64MCRegisterClasses[ARM64::FPR128RegClassID].contains(getReg()));
-    Inst.addOperand(MCOperand::CreateReg(ARM64::D0 + getReg() - ARM64::Q0));
-  }
-
-  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    assert(ARM64MCRegisterClasses[ARM64::FPR128RegClassID].contains(getReg()));
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::D0,       ARM64::D0_D1,
-                                    ARM64::D0_D1_D2, ARM64::D0_D1_D2_D3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::Q0,       ARM64::Q0_Q1,
-                                    ARM64::Q0_Q1_Q2, ARM64::Q0_Q1_Q2_Q3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // If this is a pageoff symrefexpr with an addend, adjust the addend
-    // to be only the page-offset portion. Otherwise, just add the expr
-    // as-is.
-    addExpr(Inst, getImm());
-  }
-
-  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-    if (isShiftedImm()) {
-      addExpr(Inst, getShiftedImmVal());
-      Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
-    } else {
-      addExpr(Inst, getImm());
-      Inst.addOperand(MCOperand::CreateImm(0));
-    }
-  }
-
-  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
-  }
-
-  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      addExpr(Inst, getImm());
-    else
-      Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
-  }
-
-  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addSImm9Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
-  }
-
-  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
-  }
-
-  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
-  }
-
-  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addFPImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
-  }
-
-  void addBarrierOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
-  }
-
-  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    bool Valid;
-    auto Mapper = ARM64SysReg::MRSMapper(getSysRegFeatureBits());
-    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
-  }
-
-  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    bool Valid;
-    auto Mapper = ARM64SysReg::MSRMapper(getSysRegFeatureBits());
-    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
-  }
-
-  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    bool Valid;
-    uint32_t Bits = ARM64PState::PStateMapper().fromString(getSysReg(), Valid);
-
-    Inst.addOperand(MCOperand::CreateImm(Bits));
-  }
-
-  void addSysCROperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
-  }
-
-  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
-  }
-
-  void addShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    unsigned Imm =
-        ARM64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
-    Inst.addOperand(MCOperand::CreateImm(Imm));
-  }
-
-  void addExtendOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    if (ET == ARM64_AM::LSL) ET = ARM64_AM::UXTW;
-    unsigned Imm = ARM64_AM::getArithExtendImm(ET, getShiftExtendAmount());
-    Inst.addOperand(MCOperand::CreateImm(Imm));
-  }
-
-  void addExtend64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    ARM64_AM::ShiftExtendType ET = getShiftExtendType();
-    if (ET == ARM64_AM::LSL) ET = ARM64_AM::UXTX;
-    unsigned Imm = ARM64_AM::getArithExtendImm(ET, getShiftExtendAmount());
-    Inst.addOperand(MCOperand::CreateImm(Imm));
-  }
-
-  template<int Shift>
-  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
-    Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
-  }
-
-  template<int Shift>
-  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
-    Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
-  }
-
-  void addMemoryRegisterOffsetOperands(MCInst &Inst, unsigned N, bool DoShift) {
-    assert(N == 3 && "Invalid number of operands!");
-
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-    Inst.addOperand(MCOperand::CreateReg(getXRegFromWReg(Mem.OffsetRegNum)));
-    unsigned ExtendImm = ARM64_AM::getMemExtendImm(Mem.ExtType, DoShift);
-    Inst.addOperand(MCOperand::CreateImm(ExtendImm));
-  }
-
-  void addMemoryRegisterOffset8Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ExplicitShift);
-  }
-
-  void addMemoryRegisterOffset16Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 1);
-  }
-
-  void addMemoryRegisterOffset32Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 2);
-  }
-
-  void addMemoryRegisterOffset64Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 3);
-  }
-
-  void addMemoryRegisterOffset128Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 4);
-  }
-
-  void addMemoryIndexedOperands(MCInst &Inst, unsigned N,
-                                unsigned Scale) const {
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    if (!Mem.OffsetImm) {
-      // There isn't an offset.
-      Inst.addOperand(MCOperand::CreateImm(0));
-      return;
-    }
-
-    // Add the offset operand.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm)) {
-      assert(CE->getValue() % Scale == 0 &&
-             "Offset operand must be multiple of the scale!");
-
-      // The MCInst offset operand doesn't include the low bits (like the
-      // instruction encoding).
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / Scale));
-    }
-
-    // If this is a pageoff symrefexpr with an addend, the linker will
-    // do the scaling of the addend.
-    //
-    // Otherwise we don't know what this is, so just add the scaling divide to
-    // the expression and let the MC fixup evaluation code deal with it.
-    const MCExpr *Expr = Mem.OffsetImm;
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    int64_t Addend;
-    if (Scale > 1 &&
-        (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
-                                            Addend) ||
-         (Addend != 0 && DarwinRefKind != MCSymbolRefExpr::VK_PAGEOFF))) {
-      Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(Scale, Ctx),
-                                     Ctx);
-    }
-
-    Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addMemoryUnscaledOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryUnscaled() && "Invalid number of operands!");
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    // Add the offset operand.
-    if (!Mem.OffsetImm)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else {
-      // Only constant offsets supported.
-      const MCConstantExpr *CE = cast<MCConstantExpr>(Mem.OffsetImm);
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    }
-  }
-
-  void addMemoryIndexed128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed128() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 16);
-  }
-
-  void addMemoryIndexed64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed64() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 8);
-  }
-
-  void addMemoryIndexed32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed32() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 4);
-  }
-
-  void addMemoryIndexed16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed16() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 2);
-  }
-
-  void addMemoryIndexed8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed8() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 1);
-  }
-
-  void addMemoryNoIndexOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemoryNoIndex() && "Invalid number of operands!");
-    // Add the base register operand (the offset is always zero, so ignore it).
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-  }
-
-  void addMemorySIMDNoIndexOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemorySIMDNoIndex() && "Invalid number of operands!");
-    // Add the base register operand (the offset is always zero, so ignore it).
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-  }
-
-  void addMemoryWritebackIndexedOperands(MCInst &Inst, unsigned N,
-                                         unsigned Scale) const {
-    assert(N == 2 && "Invalid number of operands!");
-
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    // Add the offset operand.
-    int64_t Offset = 0;
-    if (Mem.OffsetImm) {
-      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-      assert(CE && "Non-constant indexed offset operand!");
-      Offset = CE->getValue();
-    }
-
-    if (Scale != 1) {
-      assert(Offset % Scale == 0 &&
-             "Offset operand must be a multiple of the scale!");
-      Offset /= Scale;
-    }
-
-    Inst.addOperand(MCOperand::CreateImm(Offset));
-  }
-
-  void addMemoryIndexedSImm9Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 1);
-  }
-
-  void addMemoryIndexed32SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 4);
-  }
-
-  void addMemoryIndexed64SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 8);
-  }
-
-  void addMemoryIndexed128SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 16);
-  }
-
-  void print(raw_ostream &OS) const override;
-
-  static ARM64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Token, Ctx);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    Op->Tok.IsSuffix = IsSuffix;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
-                                 SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Register, Ctx);
-    Op->Reg.RegNum = RegNum;
-    Op->Reg.isVector = isVector;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                        unsigned NumElements, char ElementKind,
-                                        SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorList, Ctx);
-    Op->VectorList.RegNum = RegNum;
-    Op->VectorList.Count = Count;
-    Op->VectorList.NumElements = NumElements;
-    Op->VectorList.ElementKind = ElementKind;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
-                                         MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorIndex, Ctx);
-    Op->VectorIndex.Val = Idx;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
-                                 MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Immediate, Ctx);
-    Op->Imm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateShiftedImm(const MCExpr *Val, unsigned ShiftAmount,
-                                        SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_ShiftedImm, Ctx);
-    Op->ShiftedImm .Val = Val;
-    Op->ShiftedImm.ShiftAmount = ShiftAmount;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateCondCode(ARM64CC::CondCode Code, SMLoc S, SMLoc E,
-                                      MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_CondCode, Ctx);
-    Op->CondCode.Code = Code;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_FPImm, Ctx);
-    Op->FPImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Barrier, Ctx);
-    Op->Barrier.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSysReg(StringRef Str, SMLoc S,
-                                    uint64_t FeatureBits, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SysReg, Ctx);
-    Op->SysReg.Data = Str.data();
-    Op->SysReg.Length = Str.size();
-    Op->SysReg.FeatureBits = FeatureBits;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateMem(unsigned BaseRegNum, const MCExpr *Off,
-                                 SMLoc S, SMLoc E, SMLoc OffsetLoc,
-                                 MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
-    Op->Mem.BaseRegNum = BaseRegNum;
-    Op->Mem.OffsetRegNum = 0;
-    Op->Mem.OffsetImm = Off;
-    Op->Mem.ExtType = ARM64_AM::UXTX;
-    Op->Mem.ShiftVal = 0;
-    Op->Mem.ExplicitShift = false;
-    Op->Mem.Mode = ImmediateOffset;
-    Op->OffsetLoc = OffsetLoc;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateRegOffsetMem(unsigned BaseReg, unsigned OffsetReg,
-                                          ARM64_AM::ShiftExtendType ExtType,
-                                          unsigned ShiftVal, bool ExplicitShift,
-                                          SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
-    Op->Mem.BaseRegNum = BaseReg;
-    Op->Mem.OffsetRegNum = OffsetReg;
-    Op->Mem.OffsetImm = nullptr;
-    Op->Mem.ExtType = ExtType;
-    Op->Mem.ShiftVal = ShiftVal;
-    Op->Mem.ExplicitShift = ExplicitShift;
-    Op->Mem.Mode = RegisterOffset;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SysCR, Ctx);
-    Op->SysCRImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Prefetch, Ctx);
-    Op->Prefetch.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateShiftExtend(ARM64_AM::ShiftExtendType ShOp, unsigned Val,
-                                         SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_ShiftExtend, Ctx);
-    Op->ShiftExtend.Type = ShOp;
-    Op->ShiftExtend.Amount = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-};
-
-} // end anonymous namespace.
-
-void ARM64Operand::print(raw_ostream &OS) const {
-  switch (Kind) {
-  case k_FPImm:
-    OS << "<fpimm " << getFPImm() << "(" << ARM64_AM::getFPImmFloat(getFPImm())
-       << ") >";
-    break;
-  case k_Barrier: {
-    bool Valid;
-    StringRef Name = ARM64DB::DBarrierMapper().toString(getBarrier(), Valid);
-    if (Valid)
-      OS << "<barrier " << Name << ">";
-    else
-      OS << "<barrier invalid #" << getBarrier() << ">";
-    break;
-  }
-  case k_Immediate:
-    getImm()->print(OS);
-    break;
-  case k_ShiftedImm: {
-    unsigned Shift = getShiftedImmShift();
-    OS << "<shiftedimm ";
-    getShiftedImmVal()->print(OS);
-    OS << ", lsl #" << ARM64_AM::getShiftValue(Shift) << ">";
-    break;
-  }
-  case k_CondCode:
-    OS << "<condcode " << getCondCode() << ">";
-    break;
-  case k_Memory:
-    OS << "<memory>";
-    break;
-  case k_Register:
-    OS << "<register " << getReg() << ">";
-    break;
-  case k_VectorList: {
-    OS << "<vectorlist ";
-    unsigned Reg = getVectorListStart();
-    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
-      OS << Reg + i << " ";
-    OS << ">";
-    break;
-  }
-  case k_VectorIndex:
-    OS << "<vectorindex " << getVectorIndex() << ">";
-    break;
-  case k_SysReg:
-    OS << "<sysreg: " << getSysReg() << '>';
-    break;
-  case k_Token:
-    OS << "'" << getToken() << "'";
-    break;
-  case k_SysCR:
-    OS << "c" << getSysCR();
-    break;
-  case k_Prefetch: {
-    bool Valid;
-    StringRef Name = ARM64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
-    if (Valid)
-      OS << "<prfop " << Name << ">";
-    else
-      OS << "<prfop invalid #" << getPrefetch() << ">";
-    break;
-  }
-  case k_ShiftExtend: {
-    OS << "<" << ARM64_AM::getShiftExtendName(getShiftExtendType()) << " #"
-       << getShiftExtendAmount() << ">";
-    break;
-  }
-  }
-}
-
-/// @name Auto-generated Match Functions
-/// {
-
-static unsigned MatchRegisterName(StringRef Name);
-
-/// }
-
-static unsigned matchVectorRegName(StringRef Name) {
-  return StringSwitch<unsigned>(Name)
-      .Case("v0", ARM64::Q0)
-      .Case("v1", ARM64::Q1)
-      .Case("v2", ARM64::Q2)
-      .Case("v3", ARM64::Q3)
-      .Case("v4", ARM64::Q4)
-      .Case("v5", ARM64::Q5)
-      .Case("v6", ARM64::Q6)
-      .Case("v7", ARM64::Q7)
-      .Case("v8", ARM64::Q8)
-      .Case("v9", ARM64::Q9)
-      .Case("v10", ARM64::Q10)
-      .Case("v11", ARM64::Q11)
-      .Case("v12", ARM64::Q12)
-      .Case("v13", ARM64::Q13)
-      .Case("v14", ARM64::Q14)
-      .Case("v15", ARM64::Q15)
-      .Case("v16", ARM64::Q16)
-      .Case("v17", ARM64::Q17)
-      .Case("v18", ARM64::Q18)
-      .Case("v19", ARM64::Q19)
-      .Case("v20", ARM64::Q20)
-      .Case("v21", ARM64::Q21)
-      .Case("v22", ARM64::Q22)
-      .Case("v23", ARM64::Q23)
-      .Case("v24", ARM64::Q24)
-      .Case("v25", ARM64::Q25)
-      .Case("v26", ARM64::Q26)
-      .Case("v27", ARM64::Q27)
-      .Case("v28", ARM64::Q28)
-      .Case("v29", ARM64::Q29)
-      .Case("v30", ARM64::Q30)
-      .Case("v31", ARM64::Q31)
-      .Default(0);
-}
-
-static bool isValidVectorKind(StringRef Name) {
-  return StringSwitch<bool>(Name.lower())
-      .Case(".8b", true)
-      .Case(".16b", true)
-      .Case(".4h", true)
-      .Case(".8h", true)
-      .Case(".2s", true)
-      .Case(".4s", true)
-      .Case(".1d", true)
-      .Case(".2d", true)
-      .Case(".1q", true)
-      // Accept the width neutral ones, too, for verbose syntax. If those
-      // aren't used in the right places, the token operand won't match so
-      // all will work out.
-      .Case(".b", true)
-      .Case(".h", true)
-      .Case(".s", true)
-      .Case(".d", true)
-      .Default(false);
-}
-
-static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
-                                 char &ElementKind) {
-  assert(isValidVectorKind(Name));
-
-  ElementKind = Name.lower()[Name.size() - 1];
-  NumElements = 0;
-
-  if (Name.size() == 2)
-    return;
-
-  // Parse the lane count
-  Name = Name.drop_front();
-  while (isdigit(Name.front())) {
-    NumElements = 10 * NumElements + (Name.front() - '0');
-    Name = Name.drop_front();
-  }
-}
-
-bool ARM64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                   SMLoc &EndLoc) {
-  StartLoc = getLoc();
-  RegNo = tryParseRegister();
-  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  return (RegNo == (unsigned)-1);
-}
-
-/// tryParseRegister - Try to parse a register name. The token must be an
-/// Identifier when called, and if it is a register name the token is eaten and
-/// the register is added to the operand list.
-int ARM64AsmParser::tryParseRegister() {
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = MatchRegisterName(lowerCase);
-  // Also handle a few aliases of registers.
-  if (RegNum == 0)
-    RegNum = StringSwitch<unsigned>(lowerCase)
-                 .Case("fp",  ARM64::FP)
-                 .Case("lr",  ARM64::LR)
-                 .Case("x31", ARM64::XZR)
-                 .Case("w31", ARM64::WZR)
-                 .Default(0);
-
-  if (RegNum == 0)
-    return -1;
-
-  Parser.Lex(); // Eat identifier token.
-  return RegNum;
-}
-
-/// tryMatchVectorRegister - Try to parse a vector register name with optional
-/// kind specifier. If it is a register specifier, eat the token and return it.
-int ARM64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    TokError("vector register expected");
-    return -1;
-  }
-
-  StringRef Name = Parser.getTok().getString();
-  // If there is a kind specifier, it's separated from the register name by
-  // a '.'.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchVectorRegName(Head);
-  if (RegNum) {
-    if (Next != StringRef::npos) {
-      Kind = Name.slice(Next, StringRef::npos);
-      if (!isValidVectorKind(Kind)) {
-        TokError("invalid vector kind qualifier");
-        return -1;
-      }
-    }
-    Parser.Lex(); // Eat the register token.
-    return RegNum;
-  }
-
-  if (expected)
-    TokError("vector register expected");
-  return -1;
-}
-
-/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
-  }
-
-  StringRef Tok = Parser.getTok().getIdentifier();
-  if (Tok[0] != 'c' && Tok[0] != 'C') {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
-  }
-
-  uint32_t CRNum;
-  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
-  if (BadNum || CRNum > 15) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParsePrefetch - Try to parse a prefetch operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParsePrefetch(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  // Either an identifier for named values or a 5-bit immediate.
-  bool Hash = Tok.is(AsmToken::Hash);
-  if (Hash || Tok.is(AsmToken::Integer)) {
-    if (Hash)
-      Parser.Lex(); // Eat hash token.
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for prefetch operand");
-      return MatchOperand_ParseFail;
-    }
-    unsigned prfop = MCE->getValue();
-    if (prfop > 31) {
-      TokError("prefetch operand out of range, [0,31] expected");
-      return MatchOperand_ParseFail;
-    }
-
-    Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  bool Valid;
-  unsigned prfop = ARM64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
-  if (!Valid) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat hash token.
-  }
-
-  if (parseSymbolicImmVal(Expr))
-    return MatchOperand_ParseFail;
-
-  ARM64MCExpr::VariantKind ELFRefKind;
-  MCSymbolRefExpr::VariantKind DarwinRefKind;
-  int64_t Addend;
-  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
-        ELFRefKind == ARM64MCExpr::VK_INVALID) {
-      // No modifier was specified at all; this is the syntax for an ELF basic
-      // ADRP relocation (unfortunately).
-      Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_ABS_PAGE, getContext());
-    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
-                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
-               Addend != 0) {
-      Error(S, "gotpage label reference not allowed an addend");
-      return MatchOperand_ParseFail;
-    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
-               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
-               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
-               ELFRefKind != ARM64MCExpr::VK_GOT_PAGE &&
-               ELFRefKind != ARM64MCExpr::VK_GOTTPREL_PAGE &&
-               ELFRefKind != ARM64MCExpr::VK_TLSDESC_PAGE) {
-      // The operand must be an @page or @gotpage qualified symbolref.
-      Error(S, "page or gotpage label reference expected");
-      return MatchOperand_ParseFail;
-    }
-  }
-
-  // We have either a label reference possibly with addend or an immediate. The
-  // addend is a raw value here. The linker will adjust it to only reference the
-  // page.
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrLabel - Parse and validate a source label for the ADR
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat hash token.
-  }
-
-  if (getParser().parseExpression(Expr))
-    return MatchOperand_ParseFail;
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseFPImm - A floating point immediate expression operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseFPImm(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  bool Hash = false;
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
-  }
-
-  // Handle negation, as that still comes through as a separate token.
-  bool isNegative = false;
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    isNegative = true;
-    Parser.Lex();
-  }
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.is(AsmToken::Real)) {
-    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-    // If we had a '-' in front, toggle the sign bit.
-    IntVal ^= (uint64_t)isNegative << 63;
-    int Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    Parser.Lex(); // Eat the token.
-    // Check for out of range values. As an exception, we let Zero through,
-    // as we handle that special case in post-processing before matching in
-    // order to use the zero register for it.
-    if (Val == -1 && !RealVal.isZero()) {
-      TokError("expected compatible register or floating-point constant");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-  if (Tok.is(AsmToken::Integer)) {
-    int64_t Val;
-    if (!isNegative && Tok.getString().startswith("0x")) {
-      Val = Tok.getIntVal();
-      if (Val > 255 || Val < 0) {
-        TokError("encoded floating point value out of range");
-        return MatchOperand_ParseFail;
-      }
-    } else {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      // If we had a '-' in front, toggle the sign bit.
-      IntVal ^= (uint64_t)isNegative << 63;
-      Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    }
-    Parser.Lex(); // Eat the token.
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (!Hash)
-    return MatchOperand_NoMatch;
-
-  TokError("invalid floating point immediate");
-  return MatchOperand_ParseFail;
-}
-
-/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  if (Parser.getTok().is(AsmToken::Hash))
-    Parser.Lex(); // Eat '#'
-  else if (Parser.getTok().isNot(AsmToken::Integer))
-    // Operand should start from # or should be integer, emit error otherwise.
-    return MatchOperand_NoMatch;
-
-  const MCExpr *Imm;
-  if (parseSymbolicImmVal(Imm))
-    return MatchOperand_ParseFail;
-  else if (Parser.getTok().isNot(AsmToken::Comma)) {
-    uint64_t ShiftAmount = 0;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
-    if (MCE) {
-      int64_t Val = MCE->getValue();
-      if (Val > 0xfff && (Val & 0xfff) == 0) {
-        Imm = MCConstantExpr::Create(Val >> 12, getContext());
-        ShiftAmount = 12;
-      }
-    }
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(ARM64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
-                                                      getContext()));
-    return MatchOperand_Success;
-  }
-
-  // Eat ','
-  Parser.Lex();
-
-  // The optional operand must be "lsl #N" where N is non-negative.
-  if (!Parser.getTok().is(AsmToken::Identifier) ||
-      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
-    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
-    return MatchOperand_ParseFail;
-  }
-
-  // Eat 'lsl'
-  Parser.Lex();
-
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex();
-  }
-
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
-    return MatchOperand_ParseFail;
-  }
-
-  int64_t ShiftAmount = Parser.getTok().getIntVal();
-
-  if (ShiftAmount < 0) {
-    Error(Parser.getTok().getLoc(), "positive shift amount required");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat the number
-
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(ARM64Operand::CreateShiftedImm(Imm, ShiftAmount,
-                                                    S, E, getContext()));
-  return MatchOperand_Success;
-}
-
-/// parseCondCodeString - Parse a Condition Code string.
-ARM64CC::CondCode ARM64AsmParser::parseCondCodeString(StringRef Cond) {
-  ARM64CC::CondCode CC = StringSwitch<ARM64CC::CondCode>(Cond.lower())
-                    .Case("eq", ARM64CC::EQ)
-                    .Case("ne", ARM64CC::NE)
-                    .Case("cs", ARM64CC::HS)
-                    .Case("hs", ARM64CC::HS)
-                    .Case("cc", ARM64CC::LO)
-                    .Case("lo", ARM64CC::LO)
-                    .Case("mi", ARM64CC::MI)
-                    .Case("pl", ARM64CC::PL)
-                    .Case("vs", ARM64CC::VS)
-                    .Case("vc", ARM64CC::VC)
-                    .Case("hi", ARM64CC::HI)
-                    .Case("ls", ARM64CC::LS)
-                    .Case("ge", ARM64CC::GE)
-                    .Case("lt", ARM64CC::LT)
-                    .Case("gt", ARM64CC::GT)
-                    .Case("le", ARM64CC::LE)
-                    .Case("al", ARM64CC::AL)
-                    .Case("nv", ARM64CC::NV)
-                    .Default(ARM64CC::Invalid);
-  return CC;
-}
-
-/// parseCondCode - Parse a Condition Code operand.
-bool ARM64AsmParser::parseCondCode(OperandVector &Operands,
-                                   bool invertCondCode) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  StringRef Cond = Tok.getString();
-  ARM64CC::CondCode CC = parseCondCodeString(Cond);
-  if (CC == ARM64CC::Invalid)
-    return TokError("invalid condition code");
-  Parser.Lex(); // Eat identifier token.
-
-  if (invertCondCode)
-    CC = ARM64CC::getInvertedCondCode(ARM64CC::CondCode(CC));
-
-  Operands.push_back(
-      ARM64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
-  return false;
-}
-
-/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
-/// them if present.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-  std::string LowerID = Tok.getString().lower();
-  ARM64_AM::ShiftExtendType ShOp =
-      StringSwitch<ARM64_AM::ShiftExtendType>(LowerID)
-          .Case("lsl", ARM64_AM::LSL)
-          .Case("lsr", ARM64_AM::LSR)
-          .Case("asr", ARM64_AM::ASR)
-          .Case("ror", ARM64_AM::ROR)
-          .Case("msl", ARM64_AM::MSL)
-          .Case("uxtb", ARM64_AM::UXTB)
-          .Case("uxth", ARM64_AM::UXTH)
-          .Case("uxtw", ARM64_AM::UXTW)
-          .Case("uxtx", ARM64_AM::UXTX)
-          .Case("sxtb", ARM64_AM::SXTB)
-          .Case("sxth", ARM64_AM::SXTH)
-          .Case("sxtw", ARM64_AM::SXTW)
-          .Case("sxtx", ARM64_AM::SXTX)
-          .Default(ARM64_AM::InvalidShiftExtend);
-
-  if (ShOp == ARM64_AM::InvalidShiftExtend)
-    return MatchOperand_NoMatch;
-
-  SMLoc S = Tok.getLoc();
-  Parser.Lex();
-
-  bool Hash = getLexer().is(AsmToken::Hash);
-  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
-    if (ShOp == ARM64_AM::LSL || ShOp == ARM64_AM::LSR ||
-        ShOp == ARM64_AM::ASR || ShOp == ARM64_AM::ROR ||
-        ShOp == ARM64_AM::MSL) {
-      // We expect a number here.
-      TokError("expected #imm after shift specifier");
-      return MatchOperand_ParseFail;
-    }
-
-    // "extend" type operatoins don't need an immediate, #0 is implicit.
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(
-        ARM64Operand::CreateShiftExtend(ShOp, 0, S, E, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Hash)
-    Parser.Lex(); // Eat the '#'.
-
-  // Make sure we do actually have a number
-  if (!Parser.getTok().is(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(),
-          "expected integer shift amount");
-    return MatchOperand_ParseFail;
-  }
-
-  const MCExpr *ImmVal;
-  if (getParser().parseExpression(ImmVal))
-    return MatchOperand_ParseFail;
-
-  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-  if (!MCE) {
-    TokError("expected #imm after shift specifier");
-    return MatchOperand_ParseFail;
-  }
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateShiftExtend(ShOp, MCE->getValue(), S,
-                                                     E, getContext()));
-  return MatchOperand_Success;
-}
-
-/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
-/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
-bool ARM64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
-                                   OperandVector &Operands) {
-  if (Name.find('.') != StringRef::npos)
-    return TokError("invalid operand");
-
-  Mnemonic = Name;
-  Operands.push_back(
-      ARM64Operand::CreateToken("sys", false, NameLoc, getContext()));
-
-  const AsmToken &Tok = Parser.getTok();
-  StringRef Op = Tok.getString();
-  SMLoc S = Tok.getLoc();
-
-  const MCExpr *Expr = nullptr;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
-  do {                                                                         \
-    Expr = MCConstantExpr::Create(op1, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));             \
-    Expr = MCConstantExpr::Create(op2, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-  } while (0)
-
-  if (Mnemonic == "ic") {
-    if (!Op.compare_lower("ialluis")) {
-      // SYS #0, C7, C1, #0
-      SYS_ALIAS(0, 7, 1, 0);
-    } else if (!Op.compare_lower("iallu")) {
-      // SYS #0, C7, C5, #0
-      SYS_ALIAS(0, 7, 5, 0);
-    } else if (!Op.compare_lower("ivau")) {
-      // SYS #3, C7, C5, #1
-      SYS_ALIAS(3, 7, 5, 1);
-    } else {
-      return TokError("invalid operand for IC instruction");
-    }
-  } else if (Mnemonic == "dc") {
-    if (!Op.compare_lower("zva")) {
-      // SYS #3, C7, C4, #1
-      SYS_ALIAS(3, 7, 4, 1);
-    } else if (!Op.compare_lower("ivac")) {
-      // SYS #3, C7, C6, #1
-      SYS_ALIAS(0, 7, 6, 1);
-    } else if (!Op.compare_lower("isw")) {
-      // SYS #0, C7, C6, #2
-      SYS_ALIAS(0, 7, 6, 2);
-    } else if (!Op.compare_lower("cvac")) {
-      // SYS #3, C7, C10, #1
-      SYS_ALIAS(3, 7, 10, 1);
-    } else if (!Op.compare_lower("csw")) {
-      // SYS #0, C7, C10, #2
-      SYS_ALIAS(0, 7, 10, 2);
-    } else if (!Op.compare_lower("cvau")) {
-      // SYS #3, C7, C11, #1
-      SYS_ALIAS(3, 7, 11, 1);
-    } else if (!Op.compare_lower("civac")) {
-      // SYS #3, C7, C14, #1
-      SYS_ALIAS(3, 7, 14, 1);
-    } else if (!Op.compare_lower("cisw")) {
-      // SYS #0, C7, C14, #2
-      SYS_ALIAS(0, 7, 14, 2);
-    } else {
-      return TokError("invalid operand for DC instruction");
-    }
-  } else if (Mnemonic == "at") {
-    if (!Op.compare_lower("s1e1r")) {
-      // SYS #0, C7, C8, #0
-      SYS_ALIAS(0, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e2r")) {
-      // SYS #4, C7, C8, #0
-      SYS_ALIAS(4, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e3r")) {
-      // SYS #6, C7, C8, #0
-      SYS_ALIAS(6, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e1w")) {
-      // SYS #0, C7, C8, #1
-      SYS_ALIAS(0, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e2w")) {
-      // SYS #4, C7, C8, #1
-      SYS_ALIAS(4, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e3w")) {
-      // SYS #6, C7, C8, #1
-      SYS_ALIAS(6, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e0r")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 2);
-    } else if (!Op.compare_lower("s1e0w")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 3);
-    } else if (!Op.compare_lower("s12e1r")) {
-      // SYS #4, C7, C8, #4
-      SYS_ALIAS(4, 7, 8, 4);
-    } else if (!Op.compare_lower("s12e1w")) {
-      // SYS #4, C7, C8, #5
-      SYS_ALIAS(4, 7, 8, 5);
-    } else if (!Op.compare_lower("s12e0r")) {
-      // SYS #4, C7, C8, #6
-      SYS_ALIAS(4, 7, 8, 6);
-    } else if (!Op.compare_lower("s12e0w")) {
-      // SYS #4, C7, C8, #7
-      SYS_ALIAS(4, 7, 8, 7);
-    } else {
-      return TokError("invalid operand for AT instruction");
-    }
-  } else if (Mnemonic == "tlbi") {
-    if (!Op.compare_lower("vmalle1is")) {
-      // SYS #0, C8, C3, #0
-      SYS_ALIAS(0, 8, 3, 0);
-    } else if (!Op.compare_lower("alle2is")) {
-      // SYS #4, C8, C3, #0
-      SYS_ALIAS(4, 8, 3, 0);
-    } else if (!Op.compare_lower("alle3is")) {
-      // SYS #6, C8, C3, #0
-      SYS_ALIAS(6, 8, 3, 0);
-    } else if (!Op.compare_lower("vae1is")) {
-      // SYS #0, C8, C3, #1
-      SYS_ALIAS(0, 8, 3, 1);
-    } else if (!Op.compare_lower("vae2is")) {
-      // SYS #4, C8, C3, #1
-      SYS_ALIAS(4, 8, 3, 1);
-    } else if (!Op.compare_lower("vae3is")) {
-      // SYS #6, C8, C3, #1
-      SYS_ALIAS(6, 8, 3, 1);
-    } else if (!Op.compare_lower("aside1is")) {
-      // SYS #0, C8, C3, #2
-      SYS_ALIAS(0, 8, 3, 2);
-    } else if (!Op.compare_lower("vaae1is")) {
-      // SYS #0, C8, C3, #3
-      SYS_ALIAS(0, 8, 3, 3);
-    } else if (!Op.compare_lower("alle1is")) {
-      // SYS #4, C8, C3, #4
-      SYS_ALIAS(4, 8, 3, 4);
-    } else if (!Op.compare_lower("vale1is")) {
-      // SYS #0, C8, C3, #5
-      SYS_ALIAS(0, 8, 3, 5);
-    } else if (!Op.compare_lower("vaale1is")) {
-      // SYS #0, C8, C3, #7
-      SYS_ALIAS(0, 8, 3, 7);
-    } else if (!Op.compare_lower("vmalle1")) {
-      // SYS #0, C8, C7, #0
-      SYS_ALIAS(0, 8, 7, 0);
-    } else if (!Op.compare_lower("alle2")) {
-      // SYS #4, C8, C7, #0
-      SYS_ALIAS(4, 8, 7, 0);
-    } else if (!Op.compare_lower("vale2is")) {
-      // SYS #4, C8, C3, #5
-      SYS_ALIAS(4, 8, 3, 5);
-    } else if (!Op.compare_lower("vale3is")) {
-      // SYS #6, C8, C3, #5
-      SYS_ALIAS(6, 8, 3, 5);
-    } else if (!Op.compare_lower("alle3")) {
-      // SYS #6, C8, C7, #0
-      SYS_ALIAS(6, 8, 7, 0);
-    } else if (!Op.compare_lower("vae1")) {
-      // SYS #0, C8, C7, #1
-      SYS_ALIAS(0, 8, 7, 1);
-    } else if (!Op.compare_lower("vae2")) {
-      // SYS #4, C8, C7, #1
-      SYS_ALIAS(4, 8, 7, 1);
-    } else if (!Op.compare_lower("vae3")) {
-      // SYS #6, C8, C7, #1
-      SYS_ALIAS(6, 8, 7, 1);
-    } else if (!Op.compare_lower("aside1")) {
-      // SYS #0, C8, C7, #2
-      SYS_ALIAS(0, 8, 7, 2);
-    } else if (!Op.compare_lower("vaae1")) {
-      // SYS #0, C8, C7, #3
-      SYS_ALIAS(0, 8, 7, 3);
-    } else if (!Op.compare_lower("alle1")) {
-      // SYS #4, C8, C7, #4
-      SYS_ALIAS(4, 8, 7, 4);
-    } else if (!Op.compare_lower("vale1")) {
-      // SYS #0, C8, C7, #5
-      SYS_ALIAS(0, 8, 7, 5);
-    } else if (!Op.compare_lower("vale2")) {
-      // SYS #4, C8, C7, #5
-      SYS_ALIAS(4, 8, 7, 5);
-    } else if (!Op.compare_lower("vale3")) {
-      // SYS #6, C8, C7, #5
-      SYS_ALIAS(6, 8, 7, 5);
-    } else if (!Op.compare_lower("vaale1")) {
-      // SYS #0, C8, C7, #7
-      SYS_ALIAS(0, 8, 7, 7);
-    } else if (!Op.compare_lower("ipas2e1")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 4, 1);
-    } else if (!Op.compare_lower("ipas2le1")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 4, 5);
-    } else if (!Op.compare_lower("ipas2e1is")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 0, 1);
-    } else if (!Op.compare_lower("ipas2le1is")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 0, 5);
-    } else if (!Op.compare_lower("vmalls12e1")) {
-      // SYS #4, C8, C7, #6
-      SYS_ALIAS(4, 8, 7, 6);
-    } else if (!Op.compare_lower("vmalls12e1is")) {
-      // SYS #4, C8, C3, #6
-      SYS_ALIAS(4, 8, 3, 6);
-    } else {
-      return TokError("invalid operand for TLBI instruction");
-    }
-  }
-
-#undef SYS_ALIAS
-
-  Parser.Lex(); // Eat operand.
-
-  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
-  bool HasRegister = false;
-
-  // Check for the optional register operand.
-  if (getLexer().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat comma.
-
-    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
-      return TokError("expected register operand");
-
-    HasRegister = true;
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    Parser.eatToEndOfStatement();
-    return TokError("unexpected token in argument list");
-  }
-
-  if (ExpectRegister && !HasRegister) {
-    return TokError("specified " + Mnemonic + " op requires a register");
-  }
-  else if (!ExpectRegister && HasRegister) {
-    return TokError("specified " + Mnemonic + " op does not use a register");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  // Can be either a #imm style literal or an option name
-  bool Hash = Tok.is(AsmToken::Hash);
-  if (Hash || Tok.is(AsmToken::Integer)) {
-    // Immediate operand.
-    if (Hash)
-      Parser.Lex(); // Eat the '#'
-    const MCExpr *ImmVal;
-    SMLoc ExprLoc = getLoc();
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      Error(ExprLoc, "immediate value expected for barrier operand");
-      return MatchOperand_ParseFail;
-    }
-    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
-      Error(ExprLoc, "barrier operand out of range");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(
-        ARM64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("invalid operand for instruction");
-    return MatchOperand_ParseFail;
-  }
-
-  bool Valid;
-  unsigned Opt = ARM64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
-  if (!Valid) {
-    TokError("invalid barrier option name");
-    return MatchOperand_ParseFail;
-  }
-
-  // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && Opt != ARM64DB::SY) {
-    TokError("'sy' or #imm operand expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Operands.push_back(ARM64Operand::CreateBarrier(Opt, getLoc(), getContext()));
-  Parser.Lex(); // Consume the option
-
-  return MatchOperand_Success;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSysReg(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  Operands.push_back(ARM64Operand::CreateSysReg(Tok.getString(), getLoc(),
-                     STI.getFeatureBits(), getContext()));
-  Parser.Lex(); // Eat identifier
-
-  return MatchOperand_Success;
-}
-
-/// tryParseVectorRegister - Parse a vector register operand.
-bool ARM64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
-  if (Parser.getTok().isNot(AsmToken::Identifier))
-    return true;
-
-  SMLoc S = getLoc();
-  // Check for a vector register specifier first.
-  StringRef Kind;
-  int64_t Reg = tryMatchVectorRegister(Kind, false);
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
-  // If there was an explicit qualifier, that goes on as a literal text
-  // operand.
-  if (!Kind.empty())
-    Operands.push_back(ARM64Operand::CreateToken(Kind, false, S, getContext()));
-
-  // If there is an index specifier following the register, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-
-  return false;
-}
-
-/// parseRegister - Parse a non-vector register operand.
-bool ARM64AsmParser::parseRegister(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  // Try for a vector register.
-  if (!tryParseVectorRegister(Operands))
-    return false;
-
-  // Try for a scalar register.
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
-
-  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
-  // as a string token in the instruction itself.
-  if (getLexer().getKind() == AsmToken::LBrac) {
-    SMLoc LBracS = getLoc();
-    Parser.Lex();
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Integer)) {
-      SMLoc IntS = getLoc();
-      int64_t Val = Tok.getIntVal();
-      if (Val == 1) {
-        Parser.Lex();
-        if (getLexer().getKind() == AsmToken::RBrac) {
-          SMLoc RBracS = getLoc();
-          Parser.Lex();
-          Operands.push_back(
-              ARM64Operand::CreateToken("[", false, LBracS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("1", false, IntS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("]", false, RBracS, getContext()));
-          return false;
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
-/// tryParseNoIndexMemory - Custom parser method for memory operands that
-///                         do not allow base regisrer writeback modes,
-///                         or those that handle writeback separately from
-///                         the memory operand (like the AdvSIMD ldX/stX
-///                         instructions.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseNoIndexMemory(OperandVector &Operands) {
-  if (Parser.getTok().isNot(AsmToken::LBrac))
-    return MatchOperand_NoMatch;
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-
-  const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier)) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return MatchOperand_ParseFail;
-  }
-
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return MatchOperand_ParseFail;
-  }
-
-  SMLoc E = getLoc();
-  if (Parser.getTok().isNot(AsmToken::RBrac)) {
-    Error(E, "']' expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat right bracket token.
-
-  Operands.push_back(ARM64Operand::CreateMem(Reg, nullptr, S, E, E, getContext()));
-  return MatchOperand_Success;
-}
-
-/// parseMemory - Parse a memory operand for a basic load/store instruction.
-bool ARM64AsmParser::parseMemory(OperandVector &Operands) {
-  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a Left Bracket");
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-
-  const AsmToken &BaseRegTok = Parser.getTok();
-  SMLoc BaseRegLoc = BaseRegTok.getLoc();
-  if (BaseRegTok.isNot(AsmToken::Identifier))
-    return Error(BaseRegLoc, "register expected");
-
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return Error(BaseRegLoc, "register expected");
-
-  if (!ARM64MCRegisterClasses[ARM64::GPR64spRegClassID].contains(Reg))
-    return Error(BaseRegLoc, "invalid operand for instruction");
-
-  // If there is an offset expression, parse it.
-  const MCExpr *OffsetExpr = nullptr;
-  SMLoc OffsetLoc;
-  if (Parser.getTok().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat the comma.
-    OffsetLoc = getLoc();
-
-    // Register offset
-    const AsmToken &OffsetRegTok = Parser.getTok();
-    int Reg2 = OffsetRegTok.is(AsmToken::Identifier) ? tryParseRegister() : -1;
-    if (Reg2 != -1) {
-      // Default shift is LSL, with an omitted shift.  We use the third bit of
-      // the extend value to indicate presence/omission of the immediate offset.
-      ARM64_AM::ShiftExtendType ExtOp = ARM64_AM::UXTX;
-      int64_t ShiftVal = 0;
-      bool ExplicitShift = false;
-
-      if (Parser.getTok().is(AsmToken::Comma)) {
-        // Embedded extend operand.
-        Parser.Lex(); // Eat the comma
-
-        SMLoc ExtLoc = getLoc();
-        const AsmToken &Tok = Parser.getTok();
-        ExtOp = StringSwitch<ARM64_AM::ShiftExtendType>(Tok.getString().lower())
-                    .Case("uxtw", ARM64_AM::UXTW)
-                    .Case("lsl", ARM64_AM::UXTX) // Alias for UXTX
-                    .Case("sxtw", ARM64_AM::SXTW)
-                    .Case("sxtx", ARM64_AM::SXTX)
-                    .Default(ARM64_AM::InvalidShiftExtend);
-        if (ExtOp == ARM64_AM::InvalidShiftExtend)
-          return Error(ExtLoc, "expected valid extend operation");
-
-        Parser.Lex(); // Eat the extend op.
-
-        // A 32-bit offset register is only valid for [SU]/XTW extend
-        // operators.
-        if (ARM64MCRegisterClasses[ARM64::GPR32allRegClassID].contains(Reg2)) {
-         if (ExtOp != ARM64_AM::UXTW &&
-            ExtOp != ARM64_AM::SXTW)
-          return Error(ExtLoc, "32-bit general purpose offset register "
-                               "requires sxtw or uxtw extend");
-        } else if (!ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-                       Reg2))
-          return Error(OffsetLoc,
-                       "64-bit general purpose offset register expected");
-
-        bool Hash = getLexer().is(AsmToken::Hash);
-        if (getLexer().is(AsmToken::RBrac)) {
-          // No immediate operand.
-          if (ExtOp == ARM64_AM::UXTX)
-            return Error(ExtLoc, "LSL extend requires immediate operand");
-        } else if (Hash || getLexer().is(AsmToken::Integer)) {
-          // Immediate operand.
-          if (Hash)
-            Parser.Lex(); // Eat the '#'
-          const MCExpr *ImmVal;
-          SMLoc ExprLoc = getLoc();
-          if (getParser().parseExpression(ImmVal))
-            return true;
-          const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-          if (!MCE)
-            return TokError("immediate value expected for extend operand");
-
-          ExplicitShift = true;
-          ShiftVal = MCE->getValue();
-          if (ShiftVal < 0 || ShiftVal > 4)
-            return Error(ExprLoc, "immediate operand out of range");
-        } else
-          return Error(getLoc(), "expected immediate operand");
-      }
-
-      if (Parser.getTok().isNot(AsmToken::RBrac))
-        return Error(getLoc(), "']' expected");
-
-      Parser.Lex(); // Eat right bracket token.
-
-      SMLoc E = getLoc();
-      Operands.push_back(ARM64Operand::CreateRegOffsetMem(
-          Reg, Reg2, ExtOp, ShiftVal, ExplicitShift, S, E, getContext()));
-      return false;
-
-      // Immediate expressions.
-    } else if (Parser.getTok().is(AsmToken::Hash) ||
-               Parser.getTok().is(AsmToken::Colon) ||
-               Parser.getTok().is(AsmToken::Integer)) {
-      if (Parser.getTok().is(AsmToken::Hash))
-        Parser.Lex(); // Eat hash token.
-
-      if (parseSymbolicImmVal(OffsetExpr))
-        return true;
-    } else {
-      // FIXME: We really should make sure that we're dealing with a LDR/STR
-      // instruction that can legally have a symbolic expression here.
-      // Symbol reference.
-      if (Parser.getTok().isNot(AsmToken::Identifier) &&
-          Parser.getTok().isNot(AsmToken::String))
-        return Error(getLoc(), "identifier or immediate expression expected");
-      if (getParser().parseExpression(OffsetExpr))
-        return true;
-      // If this is a plain ref, Make sure a legal variant kind was specified.
-      // Otherwise, it's a more complicated expression and we have to just
-      // assume it's OK and let the relocation stuff puke if it's not.
-      ARM64MCExpr::VariantKind ELFRefKind;
-      MCSymbolRefExpr::VariantKind DarwinRefKind;
-      int64_t Addend;
-      if (classifySymbolRef(OffsetExpr, ELFRefKind, DarwinRefKind, Addend) &&
-          Addend == 0) {
-        assert(ELFRefKind == ARM64MCExpr::VK_INVALID &&
-               "ELF symbol modifiers not supported here yet");
-
-        switch (DarwinRefKind) {
-        default:
-          return Error(getLoc(), "expected @pageoff or @gotpageoff modifier");
-        case MCSymbolRefExpr::VK_GOTPAGEOFF:
-        case MCSymbolRefExpr::VK_PAGEOFF:
-        case MCSymbolRefExpr::VK_TLVPPAGEOFF:
-          // These are what we're expecting.
-          break;
-        }
-      }
-    }
-  }
-
-  SMLoc E = getLoc();
-  if (Parser.getTok().isNot(AsmToken::RBrac))
-    return Error(E, "']' expected");
-
-  Parser.Lex(); // Eat right bracket token.
-
-  // Create the memory operand.
-  Operands.push_back(
-      ARM64Operand::CreateMem(Reg, OffsetExpr, S, E, OffsetLoc, getContext()));
-
-  // Check for a '!', indicating pre-indexed addressing with writeback.
-  if (Parser.getTok().is(AsmToken::Exclaim)) {
-    // There needs to have been an immediate or wback doesn't make sense.
-    if (!OffsetExpr)
-      return Error(E, "missing offset for pre-indexed addressing");
-    // Pre-indexed with writeback must have a constant expression for the
-    // offset. FIXME: Theoretically, we'd like to allow fixups so long
-    // as they don't require a relocation.
-    if (!isa<MCConstantExpr>(OffsetExpr))
-      return Error(OffsetLoc, "constant immediate expression expected");
-
-    // Create the Token operand for the '!'.
-    Operands.push_back(ARM64Operand::CreateToken(
-        "!", false, Parser.getTok().getLoc(), getContext()));
-    Parser.Lex(); // Eat the '!' token.
-  }
-
-  return false;
-}
-
-bool ARM64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
-  bool HasELFModifier = false;
-  ARM64MCExpr::VariantKind RefKind;
-
-  if (Parser.getTok().is(AsmToken::Colon)) {
-    Parser.Lex(); // Eat ':"
-    HasELFModifier = true;
-
-    if (Parser.getTok().isNot(AsmToken::Identifier)) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    std::string LowerCase = Parser.getTok().getIdentifier().lower();
-    RefKind = StringSwitch<ARM64MCExpr::VariantKind>(LowerCase)
-                  .Case("lo12", ARM64MCExpr::VK_LO12)
-                  .Case("abs_g3", ARM64MCExpr::VK_ABS_G3)
-                  .Case("abs_g2", ARM64MCExpr::VK_ABS_G2)
-                  .Case("abs_g2_s", ARM64MCExpr::VK_ABS_G2_S)
-                  .Case("abs_g2_nc", ARM64MCExpr::VK_ABS_G2_NC)
-                  .Case("abs_g1", ARM64MCExpr::VK_ABS_G1)
-                  .Case("abs_g1_s", ARM64MCExpr::VK_ABS_G1_S)
-                  .Case("abs_g1_nc", ARM64MCExpr::VK_ABS_G1_NC)
-                  .Case("abs_g0", ARM64MCExpr::VK_ABS_G0)
-                  .Case("abs_g0_s", ARM64MCExpr::VK_ABS_G0_S)
-                  .Case("abs_g0_nc", ARM64MCExpr::VK_ABS_G0_NC)
-                  .Case("dtprel_g2", ARM64MCExpr::VK_DTPREL_G2)
-                  .Case("dtprel_g1", ARM64MCExpr::VK_DTPREL_G1)
-                  .Case("dtprel_g1_nc", ARM64MCExpr::VK_DTPREL_G1_NC)
-                  .Case("dtprel_g0", ARM64MCExpr::VK_DTPREL_G0)
-                  .Case("dtprel_g0_nc", ARM64MCExpr::VK_DTPREL_G0_NC)
-                  .Case("dtprel_hi12", ARM64MCExpr::VK_DTPREL_HI12)
-                  .Case("dtprel_lo12", ARM64MCExpr::VK_DTPREL_LO12)
-                  .Case("dtprel_lo12_nc", ARM64MCExpr::VK_DTPREL_LO12_NC)
-                  .Case("tprel_g2", ARM64MCExpr::VK_TPREL_G2)
-                  .Case("tprel_g1", ARM64MCExpr::VK_TPREL_G1)
-                  .Case("tprel_g1_nc", ARM64MCExpr::VK_TPREL_G1_NC)
-                  .Case("tprel_g0", ARM64MCExpr::VK_TPREL_G0)
-                  .Case("tprel_g0_nc", ARM64MCExpr::VK_TPREL_G0_NC)
-                  .Case("tprel_hi12", ARM64MCExpr::VK_TPREL_HI12)
-                  .Case("tprel_lo12", ARM64MCExpr::VK_TPREL_LO12)
-                  .Case("tprel_lo12_nc", ARM64MCExpr::VK_TPREL_LO12_NC)
-                  .Case("tlsdesc_lo12", ARM64MCExpr::VK_TLSDESC_LO12)
-                  .Case("got", ARM64MCExpr::VK_GOT_PAGE)
-                  .Case("got_lo12", ARM64MCExpr::VK_GOT_LO12)
-                  .Case("gottprel", ARM64MCExpr::VK_GOTTPREL_PAGE)
-                  .Case("gottprel_lo12", ARM64MCExpr::VK_GOTTPREL_LO12_NC)
-                  .Case("gottprel_g1", ARM64MCExpr::VK_GOTTPREL_G1)
-                  .Case("gottprel_g0_nc", ARM64MCExpr::VK_GOTTPREL_G0_NC)
-                  .Case("tlsdesc", ARM64MCExpr::VK_TLSDESC_PAGE)
-                  .Default(ARM64MCExpr::VK_INVALID);
-
-    if (RefKind == ARM64MCExpr::VK_INVALID) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    Parser.Lex(); // Eat identifier
-
-    if (Parser.getTok().isNot(AsmToken::Colon)) {
-      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
-      return true;
-    }
-    Parser.Lex(); // Eat ':'
-  }
-
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  if (HasELFModifier)
-    ImmVal = ARM64MCExpr::Create(ImmVal, RefKind, getContext());
-
-  return false;
-}
-
-/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
-bool ARM64AsmParser::parseVectorList(OperandVector &Operands) {
-  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-  StringRef Kind;
-  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
-  if (FirstReg == -1)
-    return true;
-  int64_t PrevReg = FirstReg;
-  unsigned Count = 1;
-
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    Parser.Lex(); // Eat the minus.
-
-    SMLoc Loc = getLoc();
-    StringRef NextKind;
-    int64_t Reg = tryMatchVectorRegister(NextKind, true);
-    if (Reg == -1)
-      return true;
-    // Any Kind suffices must match on all regs in the list.
-    if (Kind != NextKind)
-      return Error(Loc, "mismatched register size suffix");
-
-    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
-
-    if (Space == 0 || Space > 3) {
-      return Error(Loc, "invalid number of vectors");
-    }
-
-    Count += Space;
-  }
-  else {
-    while (Parser.getTok().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma token.
-
-      SMLoc Loc = getLoc();
-      StringRef NextKind;
-      int64_t Reg = tryMatchVectorRegister(NextKind, true);
-      if (Reg == -1)
-        return true;
-      // Any Kind suffices must match on all regs in the list.
-      if (Kind != NextKind)
-        return Error(Loc, "mismatched register size suffix");
-
-      // Registers must be incremental (with wraparound at 31)
-      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
-          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
-       return Error(Loc, "registers must be sequential");
-
-      PrevReg = Reg;
-      ++Count;
-    }
-  }
-
-  if (Parser.getTok().isNot(AsmToken::RCurly))
-    return Error(getLoc(), "'}' expected");
-  Parser.Lex(); // Eat the '}' token.
-
-  if (Count > 4)
-    return Error(S, "invalid number of vectors");
-
-  unsigned NumElements = 0;
-  char ElementKind = 0;
-  if (!Kind.empty())
-    parseValidVectorKind(Kind, NumElements, ElementKind);
-
-  Operands.push_back(ARM64Operand::CreateVectorList(
-      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
-
-  // If there is an index specifier following the list, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-  return false;
-}
-
-/// parseOperand - Parse a arm instruction operand.  For now this parses the
-/// operand regardless of the mnemonic.
-bool ARM64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
-                                  bool invertCondCode) {
-  // Check if the current operand has a custom associated parser, if so, try to
-  // custom parse the operand, or fallback to the general approach.
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
-  if (ResTy == MatchOperand_Success)
-    return false;
-  // If there wasn't a custom match, try the generic matcher below. Otherwise,
-  // there was a match, but an error occurred, in which case, just return that
-  // the operand parsing failed.
-  if (ResTy == MatchOperand_ParseFail)
-    return true;
-
-  // Nothing custom, so do general case parsing.
-  SMLoc S, E;
-  switch (getLexer().getKind()) {
-  default: {
-    SMLoc S = getLoc();
-    const MCExpr *Expr;
-    if (parseSymbolicImmVal(Expr))
-      return Error(S, "invalid operand");
-
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::LBrac:
-    return parseMemory(Operands);
-  case AsmToken::LCurly:
-    return parseVectorList(Operands);
-  case AsmToken::Identifier: {
-    // If we're expecting a Condition Code operand, then just parse that.
-    if (isCondCode)
-      return parseCondCode(Operands, invertCondCode);
-
-    // If it's a register name, parse it.
-    if (!parseRegister(Operands))
-      return false;
-
-    // This could be an optional "shift" or "extend" operand.
-    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
-    // We can only continue if no tokens were eaten.
-    if (GotShift != MatchOperand_NoMatch)
-      return GotShift;
-
-    // This was not a register so parse other operands that start with an
-    // identifier (like labels) as expressions and create them as immediates.
-    const MCExpr *IdVal;
-    S = getLoc();
-    if (getParser().parseExpression(IdVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(IdVal, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::Integer:
-  case AsmToken::Real:
-  case AsmToken::Hash: {
-    // #42 -> immediate.
-    S = getLoc();
-    if (getLexer().is(AsmToken::Hash))
-      Parser.Lex();
-
-    // Parse a negative sign
-    bool isNegative = false;
-    if (Parser.getTok().is(AsmToken::Minus)) {
-      isNegative = true;
-      // We need to consume this token only when we have a Real, otherwise
-      // we let parseSymbolicImmVal take care of it
-      if (Parser.getLexer().peekTok().is(AsmToken::Real))
-        Parser.Lex();
-    }
-
-    // The only Real that should come through here is a literal #0.0 for
-    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
-    // so convert the value.
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Real)) {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
-          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
-          Mnemonic != "fcmlt")
-        return TokError("unexpected floating point literal");
-      else if (IntVal != 0 || isNegative)
-        return TokError("expected floating-point constant #0.0");
-      Parser.Lex(); // Eat the token.
-
-      Operands.push_back(
-          ARM64Operand::CreateToken("#0", false, S, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken(".0", false, S, getContext()));
-      return false;
-    }
-
-    const MCExpr *ImmVal;
-    if (parseSymbolicImmVal(ImmVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(ImmVal, S, E, getContext()));
-    return false;
-  }
-  }
-}
-
-/// ParseInstruction - Parse an ARM64 instruction mnemonic followed by its
-/// operands.
-bool ARM64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                      StringRef Name, SMLoc NameLoc,
-                                      OperandVector &Operands) {
-  Name = StringSwitch<StringRef>(Name.lower())
-             .Case("beq", "b.eq")
-             .Case("bne", "b.ne")
-             .Case("bhs", "b.hs")
-             .Case("bcs", "b.cs")
-             .Case("blo", "b.lo")
-             .Case("bcc", "b.cc")
-             .Case("bmi", "b.mi")
-             .Case("bpl", "b.pl")
-             .Case("bvs", "b.vs")
-             .Case("bvc", "b.vc")
-             .Case("bhi", "b.hi")
-             .Case("bls", "b.ls")
-             .Case("bge", "b.ge")
-             .Case("blt", "b.lt")
-             .Case("bgt", "b.gt")
-             .Case("ble", "b.le")
-             .Case("bal", "b.al")
-             .Case("bnv", "b.nv")
-             .Default(Name);
-
-  // Create the leading tokens for the mnemonic, split by '.' characters.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-
-  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
-  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
-    bool IsError = parseSysAlias(Head, NameLoc, Operands);
-    if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
-      Parser.eatToEndOfStatement();
-    return IsError;
-  }
-
-  Operands.push_back(
-      ARM64Operand::CreateToken(Head, false, NameLoc, getContext()));
-  Mnemonic = Head;
-
-  // Handle condition codes for a branch mnemonic
-  if (Head == "b" && Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start + 1, Next);
-
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()));
-    ARM64CC::CondCode CC = parseCondCodeString(Head);
-    if (CC == ARM64CC::Invalid)
-      return Error(SuffixLoc, "invalid condition code");
-    Operands.push_back(
-        ARM64Operand::CreateToken(".", true, SuffixLoc, getContext()));
-    Operands.push_back(
-        ARM64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
-  }
-
-  // Add the remaining tokens in the mnemonic.
-  while (Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start, Next);
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()) + 1);
-    Operands.push_back(
-        ARM64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
-  }
-
-  // Conditional compare instructions have a Condition Code operand, which needs
-  // to be parsed and an immediate operand created.
-  bool condCodeFourthOperand =
-      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
-       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
-       Head == "csinc" || Head == "csinv" || Head == "csneg");
-
-  // These instructions are aliases to some of the conditional select
-  // instructions. However, the condition code is inverted in the aliased
-  // instruction.
-  //
-  // FIXME: Is this the correct way to handle these? Or should the parser
-  //        generate the aliased instructions directly?
-  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
-  bool condCodeThirdOperand =
-      (Head == "cinc" || Head == "cinv" || Head == "cneg");
-
-  // Read the remaining operands.
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    // Read the first operand.
-    if (parseOperand(Operands, false, false)) {
-      Parser.eatToEndOfStatement();
-      return true;
-    }
-
-    unsigned N = 2;
-    while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
-
-      // Parse and remember the operand.
-      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
-                                     (N == 3 && condCodeThirdOperand) ||
-                                     (N == 2 && condCodeSecondOperand),
-                       condCodeSecondOperand || condCodeThirdOperand)) {
-        Parser.eatToEndOfStatement();
-        return true;
-      }
-
-      ++N;
-    }
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Parser.eatToEndOfStatement();
-    return Error(Loc, "unexpected token in argument list");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-// FIXME: This entire function is a giant hack to provide us with decent
-// operand range validation/diagnostics until TableGen/MC can be extended
-// to support autogeneration of this kind of validation.
-bool ARM64AsmParser::validateInstruction(MCInst &Inst,
-                                         SmallVectorImpl<SMLoc> &Loc) {
-  const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  // Check for indexed addressing modes w/ the base register being the
-  // same as a destination/source register or pair load where
-  // the Rt == Rt2. All of those are undefined behaviour.
-  switch (Inst.getOpcode()) {
-  case ARM64::LDPSWpre:
-  case ARM64::LDPWpost:
-  case ARM64::LDPWpre:
-  case ARM64::LDPXpost:
-  case ARM64::LDPXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    // FALLTHROUGH
-  }
-  case ARM64::LDPDpost:
-  case ARM64::LDPDpre:
-  case ARM64::LDPQpost:
-  case ARM64::LDPQpre:
-  case ARM64::LDPSpost:
-  case ARM64::LDPSpre:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPDi:
-  case ARM64::LDPQi:
-  case ARM64::LDPSi:
-  case ARM64::LDPSWi:
-  case ARM64::LDPWi:
-  case ARM64::LDPXi: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    if (Rt == Rt2)
-      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
-    break;
-  }
-  case ARM64::STPDpost:
-  case ARM64::STPDpre:
-  case ARM64::STPQpost:
-  case ARM64::STPQpre:
-  case ARM64::STPSpost:
-  case ARM64::STPSpre:
-  case ARM64::STPWpost:
-  case ARM64::STPWpre:
-  case ARM64::STPXpost:
-  case ARM64::STPXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::LDRBBpre:
-  case ARM64::LDRBpre:
-  case ARM64::LDRHHpre:
-  case ARM64::LDRHpre:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRBBpost:
-  case ARM64::LDRBpost:
-  case ARM64::LDRHHpost:
-  case ARM64::LDRHpost:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRWpost:
-  case ARM64::LDRXpost: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rn = Inst.getOperand(1).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::STRBBpost:
-  case ARM64::STRBpost:
-  case ARM64::STRHHpost:
-  case ARM64::STRHpost:
-  case ARM64::STRWpost:
-  case ARM64::STRXpost:
-  case ARM64::STRBBpre:
-  case ARM64::STRBpre:
-  case ARM64::STRHHpre:
-  case ARM64::STRHpre:
-  case ARM64::STRWpre:
-  case ARM64::STRXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rn = Inst.getOperand(1).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  }
-
-  // Now check immediate ranges. Separate from the above as there is overlap
-  // in the instructions being checked and this keeps the nested conditionals
-  // to a minimum.
-  switch (Inst.getOpcode()) {
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXri:
-  case ARM64::ADDWri:
-  case ARM64::ADDXri:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXri:
-  case ARM64::SUBWri:
-  case ARM64::SUBXri: {
-    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
-    // some slight duplication here.
-    if (Inst.getOperand(2).isExpr()) {
-      const MCExpr *Expr = Inst.getOperand(2).getExpr();
-      ARM64MCExpr::VariantKind ELFRefKind;
-      MCSymbolRefExpr::VariantKind DarwinRefKind;
-      int64_t Addend;
-      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-        return Error(Loc[2], "invalid immediate expression");
-      }
-
-      // Only allow these with ADDXri.
-      if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
-          Inst.getOpcode() == ARM64::ADDXri)
-        return false;
-
-      // Only allow these with ADDXri/ADDWri
-      if ((ELFRefKind == ARM64MCExpr::VK_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_HI12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_HI12 ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) &&
-          (Inst.getOpcode() == ARM64::ADDXri ||
-          Inst.getOpcode() == ARM64::ADDWri))
-        return false;
-
-      // Don't allow expressions in the immediate field otherwise
-      return Error(Loc[2], "invalid immediate expression");
-    }
-    return false;
-  }
-  default:
-    return false;
-  }
-}
-
-bool ARM64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
-  switch (ErrCode) {
-  case Match_MissingFeature:
-    return Error(Loc,
-                 "instruction requires a CPU feature not currently enabled");
-  case Match_InvalidOperand:
-    return Error(Loc, "invalid operand for instruction");
-  case Match_InvalidSuffix:
-    return Error(Loc, "invalid type suffix for instruction");
-  case Match_InvalidCondCode:
-    return Error(Loc, "expected AArch64 condition code");
-  case Match_AddSubRegExtendSmall:
-    return Error(Loc,
-      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegExtendLarge:
-    return Error(Loc,
-      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubSecondSource:
-    return Error(Loc,
-      "expected compatible register, symbol or integer in range [0, 4095]");
-  case Match_LogicalSecondSource:
-    return Error(Loc, "expected compatible register or logical immediate");
-  case Match_InvalidMovImm32Shift:
-    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
-  case Match_InvalidMovImm64Shift:
-    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
-  case Match_AddSubRegShift32:
-    return Error(Loc,
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
-  case Match_AddSubRegShift64:
-    return Error(Loc,
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
-  case Match_InvalidFPImm:
-    return Error(Loc,
-                 "expected compatible register or floating-point constant");
-  case Match_InvalidMemoryIndexedSImm9:
-    return Error(Loc, "index must be an integer in range [-256, 255].");
-  case Match_InvalidMemoryIndexed32SImm7:
-    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
-  case Match_InvalidMemoryIndexed64SImm7:
-    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
-  case Match_InvalidMemoryIndexed128SImm7:
-    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
-  case Match_InvalidMemoryIndexed:
-    return Error(Loc, "invalid offset in memory address.");
-  case Match_InvalidMemoryIndexed8:
-    return Error(Loc, "index must be an integer in range [0, 4095].");
-  case Match_InvalidMemoryIndexed16:
-    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
-  case Match_InvalidMemoryIndexed32:
-    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
-  case Match_InvalidMemoryIndexed64:
-    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
-  case Match_InvalidMemoryIndexed128:
-    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
-  case Match_InvalidImm0_7:
-    return Error(Loc, "immediate must be an integer in range [0, 7].");
-  case Match_InvalidImm0_15:
-    return Error(Loc, "immediate must be an integer in range [0, 15].");
-  case Match_InvalidImm0_31:
-    return Error(Loc, "immediate must be an integer in range [0, 31].");
-  case Match_InvalidImm0_63:
-    return Error(Loc, "immediate must be an integer in range [0, 63].");
-  case Match_InvalidImm0_127:
-    return Error(Loc, "immediate must be an integer in range [0, 127].");
-  case Match_InvalidImm0_65535:
-    return Error(Loc, "immediate must be an integer in range [0, 65535].");
-  case Match_InvalidImm1_8:
-    return Error(Loc, "immediate must be an integer in range [1, 8].");
-  case Match_InvalidImm1_16:
-    return Error(Loc, "immediate must be an integer in range [1, 16].");
-  case Match_InvalidImm1_32:
-    return Error(Loc, "immediate must be an integer in range [1, 32].");
-  case Match_InvalidImm1_64:
-    return Error(Loc, "immediate must be an integer in range [1, 64].");
-  case Match_InvalidIndex1:
-    return Error(Loc, "expected lane specifier '[1]'");
-  case Match_InvalidIndexB:
-    return Error(Loc, "vector lane must be an integer in range [0, 15].");
-  case Match_InvalidIndexH:
-    return Error(Loc, "vector lane must be an integer in range [0, 7].");
-  case Match_InvalidIndexS:
-    return Error(Loc, "vector lane must be an integer in range [0, 3].");
-  case Match_InvalidIndexD:
-    return Error(Loc, "vector lane must be an integer in range [0, 1].");
-  case Match_InvalidLabel:
-    return Error(Loc, "expected label or encodable integer pc offset");
-  case Match_MRS:
-    return Error(Loc, "expected readable system register");
-  case Match_MSR:
-    return Error(Loc, "expected writable system register or pstate");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
-  default:
-    assert(0 && "unexpected error code!");
-    return Error(Loc, "invalid instruction format");
-  }
-}
-
-static const char *getSubtargetFeatureName(unsigned Val);
-
-bool ARM64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                             OperandVector &Operands,
-                                             MCStreamer &Out,
-                                             unsigned &ErrorInfo,
-                                             bool MatchingInlineAsm) {
-  assert(!Operands.empty() && "Unexpect empty operand list!");
-  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
-
-  StringRef Tok = Op->getToken();
-  unsigned NumOperands = Operands.size();
-
-  if (NumOperands == 4 && Tok == "lsl") {
-    ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-    ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-    if (Op2->isReg() && Op3->isImm()) {
-      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-      if (Op3CE) {
-        uint64_t Op3Val = Op3CE->getValue();
-        uint64_t NewOp3Val = 0;
-        uint64_t NewOp4Val = 0;
-        if (ARM64MCRegisterClasses[ARM64::GPR32allRegClassID].contains(
-                Op2->getReg())) {
-          NewOp3Val = (32 - Op3Val) & 0x1f;
-          NewOp4Val = 31 - Op3Val;
-        } else {
-          NewOp3Val = (64 - Op3Val) & 0x3f;
-          NewOp4Val = 63 - Op3Val;
-        }
-
-        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
-        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
-
-        Operands[0] = ARM64Operand::CreateToken(
-            "ubfm", false, Op->getStartLoc(), getContext());
-        Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                              Op3->getEndLoc(), getContext());
-        Operands.push_back(ARM64Operand::CreateImm(
-            NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
-        delete Op3;
-        delete Op;
-      }
-    }
-  } else if (NumOperands == 5) {
-    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
-    // UBFIZ -> UBFM aliases.
-    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-
-          uint64_t RegWidth = 0;
-          if (ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-              Op1->getReg()))
-            RegWidth = 64;
-          else
-            RegWidth = 32;
-
-          if (Op3Val >= RegWidth)
-            return Error(Op3->getStartLoc(),
-                         "expected integer in range [0, 31]");
-          if (Op4Val < 1 || Op4Val > RegWidth)
-            return Error(Op4->getStartLoc(),
-                         "expected integer in range [1, 32]");
-
-          uint64_t NewOp3Val = 0;
-          if (ARM64MCRegisterClasses[ARM64::GPR32allRegClassID].contains(
-                  Op1->getReg()))
-            NewOp3Val = (32 - Op3Val) & 0x1f;
-          else
-            NewOp3Val = (64 - Op3Val) & 0x3f;
-
-          uint64_t NewOp4Val = Op4Val - 1;
-
-          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
-            return Error(Op4->getStartLoc(),
-                         "requested insert overflows register");
-
-          const MCExpr *NewOp3 =
-              MCConstantExpr::Create(NewOp3Val, getContext());
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                                Op3->getEndLoc(), getContext());
-          Operands[4] = ARM64Operand::CreateImm(NewOp4, Op4->getStartLoc(),
-                                                Op4->getEndLoc(), getContext());
-          if (Tok == "bfi")
-            Operands[0] = ARM64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "sbfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "ubfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
-          else
-            llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op3;
-          delete Op4;
-        }
-      }
-
-      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
-      // UBFX -> UBFM aliases.
-    } else if (NumOperands == 5 &&
-               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-
-          uint64_t RegWidth = 0;
-          if (ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-              Op1->getReg()))
-            RegWidth = 64;
-          else
-            RegWidth = 32;
-
-          if (Op3Val >= RegWidth)
-            return Error(Op3->getStartLoc(),
-                         "expected integer in range [0, 31]");
-          if (Op4Val < 1 || Op4Val > RegWidth)
-            return Error(Op4->getStartLoc(),
-                         "expected integer in range [1, 32]");
-
-          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
-
-          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
-            return Error(Op4->getStartLoc(),
-                         "requested extract overflows register");
-
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-          Operands[4] = ARM64Operand::CreateImm(
-              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
-          if (Tok == "bfxil")
-            Operands[0] = ARM64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "sbfx")
-            Operands[0] = ARM64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "ubfx")
-            Operands[0] = ARM64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
-          else
-            llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op4;
-        }
-      }
-    }
-  }
-  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
-  //        InstAlias can't quite handle this since the reg classes aren't
-  //        subclasses.
-  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
-    // The source register can be Wn here, but the matcher expects a
-    // GPR64. Twiddle it here if necessary.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op->isReg()) {
-      unsigned Reg = getXRegFromWReg(Op->getReg());
-      Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete Op;
-    }
-  }
-  // FIXME: Likewise for sxt[bh] with a Xd dst operand
-  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-    if (Op->isReg() &&
-        ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-            Op->getReg())) {
-      // The source register can be Wn here, but the matcher expects a
-      // GPR64. Twiddle it here if necessary.
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-      if (Op->isReg()) {
-        unsigned Reg = getXRegFromWReg(Op->getReg());
-        Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-        delete Op;
-      }
-    }
-  }
-  // FIXME: Likewise for uxt[bh] with a Xd dst operand
-  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-    if (Op->isReg() &&
-        ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains(
-            Op->getReg())) {
-      // The source register can be Wn here, but the matcher expects a
-      // GPR32. Twiddle it here if necessary.
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-      if (Op->isReg()) {
-        unsigned Reg = getWRegFromXReg(Op->getReg());
-        Operands[1] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-        delete Op;
-      }
-    }
-  }
-
-  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
-  if (NumOperands == 3 && Tok == "fmov") {
-    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
-    ARM64Operand *ImmOp = static_cast<ARM64Operand *>(Operands[2]);
-    if (RegOp->isReg() && ImmOp->isFPImm() &&
-        ImmOp->getFPImm() == (unsigned)-1) {
-      unsigned zreg = ARM64MCRegisterClasses[ARM64::FPR32RegClassID].contains(
-                          RegOp->getReg())
-                          ? ARM64::WZR
-                          : ARM64::XZR;
-      Operands[2] = ARM64Operand::CreateReg(zreg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete ImmOp;
-    }
-  }
-
-  MCInst Inst;
-  // First try to match against the secondary set of tables containing the
-  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
-  unsigned MatchResult =
-      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
-
-  // If that fails, try against the alternate table containing long-form NEON:
-  // "fadd v0.2s, v1.2s, v2.2s"
-  if (MatchResult != Match_Success)
-    MatchResult =
-        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
-
-  switch (MatchResult) {
-  case Match_Success: {
-    // Perform range checking and other semantic validations
-    SmallVector<SMLoc, 8> OperandLocs;
-    NumOperands = Operands.size();
-    for (unsigned i = 1; i < NumOperands; ++i)
-      OperandLocs.push_back(Operands[i]->getStartLoc());
-    if (validateInstruction(Inst, OperandLocs))
-      return true;
-
-    Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
-    return false;
-  }
-  case Match_MissingFeature: {
-    assert(ErrorInfo && "Unknown missing feature!");
-    // Special case the error message for the very common case where only
-    // a single subtarget feature is missing (neon, e.g.).
-    std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
-    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
-      if (ErrorInfo & Mask) {
-        Msg += " ";
-        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
-      }
-      Mask <<= 1;
-    }
-    return Error(IDLoc, Msg);
-  }
-  case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      if (ErrorInfo >= Operands.size())
-        return Error(IDLoc, "too few operands for instruction");
-
-      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc())
-        ErrorLoc = IDLoc;
-    }
-    // If the match failed on a suffix token operand, tweak the diagnostic
-    // accordingly.
-    if (((ARM64Operand *)Operands[ErrorInfo])->isToken() &&
-        ((ARM64Operand *)Operands[ErrorInfo])->isTokenSuffix())
-      MatchResult = Match_InvalidSuffix;
-
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  case Match_InvalidMemoryIndexedSImm9: {
-    // If there is not a '!' after the memory operand that failed, we really
-    // want the diagnostic for the non-pre-indexed instruction variant instead.
-    // Be careful to check for the post-indexed variant as well, which also
-    // uses this match diagnostic. Also exclude the explicitly unscaled
-    // mnemonics, as they want the unscaled diagnostic as well.
-    if (Operands.size() == ErrorInfo + 1 &&
-        !((ARM64Operand *)Operands[ErrorInfo])->isImm() &&
-        !Tok.startswith("stur") && !Tok.startswith("ldur")) {
-      // FIXME: Here we use a vague diagnostic for memory operand in many
-      // instructions of various formats. This diagnostic can be more accurate
-      // if splitting memory operand into many smaller operands to help
-      // diagnose.
-      MatchResult = Match_InvalidMemoryIndexed;
-    }
-    else if(Operands.size() == 3 && Operands.size() == ErrorInfo + 1 &&
-            ((ARM64Operand *)Operands[ErrorInfo])->isImm()) {
-      MatchResult = Match_InvalidLabel;
-    }
-    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-    if (ErrorLoc == SMLoc())
-      ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  case Match_InvalidMemoryIndexed32:
-  case Match_InvalidMemoryIndexed64:
-  case Match_InvalidMemoryIndexed128:
-    // If there is a '!' after the memory operand that failed, we really
-    // want the diagnostic for the pre-indexed instruction variant instead.
-    if (Operands.size() > ErrorInfo + 1 &&
-        ((ARM64Operand *)Operands[ErrorInfo + 1])->isTokenEqual("!"))
-      MatchResult = Match_InvalidMemoryIndexedSImm9;
-  // FALL THROUGH
-  case Match_InvalidCondCode:
-  case Match_AddSubRegExtendSmall:
-  case Match_AddSubRegExtendLarge:
-  case Match_AddSubSecondSource:
-  case Match_LogicalSecondSource:
-  case Match_AddSubRegShift32:
-  case Match_AddSubRegShift64:
-  case Match_InvalidMovImm32Shift:
-  case Match_InvalidMovImm64Shift:
-  case Match_InvalidFPImm:
-  case Match_InvalidMemoryIndexed:
-  case Match_InvalidMemoryIndexed8:
-  case Match_InvalidMemoryIndexed16:
-  case Match_InvalidMemoryIndexed32SImm7:
-  case Match_InvalidMemoryIndexed64SImm7:
-  case Match_InvalidMemoryIndexed128SImm7:
-  case Match_InvalidImm0_7:
-  case Match_InvalidImm0_15:
-  case Match_InvalidImm0_31:
-  case Match_InvalidImm0_63:
-  case Match_InvalidImm0_127:
-  case Match_InvalidImm0_65535:
-  case Match_InvalidImm1_8:
-  case Match_InvalidImm1_16:
-  case Match_InvalidImm1_32:
-  case Match_InvalidImm1_64:
-  case Match_InvalidIndex1:
-  case Match_InvalidIndexB:
-  case Match_InvalidIndexH:
-  case Match_InvalidIndexS:
-  case Match_InvalidIndexD:
-  case Match_InvalidLabel:
-  case Match_MSR:
-  case Match_MRS: {
-    // Any time we get here, there's nothing fancy to do. Just get the
-    // operand SMLoc and display the diagnostic.
-    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-    // If it's a memory operand, the error is with the offset immediate,
-    // so get that location instead.
-    if (((ARM64Operand *)Operands[ErrorInfo])->isMem())
-      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getOffsetLoc();
-    if (ErrorLoc == SMLoc())
-      ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  }
-
-  llvm_unreachable("Implement any new match types added!");
-  return true;
-}
-
-/// ParseDirective parses the arm specific directives
-bool ARM64AsmParser::ParseDirective(AsmToken DirectiveID) {
-  StringRef IDVal = DirectiveID.getIdentifier();
-  SMLoc Loc = DirectiveID.getLoc();
-  if (IDVal == ".hword")
-    return parseDirectiveWord(2, Loc);
-  if (IDVal == ".word")
-    return parseDirectiveWord(4, Loc);
-  if (IDVal == ".xword")
-    return parseDirectiveWord(8, Loc);
-  if (IDVal == ".tlsdesccall")
-    return parseDirectiveTLSDescCall(Loc);
-
-  return parseDirectiveLOH(IDVal, Loc);
-}
-
-/// parseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool ARM64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
-      const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      getParser().getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
-      Parser.Lex();
-    }
-  }
-
-  Parser.Lex();
-  return false;
-}
-
-// parseDirectiveTLSDescCall:
-//   ::= .tlsdesccall symbol
-bool ARM64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
-    return Error(L, "expected symbol after directive");
-
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
-  Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_TLSDESC, getContext());
-
-  MCInst Inst;
-  Inst.setOpcode(ARM64::TLSDESCCALL);
-  Inst.addOperand(MCOperand::CreateExpr(Expr));
-
-  getParser().getStreamer().EmitInstruction(Inst, STI);
-  return false;
-}
-
-/// ::= .loh <lohName | lohId> label1, ..., labelN
-/// The number of arguments depends on the loh identifier.
-bool ARM64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
-  if (IDVal != MCLOHDirectiveName())
-    return true;
-  MCLOHType Kind;
-  if (getParser().getTok().isNot(AsmToken::Identifier)) {
-    if (getParser().getTok().isNot(AsmToken::Integer))
-      return TokError("expected an identifier or a number in directive");
-    // We successfully get a numeric value for the identifier.
-    // Check if it is valid.
-    int64_t Id = getParser().getTok().getIntVal();
-    Kind = (MCLOHType)Id;
-    // Check that Id does not overflow MCLOHType.
-    if (!isValidMCLOHType(Kind) || Id != Kind)
-      return TokError("invalid numeric identifier in directive");
-  } else {
-    StringRef Name = getTok().getIdentifier();
-    // We successfully parse an identifier.
-    // Check if it is a recognized one.
-    int Id = MCLOHNameToId(Name);
-
-    if (Id == -1)
-      return TokError("invalid identifier in directive");
-    Kind = (MCLOHType)Id;
-  }
-  // Consume the identifier.
-  Lex();
-  // Get the number of arguments of this LOH.
-  int NbArgs = MCLOHIdToNbArgs(Kind);
-
-  assert(NbArgs != -1 && "Invalid number of arguments");
-
-  SmallVector<MCSymbol *, 3> Args;
-  for (int Idx = 0; Idx < NbArgs; ++Idx) {
-    StringRef Name;
-    if (getParser().parseIdentifier(Name))
-      return TokError("expected identifier in directive");
-    Args.push_back(getContext().GetOrCreateSymbol(Name));
-
-    if (Idx + 1 == NbArgs)
-      break;
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-    Lex();
-  }
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-
-  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
-  return false;
-}
-
-bool
-ARM64AsmParser::classifySymbolRef(const MCExpr *Expr,
-                                  ARM64MCExpr::VariantKind &ELFRefKind,
-                                  MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                  int64_t &Addend) {
-  ELFRefKind = ARM64MCExpr::VK_INVALID;
-  DarwinRefKind = MCSymbolRefExpr::VK_None;
-  Addend = 0;
-
-  if (const ARM64MCExpr *AE = dyn_cast<ARM64MCExpr>(Expr)) {
-    ELFRefKind = AE->getKind();
-    Expr = AE->getSubExpr();
-  }
-
-  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
-  if (SE) {
-    // It's a simple symbol reference with no addend.
-    DarwinRefKind = SE->getKind();
-    return true;
-  }
-
-  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
-  if (!BE)
-    return false;
-
-  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
-  if (!SE)
-    return false;
-  DarwinRefKind = SE->getKind();
-
-  if (BE->getOpcode() != MCBinaryExpr::Add &&
-      BE->getOpcode() != MCBinaryExpr::Sub)
-    return false;
-
-  // See if the addend is is a constant, otherwise there's more going
-  // on here than we can deal with.
-  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
-  if (!AddendExpr)
-    return false;
-
-  Addend = AddendExpr->getValue();
-  if (BE->getOpcode() == MCBinaryExpr::Sub)
-    Addend = -Addend;
-
-  // It's some symbol reference + a constant addend, but really
-  // shouldn't use both Darwin and ELF syntax.
-  return ELFRefKind == ARM64MCExpr::VK_INVALID ||
-         DarwinRefKind == MCSymbolRefExpr::VK_None;
-}
-
-/// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmParser() {
-  RegisterMCAsmParser<ARM64AsmParser> X(TheARM64leTarget);
-  RegisterMCAsmParser<ARM64AsmParser> Y(TheARM64beTarget);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_SUBTARGET_FEATURE_NAME
-#define GET_MATCHER_IMPLEMENTATION
-#include "ARM64GenAsmMatcher.inc"
-
-// Define this matcher function after the auto-generated include so we
-// have the match class enum definitions.
-unsigned ARM64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
-                                                    unsigned Kind) {
-  ARM64Operand *Op = static_cast<ARM64Operand *>(AsmOp);
-  // If the kind is a token for a literal immediate, check if our asm
-  // operand matches. This is for InstAliases which have a fixed-value
-  // immediate in the syntax.
-  int64_t ExpectedVal;
-  switch (Kind) {
-  default:
-    return Match_InvalidOperand;
-  case MCK__35_0:
-    ExpectedVal = 0;
-    break;
-  case MCK__35_1:
-    ExpectedVal = 1;
-    break;
-  case MCK__35_12:
-    ExpectedVal = 12;
-    break;
-  case MCK__35_16:
-    ExpectedVal = 16;
-    break;
-  case MCK__35_2:
-    ExpectedVal = 2;
-    break;
-  case MCK__35_24:
-    ExpectedVal = 24;
-    break;
-  case MCK__35_3:
-    ExpectedVal = 3;
-    break;
-  case MCK__35_32:
-    ExpectedVal = 32;
-    break;
-  case MCK__35_4:
-    ExpectedVal = 4;
-    break;
-  case MCK__35_48:
-    ExpectedVal = 48;
-    break;
-  case MCK__35_6:
-    ExpectedVal = 6;
-    break;
-  case MCK__35_64:
-    ExpectedVal = 64;
-    break;
-  case MCK__35_8:
-    ExpectedVal = 8;
-    break;
-  }
-  if (!Op->isImm())
-    return Match_InvalidOperand;
-  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-  if (!CE)
-    return Match_InvalidOperand;
-  if (CE->getValue() == ExpectedVal)
-    return Match_Success;
-  return Match_InvalidOperand;
-}
diff --git a/lib/Target/ARM64/AsmParser/CMakeLists.txt b/lib/Target/ARM64/AsmParser/CMakeLists.txt
deleted file mode 100644
index 826158b1ed17..000000000000
--- a/lib/Target/ARM64/AsmParser/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmParser
-  ARM64AsmParser.cpp
-  )
-
diff --git a/lib/Target/ARM64/AsmParser/LLVMBuild.txt b/lib/Target/ARM64/AsmParser/LLVMBuild.txt
deleted file mode 100644
index 9045283e9192..000000000000
--- a/lib/Target/ARM64/AsmParser/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmParser
-parent = ARM64
-required_libraries = ARM64Desc ARM64Info ARM64Utils MC MCParser Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/AsmParser/Makefile b/lib/Target/ARM64/AsmParser/Makefile
deleted file mode 100644
index d25c47f9af99..000000000000
--- a/lib/Target/ARM64/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/CMakeLists.txt b/lib/Target/ARM64/CMakeLists.txt
deleted file mode 100644
index 56ba3b732946..000000000000
--- a/lib/Target/ARM64/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS ARM64.td)
-
-tablegen(LLVM ARM64GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM ARM64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM ARM64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(LLVM ARM64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM ARM64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM ARM64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM ARM64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM ARM64GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARM64GenFastISel.inc -gen-fast-isel)
-tablegen(LLVM ARM64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM ARM64GenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM ARM64GenDisassemblerTables.inc -gen-disassembler)
-add_public_tablegen_target(ARM64CommonTableGen)
-
-add_llvm_target(ARM64CodeGen
-  ARM64AddressTypePromotion.cpp
-  ARM64AdvSIMDScalarPass.cpp
-  ARM64AsmPrinter.cpp
-  ARM64BranchRelaxation.cpp
-  ARM64CleanupLocalDynamicTLSPass.cpp
-  ARM64CollectLOH.cpp
-  ARM64ConditionalCompares.cpp
-  ARM64DeadRegisterDefinitionsPass.cpp
-  ARM64ExpandPseudoInsts.cpp
-  ARM64FastISel.cpp
-  ARM64FrameLowering.cpp
-  ARM64ISelDAGToDAG.cpp
-  ARM64ISelLowering.cpp
-  ARM64InstrInfo.cpp
-  ARM64LoadStoreOptimizer.cpp
-  ARM64MCInstLower.cpp
-  ARM64PromoteConstant.cpp
-  ARM64RegisterInfo.cpp
-  ARM64SelectionDAGInfo.cpp
-  ARM64StorePairSuppress.cpp
-  ARM64Subtarget.cpp
-  ARM64TargetMachine.cpp
-  ARM64TargetObjectFile.cpp
-  ARM64TargetTransformInfo.cpp
-)
-
-add_dependencies(LLVMARM64CodeGen intrinsics_gen)
-
-add_subdirectory(TargetInfo)
-add_subdirectory(AsmParser)
-add_subdirectory(Disassembler)
-add_subdirectory(InstPrinter)
-add_subdirectory(MCTargetDesc)
-add_subdirectory(Utils)
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
deleted file mode 100644
index 92eabcf2b4e0..000000000000
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
+++ /dev/null
@@ -1,1524 +0,0 @@
-//===- ARM64Disassembler.cpp - Disassembler for ARM64 -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64Disassembler.h"
-#include "ARM64ExternalSymbolizer.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm64-disassembler"
-
-// Pull DecodeStatus and its enum values into the global namespace.
-typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
-
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-
-static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
-                                       uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-
-static bool Check(DecodeStatus &Out, DecodeStatus In) {
-  switch (In) {
-    case MCDisassembler::Success:
-      // Out stays the same.
-      return true;
-    case MCDisassembler::SoftFail:
-      Out = In;
-      return true;
-    case MCDisassembler::Fail:
-      Out = In;
-      return false;
-  }
-  llvm_unreachable("Invalid DecodeStatus!");
-}
-
-#include "ARM64GenDisassemblerTables.inc"
-#include "ARM64GenInstrInfo.inc"
-
-#define Success llvm::MCDisassembler::Success
-#define Fail llvm::MCDisassembler::Fail
-#define SoftFail llvm::MCDisassembler::SoftFail
-
-static MCDisassembler *createARM64Disassembler(const Target &T,
-                                               const MCSubtargetInfo &STI,
-                                               MCContext &Ctx) {
-  return new ARM64Disassembler(STI, Ctx);
-}
-
-DecodeStatus ARM64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               const MemoryObject &Region,
-                                               uint64_t Address,
-                                               raw_ostream &os,
-                                               raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
-
-  Size = 0;
-  // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
-    return Fail;
-  Size = 4;
-
-  // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn =
-      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
-
-  // Calling the auto-generated decoder function.
-  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
-}
-
-static MCSymbolizer *
-createARM64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
-                              LLVMSymbolLookupCallback SymbolLookUp,
-                              void *DisInfo, MCContext *Ctx,
-                              MCRelocationInfo *RelInfo) {
-  return new llvm::ARM64ExternalSymbolizer(
-                                     *Ctx,
-                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
-                                     GetOpInfo, SymbolLookUp, DisInfo);
-}
-
-extern "C" void LLVMInitializeARM64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
-                                         createARM64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
-                                         createARM64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
-                                       createARM64ExternalSymbolizer);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
-                                       createARM64ExternalSymbolizer);
-}
-
-static const unsigned FPR128DecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR128DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
-  if (RegNo > 15)
-    return Fail;
-  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
-}
-
-static const unsigned FPR64DecoderTable[] = {
-  ARM64::D0,  ARM64::D1,  ARM64::D2,  ARM64::D3,  ARM64::D4,  ARM64::D5,
-  ARM64::D6,  ARM64::D7,  ARM64::D8,  ARM64::D9,  ARM64::D10, ARM64::D11,
-  ARM64::D12, ARM64::D13, ARM64::D14, ARM64::D15, ARM64::D16, ARM64::D17,
-  ARM64::D18, ARM64::D19, ARM64::D20, ARM64::D21, ARM64::D22, ARM64::D23,
-  ARM64::D24, ARM64::D25, ARM64::D26, ARM64::D27, ARM64::D28, ARM64::D29,
-  ARM64::D30, ARM64::D31
-};
-
-static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR32DecoderTable[] = {
-  ARM64::S0,  ARM64::S1,  ARM64::S2,  ARM64::S3,  ARM64::S4,  ARM64::S5,
-  ARM64::S6,  ARM64::S7,  ARM64::S8,  ARM64::S9,  ARM64::S10, ARM64::S11,
-  ARM64::S12, ARM64::S13, ARM64::S14, ARM64::S15, ARM64::S16, ARM64::S17,
-  ARM64::S18, ARM64::S19, ARM64::S20, ARM64::S21, ARM64::S22, ARM64::S23,
-  ARM64::S24, ARM64::S25, ARM64::S26, ARM64::S27, ARM64::S28, ARM64::S29,
-  ARM64::S30, ARM64::S31
-};
-
-static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR16DecoderTable[] = {
-  ARM64::H0,  ARM64::H1,  ARM64::H2,  ARM64::H3,  ARM64::H4,  ARM64::H5,
-  ARM64::H6,  ARM64::H7,  ARM64::H8,  ARM64::H9,  ARM64::H10, ARM64::H11,
-  ARM64::H12, ARM64::H13, ARM64::H14, ARM64::H15, ARM64::H16, ARM64::H17,
-  ARM64::H18, ARM64::H19, ARM64::H20, ARM64::H21, ARM64::H22, ARM64::H23,
-  ARM64::H24, ARM64::H25, ARM64::H26, ARM64::H27, ARM64::H28, ARM64::H29,
-  ARM64::H30, ARM64::H31
-};
-
-static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR16DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR8DecoderTable[] = {
-  ARM64::B0,  ARM64::B1,  ARM64::B2,  ARM64::B3,  ARM64::B4,  ARM64::B5,
-  ARM64::B6,  ARM64::B7,  ARM64::B8,  ARM64::B9,  ARM64::B10, ARM64::B11,
-  ARM64::B12, ARM64::B13, ARM64::B14, ARM64::B15, ARM64::B16, ARM64::B17,
-  ARM64::B18, ARM64::B19, ARM64::B20, ARM64::B21, ARM64::B22, ARM64::B23,
-  ARM64::B24, ARM64::B25, ARM64::B26, ARM64::B27, ARM64::B28, ARM64::B29,
-  ARM64::B30, ARM64::B31
-};
-
-static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR8DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR64DecoderTable[] = {
-  ARM64::X0,  ARM64::X1,  ARM64::X2,  ARM64::X3,  ARM64::X4,  ARM64::X5,
-  ARM64::X6,  ARM64::X7,  ARM64::X8,  ARM64::X9,  ARM64::X10, ARM64::X11,
-  ARM64::X12, ARM64::X13, ARM64::X14, ARM64::X15, ARM64::X16, ARM64::X17,
-  ARM64::X18, ARM64::X19, ARM64::X20, ARM64::X21, ARM64::X22, ARM64::X23,
-  ARM64::X24, ARM64::X25, ARM64::X26, ARM64::X27, ARM64::X28, ARM64::FP,
-  ARM64::LR,  ARM64::XZR
-};
-
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = GPR64DecoderTable[RegNo];
-  if (Register == ARM64::XZR)
-    Register = ARM64::SP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR32DecoderTable[] = {
-  ARM64::W0,  ARM64::W1,  ARM64::W2,  ARM64::W3,  ARM64::W4,  ARM64::W5,
-  ARM64::W6,  ARM64::W7,  ARM64::W8,  ARM64::W9,  ARM64::W10, ARM64::W11,
-  ARM64::W12, ARM64::W13, ARM64::W14, ARM64::W15, ARM64::W16, ARM64::W17,
-  ARM64::W18, ARM64::W19, ARM64::W20, ARM64::W21, ARM64::W22, ARM64::W23,
-  ARM64::W24, ARM64::W25, ARM64::W26, ARM64::W27, ARM64::W28, ARM64::W29,
-  ARM64::W30, ARM64::WZR
-};
-
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  if (Register == ARM64::WZR)
-    Register = ARM64::WSP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned VectorDecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = VectorDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQDecoderTable[] = {
-  ARM64::Q0_Q1,   ARM64::Q1_Q2,   ARM64::Q2_Q3,   ARM64::Q3_Q4,
-  ARM64::Q4_Q5,   ARM64::Q5_Q6,   ARM64::Q6_Q7,   ARM64::Q7_Q8,
-  ARM64::Q8_Q9,   ARM64::Q9_Q10,  ARM64::Q10_Q11, ARM64::Q11_Q12,
-  ARM64::Q12_Q13, ARM64::Q13_Q14, ARM64::Q14_Q15, ARM64::Q15_Q16,
-  ARM64::Q16_Q17, ARM64::Q17_Q18, ARM64::Q18_Q19, ARM64::Q19_Q20,
-  ARM64::Q20_Q21, ARM64::Q21_Q22, ARM64::Q22_Q23, ARM64::Q23_Q24,
-  ARM64::Q24_Q25, ARM64::Q25_Q26, ARM64::Q26_Q27, ARM64::Q27_Q28,
-  ARM64::Q28_Q29, ARM64::Q29_Q30, ARM64::Q30_Q31, ARM64::Q31_Q0
-};
-
-static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2,    ARM64::Q1_Q2_Q3,    ARM64::Q2_Q3_Q4,
-  ARM64::Q3_Q4_Q5,    ARM64::Q4_Q5_Q6,    ARM64::Q5_Q6_Q7,
-  ARM64::Q6_Q7_Q8,    ARM64::Q7_Q8_Q9,    ARM64::Q8_Q9_Q10,
-  ARM64::Q9_Q10_Q11,  ARM64::Q10_Q11_Q12, ARM64::Q11_Q12_Q13,
-  ARM64::Q12_Q13_Q14, ARM64::Q13_Q14_Q15, ARM64::Q14_Q15_Q16,
-  ARM64::Q15_Q16_Q17, ARM64::Q16_Q17_Q18, ARM64::Q17_Q18_Q19,
-  ARM64::Q18_Q19_Q20, ARM64::Q19_Q20_Q21, ARM64::Q20_Q21_Q22,
-  ARM64::Q21_Q22_Q23, ARM64::Q22_Q23_Q24, ARM64::Q23_Q24_Q25,
-  ARM64::Q24_Q25_Q26, ARM64::Q25_Q26_Q27, ARM64::Q26_Q27_Q28,
-  ARM64::Q27_Q28_Q29, ARM64::Q28_Q29_Q30, ARM64::Q29_Q30_Q31,
-  ARM64::Q30_Q31_Q0,  ARM64::Q31_Q0_Q1
-};
-
-static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2_Q3,     ARM64::Q1_Q2_Q3_Q4,     ARM64::Q2_Q3_Q4_Q5,
-  ARM64::Q3_Q4_Q5_Q6,     ARM64::Q4_Q5_Q6_Q7,     ARM64::Q5_Q6_Q7_Q8,
-  ARM64::Q6_Q7_Q8_Q9,     ARM64::Q7_Q8_Q9_Q10,    ARM64::Q8_Q9_Q10_Q11,
-  ARM64::Q9_Q10_Q11_Q12,  ARM64::Q10_Q11_Q12_Q13, ARM64::Q11_Q12_Q13_Q14,
-  ARM64::Q12_Q13_Q14_Q15, ARM64::Q13_Q14_Q15_Q16, ARM64::Q14_Q15_Q16_Q17,
-  ARM64::Q15_Q16_Q17_Q18, ARM64::Q16_Q17_Q18_Q19, ARM64::Q17_Q18_Q19_Q20,
-  ARM64::Q18_Q19_Q20_Q21, ARM64::Q19_Q20_Q21_Q22, ARM64::Q20_Q21_Q22_Q23,
-  ARM64::Q21_Q22_Q23_Q24, ARM64::Q22_Q23_Q24_Q25, ARM64::Q23_Q24_Q25_Q26,
-  ARM64::Q24_Q25_Q26_Q27, ARM64::Q25_Q26_Q27_Q28, ARM64::Q26_Q27_Q28_Q29,
-  ARM64::Q27_Q28_Q29_Q30, ARM64::Q28_Q29_Q30_Q31, ARM64::Q29_Q30_Q31_Q0,
-  ARM64::Q30_Q31_Q0_Q1,   ARM64::Q31_Q0_Q1_Q2
-};
-
-static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDecoderTable[] = {
-  ARM64::D0_D1,   ARM64::D1_D2,   ARM64::D2_D3,   ARM64::D3_D4,
-  ARM64::D4_D5,   ARM64::D5_D6,   ARM64::D6_D7,   ARM64::D7_D8,
-  ARM64::D8_D9,   ARM64::D9_D10,  ARM64::D10_D11, ARM64::D11_D12,
-  ARM64::D12_D13, ARM64::D13_D14, ARM64::D14_D15, ARM64::D15_D16,
-  ARM64::D16_D17, ARM64::D17_D18, ARM64::D18_D19, ARM64::D19_D20,
-  ARM64::D20_D21, ARM64::D21_D22, ARM64::D22_D23, ARM64::D23_D24,
-  ARM64::D24_D25, ARM64::D25_D26, ARM64::D26_D27, ARM64::D27_D28,
-  ARM64::D28_D29, ARM64::D29_D30, ARM64::D30_D31, ARM64::D31_D0
-};
-
-static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDecoderTable[] = {
-  ARM64::D0_D1_D2,    ARM64::D1_D2_D3,    ARM64::D2_D3_D4,
-  ARM64::D3_D4_D5,    ARM64::D4_D5_D6,    ARM64::D5_D6_D7,
-  ARM64::D6_D7_D8,    ARM64::D7_D8_D9,    ARM64::D8_D9_D10,
-  ARM64::D9_D10_D11,  ARM64::D10_D11_D12, ARM64::D11_D12_D13,
-  ARM64::D12_D13_D14, ARM64::D13_D14_D15, ARM64::D14_D15_D16,
-  ARM64::D15_D16_D17, ARM64::D16_D17_D18, ARM64::D17_D18_D19,
-  ARM64::D18_D19_D20, ARM64::D19_D20_D21, ARM64::D20_D21_D22,
-  ARM64::D21_D22_D23, ARM64::D22_D23_D24, ARM64::D23_D24_D25,
-  ARM64::D24_D25_D26, ARM64::D25_D26_D27, ARM64::D26_D27_D28,
-  ARM64::D27_D28_D29, ARM64::D28_D29_D30, ARM64::D29_D30_D31,
-  ARM64::D30_D31_D0,  ARM64::D31_D0_D1
-};
-
-static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDDecoderTable[] = {
-  ARM64::D0_D1_D2_D3,     ARM64::D1_D2_D3_D4,     ARM64::D2_D3_D4_D5,
-  ARM64::D3_D4_D5_D6,     ARM64::D4_D5_D6_D7,     ARM64::D5_D6_D7_D8,
-  ARM64::D6_D7_D8_D9,     ARM64::D7_D8_D9_D10,    ARM64::D8_D9_D10_D11,
-  ARM64::D9_D10_D11_D12,  ARM64::D10_D11_D12_D13, ARM64::D11_D12_D13_D14,
-  ARM64::D12_D13_D14_D15, ARM64::D13_D14_D15_D16, ARM64::D14_D15_D16_D17,
-  ARM64::D15_D16_D17_D18, ARM64::D16_D17_D18_D19, ARM64::D17_D18_D19_D20,
-  ARM64::D18_D19_D20_D21, ARM64::D19_D20_D21_D22, ARM64::D20_D21_D22_D23,
-  ARM64::D21_D22_D23_D24, ARM64::D22_D23_D24_D25, ARM64::D23_D24_D25_D26,
-  ARM64::D24_D25_D26_D27, ARM64::D25_D26_D27_D28, ARM64::D26_D27_D28_D29,
-  ARM64::D27_D28_D29_D30, ARM64::D28_D29_D30_D31, ARM64::D29_D30_D31_D0,
-  ARM64::D30_D31_D0_D1,   ARM64::D31_D0_D1_D2
-};
-
-static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  // scale{5} is asserted as 1 in tblgen.
-  Imm |= 0x20;  
-  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr, const void *Decoder) {
-  int64_t ImmVal = Imm;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 19-bit immediate.
-  if (ImmVal & (1 << (19 - 1)))
-    ImmVal |= ~((1LL << 19) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
-                                     Inst.getOpcode() != ARM64::LDRXl, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  return Success;
-}
-
-static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address,
-                                            const void *Decoder) {
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
-  Inst.addOperand(MCOperand::CreateImm(Imm));
-
-  bool ValidNamed;
-  (void)ARM64SysReg::MRSMapper(STI.getFeatureBits()).toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
-}
-
-static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                            uint64_t Address,
-                                            const void *Decoder) {
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
-  Inst.addOperand(MCOperand::CreateImm(Imm));
-
-  bool ValidNamed;
-  (void)ARM64SysReg::MSRMapper(STI.getFeatureBits()).toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
-}
-
-static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  // This decoder exists to add the dummy Lane operand to the MCInst, which must
-  // be 1 in assembly but has no other real manifestation.
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
-
-  if (IsToVec) {
-    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
-  } else {
-    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // Add the lane
-  Inst.addOperand(MCOperand::CreateImm(1));
-
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
-}
-
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
-}
-
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
-}
-
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
-  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
-  unsigned shift = (shiftHi << 6) | shiftLo;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ADDWrs:
-  case ARM64::ADDSWrs:
-  case ARM64::SUBWrs:
-  case ARM64::SUBSWrs:
-    // if shift == '11' then ReservedValue()
-    if (shiftHi == 0x3)
-      return Fail;
-    // Deliberate fallthrough
-  case ARM64::ANDWrs:
-  case ARM64::ANDSWrs:
-  case ARM64::BICWrs:
-  case ARM64::BICSWrs:
-  case ARM64::ORRWrs:
-  case ARM64::ORNWrs:
-  case ARM64::EORWrs:
-  case ARM64::EONWrs: {
-    // if sf == '0' and imm6<5> == '1' then ReservedValue()
-    if (shiftLo >> 5 == 1)
-      return Fail;
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  case ARM64::ADDXrs:
-  case ARM64::ADDSXrs:
-  case ARM64::SUBXrs:
-  case ARM64::SUBSXrs:
-    // if shift == '11' then ReservedValue()
-    if (shiftHi == 0x3)
-      return Fail;
-    // Deliberate fallthrough
-  case ARM64::ANDXrs:
-  case ARM64::ANDSXrs:
-  case ARM64::BICXrs:
-  case ARM64::BICSXrs:
-  case ARM64::ORRXrs:
-  case ARM64::ORNXrs:
-  case ARM64::EORXrs:
-  case ARM64::EONXrs:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned imm = fieldFromInstruction(insn, 5, 16);
-  unsigned shift = fieldFromInstruction(insn, 21, 2);
-  shift <<= 4;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::MOVZWi:
-  case ARM64::MOVNWi:
-  case ARM64::MOVKWi:
-    if (shift & (1U << 5))
-      return Fail;
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::MOVZXi:
-  case ARM64::MOVNXi:
-  case ARM64::MOVKXi:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  }
-
-  if (Inst.getOpcode() == ARM64::MOVKWi || Inst.getOpcode() == ARM64::MOVKXi)
-    Inst.addOperand(Inst.getOperand(0));
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn, uint64_t Addr,
-                                                  const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned offset = fieldFromInstruction(insn, 10, 12);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFMui:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STRBBui:
-  case ARM64::LDRBBui:
-  case ARM64::LDRSBWui:
-  case ARM64::STRHHui:
-  case ARM64::LDRHHui:
-  case ARM64::LDRSHWui:
-  case ARM64::STRWui:
-  case ARM64::LDRWui:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::STRXui:
-  case ARM64::LDRXui:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRQui:
-  case ARM64::STRQui:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRDui:
-  case ARM64::STRDui:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSui:
-  case ARM64::STRSui:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHui:
-  case ARM64::STRHui:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBui:
-  case ARM64::STRBui:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  int64_t offset = fieldFromInstruction(insn, 12, 9);
-
-  // offset is a 9-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (9 - 1)))
-    offset |= ~((1LL << 9) - 1);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFUMi:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STURBBi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBWi:
-  case ARM64::STURHHi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURSHWi:
-  case ARM64::STURWi:
-  case ARM64::LDURWi:
-  case ARM64::LDTRSBWi:
-  case ARM64::LDTRSHWi:
-  case ARM64::STTRWi:
-  case ARM64::LDTRWi:
-  case ARM64::STTRHi:
-  case ARM64::LDTRHi:
-  case ARM64::LDTRBi:
-  case ARM64::STTRBi:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::STRBBpre:
-  case ARM64::LDRBBpre:
-  case ARM64::STRHHpre:
-  case ARM64::LDRHHpre:
-  case ARM64::STRWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::STRBBpost:
-  case ARM64::LDRBBpost:
-  case ARM64::STRHHpost:
-  case ARM64::LDRHHpost:
-  case ARM64::STRWpost:
-  case ARM64::LDRWpost:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::LDURXi:
-  case ARM64::LDTRSBXi:
-  case ARM64::LDTRSHXi:
-  case ARM64::LDTRSWi:
-  case ARM64::STTRXi:
-  case ARM64::LDTRXi:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::STRXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::STRXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRXpost:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURQi:
-  case ARM64::STURQi:
-  case ARM64::LDRQpre:
-  case ARM64::STRQpre:
-  case ARM64::LDRQpost:
-  case ARM64::STRQpost:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURDi:
-  case ARM64::STURDi:
-  case ARM64::LDRDpre:
-  case ARM64::STRDpre:
-  case ARM64::LDRDpost:
-  case ARM64::STRDpost:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSi:
-  case ARM64::STURSi:
-  case ARM64::LDRSpre:
-  case ARM64::STRSpre:
-  case ARM64::LDRSpost:
-  case ARM64::STRSpost:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURHi:
-  case ARM64::STURHi:
-  case ARM64::LDRHpre:
-  case ARM64::STRHpre:
-  case ARM64::LDRHpost:
-  case ARM64::STRHpost:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURBi:
-  case ARM64::STURBi:
-  case ARM64::LDRBpre:
-  case ARM64::STRBpre:
-  case ARM64::LDRBpost:
-  case ARM64::STRBpost:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-
-  bool IsLoad = fieldFromInstruction(insn, 22, 1);
-  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
-  bool IsFP = fieldFromInstruction(insn, 26, 1);
-
-  // Cannot write back to a transfer register (but xzr != sp).
-  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
-    return SoftFail;
-
-  return Success;
-}
-
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  unsigned Rs = fieldFromInstruction(insn, 16, 5);
-
-  unsigned Opcode = Inst.getOpcode();
-  switch (Opcode) {
-  default:
-    return Fail;
-  case ARM64::STLXRW:
-  case ARM64::STLXRB:
-  case ARM64::STLXRH:
-  case ARM64::STXRW:
-  case ARM64::STXRB:
-  case ARM64::STXRH:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARW:
-  case ARM64::LDARB:
-  case ARM64::LDARH:
-  case ARM64::LDAXRW:
-  case ARM64::LDAXRB:
-  case ARM64::LDAXRH:
-  case ARM64::LDXRW:
-  case ARM64::LDXRB:
-  case ARM64::LDXRH:
-  case ARM64::STLRW:
-  case ARM64::STLRB:
-  case ARM64::STLRH:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXRX:
-  case ARM64::STXRX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARX:
-  case ARM64::LDAXRX:
-  case ARM64::LDXRX:
-  case ARM64::STLRX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXPW:
-  case ARM64::STXPW:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPW:
-  case ARM64::LDXPW:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::STLXPX:
-  case ARM64::STXPX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPX:
-  case ARM64::LDXPX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-
-  // You shouldn't load to the same register twice in an instruction...
-  if ((Opcode == ARM64::LDAXPW || Opcode == ARM64::LDXPW ||
-       Opcode == ARM64::LDAXPX || Opcode == ARM64::LDXPX) &&
-      Rt == Rt2)
-    return SoftFail;
-
-  return Success;
-}
-
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  int64_t offset = fieldFromInstruction(insn, 15, 7);
-  bool IsLoad = fieldFromInstruction(insn, 22, 1);
-
-  // offset is a 7-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (7 - 1)))
-    offset |= ~((1LL << 7) - 1);
-
-  unsigned Opcode = Inst.getOpcode();
-  bool NeedsDisjointWritebackTransfer = false;
-  switch (Opcode) {
-  default:
-    return Fail;
-  case ARM64::LDPXpost:
-  case ARM64::STPXpost:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPXpre:
-  case ARM64::STPXpre:
-  case ARM64::LDPSWpre:
-    NeedsDisjointWritebackTransfer = true;
-    // Fallthrough
-  case ARM64::LDNPXi:
-  case ARM64::STNPXi:
-  case ARM64::LDPXi:
-  case ARM64::STPXi:
-  case ARM64::LDPSWi:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDPWpost:
-  case ARM64::STPWpost:
-  case ARM64::LDPWpre:
-  case ARM64::STPWpre:
-    NeedsDisjointWritebackTransfer = true;
-    // Fallthrough
-  case ARM64::LDNPWi:
-  case ARM64::STNPWi:
-  case ARM64::LDPWi:
-  case ARM64::STPWi:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPQi:
-  case ARM64::STNPQi:
-  case ARM64::LDPQpost:
-  case ARM64::STPQpost:
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-  case ARM64::LDPQpre:
-  case ARM64::STPQpre:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPDi:
-  case ARM64::STNPDi:
-  case ARM64::LDPDpost:
-  case ARM64::STPDpost:
-  case ARM64::LDPDi:
-  case ARM64::STPDi:
-  case ARM64::LDPDpre:
-  case ARM64::STPDpre:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPSi:
-  case ARM64::STNPSi:
-  case ARM64::LDPSpost:
-  case ARM64::STPSpost:
-  case ARM64::LDPSi:
-  case ARM64::STPSi:
-  case ARM64::LDPSpre:
-  case ARM64::STPSpre:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-
-  // You shouldn't load to the same register twice in an instruction...
-  if (IsLoad && Rt == Rt2)
-    return SoftFail;
-
-  // ... or do any operation that writes-back to a transfer register. But note
-  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
-  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
-    return SoftFail;
-
-  return Success;
-}
-
-static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned extendHi = fieldFromInstruction(insn, 13, 3);
-  unsigned extendLo = fieldFromInstruction(insn, 12, 1);
-  unsigned extend = (extendHi << 1) | extendLo;
-
-  // All RO load-store instructions are undefined if option == 00x or 10x.
-  if (extend >> 2 == 0x0 || extend >> 2 == 0x2)
-    return Fail;
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LDRSWro:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRXro:
-  case ARM64::STRXro:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRWro:
-  case ARM64::STRWro:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRQro:
-  case ARM64::STRQro:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRDro:
-  case ARM64::STRDro:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSro:
-  case ARM64::STRSro:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHro:
-  case ARM64::STRHro:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBro:
-  case ARM64::STRBro:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBBro:
-  case ARM64::STRBBro:
-  case ARM64::LDRSBWro:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHHro:
-  case ARM64::STRHHro:
-  case ARM64::LDRSHWro:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSHXro:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSBXro:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::PRFMro:
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(extend));
-  return Success;
-}
-
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned extend = fieldFromInstruction(insn, 10, 6);
-
-  unsigned shift = extend & 0x7;
-  if (shift > 4)
-    return Fail;
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ADDWrx:
-  case ARM64::SUBWrx:
-    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSWrx:
-  case ARM64::SUBSWrx:
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx:
-  case ARM64::SUBXrx:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSXrx:
-  case ARM64::SUBSXrx:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx64:
-  case ARM64::SUBXrx64:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::SUBSXrx64:
-  case ARM64::ADDSXrx64:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(extend));
-  return Success;
-}
-
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-  unsigned imm;
-
-  if (Datasize) {
-    if (Inst.getOpcode() == ARM64::ANDSXri)
-      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 13);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 64))
-      return Fail;
-  } else {
-    if (Inst.getOpcode() == ARM64::ANDSWri)
-      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 12);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 32))
-      return Fail;
-  }
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  return Success;
-}
-
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  if (Inst.getOpcode() == ARM64::MOVID)
-    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  else
-    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-
-  switch (Inst.getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVIv4i16:
-  case ARM64::MOVIv8i16:
-  case ARM64::MVNIv4i16:
-  case ARM64::MVNIv8i16:
-  case ARM64::MOVIv2i32:
-  case ARM64::MOVIv4i32:
-  case ARM64::MVNIv2i32:
-  case ARM64::MVNIv4i32:
-    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-    break;
-  case ARM64::MOVIv2s_msl:
-  case ARM64::MOVIv4s_msl:
-  case ARM64::MVNIv2s_msl:
-  case ARM64::MVNIv4s_msl:
-    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
-    break;
-  }
-
-  return Success;
-}
-
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  // Tied operands added twice.
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-
-  return Success;
-}
-
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
-  imm |= fieldFromInstruction(insn, 29, 2);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 21-bit immediate.
-  if (imm & (1 << (21 - 1)))
-    imm |= ~((1LL << 21) - 1);
-
-  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Imm = fieldFromInstruction(insn, 10, 14);
-  unsigned S = fieldFromInstruction(insn, 29, 1);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-
-  unsigned ShifterVal = (Imm >> 12) & 3;
-  unsigned ImmVal = Imm & 0xFFF;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  if (ShifterVal != 0 && ShifterVal != 1)
-    return Fail;
-
-  if (Datasize) {
-    if (Rd == 31 && !S)
-      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  } else {
-    if (Rd == 31 && !S)
-      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-  }
-
-  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
-  return Success;
-}
-
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  int64_t imm = fieldFromInstruction(insn, 0, 26);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 26-bit immediate.
-  if (imm & (1 << (26 - 1)))
-    imm |= ~((1LL << 26) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn, uint64_t Addr,
-                                                  const void *Decoder) {
-  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
-  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
-  uint64_t crm = fieldFromInstruction(insn, 8, 4);
-
-  uint64_t pstate_field = (op1 << 3) | op2;
-
-  Inst.addOperand(MCOperand::CreateImm(pstate_field));
-  Inst.addOperand(MCOperand::CreateImm(crm));
-
-  bool ValidNamed;
-  (void)ARM64PState::PStateMapper().toString(pstate_field, ValidNamed);
-  
-  return ValidNamed ? Success : Fail;
-}
-
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
-  bit |= fieldFromInstruction(insn, 19, 5);
-  int64_t dst = fieldFromInstruction(insn, 5, 14);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 14-bit immediate.
-  if (dst & (1 << (14 - 1)))
-    dst |= ~((1LL << 14) - 1);
-
-  if (fieldFromInstruction(insn, 31, 1) == 0)
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-  else
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(bit));
-  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
-    Inst.addOperand(MCOperand::CreateImm(dst));
-
-  return Success;
-}
diff --git a/lib/Target/ARM64/Disassembler/CMakeLists.txt b/lib/Target/ARM64/Disassembler/CMakeLists.txt
deleted file mode 100644
index 43ade66be144..000000000000
--- a/lib/Target/ARM64/Disassembler/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Disassembler
-  ARM64Disassembler.cpp
-  ARM64ExternalSymbolizer.cpp
-  )
-# workaround for hanging compilation on MSVC8, 9 and 10
-#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-#set_property(
-#  SOURCE ARMDisassembler.cpp
-#  PROPERTY COMPILE_FLAGS "/Od"
-#  )
-#endif()
-add_dependencies(LLVMARM64Disassembler ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/Disassembler/LLVMBuild.txt b/lib/Target/ARM64/Disassembler/LLVMBuild.txt
deleted file mode 100644
index 5bbe88ddb49a..000000000000
--- a/lib/Target/ARM64/Disassembler/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Disassembler
-parent = ARM64
-required_libraries = ARM64Info ARM64Utils MC Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/Disassembler/Makefile b/lib/Target/ARM64/Disassembler/Makefile
deleted file mode 100644
index 479d00c2494b..000000000000
--- a/lib/Target/ARM64/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/Disassembler/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Disassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
deleted file mode 100644
index adfcb46ac403..000000000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
+++ /dev/null
@@ -1,1329 +0,0 @@
-//===-- ARM64InstPrinter.cpp - Convert ARM64 MCInst to assembly syntax ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstPrinter.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter.inc"
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter1.inc"
-
-ARM64InstPrinter::ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                                   const MCRegisterInfo &MRI,
-                                   const MCSubtargetInfo &STI)
-    : MCInstPrinter(MAI, MII, MRI) {
-  // Initialize the set of available features.
-  setAvailableFeatures(STI.getFeatureBits());
-}
-
-ARM64AppleInstPrinter::ARM64AppleInstPrinter(const MCAsmInfo &MAI,
-                                             const MCInstrInfo &MII,
-                                             const MCRegisterInfo &MRI,
-                                             const MCSubtargetInfo &STI)
-    : ARM64InstPrinter(MAI, MII, MRI, STI) {}
-
-void ARM64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  // This is for .cfi directives.
-  OS << getRegisterName(RegNo);
-}
-
-void ARM64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                 StringRef Annot) {
-  // Check for special encodings and print the canonical alias instead.
-
-  unsigned Opcode = MI->getOpcode();
-
-  if (Opcode == ARM64::SYSxt)
-    if (printSysAlias(MI, O)) {
-      printAnnotation(O, Annot);
-      return;
-    }
-
-  // SBFM/UBFM should print to a nicer aliased form if possible.
-  if (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri ||
-      Opcode == ARM64::UBFMXri || Opcode == ARM64::UBFMWri) {
-    const MCOperand &Op0 = MI->getOperand(0);
-    const MCOperand &Op1 = MI->getOperand(1);
-    const MCOperand &Op2 = MI->getOperand(2);
-    const MCOperand &Op3 = MI->getOperand(3);
-
-    bool IsSigned = (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri);
-    bool Is64Bit = (Opcode == ARM64::SBFMXri || Opcode == ARM64::UBFMXri);
-    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
-      const char *AsmMnemonic = nullptr;
-
-      switch (Op3.getImm()) {
-      default:
-        break;
-      case 7:
-        if (IsSigned)
-          AsmMnemonic = "sxtb";
-        else if (!Is64Bit)
-          AsmMnemonic = "uxtb";
-        break;
-      case 15:
-        if (IsSigned)
-          AsmMnemonic = "sxth";
-        else if (!Is64Bit)
-          AsmMnemonic = "uxth";
-        break;
-      case 31:
-        // *xtw is only valid for signed 64-bit operations.
-        if (Is64Bit && IsSigned)
-          AsmMnemonic = "sxtw";
-        break;
-      }
-
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-
-    // All immediate shifts are aliases, implemented using the Bitfield
-    // instruction. In all cases the immediate shift amount shift must be in
-    // the range 0 to (reg.size -1).
-    if (Op2.isImm() && Op3.isImm()) {
-      const char *AsmMnemonic = nullptr;
-      int shift = 0;
-      int64_t immr = Op2.getImm();
-      int64_t imms = Op3.getImm();
-      if (Opcode == ARM64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
-        AsmMnemonic = "lsl";
-        shift = 31 - imms;
-      } else if (Opcode == ARM64::UBFMXri && imms != 0x3f &&
-                 ((imms + 1 == immr))) {
-        AsmMnemonic = "lsl";
-        shift = 63 - imms;
-      } else if (Opcode == ARM64::UBFMWri && imms == 0x1f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::UBFMXri && imms == 0x3f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMWri && imms == 0x1f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMXri && imms == 0x3f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      }
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-
-    // SBFIZ/UBFIZ aliases
-    if (Op2.getImm() > Op3.getImm()) {
-      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
-        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
-        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
-      printAnnotation(O, Annot);
-      return;
-    }
-
-    // Otherwise SBFX/UBFX is the preferred form
-    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
-      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
-      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  if (Opcode == ARM64::BFMXri || Opcode == ARM64::BFMWri) {
-    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
-    const MCOperand &Op2 = MI->getOperand(2);
-    int ImmR = MI->getOperand(3).getImm();
-    int ImmS = MI->getOperand(4).getImm();
-
-    // BFI alias
-    if (ImmS < ImmR) {
-      int BitWidth = Opcode == ARM64::BFMXri ? 64 : 32;
-      int LSB = (BitWidth - ImmR) % BitWidth;
-      int Width = ImmS + 1;
-      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
-        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
-      printAnnotation(O, Annot);
-      return;
-    }
-
-    int LSB = ImmR;
-    int Width = ImmS - ImmR + 1;
-    // Otherwise BFXIL the preferred form
-    O << "\tbfxil\t"
-      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
-      << ", #" << LSB << ", #" << Width;
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
-  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
-  // printed.
-  if ((Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi ||
-       Opcode == ARM64::MOVNXi || Opcode == ARM64::MOVNWi) &&
-      MI->getOperand(1).isExpr()) {
-    if (Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi)
-      O << "\tmovz\t";
-    else
-      O << "\tmovn\t";
-
-    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(1).getExpr();
-    return;
-  }
-
-  if ((Opcode == ARM64::MOVKXi || Opcode == ARM64::MOVKWi) &&
-      MI->getOperand(2).isExpr()) {
-    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(2).getExpr();
-    return;
-  }
-
-  if (!printAliasInstr(MI, O))
-    printInstruction(MI, O);
-
-  printAnnotation(O, Annot);
-}
-
-static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
-                                bool &IsTbx) {
-  switch (Opcode) {
-  case ARM64::TBXv8i8One:
-  case ARM64::TBXv8i8Two:
-  case ARM64::TBXv8i8Three:
-  case ARM64::TBXv8i8Four:
-    IsTbx = true;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBLv8i8One:
-  case ARM64::TBLv8i8Two:
-  case ARM64::TBLv8i8Three:
-  case ARM64::TBLv8i8Four:
-    IsTbx = false;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBXv16i8One:
-  case ARM64::TBXv16i8Two:
-  case ARM64::TBXv16i8Three:
-  case ARM64::TBXv16i8Four:
-    IsTbx = true;
-    Layout = ".16b";
-    return true;
-  case ARM64::TBLv16i8One:
-  case ARM64::TBLv16i8Two:
-  case ARM64::TBLv16i8Three:
-  case ARM64::TBLv16i8Four:
-    IsTbx = false;
-    Layout = ".16b";
-    return true;
-  default:
-    return false;
-  }
-}
-
-struct LdStNInstrDesc {
-  unsigned Opcode;
-  const char *Mnemonic;
-  const char *Layout;
-  int ListOperand;
-  bool HasLane;
-  int NaturalOffset;
-};
-
-static LdStNInstrDesc LdStNInstInfo[] = {
-  { ARM64::LD1i8,             "ld1",  ".b",     1, true,  0  },
-  { ARM64::LD1i16,            "ld1",  ".h",     1, true,  0  },
-  { ARM64::LD1i32,            "ld1",  ".s",     1, true,  0  },
-  { ARM64::LD1i64,            "ld1",  ".d",     1, true,  0  },
-  { ARM64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
-  { ARM64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
-  { ARM64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
-  { ARM64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
-  { ARM64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
-  { ARM64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
-  { ARM64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
-  { ARM64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
-  { ARM64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
-  { ARM64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
-  { ARM64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
-  { ARM64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
-  { ARM64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
-  { ARM64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
-  { ARM64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
-  { ARM64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
-  { ARM64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
-  { ARM64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
-  { ARM64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
-  { ARM64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
-  { ARM64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
-  { ARM64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
-  { ARM64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
-  { ARM64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
-  { ARM64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
-  { ARM64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
-  { ARM64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
-  { ARM64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
-  { ARM64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
-  { ARM64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
-  { ARM64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
-  { ARM64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
-  { ARM64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
-  { ARM64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
-  { ARM64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
-  { ARM64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
-  { ARM64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
-  { ARM64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
-  { ARM64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
-  { ARM64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
-  { ARM64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
-  { ARM64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
-  { ARM64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
-  { ARM64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
-  { ARM64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
-  { ARM64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
-  { ARM64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
-  { ARM64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
-  { ARM64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
-  { ARM64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
-  { ARM64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
-  { ARM64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
-  { ARM64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
-  { ARM64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
-  { ARM64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
-  { ARM64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
-  { ARM64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
-  { ARM64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
-  { ARM64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
-  { ARM64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
-  { ARM64::LD2i8,             "ld2",  ".b",     1, true,  0  },
-  { ARM64::LD2i16,            "ld2",  ".h",     1, true,  0  },
-  { ARM64::LD2i32,            "ld2",  ".s",     1, true,  0  },
-  { ARM64::LD2i64,            "ld2",  ".d",     1, true,  0  },
-  { ARM64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
-  { ARM64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
-  { ARM64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
-  { ARM64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
-  { ARM64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
-  { ARM64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
-  { ARM64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
-  { ARM64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
-  { ARM64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
-  { ARM64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
-  { ARM64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
-  { ARM64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
-  { ARM64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
-  { ARM64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
-  { ARM64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
-  { ARM64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
-  { ARM64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
-  { ARM64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
-  { ARM64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
-  { ARM64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
-  { ARM64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
-  { ARM64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
-  { ARM64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
-  { ARM64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
-  { ARM64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
-  { ARM64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
-  { ARM64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
-  { ARM64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
-  { ARM64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
-  { ARM64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
-  { ARM64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
-  { ARM64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
-  { ARM64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
-  { ARM64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
-  { ARM64::LD3i8,             "ld3",  ".b",     1, true,  0  },
-  { ARM64::LD3i16,            "ld3",  ".h",     1, true,  0  },
-  { ARM64::LD3i32,            "ld3",  ".s",     1, true,  0  },
-  { ARM64::LD3i64,            "ld3",  ".d",     1, true,  0  },
-  { ARM64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
-  { ARM64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
-  { ARM64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
-  { ARM64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
-  { ARM64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
-  { ARM64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
-  { ARM64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
-  { ARM64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
-  { ARM64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
-  { ARM64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
-  { ARM64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
-  { ARM64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
-  { ARM64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
-  { ARM64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
-  { ARM64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
-  { ARM64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
-  { ARM64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
-  { ARM64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
-  { ARM64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
-  { ARM64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
-  { ARM64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
-  { ARM64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
-  { ARM64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
-  { ARM64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
-  { ARM64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
-  { ARM64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
-  { ARM64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
-  { ARM64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
-  { ARM64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
-  { ARM64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
-  { ARM64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
-  { ARM64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
-  { ARM64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
-  { ARM64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
-  { ARM64::LD4i8,             "ld4",  ".b",     1, true,  0  },
-  { ARM64::LD4i16,            "ld4",  ".h",     1, true,  0  },
-  { ARM64::LD4i32,            "ld4",  ".s",     1, true,  0  },
-  { ARM64::LD4i64,            "ld4",  ".d",     1, true,  0  },
-  { ARM64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
-  { ARM64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
-  { ARM64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
-  { ARM64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
-  { ARM64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
-  { ARM64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
-  { ARM64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
-  { ARM64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
-  { ARM64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
-  { ARM64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
-  { ARM64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
-  { ARM64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
-  { ARM64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
-  { ARM64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
-  { ARM64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
-  { ARM64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
-  { ARM64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
-  { ARM64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
-  { ARM64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
-  { ARM64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
-  { ARM64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
-  { ARM64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
-  { ARM64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
-  { ARM64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
-  { ARM64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
-  { ARM64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
-  { ARM64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
-  { ARM64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
-  { ARM64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
-  { ARM64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
-  { ARM64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
-  { ARM64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
-  { ARM64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
-  { ARM64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
-  { ARM64::ST1i8,             "st1",  ".b",     0, true,  0  },
-  { ARM64::ST1i16,            "st1",  ".h",     0, true,  0  },
-  { ARM64::ST1i32,            "st1",  ".s",     0, true,  0  },
-  { ARM64::ST1i64,            "st1",  ".d",     0, true,  0  },
-  { ARM64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
-  { ARM64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
-  { ARM64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
-  { ARM64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
-  { ARM64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
-  { ARM64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
-  { ARM64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
-  { ARM64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
-  { ARM64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
-  { ARM64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
-  { ARM64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
-  { ARM64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
-  { ARM64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
-  { ARM64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
-  { ARM64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
-  { ARM64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
-  { ARM64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
-  { ARM64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
-  { ARM64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
-  { ARM64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
-  { ARM64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
-  { ARM64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
-  { ARM64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
-  { ARM64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
-  { ARM64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
-  { ARM64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
-  { ARM64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
-  { ARM64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
-  { ARM64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
-  { ARM64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
-  { ARM64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
-  { ARM64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
-  { ARM64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
-  { ARM64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
-  { ARM64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
-  { ARM64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
-  { ARM64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
-  { ARM64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
-  { ARM64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
-  { ARM64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
-  { ARM64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
-  { ARM64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
-  { ARM64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
-  { ARM64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
-  { ARM64::ST2i8,             "st2",  ".b",     0, true,  0  },
-  { ARM64::ST2i16,            "st2",  ".h",     0, true,  0  },
-  { ARM64::ST2i32,            "st2",  ".s",     0, true,  0  },
-  { ARM64::ST2i64,            "st2",  ".d",     0, true,  0  },
-  { ARM64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
-  { ARM64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
-  { ARM64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
-  { ARM64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
-  { ARM64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
-  { ARM64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
-  { ARM64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
-  { ARM64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
-  { ARM64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
-  { ARM64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
-  { ARM64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
-  { ARM64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
-  { ARM64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
-  { ARM64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
-  { ARM64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
-  { ARM64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
-  { ARM64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
-  { ARM64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
-  { ARM64::ST3i8,             "st3",  ".b",     0, true,  0  },
-  { ARM64::ST3i16,            "st3",  ".h",     0, true,  0  },
-  { ARM64::ST3i32,            "st3",  ".s",     0, true,  0  },
-  { ARM64::ST3i64,            "st3",  ".d",     0, true,  0  },
-  { ARM64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
-  { ARM64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
-  { ARM64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
-  { ARM64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
-  { ARM64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
-  { ARM64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
-  { ARM64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
-  { ARM64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
-  { ARM64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
-  { ARM64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
-  { ARM64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
-  { ARM64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
-  { ARM64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
-  { ARM64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
-  { ARM64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
-  { ARM64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
-  { ARM64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
-  { ARM64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
-  { ARM64::ST4i8,             "st4",  ".b",     0, true,  0  },
-  { ARM64::ST4i16,            "st4",  ".h",     0, true,  0  },
-  { ARM64::ST4i32,            "st4",  ".s",     0, true,  0  },
-  { ARM64::ST4i64,            "st4",  ".d",     0, true,  0  },
-  { ARM64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
-  { ARM64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
-  { ARM64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
-  { ARM64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
-  { ARM64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
-  { ARM64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
-  { ARM64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
-  { ARM64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
-  { ARM64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
-  { ARM64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
-  { ARM64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
-  { ARM64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
-  { ARM64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
-  { ARM64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
-  { ARM64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
-  { ARM64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
-  { ARM64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
-  { ARM64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
-};
-
-static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
-  unsigned Idx;
-  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
-    if (LdStNInstInfo[Idx].Opcode == Opcode)
-      return &LdStNInstInfo[Idx];
-
-  return nullptr;
-}
-
-void ARM64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                      StringRef Annot) {
-  unsigned Opcode = MI->getOpcode();
-  StringRef Layout, Mnemonic;
-
-  bool IsTbx;
-  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
-    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
-      << getRegisterName(MI->getOperand(0).getReg(), ARM64::vreg) << ", ";
-
-    unsigned ListOpNum = IsTbx ? 2 : 1;
-    printVectorList(MI, ListOpNum, O, "");
-
-    O << ", "
-      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), ARM64::vreg);
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
-    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
-
-    // Now onto the operands: first a vector list with possible lane
-    // specifier. E.g. { v0 }[2]
-    int OpNum = LdStDesc->ListOperand;
-    printVectorList(MI, OpNum++, O, "");
-
-    if (LdStDesc->HasLane)
-      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
-
-    // Next the address: [xN]
-    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
-    O << ", [" << getRegisterName(AddrReg) << ']';
-
-    // Finally, there might be a post-indexed offset.
-    if (LdStDesc->NaturalOffset != 0) {
-      unsigned Reg = MI->getOperand(OpNum++).getReg();
-      if (Reg != ARM64::XZR)
-        O << ", " << getRegisterName(Reg);
-      else {
-        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
-        O << ", #" << LdStDesc->NaturalOffset;
-      }
-    }
-
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  ARM64InstPrinter::printInst(MI, O, Annot);
-}
-
-bool ARM64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
-#ifndef NDEBUG
-  unsigned Opcode = MI->getOpcode();
-  assert(Opcode == ARM64::SYSxt && "Invalid opcode for SYS alias!");
-#endif
-
-  const char *Asm = nullptr;
-  const MCOperand &Op1 = MI->getOperand(0);
-  const MCOperand &Cn = MI->getOperand(1);
-  const MCOperand &Cm = MI->getOperand(2);
-  const MCOperand &Op2 = MI->getOperand(3);
-
-  unsigned Op1Val = Op1.getImm();
-  unsigned CnVal = Cn.getImm();
-  unsigned CmVal = Cm.getImm();
-  unsigned Op2Val = Op2.getImm();
-
-  if (CnVal == 7) {
-    switch (CmVal) {
-    default:
-      break;
-
-    // IC aliases
-    case 1:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tialluis";
-      break;
-    case 5:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tiallu";
-      else if (Op1Val == 3 && Op2Val == 1)
-        Asm = "ic\tivau";
-      break;
-
-    // DC aliases
-    case 4:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tzva";
-      break;
-    case 6:
-      if (Op1Val == 0 && Op2Val == 1)
-        Asm = "dc\tivac";
-      if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tisw";
-      break;
-    case 10:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcsw";
-      break;
-    case 11:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvau";
-      break;
-    case 14:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcivac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcisw";
-      break;
-
-    // AT aliases
-    case 8:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e1r"; break;
-        case 1: Asm = "at\ts1e1w"; break;
-        case 2: Asm = "at\ts1e0r"; break;
-        case 3: Asm = "at\ts1e0w"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e2r"; break;
-        case 1: Asm = "at\ts1e2w"; break;
-        case 4: Asm = "at\ts12e1r"; break;
-        case 5: Asm = "at\ts12e1w"; break;
-        case 6: Asm = "at\ts12e0r"; break;
-        case 7: Asm = "at\ts12e0w"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e3r"; break;
-        case 1: Asm = "at\ts1e3w"; break;
-        }
-        break;
-      }
-      break;
-    }
-  } else if (CnVal == 8) {
-    // TLBI aliases
-    switch (CmVal) {
-    default:
-      break;
-    case 3:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1is"; break;
-        case 1: Asm = "tlbi\tvae1is"; break;
-        case 2: Asm = "tlbi\taside1is"; break;
-        case 3: Asm = "tlbi\tvaae1is"; break;
-        case 5: Asm = "tlbi\tvale1is"; break;
-        case 7: Asm = "tlbi\tvaale1is"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2is"; break;
-        case 1: Asm = "tlbi\tvae2is"; break;
-        case 4: Asm = "tlbi\talle1is"; break;
-        case 5: Asm = "tlbi\tvale2is"; break;
-        case 6: Asm = "tlbi\tvmalls12e1is"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3is"; break;
-        case 1: Asm = "tlbi\tvae3is"; break;
-        case 5: Asm = "tlbi\tvale3is"; break;
-        }
-        break;
-      }
-      break;
-    case 0:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1is"; break;
-        case 5: Asm = "tlbi\tipas2le1is"; break;
-        }
-        break;
-      }
-      break;
-    case 4:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1"; break;
-        case 5: Asm = "tlbi\tipas2le1"; break;
-        }
-        break;
-      }
-      break;
-    case 7:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1"; break;
-        case 1: Asm = "tlbi\tvae1"; break;
-        case 2: Asm = "tlbi\taside1"; break;
-        case 3: Asm = "tlbi\tvaae1"; break;
-        case 5: Asm = "tlbi\tvale1"; break;
-        case 7: Asm = "tlbi\tvaale1"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2"; break;
-        case 1: Asm = "tlbi\tvae2"; break;
-        case 4: Asm = "tlbi\talle1"; break;
-        case 5: Asm = "tlbi\tvale2"; break;
-        case 6: Asm = "tlbi\tvmalls12e1"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3"; break;
-        case 1: Asm = "tlbi\tvae3";  break;
-        case 5: Asm = "tlbi\tvale3"; break;
-        }
-        break;
-      }
-      break;
-    }
-  }
-
-  if (Asm) {
-    unsigned Reg = MI->getOperand(4).getReg();
-
-    O << '\t' << Asm;
-    if (StringRef(Asm).lower().find("all") == StringRef::npos)
-      O << ", " << getRegisterName(Reg);
-  }
-
-  return Asm != nullptr;
-}
-
-void ARM64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
-  } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
-  } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
-  }
-}
-
-void ARM64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  O << format("#%#llx", Op.getImm());
-}
-
-void ARM64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
-                                           unsigned Imm, raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    if (Reg == ARM64::XZR)
-      O << "#" << Imm;
-    else
-      O << getRegisterName(Reg);
-  } else
-    assert(0 && "unknown operand kind in printPostIncOperand64");
-}
-
-void ARM64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isReg() && "Non-register vreg operand!");
-  unsigned Reg = Op.getReg();
-  O << getRegisterName(Reg, ARM64::vreg);
-}
-
-void ARM64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
-  O << "c" << Op.getImm();
-}
-
-void ARM64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.isImm()) {
-    unsigned Val = (MO.getImm() & 0xfff);
-    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
-    unsigned Shift =
-        ARM64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
-    O << '#' << Val;
-    if (Shift != 0)
-      printShifter(MI, OpNum + 1, O);
-
-    if (CommentStream)
-      *CommentStream << "=#" << (Val << Shift) << '\n';
-  } else {
-    assert(MO.isExpr() && "Unexpected operand type!");
-    O << *MO.getExpr();
-    printShifter(MI, OpNum + 1, O);
-  }
-}
-
-void ARM64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 32));
-}
-
-void ARM64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 64));
-}
-
-void ARM64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  // LSL #0 should not be printed.
-  if (ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-      ARM64_AM::getShiftValue(Val) == 0)
-    return;
-  O << ", " << ARM64_AM::getShiftExtendName(ARM64_AM::getShiftType(Val)) << " #"
-    << ARM64_AM::getShiftValue(Val);
-}
-
-void ARM64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printShifter(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printExtend(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printExtend(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  ARM64_AM::ShiftExtendType ExtType = ARM64_AM::getArithExtendType(Val);
-  unsigned ShiftVal = ARM64_AM::getArithShiftValue(Val);
-
-  // If the destination or first source register operand is [W]SP, print
-  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
-  // all.
-  if (ExtType == ARM64_AM::UXTW || ExtType == ARM64_AM::UXTX) {
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src1 = MI->getOperand(1).getReg();
-    if ( ((Dest == ARM64::SP || Src1 == ARM64::SP) &&
-          ExtType == ARM64_AM::UXTX) ||
-         ((Dest == ARM64::WSP || Src1 == ARM64::WSP) &&
-          ExtType == ARM64_AM::UXTW) ) {
-      if (ShiftVal != 0)
-        O << ", lsl #" << ShiftVal;
-      return;
-    }
-  }
-  O << ", " << ARM64_AM::getShiftExtendName(ExtType);
-  if (ShiftVal != 0)
-    O << " #" << ShiftVal;
-}
-
-void ARM64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
-                                     raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  O << ARM64CC::getCondCodeName(CC);
-}
-
-void ARM64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  O << ARM64CC::getCondCodeName(ARM64CC::getInvertedCondCode(CC));
-}
-
-void ARM64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
-}
-
-template<int Scale>
-void ARM64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
-                                     raw_ostream &O) {
-  O << '#' << Scale * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printAMIndexed(const MCInst *MI, unsigned OpNum,
-                                      unsigned Scale, raw_ostream &O) {
-  const MCOperand MO1 = MI->getOperand(OpNum + 1);
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
-  if (MO1.isImm()) {
-    if (MO1.getImm() != 0)
-      O << ", #" << (MO1.getImm() * Scale);
-  } else {
-    assert(MO1.isExpr() && "Unexpected operand type!");
-    O << ", " << *MO1.getExpr();
-  }
-  O << ']';
-}
-
-void ARM64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
-                                        unsigned Scale, raw_ostream &O) {
-  const MCOperand MO1 = MI->getOperand(OpNum + 1);
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
-  if (MO1.isImm()) {
-      O << ", #" << (MO1.getImm() * Scale);
-  } else {
-    assert(MO1.isExpr() && "Unexpected operand type!");
-    O << ", " << *MO1.getExpr();
-  }
-  O << ']';
-}
-
-void ARM64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  unsigned prfop = MI->getOperand(OpNum).getImm();
-  bool Valid;
-  StringRef Name = ARM64PRFM::PRFMMapper().toString(prfop, Valid);
-  if (Valid)
-    O << Name;
-  else
-    O << '#' << prfop;
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O, unsigned Scale) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << Scale * MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryRegOffset(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O, int Scale) {
-  unsigned Val = MI->getOperand(OpNum + 2).getImm();
-  ARM64_AM::ShiftExtendType ExtType = ARM64_AM::getMemExtendType(Val);
-
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ", ";
-  if (ExtType == ARM64_AM::UXTW || ExtType == ARM64_AM::SXTW)
-    O << getRegisterName(getWRegFromXReg(MI->getOperand(OpNum + 1).getReg()));
-  else
-    O << getRegisterName(MI->getOperand(OpNum + 1).getReg());
-
-  bool DoShift = ARM64_AM::getMemDoShift(Val);
-
-  if (ExtType == ARM64_AM::UXTX) {
-    if (DoShift)
-      O << ", lsl";
-  } else
-    O << ", " << ARM64_AM::getShiftExtendName(ExtType);
-
-  if (DoShift)
-    O << " #" << Log2_32(Scale);
-
-  O << "]";
-}
-
-void ARM64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  float FPImm = MO.isFPImm() ? MO.getFPImm() : ARM64_AM::getFPImmFloat(MO.getImm());
-
-  // 8 decimal places are enough to perfectly represent permitted floats.
-  O << format("#%.8f", FPImm);
-}
-
-static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
-  while (Stride--) {
-    switch (Reg) {
-    default:
-      assert(0 && "Vector register expected!");
-    case ARM64::Q0:  Reg = ARM64::Q1;  break;
-    case ARM64::Q1:  Reg = ARM64::Q2;  break;
-    case ARM64::Q2:  Reg = ARM64::Q3;  break;
-    case ARM64::Q3:  Reg = ARM64::Q4;  break;
-    case ARM64::Q4:  Reg = ARM64::Q5;  break;
-    case ARM64::Q5:  Reg = ARM64::Q6;  break;
-    case ARM64::Q6:  Reg = ARM64::Q7;  break;
-    case ARM64::Q7:  Reg = ARM64::Q8;  break;
-    case ARM64::Q8:  Reg = ARM64::Q9;  break;
-    case ARM64::Q9:  Reg = ARM64::Q10; break;
-    case ARM64::Q10: Reg = ARM64::Q11; break;
-    case ARM64::Q11: Reg = ARM64::Q12; break;
-    case ARM64::Q12: Reg = ARM64::Q13; break;
-    case ARM64::Q13: Reg = ARM64::Q14; break;
-    case ARM64::Q14: Reg = ARM64::Q15; break;
-    case ARM64::Q15: Reg = ARM64::Q16; break;
-    case ARM64::Q16: Reg = ARM64::Q17; break;
-    case ARM64::Q17: Reg = ARM64::Q18; break;
-    case ARM64::Q18: Reg = ARM64::Q19; break;
-    case ARM64::Q19: Reg = ARM64::Q20; break;
-    case ARM64::Q20: Reg = ARM64::Q21; break;
-    case ARM64::Q21: Reg = ARM64::Q22; break;
-    case ARM64::Q22: Reg = ARM64::Q23; break;
-    case ARM64::Q23: Reg = ARM64::Q24; break;
-    case ARM64::Q24: Reg = ARM64::Q25; break;
-    case ARM64::Q25: Reg = ARM64::Q26; break;
-    case ARM64::Q26: Reg = ARM64::Q27; break;
-    case ARM64::Q27: Reg = ARM64::Q28; break;
-    case ARM64::Q28: Reg = ARM64::Q29; break;
-    case ARM64::Q29: Reg = ARM64::Q30; break;
-    case ARM64::Q30: Reg = ARM64::Q31; break;
-    // Vector lists can wrap around.
-    case ARM64::Q31:
-      Reg = ARM64::Q0;
-      break;
-    }
-  }
-  return Reg;
-}
-
-void ARM64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O, StringRef LayoutSuffix) {
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-
-  O << "{ ";
-
-  // Work out how many registers there are in the list (if there is an actual
-  // list).
-  unsigned NumRegs = 1;
-  if (MRI.getRegClass(ARM64::DDRegClassID).contains(Reg) ||
-      MRI.getRegClass(ARM64::QQRegClassID).contains(Reg))
-    NumRegs = 2;
-  else if (MRI.getRegClass(ARM64::DDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQRegClassID).contains(Reg))
-    NumRegs = 3;
-  else if (MRI.getRegClass(ARM64::DDDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQQRegClassID).contains(Reg))
-    NumRegs = 4;
-
-  // Now forget about the list and find out what the first register is.
-  if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::dsub0))
-    Reg = FirstReg;
-  else if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::qsub0))
-    Reg = FirstReg;
-
-  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
-  // printing (otherwise getRegisterName fails).
-  if (MRI.getRegClass(ARM64::FPR64RegClassID).contains(Reg)) {
-    const MCRegisterClass &FPR128RC = MRI.getRegClass(ARM64::FPR128RegClassID);
-    Reg = MRI.getMatchingSuperReg(Reg, ARM64::dsub, &FPR128RC);
-  }
-
-  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
-    O << getRegisterName(Reg, ARM64::vreg) << LayoutSuffix;
-    if (i + 1 != NumRegs)
-      O << ", ";
-  }
-
-  O << " }";
-}
-
-void ARM64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
-                                                      unsigned OpNum,
-                                                      raw_ostream &O) {
-  printVectorList(MI, OpNum, O, "");
-}
-
-template <unsigned NumLanes, char LaneKind>
-void ARM64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  std::string Suffix(".");
-  if (NumLanes)
-    Suffix += itostr(NumLanes) + LaneKind;
-  else
-    Suffix += LaneKind;
-
-  printVectorList(MI, OpNum, O, Suffix);
-}
-
-void ARM64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  O << "[" << MI->getOperand(OpNum).getImm() << "]";
-}
-
-void ARM64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 2);
-    return;
-  }
-
-  // If the branch target is simply an address then print it in hex.
-  const MCConstantExpr *BranchTarget =
-      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
-  int64_t Address;
-  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-    O << "0x";
-    O.write_hex(Address);
-  } else {
-    // Otherwise, just print the expression.
-    O << *MI->getOperand(OpNum).getExpr();
-  }
-}
-
-void ARM64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 12);
-    return;
-  }
-
-  // Otherwise, just print the expression.
-  O << *MI->getOperand(OpNum).getExpr();
-}
-
-void ARM64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  unsigned Opcode = MI->getOpcode();
-
-  bool Valid;
-  StringRef Name;
-  if (Opcode == ARM64::ISB)
-    Name = ARM64ISB::ISBMapper().toString(Val, Valid);
-  else
-    Name = ARM64DB::DBarrierMapper().toString(Val, Valid);
-  if (Valid)
-    O << Name;
-  else
-    O << "#" << Val;
-}
-
-void ARM64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-
-  bool Valid;
-  auto Mapper = ARM64SysReg::MRSMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
-
-  if (Valid)
-    O << StringRef(Name).upper();
-}
-
-void ARM64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-
-  bool Valid;
-  auto Mapper = ARM64SysReg::MSRMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
-
-  if (Valid)
-    O << StringRef(Name).upper();
-}
-
-void ARM64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
-                                              raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-
-  bool Valid;
-  StringRef Name = ARM64PState::PStateMapper().toString(Val, Valid);
-  if (Valid)
-    O << StringRef(Name.str()).upper();
-  else
-    O << "#" << Val;
-}
-
-void ARM64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
-                                              raw_ostream &O) {
-  unsigned RawVal = MI->getOperand(OpNo).getImm();
-  uint64_t Val = ARM64_AM::decodeAdvSIMDModImmType10(RawVal);
-  O << format("#%#016llx", Val);
-}
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
deleted file mode 100644
index 31818dff980b..000000000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
+++ /dev/null
@@ -1,147 +0,0 @@
-//===-- ARM64InstPrinter.h - Convert ARM64 MCInst to assembly syntax ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64INSTPRINTER_H
-#define ARM64INSTPRINTER_H
-
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class ARM64InstPrinter : public MCInstPrinter {
-public:
-  ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
-  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
-
-  // Autogenerated by tblgen.
-  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx, raw_ostream &O);
-  virtual StringRef getRegName(unsigned RegNo) const {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-
-protected:
-  bool printSysAlias(const MCInst *MI, raw_ostream &O);
-  // Operand printers
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
-                           raw_ostream &O);
-  template<int Amount>
-  void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printPostIncOperand(MI, OpNo, Amount, O);
-  }
-
-  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAMIndexed(const MCInst *MI, unsigned OpNum, unsigned Scale,
-                      raw_ostream &O);
-  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
-                        raw_ostream &O);
-
-  template<int BitWidth>
-  void printAMIndexed(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, BitWidth / 8, O);
-  }
-
-  template<int BitWidth>
-  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
-  }
-
-  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<int Scale>
-  void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printMemoryPostIndexed(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                              unsigned Scale);
-  template<int BitWidth>
-  void printMemoryPostIndexed(const MCInst *MI, unsigned OpNum,
-                              raw_ostream &O) {
-    printMemoryPostIndexed(MI, OpNum, O, BitWidth / 8);
-  }
-
-  void printMemoryRegOffset(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                            int LegalShiftAmt);
-  template<int BitWidth>
-  void printMemoryRegOffset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printMemoryRegOffset(MI, OpNum, O, BitWidth / 8);
-  }
-
-  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                       StringRef LayoutSuffix);
-
-  /// Print a list of vector registers where the type suffix is implicit
-  /// (i.e. attached to the instruction rather than the registers).
-  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O);
-
-  template <unsigned NumLanes, char LaneKind>
-  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-};
-
-class ARM64AppleInstPrinter : public ARM64InstPrinter {
-public:
-  ARM64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
-
-  void printInstruction(const MCInst *MI, raw_ostream &O) override;
-  bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx, raw_ostream &O);
-  StringRef getRegName(unsigned RegNo) const override {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/InstPrinter/CMakeLists.txt b/lib/Target/ARM64/InstPrinter/CMakeLists.txt
deleted file mode 100644
index b8ee12c55412..000000000000
--- a/lib/Target/ARM64/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmPrinter
-  ARM64InstPrinter.cpp
-  )
-
-add_dependencies(LLVMARM64AsmPrinter ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
deleted file mode 100644
index 7ab439249210..000000000000
--- a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmPrinter
-parent = ARM64
-required_libraries = ARM64Utils MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/InstPrinter/Makefile b/lib/Target/ARM64/InstPrinter/Makefile
deleted file mode 100644
index a59efb08465f..000000000000
--- a/lib/Target/ARM64/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/AsmPrinter/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/LLVMBuild.txt b/lib/Target/ARM64/LLVMBuild.txt
deleted file mode 100644
index 3d1e56e7ca65..000000000000
--- a/lib/Target/ARM64/LLVMBuild.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-;===- ./lib/Target/ARM64/LLVMBuild.txt -------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[common]
-subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils
-
-[component_0]
-type = TargetGroup
-name = ARM64
-parent = Target
-has_asmparser = 1
-has_asmprinter = 1
-has_disassembler = 1
-has_jit = 1
-
-[component_1]
-type = Library
-name = ARM64CodeGen
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Desc ARM64Info ARM64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
deleted file mode 100644
index ba5025ab620c..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
+++ /dev/null
@@ -1,564 +0,0 @@
-//===-- ARM64AsmBackend.cpp - ARM64 Assembler Backend ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64AsmBackend : public MCAsmBackend {
-  static const unsigned PCRelFlagVal =
-      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
-
-public:
-  ARM64AsmBackend(const Target &T) : MCAsmBackend() {}
-
-  unsigned getNumFixupKinds() const override {
-    return ARM64::NumTargetFixupKinds;
-  }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
-    const static MCFixupKindInfo Infos[ARM64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // ARM64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_arm64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_add_imm12", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_arm64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_arm64_movw", 5, 16, 0 },
-      { "fixup_arm64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_arm64_pcrel_branch19", 5, 19, PCRelFlagVal },
-      { "fixup_arm64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_tlsdesc_call", 0, 0, 0 }
-    };
-
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
-
-  bool mayNeedRelaxation(const MCInst &Inst) const override;
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const override;
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-
-  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
-
-  unsigned getPointerSize() const { return 8; }
-};
-
-} // end anonymous namespace
-
-/// \brief The number of bytes the fixup may change.
-static unsigned getFixupKindNumBytes(unsigned Kind) {
-  switch (Kind) {
-  default:
-    assert(0 && "Unknown fixup kind!");
-
-  case ARM64::fixup_arm64_tlsdesc_call:
-    return 0;
-
-  case FK_Data_1:
-    return 1;
-
-  case FK_Data_2:
-  case ARM64::fixup_arm64_movw:
-    return 2;
-
-  case ARM64::fixup_arm64_pcrel_branch14:
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-  case ARM64::fixup_arm64_ldr_pcrel_imm19:
-  case ARM64::fixup_arm64_pcrel_branch19:
-    return 3;
-
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-  case FK_Data_4:
-    return 4;
-
-  case FK_Data_8:
-    return 8;
-  }
-}
-
-static unsigned AdrImmBits(unsigned Value) {
-  unsigned lo2 = Value & 0x3;
-  unsigned hi19 = (Value & 0x1ffffc) >> 2;
-  return (hi19 << 5) | (lo2 << 29);
-}
-
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
-  int64_t SignedValue = static_cast<int64_t>(Value);
-  switch (Kind) {
-  default:
-    assert(false && "Unknown fixup kind!");
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    return AdrImmBits(Value & 0x1fffffULL);
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
-  case ARM64::fixup_arm64_ldr_pcrel_imm19:
-  case ARM64::fixup_arm64_pcrel_branch19:
-    // Signed 21-bit immediate
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded.
-    return (Value >> 2) & 0x7ffff;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-    // Unsigned 12-bit immediate
-    if (Value >= 0x1000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value;
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-    // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Value & 1 || Value >= 0x2000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 1;
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-    // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Value & 3 || Value >= 0x4000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 2;
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-    // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Value & 7 || Value >= 0x8000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 3;
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-    // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Value & 15 || Value >= 0x10000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 4;
-  case ARM64::fixup_arm64_movw:
-    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
-    return Value;
-  case ARM64::fixup_arm64_pcrel_branch14:
-    // Signed 16-bit immediate
-    if (SignedValue > 32767 || SignedValue < -32768)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3fff;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-    // Signed 28-bit immediate
-    if (SignedValue > 134217727 || SignedValue < -134217728)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3ffffff;
-  case FK_Data_1:
-  case FK_Data_2:
-  case FK_Data_4:
-  case FK_Data_8:
-    return Value;
-  }
-}
-
-void ARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel) const {
-  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  if (!Value)
-    return; // Doesn't change encoding.
-  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
-  // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup.getKind(), Value);
-
-  // Shift the value into position.
-  Value <<= Info.TargetOffset;
-
-  unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-  // For each byte of the fragment that the fixup touches, mask in the
-  // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-bool ARM64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  return false;
-}
-
-bool ARM64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                           const MCRelaxableFragment *DF,
-                                           const MCAsmLayout &Layout) const {
-  // FIXME:  This isn't correct for ARM64. Just moving the "generic" logic
-  // into the targets for now.
-  //
-  // Relax if the value is too big for a (signed) i8.
-  return int64_t(Value) != int64_t(int8_t(Value));
-}
-
-void ARM64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  assert(false && "ARM64AsmBackend::relaxInstruction() unimplemented");
-}
-
-bool ARM64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // If the count is not 4-byte aligned, we must be writing data into the text
-  // section (otherwise we have unaligned instructions, and thus have far
-  // bigger problems), so just write zeros instead.
-  if ((Count & 3) != 0) {
-    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
-      OW->Write8(0);
-  }
-
-  // We are properly aligned, so write NOPs as requested.
-  Count /= 4;
-  for (uint64_t i = 0; i != Count; ++i)
-    OW->Write32(0xd503201f);
-  return true;
-}
-
-namespace {
-
-namespace CU {
-
-/// \brief Compact unwind encoding values.
-enum CompactUnwindEncodings {
-  /// \brief A "frameless" leaf function, where no non-volatile registers are
-  /// saved. The return remains in LR throughout the function.
-  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
-
-  /// \brief No compact unwind encoding available. Instead the low 23-bits of
-  /// the compact unwind encoding is the offset of the DWARF FDE in the
-  /// __eh_frame section. This mode is never used in object files. It is only
-  /// generated by the linker in final linked images, which have only DWARF info
-  /// for a function.
-  UNWIND_ARM64_MODE_DWARF = 0x03000000,
-
-  /// \brief This is a standard arm64 prologue where FP/LR are immediately
-  /// pushed on the stack, then SP is copied to FP. If there are any
-  /// non-volatile register saved, they are copied into the stack fame in pairs
-  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
-  /// five X pairs and four D pairs can be saved, but the memory layout must be
-  /// in register number order.
-  UNWIND_ARM64_MODE_FRAME = 0x04000000,
-
-  /// \brief Frame register pair encodings.
-  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
-  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
-  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
-  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
-  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
-  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
-  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
-  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
-  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
-};
-
-} // end CU namespace
-
-// FIXME: This should be in a separate file.
-class DarwinARM64AsmBackend : public ARM64AsmBackend {
-  const MCRegisterInfo &MRI;
-
-  /// \brief Encode compact unwind stack adjustment for frameless functions.
-  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
-  /// The stack size always needs to be 16 byte aligned.
-  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
-    return (StackSize / 16) << 12;
-  }
-
-public:
-  DarwinARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
-      : ARM64AsmBackend(T), MRI(MRI) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createARM64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
-                                       MachO::CPU_SUBTYPE_ARM64_ALL);
-  }
-
-  bool doesSectionRequireSymbols(const MCSection &Section) const override {
-    // Any section for which the linker breaks things into atoms needs to
-    // preserve symbols, including assembler local symbols, to identify
-    // those atoms. These sections are:
-    // Sections of type:
-    //
-    //    S_CSTRING_LITERALS  (e.g. __cstring)
-    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
-    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
-    //
-    // Sections named:
-    //
-    //    __TEXT,__eh_frame
-    //    __TEXT,__ustring
-    //    __DATA,__cfstring
-    //    __DATA,__objc_classrefs
-    //    __DATA,__objc_catlist
-    //
-    // FIXME: It would be better if the compiler used actual linker local
-    // symbols for each of these sections rather than preserving what
-    // are ostensibly assembler local symbols.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
-    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
-            SMO.getType() == MachO::S_4BYTE_LITERALS ||
-            SMO.getType() == MachO::S_8BYTE_LITERALS ||
-            SMO.getType() == MachO::S_16BYTE_LITERALS ||
-            SMO.getType() == MachO::S_LITERAL_POINTERS ||
-            (SMO.getSegmentName() == "__TEXT" &&
-             (SMO.getSectionName() == "__eh_frame" ||
-              SMO.getSectionName() == "__ustring")) ||
-            (SMO.getSegmentName() == "__DATA" &&
-             (SMO.getSectionName() == "__cfstring" ||
-              SMO.getSectionName() == "__objc_classrefs" ||
-              SMO.getSectionName() == "__objc_catlist")));
-  }
-
-  /// \brief Generate the compact unwind encoding from the CFI directives.
-  uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
-    if (Instrs.empty())
-      return CU::UNWIND_ARM64_MODE_FRAMELESS;
-
-    bool HasFP = false;
-    unsigned StackSize = 0;
-
-    uint32_t CompactUnwindEncoding = 0;
-    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
-      const MCCFIInstruction &Inst = Instrs[i];
-
-      switch (Inst.getOperation()) {
-      default:
-        // Cannot handle this directive:  bail out.
-        return CU::UNWIND_ARM64_MODE_DWARF;
-      case MCCFIInstruction::OpDefCfa: {
-        // Defines a frame pointer.
-        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
-                   ARM64::FP &&
-               "Invalid frame pointer!");
-        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
-
-        const MCCFIInstruction &LRPush = Instrs[++i];
-        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Link register not pushed!");
-        const MCCFIInstruction &FPPush = Instrs[++i];
-        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Frame pointer not pushed!");
-
-        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
-        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
-
-        LRReg = getXRegFromWReg(LRReg);
-        FPReg = getXRegFromWReg(FPReg);
-
-        assert(LRReg == ARM64::LR && FPReg == ARM64::FP &&
-               "Pushing invalid registers for frame!");
-
-        // Indicate that the function has a frame.
-        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
-        HasFP = true;
-        break;
-      }
-      case MCCFIInstruction::OpDefCfaOffset: {
-        assert(StackSize == 0 && "We already have the CFA offset!");
-        StackSize = std::abs(Inst.getOffset());
-        break;
-      }
-      case MCCFIInstruction::OpOffset: {
-        // Registers are saved in pairs. We expect there to be two consecutive
-        // `.cfi_offset' instructions with the appropriate registers specified.
-        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
-        if (i + 1 == e)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-
-        const MCCFIInstruction &Inst2 = Instrs[++i];
-        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
-
-        // N.B. The encodings must be in register number order, and the X
-        // registers before the D registers.
-
-        // X19/X20 pair = 0x00000001,
-        // X21/X22 pair = 0x00000002,
-        // X23/X24 pair = 0x00000004,
-        // X25/X26 pair = 0x00000008,
-        // X27/X28 pair = 0x00000010
-        Reg1 = getXRegFromWReg(Reg1);
-        Reg2 = getXRegFromWReg(Reg2);
-
-        if (Reg1 == ARM64::X19 && Reg2 == ARM64::X20 &&
-            (CompactUnwindEncoding & 0xF1E) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
-        else if (Reg1 == ARM64::X21 && Reg2 == ARM64::X22 &&
-                 (CompactUnwindEncoding & 0xF1C) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
-        else if (Reg1 == ARM64::X23 && Reg2 == ARM64::X24 &&
-                 (CompactUnwindEncoding & 0xF18) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
-        else if (Reg1 == ARM64::X25 && Reg2 == ARM64::X26 &&
-                 (CompactUnwindEncoding & 0xF10) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
-        else if (Reg1 == ARM64::X27 && Reg2 == ARM64::X28 &&
-                 (CompactUnwindEncoding & 0xF00) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
-        else {
-          Reg1 = getDRegFromBReg(Reg1);
-          Reg2 = getDRegFromBReg(Reg2);
-
-          // D8/D9 pair   = 0x00000100,
-          // D10/D11 pair = 0x00000200,
-          // D12/D13 pair = 0x00000400,
-          // D14/D15 pair = 0x00000800
-          if (Reg1 == ARM64::D8 && Reg2 == ARM64::D9 &&
-              (CompactUnwindEncoding & 0xE00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
-          else if (Reg1 == ARM64::D10 && Reg2 == ARM64::D11 &&
-                   (CompactUnwindEncoding & 0xC00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
-          else if (Reg1 == ARM64::D12 && Reg2 == ARM64::D13 &&
-                   (CompactUnwindEncoding & 0x800) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
-          else if (Reg1 == ARM64::D14 && Reg2 == ARM64::D15)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
-          else
-            // A pair was pushed which we cannot handle.
-            return CU::UNWIND_ARM64_MODE_DWARF;
-        }
-
-        break;
-      }
-      }
-    }
-
-    if (!HasFP) {
-      // With compact unwind info we can only represent stack adjustments of up
-      // to 65520 bytes.
-      if (StackSize > 65520)
-        return CU::UNWIND_ARM64_MODE_DWARF;
-
-      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
-      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
-    }
-
-    return CompactUnwindEncoding;
-  }
-};
-
-} // end anonymous namespace
-
-namespace {
-
-class ELFARM64AsmBackend : public ARM64AsmBackend {
-public:
-  uint8_t OSABI;
-  bool IsLittleEndian;
-
-  ELFARM64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
-    : ARM64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createARM64ELFObjectWriter(OS, OSABI, IsLittleEndian);
-  }
-
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
-};
-
-void ELFARM64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                           const MCAsmLayout &Layout,
-                                           const MCFixup &Fixup,
-                                           const MCFragment *DF,
-                                           const MCValue &Target,
-                                           uint64_t &Value, bool &IsResolved) {
-  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
-  // ~0xfff. This means that the required offset to reach a symbol can vary by
-  // up to one step depending on where the ADRP is in memory. For example:
-  //
-  //     ADRP x0, there
-  //  there:
-  //
-  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
-  // we'll need that as an offset. At any other address "there" will be in the
-  // same page as the ADRP and the instruction should encode 0x0. Assuming the
-  // section isn't 0x1000-aligned, we therefore need to delegate this decision
-  // to the linker -- a relocation!
-  if ((uint32_t)Fixup.getKind() == ARM64::fixup_arm64_pcrel_adrp_imm21)
-    IsResolved = false;
-}
-
-void ELFARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                    unsigned DataSize, uint64_t Value,
-                                    bool IsPCRel) const {
-  // store fixups in .eh_frame section in big endian order
-  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
-    const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
-    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
-    if (SecELF->getSectionName() == ".eh_frame")
-      Value = ByteSwap_32(unsigned(Value));
-  }
-  ARM64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
-}
-}
-
-MCAsmBackend *llvm::createARM64leAsmBackend(const Target &T,
-                                            const MCRegisterInfo &MRI,
-                                            StringRef TT, StringRef CPU) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return new DarwinARM64AsmBackend(T, MRI);
-
-  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
-  return new ELFARM64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
-}
-
-MCAsmBackend *llvm::createARM64beAsmBackend(const Target &T,
-                                            const MCRegisterInfo &MRI,
-                                            StringRef TT, StringRef CPU) {
-  Triple TheTriple(TT);
-
-  assert(TheTriple.isOSBinFormatELF() && "Big endian is only supported for ELF targets!");
-  return new ELFARM64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/false);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
deleted file mode 100644
index 0990a701bc87..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-//===-- ARM64ELFObjectWriter.cpp - ARM64 ELF Writer -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file handles ELF-specific object emission, converting LLVM's internal
-// fixups into the appropriate relocations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-namespace {
-class ARM64ELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  ARM64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian);
-
-  virtual ~ARM64ELFObjectWriter();
-
-protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
-
-private:
-};
-}
-
-ARM64ELFObjectWriter::ARM64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
-                              /*HasRelocationAddend*/ true) {}
-
-ARM64ELFObjectWriter::~ARM64ELFObjectWriter() {}
-
-unsigned ARM64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                            const MCFixup &Fixup,
-                                            bool IsPCRel) const {
-  ARM64MCExpr::VariantKind RefKind =
-      static_cast<ARM64MCExpr::VariantKind>(Target.getRefKind());
-  ARM64MCExpr::VariantKind SymLoc = ARM64MCExpr::getSymbolLoc(RefKind);
-  bool IsNC = ARM64MCExpr::isNotChecked(RefKind);
-
-  assert((!Target.getSymA() ||
-          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  assert((!Target.getSymB() ||
-          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_PREL16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_PREL32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_PREL64;
-    case ARM64::fixup_arm64_pcrel_adr_imm21:
-      assert(SymLoc == ARM64MCExpr::VK_NONE && "unexpected ADR relocation");
-      return ELF::R_AARCH64_ADR_PREL_LO21;
-    case ARM64::fixup_arm64_pcrel_adrp_imm21:
-      if (SymLoc == ARM64MCExpr::VK_ABS && !IsNC)
-        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
-      if (SymLoc == ARM64MCExpr::VK_GOT && !IsNC)
-        return ELF::R_AARCH64_ADR_GOT_PAGE;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && !IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      llvm_unreachable("invalid symbol kind for ADRP relocation");
-    case ARM64::fixup_arm64_pcrel_branch26:
-      return ELF::R_AARCH64_JUMP26;
-    case ARM64::fixup_arm64_pcrel_call26:
-      return ELF::R_AARCH64_CALL26;
-    case ARM64::fixup_arm64_ldr_pcrel_imm19:
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL)
-        return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
-      return ELF::R_AARCH64_LD_PREL_LO19;
-    case ARM64::fixup_arm64_pcrel_branch14:
-      return ELF::R_AARCH64_TSTBR14;
-    case ARM64::fixup_arm64_pcrel_branch19:
-      return ELF::R_AARCH64_CONDBR19;
-    default:
-      llvm_unreachable("Unsupported pc-relative fixup kind");
-    }
-  } else {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_ABS16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_ABS32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_ABS64;
-    case ARM64::fixup_arm64_add_imm12:
-      if (RefKind == ARM64MCExpr::VK_DTPREL_HI12)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
-      if (RefKind == ARM64MCExpr::VK_TPREL_HI12)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_LO12_NC)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_LO12)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
-      if (RefKind == ARM64MCExpr::VK_TPREL_LO12_NC)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_LO12)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
-      if (RefKind == ARM64MCExpr::VK_TLSDESC_LO12)
-        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for add (uimm12) instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale1:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 8-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale2:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 16-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale4:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 32-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale8:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOT && IsNC)
-        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && IsNC)
-        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
-        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-
-      report_fatal_error("invalid fixup for 64-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale16:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for 128-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_movw:
-      if (RefKind == ARM64MCExpr::VK_ABS_G3)
-        return ELF::R_AARCH64_MOVW_UABS_G3;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2)
-        return ELF::R_AARCH64_MOVW_UABS_G2;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2_S)
-        return ELF::R_AARCH64_MOVW_SABS_G2;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1)
-        return ELF::R_AARCH64_MOVW_UABS_G1;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1_S)
-        return ELF::R_AARCH64_MOVW_SABS_G1;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0)
-        return ELF::R_AARCH64_MOVW_UABS_G0;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0_S)
-        return ELF::R_AARCH64_MOVW_SABS_G0;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G2)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G2)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G1)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      report_fatal_error("invalid fixup for movz/movk instruction");
-      return 0;
-    case ARM64::fixup_arm64_tlsdesc_call:
-      return ELF::R_AARCH64_TLSDESC_CALL;
-    default:
-      llvm_unreachable("Unknown ELF relocation type");
-    }
-  }
-
-  llvm_unreachable("Unimplemented fixup -> relocation");
-}
-
-MCObjectWriter *llvm::createARM64ELFObjectWriter(raw_ostream &OS,
-                                                 uint8_t OSABI,
-                                                 bool IsLittleEndian) {
-  MCELFObjectTargetWriter *MOTW = new ARM64ELFObjectWriter(OSABI, IsLittleEndian);
-  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
deleted file mode 100644
index adbf83079725..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-//===- lib/MC/ARM64ELFStreamer.cpp - ELF Object Output for ARM64 ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file assembles .s files and emits AArch64 ELF .o object files. Different
-// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
-// regions of data and code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
-/// the appropriate points in the object files. These symbols are defined in the
-/// AArch64 ELF ABI:
-///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
-///
-/// In brief: $x or $d should be emitted at the start of each contiguous region
-/// of A64 code or data in a section. In practice, this emission does not rely
-/// on explicit assembler directives but on inherent properties of the
-/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
-/// instruction).
-///
-/// As a result this system is orthogonal to the DataRegion infrastructure used
-/// by MachO. Beware!
-class ARM64ELFStreamer : public MCELFStreamer {
-public:
-  ARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                   MCCodeEmitter *Emitter)
-      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
-        LastEMS(EMS_None) {}
-
-  ~ARM64ELFStreamer() {}
-
-  void ChangeSection(const MCSection *Section,
-                     const MCExpr *Subsection) override {
-    // We have to keep track of the mapping symbol state of any sections we
-    // use. Each one should start off as EMS_None, which is provided as the
-    // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
-    LastEMS = LastMappingSymbols.lookup(Section);
-
-    MCELFStreamer::ChangeSection(Section, Subsection);
-  }
-
-  /// This function is the one used to emit instruction data into the ELF
-  /// streamer. We override it to add the appropriate mapping symbol if
-  /// necessary.
-  void EmitInstruction(const MCInst &Inst,
-                       const MCSubtargetInfo &STI) override {
-    EmitA64MappingSymbol();
-    MCELFStreamer::EmitInstruction(Inst, STI);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  void EmitBytes(StringRef Data) override {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                     const SMLoc &Loc) override {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size);
-  }
-
-private:
-  enum ElfMappingSymbol {
-    EMS_None,
-    EMS_A64,
-    EMS_Data
-  };
-
-  void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data)
-      return;
-    EmitMappingSymbol("$d");
-    LastEMS = EMS_Data;
-  }
-
-  void EmitA64MappingSymbol() {
-    if (LastEMS == EMS_A64)
-      return;
-    EmitMappingSymbol("$x");
-    LastEMS = EMS_A64;
-  }
-
-  void EmitMappingSymbol(StringRef Name) {
-    MCSymbol *Start = getContext().CreateTempSymbol();
-    EmitLabel(Start);
-
-    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++));
-
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-    MCELF::SetType(SD, ELF::STT_NOTYPE);
-    MCELF::SetBinding(SD, ELF::STB_LOCAL);
-    SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
-
-    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
-    Symbol->setVariableValue(Value);
-  }
-
-  int64_t MappingSymbolCounter;
-
-  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
-  ElfMappingSymbol LastEMS;
-
-  /// @}
-};
-}
-
-namespace llvm {
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack) {
-  ARM64ELFStreamer *S = new ARM64ELFStreamer(Context, TAB, OS, Emitter);
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
-  return S;
-}
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
deleted file mode 100644
index 72dadbc50aa9..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- ARM64ELFStreamer.h - ELF Streamer for ARM64 -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF streamer information for the ARM64 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64_ELF_STREAMER_H
-#define LLVM_AARCH64_ELF_STREAMER_H
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack);
-}
-
-#endif // ARM64_ELF_STREAMER_H
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
deleted file mode 100644
index 7106b314ea20..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//===-- ARM64FixupKinds.h - ARM64 Specific Fixup Entries --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64FIXUPKINDS_H
-#define LLVM_ARM64FIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace ARM64 {
-
-enum Fixups {
-  // fixup_arm64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADR instruction.
-  fixup_arm64_pcrel_adr_imm21 = FirstTargetFixupKind,
-
-  // fixup_arm64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADRP instruction.
-  fixup_arm64_pcrel_adrp_imm21,
-
-  // fixup_arm64_imm12 - 12-bit fixup for add/sub instructions.
-  //     No alignment adjustment. All value bits are encoded.
-  fixup_arm64_add_imm12,
-
-  // fixup_arm64_ldst_imm12_* - unsigned 12-bit fixups for load and
-  // store instructions.
-  fixup_arm64_ldst_imm12_scale1,
-  fixup_arm64_ldst_imm12_scale2,
-  fixup_arm64_ldst_imm12_scale4,
-  fixup_arm64_ldst_imm12_scale8,
-  fixup_arm64_ldst_imm12_scale16,
-
-  // fixup_arm64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this is used by
-  // pc-relative loads and generates relocations directly when necessary.
-  fixup_arm64_ldr_pcrel_imm19,
-
-  // FIXME: comment
-  fixup_arm64_movw,
-
-  // fixup_arm64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch14,
-
-  // fixup_arm64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this is use by
-  // b.cc and generates relocations directly when necessary.
-  fixup_arm64_pcrel_branch19,
-
-  // fixup_arm64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch26,
-
-  // fixup_arm64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
-  // immediate. Distinguished from branch26 only on ELF.
-  fixup_arm64_pcrel_call26,
-
-  // fixup_arm64_tlsdesc_call - zero-space placeholder for the ELF
-  // R_AARCH64_TLSDESC_CALL relocation.
-  fixup_arm64_tlsdesc_call,
-
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-
-} // end namespace ARM64
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
deleted file mode 100644
index e211d3428bf6..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-//===-- ARM64MCAsmInfo.cpp - ARM64 asm properties -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of the ARM64MCAsmInfo properties.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCAsmInfo.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-enum AsmWriterVariantTy {
-  Default = -1,
-  Generic = 0,
-  Apple = 1
-};
-
-static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
-    "arm64-neon-syntax", cl::init(Default),
-    cl::desc("Choose style of NEON code to emit from ARM64 backend:"),
-    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
-               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
-               clEnumValEnd));
-
-ARM64MCAsmInfoDarwin::ARM64MCAsmInfoDarwin() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
-
-  PrivateGlobalPrefix = "L";
-  SeparatorString = "%%";
-  CommentString = ";";
-  PointerSize = CalleeSaveStackSlotSize = 8;
-
-  AlignmentIsInBytes = false;
-  UsesELFSectionDirectiveForBSS = true;
-  SupportsDebugInformation = true;
-  UseDataRegionDirectives = true;
-
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-}
-
-const MCExpr *ARM64MCAsmInfoDarwin::getExprForPersonalitySymbol(
-    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  MCContext &Context = Streamer.getContext();
-  const MCExpr *Res =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
-  MCSymbol *PCSym = Context.CreateTempSymbol();
-  Streamer.EmitLabel(PCSym);
-  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
-  return MCBinaryExpr::CreateSub(Res, PC, Context);
-}
-
-ARM64MCAsmInfoELF::ARM64MCAsmInfoELF(StringRef TT) {
-  Triple T(TT);
-  if (T.getArch() == Triple::arm64_be)
-    IsLittleEndian = false;
-
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
-
-  PointerSize = 8;
-
-  // ".comm align is in bytes but .align is pow-2."
-  AlignmentIsInBytes = false;
-
-  CommentString = "//";
-  PrivateGlobalPrefix = ".L";
-  Code32Directive = ".code\t32";
-
-  Data16bitsDirective = "\t.hword\t";
-  Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = "\t.xword\t";
-
-  UseDataRegionDirectives = false;
-
-  WeakRefDirective = "\t.weak\t";
-
-  HasLEB128 = true;
-  SupportsDebugInformation = true;
-
-  // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-
-  UseIntegratedAssembler = true;
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
deleted file mode 100644
index 324bc39560f0..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//=====-- ARM64MCAsmInfo.h - ARM64 asm properties -----------*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the ARM64MCAsmInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETASMINFO_H
-#define ARM64TARGETASMINFO_H
-
-#include "llvm/MC/MCAsmInfoDarwin.h"
-
-namespace llvm {
-class Target;
-class StringRef;
-class MCStreamer;
-struct ARM64MCAsmInfoDarwin : public MCAsmInfoDarwin {
-  explicit ARM64MCAsmInfoDarwin();
-  const MCExpr *
-  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
-                              MCStreamer &Streamer) const override;
-};
-
-struct ARM64MCAsmInfoELF : public MCAsmInfo {
-  explicit ARM64MCAsmInfoELF(StringRef TT);
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
deleted file mode 100644
index 3c6dbc85b138..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
+++ /dev/null
@@ -1,647 +0,0 @@
-//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64MCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "Utils/ARM64BaseInfo.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "mccodeemitter"
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
-STATISTIC(MCNumFixups, "Number of MC fixups created.");
-
-namespace {
-
-class ARM64MCCodeEmitter : public MCCodeEmitter {
-  MCContext &Ctx;
-
-  ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const ARM64MCCodeEmitter &);     // DO NOT IMPLEMENT
-public:
-  ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                     MCContext &ctx)
-      : Ctx(ctx) {}
-
-  ~ARM64MCCodeEmitter() {}
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-
-  /// getAMIndexed8OpValue - Return encoding info for base register
-  /// and 12-bit unsigned immediate attached to a load, store or prfm
-  /// instruction. If operand requires a relocation, record it and
-  /// return zero in that part of the encoding.
-  template <uint32_t FixupKind>
-  uint32_t getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-  /// target.
-  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-
-  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-  /// the 2-bit shift field.
-  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
-  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
-  /// branch target.
-  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
-  /// pc-relative address.
-  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-  /// branch target.
-  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getBranchTargetOpValue - Return the encoded value for an unconditional
-  /// branch target.
-  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
-
-  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
-  /// of a MOVZ or MOVK instruction.
-  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
-  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
-  /// shifter (MSL).
-  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  /// getFixedPointScaleOpValue - Return the encoded value for the
-  // FP-to-fixed-point scale factor.
-  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
-
-  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getSIMDShift64OpValue - Return the encoded value for the
-  // shift-by-immediate AdvSIMD instructions.
-  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                   const MCSubtargetInfo &STI) const;
-
-  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
-
-  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const override;
-
-  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
-                      const MCSubtargetInfo &STI) const;
-
-  template<int hasRs, int hasRt2> unsigned
-  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
-                        const MCSubtargetInfo &STI) const;
-
-  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
-                                     const MCSubtargetInfo &STI) const;
-};
-
-} // end anonymous namespace
-
-MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              const MCSubtargetInfo &STI,
-                                              MCContext &Ctx) {
-  return new ARM64MCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned
-ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-  if (MO.isReg())
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  else {
-    assert(MO.isImm() && "did not expect relocated expression");
-    return static_cast<unsigned>(MO.getImm());
-  }
-
-  assert(0 && "Unable to encode MCOperand!");
-  return 0;
-}
-
-template <uint32_t FixupKind>
-uint32_t
-ARM64MCCodeEmitter::getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  unsigned BaseReg = MI.getOperand(OpIdx).getReg();
-  BaseReg = Ctx.getRegisterInfo()->getEncodingValue(BaseReg);
-
-  const MCOperand &MO = MI.getOperand(OpIdx + 1);
-  uint32_t ImmVal = 0;
-
-  if (MO.isImm())
-    ImmVal = static_cast<uint32_t>(MO.getImm());
-  else {
-    assert(MO.isExpr() && "unable to encode load/store imm operand");
-    MCFixupKind Kind = MCFixupKind(FixupKind);
-    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-    ++MCNumFixups;
-  }
-
-  return BaseReg | (ImmVal << 5);
-}
-
-/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-/// target.
-uint32_t
-ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-  const MCExpr *Expr = MO.getExpr();
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::ADR
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  MCNumFixups += 1;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
-/// return value.
-uint32_t
-ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
-  // Suboperands are [imm, shifter].
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL &&
-         "unexpected shift type for add/sub immediate");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm());
-  assert((ShiftVal == 0 || ShiftVal == 12) &&
-         "unexpected shift value for add/sub immediate");
-  if (MO.isImm())
-    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
-  assert(MO.isExpr() && "Unable to encode MCOperand!");
-  const MCExpr *Expr = MO.getExpr();
-
-  // Encode the 12 bits of the fixup.
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getCondBranchTargetOpValue - Return the encoded value for a conditional
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch19);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getLoadLiteralOpValue - Return the encoded value for a load-literal
-/// pc-relative address.
-uint32_t
-ARM64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_ldr_pcrel_imm19);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected movz/movk immediate");
-
-  Fixups.push_back(MCFixup::Create(
-      0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getBranchTargetOpValue - Return the encoded value for an unconditional
-/// branch target.
-uint32_t
-ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::BL
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getVecShifterOpValue - Return the encoded value for the vector shifter:
-///
-///   00 -> 0
-///   01 -> 8
-///   10 -> 16
-///   11 -> 24
-uint32_t
-ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-
-  switch (MO.getImm()) {
-  default:
-    break;
-  case 0:
-    return 0;
-  case 8:
-    return 1;
-  case 16:
-    return 2;
-  case 24:
-    return 3;
-  }
-
-  assert(false && "Invalid value for vector shift amount!");
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm());
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm() | 32);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 32 - (MO.getImm() | 16);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 16 - (MO.getImm() | 8);
-}
-
-/// getFixedPointScaleOpValue - Return the encoded value for the
-// FP-to-fixed-point scale factor.
-uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 32 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 16 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 8 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 64;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 32;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 16;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 8;
-}
-
-/// getMoveVecShifterOpValue - Return the encoded value for the vector move
-/// shifter (MSL).
-uint32_t
-ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() &&
-         "Expected an immediate value for the move shift amount!");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm());
-  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
-  return ShiftVal == 8 ? 0 : 1;
-}
-
-unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                                     const MCSubtargetInfo &STI) const {
-  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
-  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
-  // job to ensure that any bits possibly affected by this are 0. This means we
-  // must zero out bit 30 (essentially emitting a MOVN).
-  MCOperand UImm16MO = MI.getOperand(1);
-
-  // Nothing to do if there's no fixup.
-  if (UImm16MO.isImm())
-    return EncodedValue;
-
-  const ARM64MCExpr *A64E = cast<ARM64MCExpr>(UImm16MO.getExpr());
-  switch (A64E->getKind()) {
-  case ARM64MCExpr::VK_DTPREL_G2:
-  case ARM64MCExpr::VK_DTPREL_G1:
-  case ARM64MCExpr::VK_DTPREL_G0:
-  case ARM64MCExpr::VK_GOTTPREL_G1:
-  case ARM64MCExpr::VK_TPREL_G2:
-  case ARM64MCExpr::VK_TPREL_G1:
-  case ARM64MCExpr::VK_TPREL_G0:
-    return EncodedValue & ~(1u << 30);
-  default:
-    // Nothing to do for an unsigned fixup.
-    return EncodedValue;
-  }
-
-
-  return EncodedValue & ~(1u << 30);
-}
-
-void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  if (MI.getOpcode() == ARM64::TLSDESCCALL) {
-    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
-    // following (BLR) instruction. It doesn't emit any code itself so it
-    // doesn't go through the normal TableGenerated channels.
-    MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call);
-    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
-    return;
-  }
-
-  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  EmitConstant(Binary, 4, OS);
-  ++MCNumEmitted; // Keep track of the # of mi's emitted.
-}
-
-unsigned
-ARM64MCCodeEmitter::fixMulHigh(const MCInst &MI,
-                               unsigned EncodedValue,
-                               const MCSubtargetInfo &STI) const {
-  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
-  // (i.e. all bits 1) but is ignored by the processor.
-  EncodedValue |= 0x1f << 10;
-  return EncodedValue;
-}
-
-template<int hasRs, int hasRt2> unsigned
-ARM64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
-                                          unsigned EncodedValue,
-                                          const MCSubtargetInfo &STI) const {
-  if (!hasRs) EncodedValue |= 0x001F0000;
-  if (!hasRt2) EncodedValue |= 0x00007C00;
-
-  return EncodedValue;
-}
-
-unsigned
-ARM64MCCodeEmitter::fixOneOperandFPComparison(const MCInst &MI,
-                                              unsigned EncodedValue,
-                                              const MCSubtargetInfo &STI) const {
-  // The Rm field of FCMP and friends is unused - it should be assembled
-  // as 0, but is ignored by the processor.
-  EncodedValue &= ~(0x1f << 16);
-  return EncodedValue;
-}
-
-#include "ARM64GenMCCodeEmitter.inc"
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
deleted file mode 100644
index efa820b097f1..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the assembly expression modifiers
-// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCExpr.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-
-const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
-                                       MCContext &Ctx) {
-  return new (Ctx) ARM64MCExpr(Expr, Kind);
-}
-
-StringRef ARM64MCExpr::getVariantKindName() const {
-  switch (static_cast<uint32_t>(getKind())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_S:            return ":abs_g2_s:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_S:            return ":abs_g1_s:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_S:            return ":abs_g0_s:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_HI12:          return ":tprel_hi12:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  default:
-    llvm_unreachable("Invalid ELF symbol kind");
-  }
-}
-
-void ARM64MCExpr::PrintImpl(raw_ostream &OS) const {
-  if (getKind() != VK_NONE)
-    OS << getVariantKindName();
-  OS << *Expr;
-}
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
-}
-
-const MCSection *ARM64MCExpr::FindAssociatedSection() const {
-  llvm_unreachable("FIXME: what goes here?");
-}
-
-bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                            const MCAsmLayout *Layout) const {
-  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
-    return false;
-
-  Res =
-      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
-
-  return true;
-}
-
-static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
-  switch (Expr->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expression");
-    break;
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
-    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
-    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef: {
-    // We're known to be under a TLS fixup, so any symbol should be
-    // modified. There should be only one.
-    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
-    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
-    MCELF::SetType(SD, ELF::STT_TLS);
-    break;
-  }
-
-  case MCExpr::Unary:
-    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
-  switch (getSymbolLoc(Kind)) {
-  default:
-    return;
-  case VK_DTPREL:
-  case VK_GOTTPREL:
-  case VK_TPREL:
-  case VK_TLSDESC:
-    break;
-  }
-
-  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
deleted file mode 100644
index d8325465178e..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
+++ /dev/null
@@ -1,168 +0,0 @@
-//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes ARM64-specific MCExprs, used for modifiers like
-// ":lo12:" or ":gottprel_g1:".
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64MCEXPR_H
-#define LLVM_ARM64MCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-class ARM64MCExpr : public MCTargetExpr {
-public:
-  enum VariantKind {
-    VK_NONE     = 0x000,
-
-    // Symbol locations specifying (roughly speaking) what calculation should be
-    // performed to construct the final address for the relocated
-    // symbol. E.g. direct, via the GOT, ...
-    VK_ABS      = 0x001,
-    VK_SABS     = 0x002,
-    VK_GOT      = 0x003,
-    VK_DTPREL   = 0x004,
-    VK_GOTTPREL = 0x005,
-    VK_TPREL    = 0x006,
-    VK_TLSDESC  = 0x007,
-    VK_SymLocBits = 0x00f,
-
-    // Variants specifying which part of the final address calculation is
-    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
-    // MOVZ/MOVK.
-    VK_PAGE     = 0x010,
-    VK_PAGEOFF  = 0x020,
-    VK_HI12     = 0x030,
-    VK_G0       = 0x040,
-    VK_G1       = 0x050,
-    VK_G2       = 0x060,
-    VK_G3       = 0x070,
-    VK_AddressFragBits = 0x0f0,
-
-    // Whether the final relocation is a checked one (where a linker should
-    // perform a range-check on the final address) or not. Note that this field
-    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
-    // on its own is a non-checked relocation. We side with ELF on being
-    // explicit about this!
-    VK_NC       = 0x100,
-
-    // Convenience definitions for referring to specific textual representations
-    // of relocation specifiers. Note that this means the "_NC" is sometimes
-    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
-    // since a user would write ":lo12:").
-    VK_CALL              = VK_ABS,
-    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
-    VK_ABS_G3            = VK_ABS      | VK_G3,
-    VK_ABS_G2            = VK_ABS      | VK_G2,
-    VK_ABS_G2_S          = VK_SABS     | VK_G2,
-    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
-    VK_ABS_G1            = VK_ABS      | VK_G1,
-    VK_ABS_G1_S          = VK_SABS     | VK_G1,
-    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
-    VK_ABS_G0            = VK_ABS      | VK_G0,
-    VK_ABS_G0_S          = VK_SABS     | VK_G0,
-    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
-    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
-    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
-    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
-    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
-    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
-    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
-    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
-    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
-    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
-    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
-    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
-    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
-    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
-    VK_TPREL_G2          = VK_TPREL    | VK_G2,
-    VK_TPREL_G1          = VK_TPREL    | VK_G1,
-    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
-    VK_TPREL_G0          = VK_TPREL    | VK_G0,
-    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
-    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
-    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
-    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
-
-    VK_INVALID  = 0xfff
-  };
-
-private:
-  const MCExpr *Expr;
-  const VariantKind Kind;
-
-  explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind)
-    : Expr(Expr), Kind(Kind) {}
-
-public:
-  /// @name Construction
-  /// @{
-
-  static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
-                                   MCContext &Ctx);
-
-  /// @}
-  /// @name Accessors
-  /// @{
-
-  /// Get the kind of this expression.
-  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
-
-  /// Get the expression this modifier applies to.
-  const MCExpr *getSubExpr() const { return Expr; }
-
-  /// @}
-  /// @name VariantKind information extractors.
-  /// @{
-
-  static VariantKind getSymbolLoc(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_SymLocBits);
-  }
-
-  static VariantKind getAddressFrag(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
-  }
-
-  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
-
-  /// @}
-
-  /// Convert the variant kind into an ELF-appropriate modifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getVariantKindName() const;
-
-  void PrintImpl(raw_ostream &OS) const override;
-
-  void AddValueSymbols(MCAssembler *) const override;
-
-  const MCSection *FindAssociatedSection() const override;
-
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
-
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
-
-  static bool classof(const MCExpr *E) {
-    return E->getKind() == MCExpr::Target;
-  }
-
-  static bool classof(const ARM64MCExpr *) { return true; }
-
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
deleted file mode 100644
index 9775a471f521..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCTargetDesc.h"
-#include "ARM64ELFStreamer.h"
-#include "ARM64MCAsmInfo.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-#define GET_INSTRINFO_MC_DESC
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-static MCInstrInfo *createARM64MCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitARM64MCInstrInfo(X);
-  return X;
-}
-
-static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                   StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-
-  if (CPU.empty())
-    CPU = "generic";
-
-  InitARM64MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitARM64MCRegisterInfo(X, ARM64::LR);
-  return X;
-}
-
-static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI,
-                                       StringRef TT) {
-  Triple TheTriple(TT);
-
-  MCAsmInfo *MAI;
-  if (TheTriple.isOSDarwin())
-    MAI = new ARM64MCAsmInfoDarwin();
-  else {
-    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
-    MAI = new ARM64MCAsmInfoELF(TT);
-  }
-
-  // Initial state of the frame pointer is SP.
-  unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
-  MAI->addInitialFrameState(Inst);
-
-  return MAI;
-}
-
-static MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  Triple TheTriple(TT);
-  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
-         "Only expect Darwin and ELF targets");
-
-  if (CM == CodeModel::Default)
-    CM = CodeModel::Small;
-  // The default MCJIT memory managers make no guarantees about where they can
-  // find an executable page; JITed code needs to be able to refer to globals
-  // no matter how far away they are.
-  else if (CM == CodeModel::JITDefault)
-    CM = CodeModel::Large;
-  else if (CM != CodeModel::Small && CM != CodeModel::Large)
-    report_fatal_error("Only small and large code models are allowed on ARM64");
-
-  // ARM64 Darwin is always PIC.
-  if (TheTriple.isOSDarwin())
-    RM = Reloc::PIC_;
-  // On ELF platforms the default static relocation model has a smart enough
-  // linker to cope with referencing external symbols defined in a shared
-  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
-    RM = Reloc::Static;
-
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCInstPrinter *createARM64MCInstPrinter(const Target &T,
-                                               unsigned SyntaxVariant,
-                                               const MCAsmInfo &MAI,
-                                               const MCInstrInfo &MII,
-                                               const MCRegisterInfo &MRI,
-                                               const MCSubtargetInfo &STI) {
-  if (SyntaxVariant == 0)
-    return new ARM64InstPrinter(MAI, MII, MRI, STI);
-  if (SyntaxVariant == 1)
-    return new ARM64AppleInstPrinter(MAI, MII, MRI, STI);
-
-  return nullptr;
-}
-
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &TAB,
-                                    raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI, bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
-                               /*LabelSections*/ true);
-
-  return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64TargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfoFn X(TheARM64leTarget, createARM64MCAsmInfo);
-  RegisterMCAsmInfoFn Y(TheARM64beTarget, createARM64MCAsmInfo);
-
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
-                                        createARM64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
-                                        createARM64MCCodeGenInfo);
-
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget, createARM64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget, createARM64MCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget, createARM64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget, createARM64MCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
-                                          createARM64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
-                                          createARM64MCSubtargetInfo);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget, createARM64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget, createARM64beAsmBackend);
-
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
-                                        createARM64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
-                                        createARM64MCCodeEmitter);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
-                                        createARM64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
-                                        createARM64MCInstPrinter);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
deleted file mode 100644
index 954dcdbb8bc0..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MCTARGETDESC_H
-#define ARM64MCTARGETDESC_H
-
-#include "llvm/Support/DataTypes.h"
-#include <string>
-
-namespace llvm {
-class MCAsmBackend;
-class MCCodeEmitter;
-class MCContext;
-class MCInstrInfo;
-class MCRegisterInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
-class StringRef;
-class Target;
-class raw_ostream;
-
-extern Target TheARM64leTarget;
-extern Target TheARM64beTarget;
-
-MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
-                                        const MCSubtargetInfo &STI,
-                                        MCContext &Ctx);
-MCAsmBackend *createARM64leAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                      StringRef TT, StringRef CPU);
-MCAsmBackend *createARM64beAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                      StringRef TT, StringRef CPU);
-
-        MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
-                                                   bool IsLittleEndian);
-
-MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
-                                            uint32_t CPUSubtype);
-
-} // End llvm namespace
-
-// Defines symbolic names for ARM64 registers.  This defines a mapping from
-// register name to register number.
-//
-#define GET_REGINFO_ENUM
-#include "ARM64GenRegisterInfo.inc"
-
-// Defines symbolic names for the ARM64 instructions.
-//
-#define GET_INSTRINFO_ENUM
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_ENUM
-#include "ARM64GenSubtargetInfo.inc"
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index f8665bcfe949..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-add_llvm_library(LLVMARM64Desc
-  ARM64AsmBackend.cpp
-  ARM64ELFObjectWriter.cpp
-  ARM64ELFStreamer.cpp
-  ARM64MCAsmInfo.cpp
-  ARM64MCCodeEmitter.cpp
-  ARM64MCExpr.cpp
-  ARM64MCTargetDesc.cpp
-  ARM64MachObjectWriter.cpp
-)
-add_dependencies(LLVMARM64Desc ARM64CommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
deleted file mode 100644
index e4c74d285d48..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Desc
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Info MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/MCTargetDesc/Makefile b/lib/Target/ARM64/MCTargetDesc/Makefile
deleted file mode 100644
index 013cc633f664..000000000000
--- a/lib/Target/ARM64/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/Makefile b/lib/Target/ARM64/Makefile
deleted file mode 100644
index cfb05d2a87ba..000000000000
--- a/lib/Target/ARM64/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMARM64CodeGen
-TARGET = ARM64
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \
-		ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \
-		ARM64GenDAGISel.inc \
-		ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \
-		ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \
-		ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \
-		ARM64GenMCPseudoLowering.inc
-
-DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp b/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
deleted file mode 100644
index c2b6f5c70456..000000000000
--- a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- ARM64TargetInfo.cpp - ARM64 Target Implementation -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-namespace llvm {
-Target TheARM64leTarget;
-Target TheARM64beTarget;
-} // end namespace llvm
-
-extern "C" void LLVMInitializeARM64TargetInfo() {
-  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
-                                                   "ARM64 (little endian)");
-  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
-                                                      "ARM64 (big endian)");
-}
diff --git a/lib/Target/ARM64/TargetInfo/CMakeLists.txt b/lib/Target/ARM64/TargetInfo/CMakeLists.txt
deleted file mode 100644
index a0142c40713e..000000000000
--- a/lib/Target/ARM64/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Info
-  ARM64TargetInfo.cpp
-  )
-
-add_dependencies(LLVMARM64Info ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt b/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
deleted file mode 100644
index b9ecb7069523..000000000000
--- a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Info
-parent = ARM64
-required_libraries = Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/TargetInfo/Makefile b/lib/Target/ARM64/TargetInfo/Makefile
deleted file mode 100644
index 2d5a1a087a57..000000000000
--- a/lib/Target/ARM64/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp b/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp
deleted file mode 100644
index 5142d18c23cc..000000000000
--- a/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp
+++ /dev/null
@@ -1,901 +0,0 @@
-//===-- ARM64BaseInfo.cpp - ARM64 Base encoding information------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides basic encoding and assembly information for ARM64.
-//
-//===----------------------------------------------------------------------===//
-#include "ARM64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Regex.h"
-
-using namespace llvm;
-
-StringRef ARM64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
-  for (unsigned i = 0; i < NumPairs; ++i) {
-    if (Pairs[i].Value == Value) {
-      Valid = true;
-      return Pairs[i].Name;
-    }
-  }
-
-  Valid = false;
-  return StringRef();
-}
-
-uint32_t ARM64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
-  std::string LowerCaseName = Name.lower();
-  for (unsigned i = 0; i < NumPairs; ++i) {
-    if (Pairs[i].Name == LowerCaseName) {
-      Valid = true;
-      return Pairs[i].Value;
-    }
-  }
-
-  Valid = false;
-  return -1;
-}
-
-bool ARM64NamedImmMapper::validImm(uint32_t Value) const {
-  return Value < TooBigImm;
-}
-
-const ARM64NamedImmMapper::Mapping ARM64AT::ATMapper::ATPairs[] = {
-  {"s1e1r", S1E1R},
-  {"s1e2r", S1E2R},
-  {"s1e3r", S1E3R},
-  {"s1e1w", S1E1W},
-  {"s1e2w", S1E2W},
-  {"s1e3w", S1E3W},
-  {"s1e0r", S1E0R},
-  {"s1e0w", S1E0W},
-  {"s12e1r", S12E1R},
-  {"s12e1w", S12E1W},
-  {"s12e0r", S12E0R},
-  {"s12e0w", S12E0W},
-};
-
-ARM64AT::ATMapper::ATMapper()
-  : ARM64NamedImmMapper(ATPairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64DB::DBarrierMapper::DBarrierPairs[] = {
-  {"oshld", OSHLD},
-  {"oshst", OSHST},
-  {"osh", OSH},
-  {"nshld", NSHLD},
-  {"nshst", NSHST},
-  {"nsh", NSH},
-  {"ishld", ISHLD},
-  {"ishst", ISHST},
-  {"ish", ISH},
-  {"ld", LD},
-  {"st", ST},
-  {"sy", SY}
-};
-
-ARM64DB::DBarrierMapper::DBarrierMapper()
-  : ARM64NamedImmMapper(DBarrierPairs, 16u) {}
-
-const ARM64NamedImmMapper::Mapping ARM64DC::DCMapper::DCPairs[] = {
-  {"zva", ZVA},
-  {"ivac", IVAC},
-  {"isw", ISW},
-  {"cvac", CVAC},
-  {"csw", CSW},
-  {"cvau", CVAU},
-  {"civac", CIVAC},
-  {"cisw", CISW}
-};
-
-ARM64DC::DCMapper::DCMapper()
-  : ARM64NamedImmMapper(DCPairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64IC::ICMapper::ICPairs[] = {
-  {"ialluis",  IALLUIS},
-  {"iallu", IALLU},
-  {"ivau", IVAU}
-};
-
-ARM64IC::ICMapper::ICMapper()
-  : ARM64NamedImmMapper(ICPairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64ISB::ISBMapper::ISBPairs[] = {
-  {"sy",  SY},
-};
-
-ARM64ISB::ISBMapper::ISBMapper()
-  : ARM64NamedImmMapper(ISBPairs, 16) {}
-
-const ARM64NamedImmMapper::Mapping ARM64PRFM::PRFMMapper::PRFMPairs[] = {
-  {"pldl1keep", PLDL1KEEP},
-  {"pldl1strm", PLDL1STRM},
-  {"pldl2keep", PLDL2KEEP},
-  {"pldl2strm", PLDL2STRM},
-  {"pldl3keep", PLDL3KEEP},
-  {"pldl3strm", PLDL3STRM},
-  {"plil1keep", PLIL1KEEP},
-  {"plil1strm", PLIL1STRM},
-  {"plil2keep", PLIL2KEEP},
-  {"plil2strm", PLIL2STRM},
-  {"plil3keep", PLIL3KEEP},
-  {"plil3strm", PLIL3STRM},
-  {"pstl1keep", PSTL1KEEP},
-  {"pstl1strm", PSTL1STRM},
-  {"pstl2keep", PSTL2KEEP},
-  {"pstl2strm", PSTL2STRM},
-  {"pstl3keep", PSTL3KEEP},
-  {"pstl3strm", PSTL3STRM}
-};
-
-ARM64PRFM::PRFMMapper::PRFMMapper()
-  : ARM64NamedImmMapper(PRFMPairs, 32) {}
-
-const ARM64NamedImmMapper::Mapping ARM64PState::PStateMapper::PStatePairs[] = {
-  {"spsel", SPSel},
-  {"daifset", DAIFSet},
-  {"daifclr", DAIFClr}
-};
-
-ARM64PState::PStateMapper::PStateMapper()
-  : ARM64NamedImmMapper(PStatePairs, 0) {}
-
-const ARM64NamedImmMapper::Mapping ARM64SysReg::MRSMapper::MRSPairs[] = {
-  {"mdccsr_el0", MDCCSR_EL0},
-  {"dbgdtrrx_el0", DBGDTRRX_EL0},
-  {"mdrar_el1", MDRAR_EL1},
-  {"oslsr_el1", OSLSR_EL1},
-  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1},
-  {"pmceid0_el0", PMCEID0_EL0},
-  {"pmceid1_el0", PMCEID1_EL0},
-  {"midr_el1", MIDR_EL1},
-  {"ccsidr_el1", CCSIDR_EL1},
-  {"clidr_el1", CLIDR_EL1},
-  {"ctr_el0", CTR_EL0},
-  {"mpidr_el1", MPIDR_EL1},
-  {"revidr_el1", REVIDR_EL1},
-  {"aidr_el1", AIDR_EL1},
-  {"dczid_el0", DCZID_EL0},
-  {"id_pfr0_el1", ID_PFR0_EL1},
-  {"id_pfr1_el1", ID_PFR1_EL1},
-  {"id_dfr0_el1", ID_DFR0_EL1},
-  {"id_afr0_el1", ID_AFR0_EL1},
-  {"id_mmfr0_el1", ID_MMFR0_EL1},
-  {"id_mmfr1_el1", ID_MMFR1_EL1},
-  {"id_mmfr2_el1", ID_MMFR2_EL1},
-  {"id_mmfr3_el1", ID_MMFR3_EL1},
-  {"id_isar0_el1", ID_ISAR0_EL1},
-  {"id_isar1_el1", ID_ISAR1_EL1},
-  {"id_isar2_el1", ID_ISAR2_EL1},
-  {"id_isar3_el1", ID_ISAR3_EL1},
-  {"id_isar4_el1", ID_ISAR4_EL1},
-  {"id_isar5_el1", ID_ISAR5_EL1},
-  {"id_aa64pfr0_el1", ID_AARM64PFR0_EL1},
-  {"id_aa64pfr1_el1", ID_AARM64PFR1_EL1},
-  {"id_aa64dfr0_el1", ID_AARM64DFR0_EL1},
-  {"id_aa64dfr1_el1", ID_AARM64DFR1_EL1},
-  {"id_aa64afr0_el1", ID_AARM64AFR0_EL1},
-  {"id_aa64afr1_el1", ID_AARM64AFR1_EL1},
-  {"id_aa64isar0_el1", ID_AARM64ISAR0_EL1},
-  {"id_aa64isar1_el1", ID_AARM64ISAR1_EL1},
-  {"id_aa64mmfr0_el1", ID_AARM64MMFR0_EL1},
-  {"id_aa64mmfr1_el1", ID_AARM64MMFR1_EL1},
-  {"mvfr0_el1", MVFR0_EL1},
-  {"mvfr1_el1", MVFR1_EL1},
-  {"mvfr2_el1", MVFR2_EL1},
-  {"rvbar_el1", RVBAR_EL1},
-  {"rvbar_el2", RVBAR_EL2},
-  {"rvbar_el3", RVBAR_EL3},
-  {"isr_el1", ISR_EL1},
-  {"cntpct_el0", CNTPCT_EL0},
-  {"cntvct_el0", CNTVCT_EL0},
-
-  // Trace registers
-  {"trcstatr", TRCSTATR},
-  {"trcidr8", TRCIDR8},
-  {"trcidr9", TRCIDR9},
-  {"trcidr10", TRCIDR10},
-  {"trcidr11", TRCIDR11},
-  {"trcidr12", TRCIDR12},
-  {"trcidr13", TRCIDR13},
-  {"trcidr0", TRCIDR0},
-  {"trcidr1", TRCIDR1},
-  {"trcidr2", TRCIDR2},
-  {"trcidr3", TRCIDR3},
-  {"trcidr4", TRCIDR4},
-  {"trcidr5", TRCIDR5},
-  {"trcidr6", TRCIDR6},
-  {"trcidr7", TRCIDR7},
-  {"trcoslsr", TRCOSLSR},
-  {"trcpdsr", TRCPDSR},
-  {"trcdevaff0", TRCDEVAFF0},
-  {"trcdevaff1", TRCDEVAFF1},
-  {"trclsr", TRCLSR},
-  {"trcauthstatus", TRCAUTHSTATUS},
-  {"trcdevarch", TRCDEVARCH},
-  {"trcdevid", TRCDEVID},
-  {"trcdevtype", TRCDEVTYPE},
-  {"trcpidr4", TRCPIDR4},
-  {"trcpidr5", TRCPIDR5},
-  {"trcpidr6", TRCPIDR6},
-  {"trcpidr7", TRCPIDR7},
-  {"trcpidr0", TRCPIDR0},
-  {"trcpidr1", TRCPIDR1},
-  {"trcpidr2", TRCPIDR2},
-  {"trcpidr3", TRCPIDR3},
-  {"trccidr0", TRCCIDR0},
-  {"trccidr1", TRCCIDR1},
-  {"trccidr2", TRCCIDR2},
-  {"trccidr3", TRCCIDR3},
-
-  // GICv3 registers
-  {"icc_iar1_el1", ICC_IAR1_EL1},
-  {"icc_iar0_el1", ICC_IAR0_EL1},
-  {"icc_hppir1_el1", ICC_HPPIR1_EL1},
-  {"icc_hppir0_el1", ICC_HPPIR0_EL1},
-  {"icc_rpr_el1", ICC_RPR_EL1},
-  {"ich_vtr_el2", ICH_VTR_EL2},
-  {"ich_eisr_el2", ICH_EISR_EL2},
-  {"ich_elsr_el2", ICH_ELSR_EL2}
-};
-
-ARM64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
-  : SysRegMapper(FeatureBits) {
-    InstPairs = &MRSPairs[0];
-    NumInstPairs = llvm::array_lengthof(MRSPairs);
-}
-
-const ARM64NamedImmMapper::Mapping ARM64SysReg::MSRMapper::MSRPairs[] = {
-  {"dbgdtrtx_el0", DBGDTRTX_EL0},
-  {"oslar_el1", OSLAR_EL1},
-  {"pmswinc_el0", PMSWINC_EL0},
-
-  // Trace registers
-  {"trcoslar", TRCOSLAR},
-  {"trclar", TRCLAR},
-
-  // GICv3 registers
-  {"icc_eoir1_el1", ICC_EOIR1_EL1},
-  {"icc_eoir0_el1", ICC_EOIR0_EL1},
-  {"icc_dir_el1", ICC_DIR_EL1},
-  {"icc_sgi1r_el1", ICC_SGI1R_EL1},
-  {"icc_asgi1r_el1", ICC_ASGI1R_EL1},
-  {"icc_sgi0r_el1", ICC_SGI0R_EL1}
-};
-
-ARM64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
-  : SysRegMapper(FeatureBits) {
-    InstPairs = &MSRPairs[0];
-    NumInstPairs = llvm::array_lengthof(MSRPairs);
-}
-
-
-const ARM64NamedImmMapper::Mapping ARM64SysReg::SysRegMapper::SysRegPairs[] = {
-  {"osdtrrx_el1", OSDTRRX_EL1},
-  {"osdtrtx_el1",  OSDTRTX_EL1},
-  {"teecr32_el1", TEECR32_EL1},
-  {"mdccint_el1", MDCCINT_EL1},
-  {"mdscr_el1", MDSCR_EL1},
-  {"dbgdtr_el0", DBGDTR_EL0},
-  {"oseccr_el1", OSECCR_EL1},
-  {"dbgvcr32_el2", DBGVCR32_EL2},
-  {"dbgbvr0_el1", DBGBVR0_EL1},
-  {"dbgbvr1_el1", DBGBVR1_EL1},
-  {"dbgbvr2_el1", DBGBVR2_EL1},
-  {"dbgbvr3_el1", DBGBVR3_EL1},
-  {"dbgbvr4_el1", DBGBVR4_EL1},
-  {"dbgbvr5_el1", DBGBVR5_EL1},
-  {"dbgbvr6_el1", DBGBVR6_EL1},
-  {"dbgbvr7_el1", DBGBVR7_EL1},
-  {"dbgbvr8_el1", DBGBVR8_EL1},
-  {"dbgbvr9_el1", DBGBVR9_EL1},
-  {"dbgbvr10_el1", DBGBVR10_EL1},
-  {"dbgbvr11_el1", DBGBVR11_EL1},
-  {"dbgbvr12_el1", DBGBVR12_EL1},
-  {"dbgbvr13_el1", DBGBVR13_EL1},
-  {"dbgbvr14_el1", DBGBVR14_EL1},
-  {"dbgbvr15_el1", DBGBVR15_EL1},
-  {"dbgbcr0_el1", DBGBCR0_EL1},
-  {"dbgbcr1_el1", DBGBCR1_EL1},
-  {"dbgbcr2_el1", DBGBCR2_EL1},
-  {"dbgbcr3_el1", DBGBCR3_EL1},
-  {"dbgbcr4_el1", DBGBCR4_EL1},
-  {"dbgbcr5_el1", DBGBCR5_EL1},
-  {"dbgbcr6_el1", DBGBCR6_EL1},
-  {"dbgbcr7_el1", DBGBCR7_EL1},
-  {"dbgbcr8_el1", DBGBCR8_EL1},
-  {"dbgbcr9_el1", DBGBCR9_EL1},
-  {"dbgbcr10_el1", DBGBCR10_EL1},
-  {"dbgbcr11_el1", DBGBCR11_EL1},
-  {"dbgbcr12_el1", DBGBCR12_EL1},
-  {"dbgbcr13_el1", DBGBCR13_EL1},
-  {"dbgbcr14_el1", DBGBCR14_EL1},
-  {"dbgbcr15_el1", DBGBCR15_EL1},
-  {"dbgwvr0_el1", DBGWVR0_EL1},
-  {"dbgwvr1_el1", DBGWVR1_EL1},
-  {"dbgwvr2_el1", DBGWVR2_EL1},
-  {"dbgwvr3_el1", DBGWVR3_EL1},
-  {"dbgwvr4_el1", DBGWVR4_EL1},
-  {"dbgwvr5_el1", DBGWVR5_EL1},
-  {"dbgwvr6_el1", DBGWVR6_EL1},
-  {"dbgwvr7_el1", DBGWVR7_EL1},
-  {"dbgwvr8_el1", DBGWVR8_EL1},
-  {"dbgwvr9_el1", DBGWVR9_EL1},
-  {"dbgwvr10_el1", DBGWVR10_EL1},
-  {"dbgwvr11_el1", DBGWVR11_EL1},
-  {"dbgwvr12_el1", DBGWVR12_EL1},
-  {"dbgwvr13_el1", DBGWVR13_EL1},
-  {"dbgwvr14_el1", DBGWVR14_EL1},
-  {"dbgwvr15_el1", DBGWVR15_EL1},
-  {"dbgwcr0_el1", DBGWCR0_EL1},
-  {"dbgwcr1_el1", DBGWCR1_EL1},
-  {"dbgwcr2_el1", DBGWCR2_EL1},
-  {"dbgwcr3_el1", DBGWCR3_EL1},
-  {"dbgwcr4_el1", DBGWCR4_EL1},
-  {"dbgwcr5_el1", DBGWCR5_EL1},
-  {"dbgwcr6_el1", DBGWCR6_EL1},
-  {"dbgwcr7_el1", DBGWCR7_EL1},
-  {"dbgwcr8_el1", DBGWCR8_EL1},
-  {"dbgwcr9_el1", DBGWCR9_EL1},
-  {"dbgwcr10_el1", DBGWCR10_EL1},
-  {"dbgwcr11_el1", DBGWCR11_EL1},
-  {"dbgwcr12_el1", DBGWCR12_EL1},
-  {"dbgwcr13_el1", DBGWCR13_EL1},
-  {"dbgwcr14_el1", DBGWCR14_EL1},
-  {"dbgwcr15_el1", DBGWCR15_EL1},
-  {"teehbr32_el1", TEEHBR32_EL1},
-  {"osdlr_el1", OSDLR_EL1},
-  {"dbgprcr_el1", DBGPRCR_EL1},
-  {"dbgclaimset_el1", DBGCLAIMSET_EL1},
-  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1},
-  {"csselr_el1", CSSELR_EL1},
-  {"vpidr_el2", VPIDR_EL2},
-  {"vmpidr_el2", VMPIDR_EL2},
-  {"sctlr_el1", SCTLR_EL1},
-  {"sctlr_el2", SCTLR_EL2},
-  {"sctlr_el3", SCTLR_EL3},
-  {"actlr_el1", ACTLR_EL1},
-  {"actlr_el2", ACTLR_EL2},
-  {"actlr_el3", ACTLR_EL3},
-  {"cpacr_el1", CPACR_EL1},
-  {"hcr_el2", HCR_EL2},
-  {"scr_el3", SCR_EL3},
-  {"mdcr_el2", MDCR_EL2},
-  {"sder32_el3", SDER32_EL3},
-  {"cptr_el2", CPTR_EL2},
-  {"cptr_el3", CPTR_EL3},
-  {"hstr_el2", HSTR_EL2},
-  {"hacr_el2", HACR_EL2},
-  {"mdcr_el3", MDCR_EL3},
-  {"ttbr0_el1", TTBR0_EL1},
-  {"ttbr0_el2", TTBR0_EL2},
-  {"ttbr0_el3", TTBR0_EL3},
-  {"ttbr1_el1", TTBR1_EL1},
-  {"tcr_el1", TCR_EL1},
-  {"tcr_el2", TCR_EL2},
-  {"tcr_el3", TCR_EL3},
-  {"vttbr_el2", VTTBR_EL2},
-  {"vtcr_el2", VTCR_EL2},
-  {"dacr32_el2", DACR32_EL2},
-  {"spsr_el1", SPSR_EL1},
-  {"spsr_el2", SPSR_EL2},
-  {"spsr_el3", SPSR_EL3},
-  {"elr_el1", ELR_EL1},
-  {"elr_el2", ELR_EL2},
-  {"elr_el3", ELR_EL3},
-  {"sp_el0", SP_EL0},
-  {"sp_el1", SP_EL1},
-  {"sp_el2", SP_EL2},
-  {"spsel", SPSel},
-  {"nzcv", NZCV},
-  {"daif", DAIF},
-  {"currentel", CurrentEL},
-  {"spsr_irq", SPSR_irq},
-  {"spsr_abt", SPSR_abt},
-  {"spsr_und", SPSR_und},
-  {"spsr_fiq", SPSR_fiq},
-  {"fpcr", FPCR},
-  {"fpsr", FPSR},
-  {"dspsr_el0", DSPSR_EL0},
-  {"dlr_el0", DLR_EL0},
-  {"ifsr32_el2", IFSR32_EL2},
-  {"afsr0_el1", AFSR0_EL1},
-  {"afsr0_el2", AFSR0_EL2},
-  {"afsr0_el3", AFSR0_EL3},
-  {"afsr1_el1", AFSR1_EL1},
-  {"afsr1_el2", AFSR1_EL2},
-  {"afsr1_el3", AFSR1_EL3},
-  {"esr_el1", ESR_EL1},
-  {"esr_el2", ESR_EL2},
-  {"esr_el3", ESR_EL3},
-  {"fpexc32_el2", FPEXC32_EL2},
-  {"far_el1", FAR_EL1},
-  {"far_el2", FAR_EL2},
-  {"far_el3", FAR_EL3},
-  {"hpfar_el2", HPFAR_EL2},
-  {"par_el1", PAR_EL1},
-  {"pmcr_el0", PMCR_EL0},
-  {"pmcntenset_el0", PMCNTENSET_EL0},
-  {"pmcntenclr_el0", PMCNTENCLR_EL0},
-  {"pmovsclr_el0", PMOVSCLR_EL0},
-  {"pmselr_el0", PMSELR_EL0},
-  {"pmccntr_el0", PMCCNTR_EL0},
-  {"pmxevtyper_el0", PMXEVTYPER_EL0},
-  {"pmxevcntr_el0", PMXEVCNTR_EL0},
-  {"pmuserenr_el0", PMUSERENR_EL0},
-  {"pmintenset_el1", PMINTENSET_EL1},
-  {"pmintenclr_el1", PMINTENCLR_EL1},
-  {"pmovsset_el0", PMOVSSET_EL0},
-  {"mair_el1", MAIR_EL1},
-  {"mair_el2", MAIR_EL2},
-  {"mair_el3", MAIR_EL3},
-  {"amair_el1", AMAIR_EL1},
-  {"amair_el2", AMAIR_EL2},
-  {"amair_el3", AMAIR_EL3},
-  {"vbar_el1", VBAR_EL1},
-  {"vbar_el2", VBAR_EL2},
-  {"vbar_el3", VBAR_EL3},
-  {"rmr_el1", RMR_EL1},
-  {"rmr_el2", RMR_EL2},
-  {"rmr_el3", RMR_EL3},
-  {"contextidr_el1", CONTEXTIDR_EL1},
-  {"tpidr_el0", TPIDR_EL0},
-  {"tpidr_el2", TPIDR_EL2},
-  {"tpidr_el3", TPIDR_EL3},
-  {"tpidrro_el0", TPIDRRO_EL0},
-  {"tpidr_el1", TPIDR_EL1},
-  {"cntfrq_el0", CNTFRQ_EL0},
-  {"cntvoff_el2", CNTVOFF_EL2},
-  {"cntkctl_el1", CNTKCTL_EL1},
-  {"cnthctl_el2", CNTHCTL_EL2},
-  {"cntp_tval_el0", CNTP_TVAL_EL0},
-  {"cnthp_tval_el2", CNTHP_TVAL_EL2},
-  {"cntps_tval_el1", CNTPS_TVAL_EL1},
-  {"cntp_ctl_el0", CNTP_CTL_EL0},
-  {"cnthp_ctl_el2", CNTHP_CTL_EL2},
-  {"cntps_ctl_el1", CNTPS_CTL_EL1},
-  {"cntp_cval_el0", CNTP_CVAL_EL0},
-  {"cnthp_cval_el2", CNTHP_CVAL_EL2},
-  {"cntps_cval_el1", CNTPS_CVAL_EL1},
-  {"cntv_tval_el0", CNTV_TVAL_EL0},
-  {"cntv_ctl_el0", CNTV_CTL_EL0},
-  {"cntv_cval_el0", CNTV_CVAL_EL0},
-  {"pmevcntr0_el0", PMEVCNTR0_EL0},
-  {"pmevcntr1_el0", PMEVCNTR1_EL0},
-  {"pmevcntr2_el0", PMEVCNTR2_EL0},
-  {"pmevcntr3_el0", PMEVCNTR3_EL0},
-  {"pmevcntr4_el0", PMEVCNTR4_EL0},
-  {"pmevcntr5_el0", PMEVCNTR5_EL0},
-  {"pmevcntr6_el0", PMEVCNTR6_EL0},
-  {"pmevcntr7_el0", PMEVCNTR7_EL0},
-  {"pmevcntr8_el0", PMEVCNTR8_EL0},
-  {"pmevcntr9_el0", PMEVCNTR9_EL0},
-  {"pmevcntr10_el0", PMEVCNTR10_EL0},
-  {"pmevcntr11_el0", PMEVCNTR11_EL0},
-  {"pmevcntr12_el0", PMEVCNTR12_EL0},
-  {"pmevcntr13_el0", PMEVCNTR13_EL0},
-  {"pmevcntr14_el0", PMEVCNTR14_EL0},
-  {"pmevcntr15_el0", PMEVCNTR15_EL0},
-  {"pmevcntr16_el0", PMEVCNTR16_EL0},
-  {"pmevcntr17_el0", PMEVCNTR17_EL0},
-  {"pmevcntr18_el0", PMEVCNTR18_EL0},
-  {"pmevcntr19_el0", PMEVCNTR19_EL0},
-  {"pmevcntr20_el0", PMEVCNTR20_EL0},
-  {"pmevcntr21_el0", PMEVCNTR21_EL0},
-  {"pmevcntr22_el0", PMEVCNTR22_EL0},
-  {"pmevcntr23_el0", PMEVCNTR23_EL0},
-  {"pmevcntr24_el0", PMEVCNTR24_EL0},
-  {"pmevcntr25_el0", PMEVCNTR25_EL0},
-  {"pmevcntr26_el0", PMEVCNTR26_EL0},
-  {"pmevcntr27_el0", PMEVCNTR27_EL0},
-  {"pmevcntr28_el0", PMEVCNTR28_EL0},
-  {"pmevcntr29_el0", PMEVCNTR29_EL0},
-  {"pmevcntr30_el0", PMEVCNTR30_EL0},
-  {"pmccfiltr_el0", PMCCFILTR_EL0},
-  {"pmevtyper0_el0", PMEVTYPER0_EL0},
-  {"pmevtyper1_el0", PMEVTYPER1_EL0},
-  {"pmevtyper2_el0", PMEVTYPER2_EL0},
-  {"pmevtyper3_el0", PMEVTYPER3_EL0},
-  {"pmevtyper4_el0", PMEVTYPER4_EL0},
-  {"pmevtyper5_el0", PMEVTYPER5_EL0},
-  {"pmevtyper6_el0", PMEVTYPER6_EL0},
-  {"pmevtyper7_el0", PMEVTYPER7_EL0},
-  {"pmevtyper8_el0", PMEVTYPER8_EL0},
-  {"pmevtyper9_el0", PMEVTYPER9_EL0},
-  {"pmevtyper10_el0", PMEVTYPER10_EL0},
-  {"pmevtyper11_el0", PMEVTYPER11_EL0},
-  {"pmevtyper12_el0", PMEVTYPER12_EL0},
-  {"pmevtyper13_el0", PMEVTYPER13_EL0},
-  {"pmevtyper14_el0", PMEVTYPER14_EL0},
-  {"pmevtyper15_el0", PMEVTYPER15_EL0},
-  {"pmevtyper16_el0", PMEVTYPER16_EL0},
-  {"pmevtyper17_el0", PMEVTYPER17_EL0},
-  {"pmevtyper18_el0", PMEVTYPER18_EL0},
-  {"pmevtyper19_el0", PMEVTYPER19_EL0},
-  {"pmevtyper20_el0", PMEVTYPER20_EL0},
-  {"pmevtyper21_el0", PMEVTYPER21_EL0},
-  {"pmevtyper22_el0", PMEVTYPER22_EL0},
-  {"pmevtyper23_el0", PMEVTYPER23_EL0},
-  {"pmevtyper24_el0", PMEVTYPER24_EL0},
-  {"pmevtyper25_el0", PMEVTYPER25_EL0},
-  {"pmevtyper26_el0", PMEVTYPER26_EL0},
-  {"pmevtyper27_el0", PMEVTYPER27_EL0},
-  {"pmevtyper28_el0", PMEVTYPER28_EL0},
-  {"pmevtyper29_el0", PMEVTYPER29_EL0},
-  {"pmevtyper30_el0", PMEVTYPER30_EL0},
-
-  // Trace registers
-  {"trcprgctlr", TRCPRGCTLR},
-  {"trcprocselr", TRCPROCSELR},
-  {"trcconfigr", TRCCONFIGR},
-  {"trcauxctlr", TRCAUXCTLR},
-  {"trceventctl0r", TRCEVENTCTL0R},
-  {"trceventctl1r", TRCEVENTCTL1R},
-  {"trcstallctlr", TRCSTALLCTLR},
-  {"trctsctlr", TRCTSCTLR},
-  {"trcsyncpr", TRCSYNCPR},
-  {"trcccctlr", TRCCCCTLR},
-  {"trcbbctlr", TRCBBCTLR},
-  {"trctraceidr", TRCTRACEIDR},
-  {"trcqctlr", TRCQCTLR},
-  {"trcvictlr", TRCVICTLR},
-  {"trcviiectlr", TRCVIIECTLR},
-  {"trcvissctlr", TRCVISSCTLR},
-  {"trcvipcssctlr", TRCVIPCSSCTLR},
-  {"trcvdctlr", TRCVDCTLR},
-  {"trcvdsacctlr", TRCVDSACCTLR},
-  {"trcvdarcctlr", TRCVDARCCTLR},
-  {"trcseqevr0", TRCSEQEVR0},
-  {"trcseqevr1", TRCSEQEVR1},
-  {"trcseqevr2", TRCSEQEVR2},
-  {"trcseqrstevr", TRCSEQRSTEVR},
-  {"trcseqstr", TRCSEQSTR},
-  {"trcextinselr", TRCEXTINSELR},
-  {"trccntrldvr0", TRCCNTRLDVR0},
-  {"trccntrldvr1", TRCCNTRLDVR1},
-  {"trccntrldvr2", TRCCNTRLDVR2},
-  {"trccntrldvr3", TRCCNTRLDVR3},
-  {"trccntctlr0", TRCCNTCTLR0},
-  {"trccntctlr1", TRCCNTCTLR1},
-  {"trccntctlr2", TRCCNTCTLR2},
-  {"trccntctlr3", TRCCNTCTLR3},
-  {"trccntvr0", TRCCNTVR0},
-  {"trccntvr1", TRCCNTVR1},
-  {"trccntvr2", TRCCNTVR2},
-  {"trccntvr3", TRCCNTVR3},
-  {"trcimspec0", TRCIMSPEC0},
-  {"trcimspec1", TRCIMSPEC1},
-  {"trcimspec2", TRCIMSPEC2},
-  {"trcimspec3", TRCIMSPEC3},
-  {"trcimspec4", TRCIMSPEC4},
-  {"trcimspec5", TRCIMSPEC5},
-  {"trcimspec6", TRCIMSPEC6},
-  {"trcimspec7", TRCIMSPEC7},
-  {"trcrsctlr2", TRCRSCTLR2},
-  {"trcrsctlr3", TRCRSCTLR3},
-  {"trcrsctlr4", TRCRSCTLR4},
-  {"trcrsctlr5", TRCRSCTLR5},
-  {"trcrsctlr6", TRCRSCTLR6},
-  {"trcrsctlr7", TRCRSCTLR7},
-  {"trcrsctlr8", TRCRSCTLR8},
-  {"trcrsctlr9", TRCRSCTLR9},
-  {"trcrsctlr10", TRCRSCTLR10},
-  {"trcrsctlr11", TRCRSCTLR11},
-  {"trcrsctlr12", TRCRSCTLR12},
-  {"trcrsctlr13", TRCRSCTLR13},
-  {"trcrsctlr14", TRCRSCTLR14},
-  {"trcrsctlr15", TRCRSCTLR15},
-  {"trcrsctlr16", TRCRSCTLR16},
-  {"trcrsctlr17", TRCRSCTLR17},
-  {"trcrsctlr18", TRCRSCTLR18},
-  {"trcrsctlr19", TRCRSCTLR19},
-  {"trcrsctlr20", TRCRSCTLR20},
-  {"trcrsctlr21", TRCRSCTLR21},
-  {"trcrsctlr22", TRCRSCTLR22},
-  {"trcrsctlr23", TRCRSCTLR23},
-  {"trcrsctlr24", TRCRSCTLR24},
-  {"trcrsctlr25", TRCRSCTLR25},
-  {"trcrsctlr26", TRCRSCTLR26},
-  {"trcrsctlr27", TRCRSCTLR27},
-  {"trcrsctlr28", TRCRSCTLR28},
-  {"trcrsctlr29", TRCRSCTLR29},
-  {"trcrsctlr30", TRCRSCTLR30},
-  {"trcrsctlr31", TRCRSCTLR31},
-  {"trcssccr0", TRCSSCCR0},
-  {"trcssccr1", TRCSSCCR1},
-  {"trcssccr2", TRCSSCCR2},
-  {"trcssccr3", TRCSSCCR3},
-  {"trcssccr4", TRCSSCCR4},
-  {"trcssccr5", TRCSSCCR5},
-  {"trcssccr6", TRCSSCCR6},
-  {"trcssccr7", TRCSSCCR7},
-  {"trcsscsr0", TRCSSCSR0},
-  {"trcsscsr1", TRCSSCSR1},
-  {"trcsscsr2", TRCSSCSR2},
-  {"trcsscsr3", TRCSSCSR3},
-  {"trcsscsr4", TRCSSCSR4},
-  {"trcsscsr5", TRCSSCSR5},
-  {"trcsscsr6", TRCSSCSR6},
-  {"trcsscsr7", TRCSSCSR7},
-  {"trcsspcicr0", TRCSSPCICR0},
-  {"trcsspcicr1", TRCSSPCICR1},
-  {"trcsspcicr2", TRCSSPCICR2},
-  {"trcsspcicr3", TRCSSPCICR3},
-  {"trcsspcicr4", TRCSSPCICR4},
-  {"trcsspcicr5", TRCSSPCICR5},
-  {"trcsspcicr6", TRCSSPCICR6},
-  {"trcsspcicr7", TRCSSPCICR7},
-  {"trcpdcr", TRCPDCR},
-  {"trcacvr0", TRCACVR0},
-  {"trcacvr1", TRCACVR1},
-  {"trcacvr2", TRCACVR2},
-  {"trcacvr3", TRCACVR3},
-  {"trcacvr4", TRCACVR4},
-  {"trcacvr5", TRCACVR5},
-  {"trcacvr6", TRCACVR6},
-  {"trcacvr7", TRCACVR7},
-  {"trcacvr8", TRCACVR8},
-  {"trcacvr9", TRCACVR9},
-  {"trcacvr10", TRCACVR10},
-  {"trcacvr11", TRCACVR11},
-  {"trcacvr12", TRCACVR12},
-  {"trcacvr13", TRCACVR13},
-  {"trcacvr14", TRCACVR14},
-  {"trcacvr15", TRCACVR15},
-  {"trcacatr0", TRCACATR0},
-  {"trcacatr1", TRCACATR1},
-  {"trcacatr2", TRCACATR2},
-  {"trcacatr3", TRCACATR3},
-  {"trcacatr4", TRCACATR4},
-  {"trcacatr5", TRCACATR5},
-  {"trcacatr6", TRCACATR6},
-  {"trcacatr7", TRCACATR7},
-  {"trcacatr8", TRCACATR8},
-  {"trcacatr9", TRCACATR9},
-  {"trcacatr10", TRCACATR10},
-  {"trcacatr11", TRCACATR11},
-  {"trcacatr12", TRCACATR12},
-  {"trcacatr13", TRCACATR13},
-  {"trcacatr14", TRCACATR14},
-  {"trcacatr15", TRCACATR15},
-  {"trcdvcvr0", TRCDVCVR0},
-  {"trcdvcvr1", TRCDVCVR1},
-  {"trcdvcvr2", TRCDVCVR2},
-  {"trcdvcvr3", TRCDVCVR3},
-  {"trcdvcvr4", TRCDVCVR4},
-  {"trcdvcvr5", TRCDVCVR5},
-  {"trcdvcvr6", TRCDVCVR6},
-  {"trcdvcvr7", TRCDVCVR7},
-  {"trcdvcmr0", TRCDVCMR0},
-  {"trcdvcmr1", TRCDVCMR1},
-  {"trcdvcmr2", TRCDVCMR2},
-  {"trcdvcmr3", TRCDVCMR3},
-  {"trcdvcmr4", TRCDVCMR4},
-  {"trcdvcmr5", TRCDVCMR5},
-  {"trcdvcmr6", TRCDVCMR6},
-  {"trcdvcmr7", TRCDVCMR7},
-  {"trccidcvr0", TRCCIDCVR0},
-  {"trccidcvr1", TRCCIDCVR1},
-  {"trccidcvr2", TRCCIDCVR2},
-  {"trccidcvr3", TRCCIDCVR3},
-  {"trccidcvr4", TRCCIDCVR4},
-  {"trccidcvr5", TRCCIDCVR5},
-  {"trccidcvr6", TRCCIDCVR6},
-  {"trccidcvr7", TRCCIDCVR7},
-  {"trcvmidcvr0", TRCVMIDCVR0},
-  {"trcvmidcvr1", TRCVMIDCVR1},
-  {"trcvmidcvr2", TRCVMIDCVR2},
-  {"trcvmidcvr3", TRCVMIDCVR3},
-  {"trcvmidcvr4", TRCVMIDCVR4},
-  {"trcvmidcvr5", TRCVMIDCVR5},
-  {"trcvmidcvr6", TRCVMIDCVR6},
-  {"trcvmidcvr7", TRCVMIDCVR7},
-  {"trccidcctlr0", TRCCIDCCTLR0},
-  {"trccidcctlr1", TRCCIDCCTLR1},
-  {"trcvmidcctlr0", TRCVMIDCCTLR0},
-  {"trcvmidcctlr1", TRCVMIDCCTLR1},
-  {"trcitctrl", TRCITCTRL},
-  {"trcclaimset", TRCCLAIMSET},
-  {"trcclaimclr", TRCCLAIMCLR},
-
-  // GICv3 registers
-  {"icc_bpr1_el1", ICC_BPR1_EL1},
-  {"icc_bpr0_el1", ICC_BPR0_EL1},
-  {"icc_pmr_el1", ICC_PMR_EL1},
-  {"icc_ctlr_el1", ICC_CTLR_EL1},
-  {"icc_ctlr_el3", ICC_CTLR_EL3},
-  {"icc_sre_el1", ICC_SRE_EL1},
-  {"icc_sre_el2", ICC_SRE_EL2},
-  {"icc_sre_el3", ICC_SRE_EL3},
-  {"icc_igrpen0_el1", ICC_IGRPEN0_EL1},
-  {"icc_igrpen1_el1", ICC_IGRPEN1_EL1},
-  {"icc_igrpen1_el3", ICC_IGRPEN1_EL3},
-  {"icc_seien_el1", ICC_SEIEN_EL1},
-  {"icc_ap0r0_el1", ICC_AP0R0_EL1},
-  {"icc_ap0r1_el1", ICC_AP0R1_EL1},
-  {"icc_ap0r2_el1", ICC_AP0R2_EL1},
-  {"icc_ap0r3_el1", ICC_AP0R3_EL1},
-  {"icc_ap1r0_el1", ICC_AP1R0_EL1},
-  {"icc_ap1r1_el1", ICC_AP1R1_EL1},
-  {"icc_ap1r2_el1", ICC_AP1R2_EL1},
-  {"icc_ap1r3_el1", ICC_AP1R3_EL1},
-  {"ich_ap0r0_el2", ICH_AP0R0_EL2},
-  {"ich_ap0r1_el2", ICH_AP0R1_EL2},
-  {"ich_ap0r2_el2", ICH_AP0R2_EL2},
-  {"ich_ap0r3_el2", ICH_AP0R3_EL2},
-  {"ich_ap1r0_el2", ICH_AP1R0_EL2},
-  {"ich_ap1r1_el2", ICH_AP1R1_EL2},
-  {"ich_ap1r2_el2", ICH_AP1R2_EL2},
-  {"ich_ap1r3_el2", ICH_AP1R3_EL2},
-  {"ich_hcr_el2", ICH_HCR_EL2},
-  {"ich_misr_el2", ICH_MISR_EL2},
-  {"ich_vmcr_el2", ICH_VMCR_EL2},
-  {"ich_vseir_el2", ICH_VSEIR_EL2},
-  {"ich_lr0_el2", ICH_LR0_EL2},
-  {"ich_lr1_el2", ICH_LR1_EL2},
-  {"ich_lr2_el2", ICH_LR2_EL2},
-  {"ich_lr3_el2", ICH_LR3_EL2},
-  {"ich_lr4_el2", ICH_LR4_EL2},
-  {"ich_lr5_el2", ICH_LR5_EL2},
-  {"ich_lr6_el2", ICH_LR6_EL2},
-  {"ich_lr7_el2", ICH_LR7_EL2},
-  {"ich_lr8_el2", ICH_LR8_EL2},
-  {"ich_lr9_el2", ICH_LR9_EL2},
-  {"ich_lr10_el2", ICH_LR10_EL2},
-  {"ich_lr11_el2", ICH_LR11_EL2},
-  {"ich_lr12_el2", ICH_LR12_EL2},
-  {"ich_lr13_el2", ICH_LR13_EL2},
-  {"ich_lr14_el2", ICH_LR14_EL2},
-  {"ich_lr15_el2", ICH_LR15_EL2}
-};
-
-const ARM64NamedImmMapper::Mapping
-ARM64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
-  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
-};
-
-uint32_t
-ARM64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
-  std::string NameLower = Name.lower();
-
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
-    if (SysRegPairs[i].Name == NameLower) {
-      Valid = true;
-      return SysRegPairs[i].Value;
-    }
-  }
-
-  // Next search for target specific registers
-  if (FeatureBits & ARM64::ProcCyclone) {
-    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
-      if (CycloneSysRegPairs[i].Name == NameLower) {
-        Valid = true;
-        return CycloneSysRegPairs[i].Value;
-      }
-    }
-  }
-
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstPairs; ++i) {
-    if (InstPairs[i].Name == NameLower) {
-      Valid = true;
-      return InstPairs[i].Value;
-    }
-  }
-
-  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
-  // are: 11 xxx 1x11 xxxx xxx
-  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
-
-  SmallVector<StringRef, 4> Ops;
-  if (!GenericRegPattern.match(NameLower, &Ops)) {
-    Valid = false;
-    return -1;
-  }
-
-  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
-  uint32_t Bits;
-  Ops[1].getAsInteger(10, Op1);
-  Ops[2].getAsInteger(10, CRn);
-  Ops[3].getAsInteger(10, CRm);
-  Ops[4].getAsInteger(10, Op2);
-  Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
-
-  Valid = true;
-  return Bits;
-}
-
-std::string
-ARM64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
-    if (SysRegPairs[i].Value == Bits) {
-      Valid = true;
-      return SysRegPairs[i].Name;
-    }
-  }
-
-  // Next search for target specific registers
-  if (FeatureBits & ARM64::ProcCyclone) {
-    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
-      if (CycloneSysRegPairs[i].Value == Bits) {
-        Valid = true;
-        return CycloneSysRegPairs[i].Name;
-      }
-    }
-  }
-
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstPairs; ++i) {
-    if (InstPairs[i].Value == Bits) {
-      Valid = true;
-      return InstPairs[i].Name;
-    }
-  }
-
-  uint32_t Op0 = (Bits >> 14) & 0x3;
-  uint32_t Op1 = (Bits >> 11) & 0x7;
-  uint32_t CRn = (Bits >> 7) & 0xf;
-  uint32_t CRm = (Bits >> 3) & 0xf;
-  uint32_t Op2 = Bits & 0x7;
-
-  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
-  // name.
-  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
-      Valid = false;
-      return "";
-  }
-
-  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
-
-  Valid = true;
-  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
-               + "_c" + utostr(CRm) + "_" + utostr(Op2);
-}
-
-const ARM64NamedImmMapper::Mapping ARM64TLBI::TLBIMapper::TLBIPairs[] = {
-  {"ipas2e1is", IPAS2E1IS},
-  {"ipas2le1is", IPAS2LE1IS},
-  {"vmalle1is", VMALLE1IS},
-  {"alle2is", ALLE2IS},
-  {"alle3is", ALLE3IS},
-  {"vae1is", VAE1IS},
-  {"vae2is", VAE2IS},
-  {"vae3is", VAE3IS},
-  {"aside1is", ASIDE1IS},
-  {"vaae1is", VAAE1IS},
-  {"alle1is", ALLE1IS},
-  {"vale1is", VALE1IS},
-  {"vale2is", VALE2IS},
-  {"vale3is", VALE3IS},
-  {"vmalls12e1is", VMALLS12E1IS},
-  {"vaale1is", VAALE1IS},
-  {"ipas2e1", IPAS2E1},
-  {"ipas2le1", IPAS2LE1},
-  {"vmalle1", VMALLE1},
-  {"alle2", ALLE2},
-  {"alle3", ALLE3},
-  {"vae1", VAE1},
-  {"vae2", VAE2},
-  {"vae3", VAE3},
-  {"aside1", ASIDE1},
-  {"vaae1", VAAE1},
-  {"alle1", ALLE1},
-  {"vale1", VALE1},
-  {"vale2", VALE2},
-  {"vale3", VALE3},
-  {"vmalls12e1", VMALLS12E1},
-  {"vaale1", VAALE1}
-};
-
-ARM64TLBI::TLBIMapper::TLBIMapper()
-  : ARM64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/lib/Target/ARM64/Utils/ARM64BaseInfo.h b/lib/Target/ARM64/Utils/ARM64BaseInfo.h
deleted file mode 100644
index 8075d6b37c9a..000000000000
--- a/lib/Target/ARM64/Utils/ARM64BaseInfo.h
+++ /dev/null
@@ -1,1294 +0,0 @@
-//===-- ARM64BaseInfo.h - Top level definitions for ARM64 -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the ARM64 target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64BASEINFO_H
-#define ARM64BASEINFO_H
-
-// FIXME: Is it easiest to fix this layering violation by moving the .inc
-// #includes from ARM64MCTargetDesc.h to here?
-#include "MCTargetDesc/ARM64MCTargetDesc.h" // For ARM64::X0 and friends.
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-inline static unsigned getWRegFromXReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::X0: return ARM64::W0;
-  case ARM64::X1: return ARM64::W1;
-  case ARM64::X2: return ARM64::W2;
-  case ARM64::X3: return ARM64::W3;
-  case ARM64::X4: return ARM64::W4;
-  case ARM64::X5: return ARM64::W5;
-  case ARM64::X6: return ARM64::W6;
-  case ARM64::X7: return ARM64::W7;
-  case ARM64::X8: return ARM64::W8;
-  case ARM64::X9: return ARM64::W9;
-  case ARM64::X10: return ARM64::W10;
-  case ARM64::X11: return ARM64::W11;
-  case ARM64::X12: return ARM64::W12;
-  case ARM64::X13: return ARM64::W13;
-  case ARM64::X14: return ARM64::W14;
-  case ARM64::X15: return ARM64::W15;
-  case ARM64::X16: return ARM64::W16;
-  case ARM64::X17: return ARM64::W17;
-  case ARM64::X18: return ARM64::W18;
-  case ARM64::X19: return ARM64::W19;
-  case ARM64::X20: return ARM64::W20;
-  case ARM64::X21: return ARM64::W21;
-  case ARM64::X22: return ARM64::W22;
-  case ARM64::X23: return ARM64::W23;
-  case ARM64::X24: return ARM64::W24;
-  case ARM64::X25: return ARM64::W25;
-  case ARM64::X26: return ARM64::W26;
-  case ARM64::X27: return ARM64::W27;
-  case ARM64::X28: return ARM64::W28;
-  case ARM64::FP: return ARM64::W29;
-  case ARM64::LR: return ARM64::W30;
-  case ARM64::SP: return ARM64::WSP;
-  case ARM64::XZR: return ARM64::WZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-inline static unsigned getXRegFromWReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::W0: return ARM64::X0;
-  case ARM64::W1: return ARM64::X1;
-  case ARM64::W2: return ARM64::X2;
-  case ARM64::W3: return ARM64::X3;
-  case ARM64::W4: return ARM64::X4;
-  case ARM64::W5: return ARM64::X5;
-  case ARM64::W6: return ARM64::X6;
-  case ARM64::W7: return ARM64::X7;
-  case ARM64::W8: return ARM64::X8;
-  case ARM64::W9: return ARM64::X9;
-  case ARM64::W10: return ARM64::X10;
-  case ARM64::W11: return ARM64::X11;
-  case ARM64::W12: return ARM64::X12;
-  case ARM64::W13: return ARM64::X13;
-  case ARM64::W14: return ARM64::X14;
-  case ARM64::W15: return ARM64::X15;
-  case ARM64::W16: return ARM64::X16;
-  case ARM64::W17: return ARM64::X17;
-  case ARM64::W18: return ARM64::X18;
-  case ARM64::W19: return ARM64::X19;
-  case ARM64::W20: return ARM64::X20;
-  case ARM64::W21: return ARM64::X21;
-  case ARM64::W22: return ARM64::X22;
-  case ARM64::W23: return ARM64::X23;
-  case ARM64::W24: return ARM64::X24;
-  case ARM64::W25: return ARM64::X25;
-  case ARM64::W26: return ARM64::X26;
-  case ARM64::W27: return ARM64::X27;
-  case ARM64::W28: return ARM64::X28;
-  case ARM64::W29: return ARM64::FP;
-  case ARM64::W30: return ARM64::LR;
-  case ARM64::WSP: return ARM64::SP;
-  case ARM64::WZR: return ARM64::XZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-static inline unsigned getBRegFromDReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::D0:  return ARM64::B0;
-  case ARM64::D1:  return ARM64::B1;
-  case ARM64::D2:  return ARM64::B2;
-  case ARM64::D3:  return ARM64::B3;
-  case ARM64::D4:  return ARM64::B4;
-  case ARM64::D5:  return ARM64::B5;
-  case ARM64::D6:  return ARM64::B6;
-  case ARM64::D7:  return ARM64::B7;
-  case ARM64::D8:  return ARM64::B8;
-  case ARM64::D9:  return ARM64::B9;
-  case ARM64::D10: return ARM64::B10;
-  case ARM64::D11: return ARM64::B11;
-  case ARM64::D12: return ARM64::B12;
-  case ARM64::D13: return ARM64::B13;
-  case ARM64::D14: return ARM64::B14;
-  case ARM64::D15: return ARM64::B15;
-  case ARM64::D16: return ARM64::B16;
-  case ARM64::D17: return ARM64::B17;
-  case ARM64::D18: return ARM64::B18;
-  case ARM64::D19: return ARM64::B19;
-  case ARM64::D20: return ARM64::B20;
-  case ARM64::D21: return ARM64::B21;
-  case ARM64::D22: return ARM64::B22;
-  case ARM64::D23: return ARM64::B23;
-  case ARM64::D24: return ARM64::B24;
-  case ARM64::D25: return ARM64::B25;
-  case ARM64::D26: return ARM64::B26;
-  case ARM64::D27: return ARM64::B27;
-  case ARM64::D28: return ARM64::B28;
-  case ARM64::D29: return ARM64::B29;
-  case ARM64::D30: return ARM64::B30;
-  case ARM64::D31: return ARM64::B31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-
-static inline unsigned getDRegFromBReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::B0:  return ARM64::D0;
-  case ARM64::B1:  return ARM64::D1;
-  case ARM64::B2:  return ARM64::D2;
-  case ARM64::B3:  return ARM64::D3;
-  case ARM64::B4:  return ARM64::D4;
-  case ARM64::B5:  return ARM64::D5;
-  case ARM64::B6:  return ARM64::D6;
-  case ARM64::B7:  return ARM64::D7;
-  case ARM64::B8:  return ARM64::D8;
-  case ARM64::B9:  return ARM64::D9;
-  case ARM64::B10: return ARM64::D10;
-  case ARM64::B11: return ARM64::D11;
-  case ARM64::B12: return ARM64::D12;
-  case ARM64::B13: return ARM64::D13;
-  case ARM64::B14: return ARM64::D14;
-  case ARM64::B15: return ARM64::D15;
-  case ARM64::B16: return ARM64::D16;
-  case ARM64::B17: return ARM64::D17;
-  case ARM64::B18: return ARM64::D18;
-  case ARM64::B19: return ARM64::D19;
-  case ARM64::B20: return ARM64::D20;
-  case ARM64::B21: return ARM64::D21;
-  case ARM64::B22: return ARM64::D22;
-  case ARM64::B23: return ARM64::D23;
-  case ARM64::B24: return ARM64::D24;
-  case ARM64::B25: return ARM64::D25;
-  case ARM64::B26: return ARM64::D26;
-  case ARM64::B27: return ARM64::D27;
-  case ARM64::B28: return ARM64::D28;
-  case ARM64::B29: return ARM64::D29;
-  case ARM64::B30: return ARM64::D30;
-  case ARM64::B31: return ARM64::D31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-namespace ARM64CC {
-
-// The CondCodes constants map directly to the 4-bit encoding of the condition
-// field for predicated instructions.
-enum CondCode {  // Meaning (integer)          Meaning (floating-point)
-  EQ = 0x0,      // Equal                      Equal
-  NE = 0x1,      // Not equal                  Not equal, or unordered
-  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
-  LO = 0x3,      // Unsigned lower             Less than
-  MI = 0x4,      // Minus, negative            Less than
-  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
-  VS = 0x6,      // Overflow                   Unordered
-  VC = 0x7,      // No overflow                Not unordered
-  HI = 0x8,      // Unsigned higher            Greater than, or unordered
-  LS = 0x9,      // Unsigned lower or same     Less than or equal
-  GE = 0xa,      // Greater than or equal      Greater than or equal
-  LT = 0xb,      // Less than                  Less than, or unordered
-  GT = 0xc,      // Greater than               Greater than
-  LE = 0xd,      // Less than or equal         <, ==, or unordered
-  AL = 0xe,      // Always (unconditional)     Always (unconditional)
-  NV = 0xf,      // Always (unconditional)     Always (unconditional)
-  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
-  Invalid
-};
-
-inline static const char *getCondCodeName(CondCode Code) {
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ:  return "eq";
-  case NE:  return "ne";
-  case HS:  return "hs";
-  case LO:  return "lo";
-  case MI:  return "mi";
-  case PL:  return "pl";
-  case VS:  return "vs";
-  case VC:  return "vc";
-  case HI:  return "hi";
-  case LS:  return "ls";
-  case GE:  return "ge";
-  case LT:  return "lt";
-  case GT:  return "gt";
-  case LE:  return "le";
-  case AL:  return "al";
-  case NV:  return "nv";
-  }
-}
-
-inline static CondCode getInvertedCondCode(CondCode Code) {
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ:  return NE;
-  case NE:  return EQ;
-  case HS:  return LO;
-  case LO:  return HS;
-  case MI:  return PL;
-  case PL:  return MI;
-  case VS:  return VC;
-  case VC:  return VS;
-  case HI:  return LS;
-  case LS:  return HI;
-  case GE:  return LT;
-  case LT:  return GE;
-  case GT:  return LE;
-  case LE:  return GT;
-  }
-}
-
-/// Given a condition code, return NZCV flags that would satisfy that condition.
-/// The flag bits are in the format expected by the ccmp instructions.
-/// Note that many different flag settings can satisfy a given condition code,
-/// this function just returns one of them.
-inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
-  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
-  enum { N = 8, Z = 4, C = 2, V = 1 };
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ: return Z; // Z == 1
-  case NE: return 0; // Z == 0
-  case HS: return C; // C == 1
-  case LO: return 0; // C == 0
-  case MI: return N; // N == 1
-  case PL: return 0; // N == 0
-  case VS: return V; // V == 1
-  case VC: return 0; // V == 0
-  case HI: return C; // C == 1 && Z == 0
-  case LS: return 0; // C == 0 || Z == 1
-  case GE: return 0; // N == V
-  case LT: return N; // N != V
-  case GT: return 0; // Z == 0 && N == V
-  case LE: return Z; // Z == 1 || N != V
-  }
-}
-} // end namespace ARM64CC
-
-/// Instances of this class can perform bidirectional mapping from random
-/// identifier strings to operand encodings. For example "MSR" takes a named
-/// system-register which must be encoded somehow and decoded for printing. This
-/// central location means that the information for those transformations is not
-/// duplicated and remains in sync.
-///
-/// FIXME: currently the algorithm is a completely unoptimised linear
-/// search. Obviously this could be improved, but we would probably want to work
-/// out just how often these instructions are emitted before working on it. It
-/// might even be optimal to just reorder the tables for the common instructions
-/// rather than changing the algorithm.
-struct ARM64NamedImmMapper {
-  struct Mapping {
-    const char *Name;
-    uint32_t Value;
-  };
-
-  template<int N>
-  ARM64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
-    : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
-
-  StringRef toString(uint32_t Value, bool &Valid) const;
-  uint32_t fromString(StringRef Name, bool &Valid) const;
-
-  /// Many of the instructions allow an alternative assembly form consisting of
-  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
-  /// N being 0 indicates no immediate syntax-form is allowed.
-  bool validImm(uint32_t Value) const;
-protected:
-  const Mapping *Pairs;
-  size_t NumPairs;
-  uint32_t TooBigImm;
-};
-
-namespace ARM64AT {
-  enum ATValues {
-    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
-    S1E1R = 0x43c0,  // 01  000  0111  1000  000
-    S1E2R = 0x63c0,  // 01  100  0111  1000  000
-    S1E3R = 0x73c0,  // 01  110  0111  1000  000
-    S1E1W = 0x43c1,  // 01  000  0111  1000  001
-    S1E2W = 0x63c1,  // 01  100  0111  1000  001
-    S1E3W = 0x73c1,  // 01  110  0111  1000  001
-    S1E0R = 0x43c2,  // 01  000  0111  1000  010
-    S1E0W = 0x43c3,  // 01  000  0111  1000  011
-    S12E1R = 0x63c4, // 01  100  0111  1000  100
-    S12E1W = 0x63c5, // 01  100  0111  1000  101
-    S12E0R = 0x63c6, // 01  100  0111  1000  110
-    S12E0W = 0x63c7  // 01  100  0111  1000  111
-  };
-
-  struct ATMapper : ARM64NamedImmMapper {
-    const static Mapping ATPairs[];
-
-    ATMapper();
-  };
-
-}
-namespace ARM64DB {
-  enum DBValues {
-    Invalid = -1,
-    OSHLD = 0x1,
-    OSHST = 0x2,
-    OSH =   0x3,
-    NSHLD = 0x5,
-    NSHST = 0x6,
-    NSH =   0x7,
-    ISHLD = 0x9,
-    ISHST = 0xa,
-    ISH =   0xb,
-    LD =    0xd,
-    ST =    0xe,
-    SY =    0xf
-  };
-
-  struct DBarrierMapper : ARM64NamedImmMapper {
-    const static Mapping DBarrierPairs[];
-
-    DBarrierMapper();
-  };
-}
-
-namespace  ARM64DC {
-  enum DCValues {
-    Invalid = -1,   // Op1  CRn   CRm   Op2
-    ZVA   = 0x5ba1, // 01  011  0111  0100  001
-    IVAC  = 0x43b1, // 01  000  0111  0110  001
-    ISW   = 0x43b2, // 01  000  0111  0110  010
-    CVAC  = 0x5bd1, // 01  011  0111  1010  001
-    CSW   = 0x43d2, // 01  000  0111  1010  010
-    CVAU  = 0x5bd9, // 01  011  0111  1011  001
-    CIVAC = 0x5bf1, // 01  011  0111  1110  001
-    CISW  = 0x43f2  // 01  000  0111  1110  010
-  };
-
-  struct DCMapper : ARM64NamedImmMapper {
-    const static Mapping DCPairs[];
-
-    DCMapper();
-  };
-
-}
-
-namespace  ARM64IC {
-  enum ICValues {
-    Invalid = -1,     // Op1  CRn   CRm   Op2
-    IALLUIS = 0x0388, // 000  0111  0001  000
-    IALLU = 0x03a8,   // 000  0111  0101  000
-    IVAU = 0x1ba9     // 011  0111  0101  001
-  };
-
-
-  struct ICMapper : ARM64NamedImmMapper {
-    const static Mapping ICPairs[];
-
-    ICMapper();
-  };
-
-  static inline bool NeedsRegister(ICValues Val) {
-    return Val == IVAU;
-  }
-}
-
-namespace  ARM64ISB {
-  enum ISBValues {
-    Invalid = -1,
-    SY = 0xf
-  };
-  struct ISBMapper : ARM64NamedImmMapper {
-    const static Mapping ISBPairs[];
-
-    ISBMapper();
-  };
-}
-
-namespace ARM64PRFM {
-  enum PRFMValues {
-    Invalid = -1,
-    PLDL1KEEP = 0x00,
-    PLDL1STRM = 0x01,
-    PLDL2KEEP = 0x02,
-    PLDL2STRM = 0x03,
-    PLDL3KEEP = 0x04,
-    PLDL3STRM = 0x05,
-    PLIL1KEEP = 0x08,
-    PLIL1STRM = 0x09,
-    PLIL2KEEP = 0x0a,
-    PLIL2STRM = 0x0b,
-    PLIL3KEEP = 0x0c,
-    PLIL3STRM = 0x0d,
-    PSTL1KEEP = 0x10,
-    PSTL1STRM = 0x11,
-    PSTL2KEEP = 0x12,
-    PSTL2STRM = 0x13,
-    PSTL3KEEP = 0x14,
-    PSTL3STRM = 0x15
-  };
-
-  struct PRFMMapper : ARM64NamedImmMapper {
-    const static Mapping PRFMPairs[];
-
-    PRFMMapper();
-  };
-}
-
-namespace ARM64PState {
-  enum PStateValues {
-    Invalid = -1,
-    SPSel = 0x05,
-    DAIFSet = 0x1e,
-    DAIFClr = 0x1f
-  };
-
-  struct PStateMapper : ARM64NamedImmMapper {
-    const static Mapping PStatePairs[];
-
-    PStateMapper();
-  };
-
-}
-
-namespace ARM64SE {
-    enum ShiftExtSpecifiers {
-        Invalid = -1,
-        LSL,
-        MSL,
-        LSR,
-        ASR,
-        ROR,
-
-        UXTB,
-        UXTH,
-        UXTW,
-        UXTX,
-
-        SXTB,
-        SXTH,
-        SXTW,
-        SXTX
-    };
-}
-
-namespace ARM64Layout {
-    enum VectorLayout {
-        Invalid = -1,
-        VL_8B,
-        VL_4H,
-        VL_2S,
-        VL_1D,
-
-        VL_16B,
-        VL_8H,
-        VL_4S,
-        VL_2D,
-
-        // Bare layout for the 128-bit vector
-        // (only show ".b", ".h", ".s", ".d" without vector number)
-        VL_B,
-        VL_H,
-        VL_S,
-        VL_D
-    };
-}
-
-inline static const char *
-ARM64VectorLayoutToString(ARM64Layout::VectorLayout Layout) {
-  switch (Layout) {
-  case ARM64Layout::VL_8B:  return ".8b";
-  case ARM64Layout::VL_4H:  return ".4h";
-  case ARM64Layout::VL_2S:  return ".2s";
-  case ARM64Layout::VL_1D:  return ".1d";
-  case ARM64Layout::VL_16B:  return ".16b";
-  case ARM64Layout::VL_8H:  return ".8h";
-  case ARM64Layout::VL_4S:  return ".4s";
-  case ARM64Layout::VL_2D:  return ".2d";
-  case ARM64Layout::VL_B:  return ".b";
-  case ARM64Layout::VL_H:  return ".h";
-  case ARM64Layout::VL_S:  return ".s";
-  case ARM64Layout::VL_D:  return ".d";
-  default: llvm_unreachable("Unknown Vector Layout");
-  }
-}
-
-inline static ARM64Layout::VectorLayout
-ARM64StringToVectorLayout(StringRef LayoutStr) {
-  return StringSwitch<ARM64Layout::VectorLayout>(LayoutStr)
-             .Case(".8b", ARM64Layout::VL_8B)
-             .Case(".4h", ARM64Layout::VL_4H)
-             .Case(".2s", ARM64Layout::VL_2S)
-             .Case(".1d", ARM64Layout::VL_1D)
-             .Case(".16b", ARM64Layout::VL_16B)
-             .Case(".8h", ARM64Layout::VL_8H)
-             .Case(".4s", ARM64Layout::VL_4S)
-             .Case(".2d", ARM64Layout::VL_2D)
-             .Case(".b", ARM64Layout::VL_B)
-             .Case(".h", ARM64Layout::VL_H)
-             .Case(".s", ARM64Layout::VL_S)
-             .Case(".d", ARM64Layout::VL_D)
-             .Default(ARM64Layout::Invalid);
-}
-
-namespace ARM64SysReg {
-  enum SysRegROValues {
-    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
-    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
-    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
-    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
-    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
-    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
-    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
-    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
-    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
-    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
-    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
-    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
-    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
-    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
-    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
-    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
-    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
-    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
-    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
-    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
-    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
-    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
-    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
-    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
-    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
-    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
-    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
-    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
-    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
-    ID_AARM64PFR0_EL1   = 0xc020, // 11  000  0000  0100  000
-    ID_AARM64PFR1_EL1   = 0xc021, // 11  000  0000  0100  001
-    ID_AARM64DFR0_EL1   = 0xc028, // 11  000  0000  0101  000
-    ID_AARM64DFR1_EL1   = 0xc029, // 11  000  0000  0101  001
-    ID_AARM64AFR0_EL1   = 0xc02c, // 11  000  0000  0101  100
-    ID_AARM64AFR1_EL1   = 0xc02d, // 11  000  0000  0101  101
-    ID_AARM64ISAR0_EL1  = 0xc030, // 11  000  0000  0110  000
-    ID_AARM64ISAR1_EL1  = 0xc031, // 11  000  0000  0110  001
-    ID_AARM64MMFR0_EL1  = 0xc038, // 11  000  0000  0111  000
-    ID_AARM64MMFR1_EL1  = 0xc039, // 11  000  0000  0111  001
-    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
-    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
-    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
-    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
-    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
-    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
-    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
-    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
-    CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
-
-    // Trace registers
-    TRCSTATR          = 0x8818, // 10  001  0000  0011  000
-    TRCIDR8           = 0x8806, // 10  001  0000  0000  110
-    TRCIDR9           = 0x880e, // 10  001  0000  0001  110
-    TRCIDR10          = 0x8816, // 10  001  0000  0010  110
-    TRCIDR11          = 0x881e, // 10  001  0000  0011  110
-    TRCIDR12          = 0x8826, // 10  001  0000  0100  110
-    TRCIDR13          = 0x882e, // 10  001  0000  0101  110
-    TRCIDR0           = 0x8847, // 10  001  0000  1000  111
-    TRCIDR1           = 0x884f, // 10  001  0000  1001  111
-    TRCIDR2           = 0x8857, // 10  001  0000  1010  111
-    TRCIDR3           = 0x885f, // 10  001  0000  1011  111
-    TRCIDR4           = 0x8867, // 10  001  0000  1100  111
-    TRCIDR5           = 0x886f, // 10  001  0000  1101  111
-    TRCIDR6           = 0x8877, // 10  001  0000  1110  111
-    TRCIDR7           = 0x887f, // 10  001  0000  1111  111
-    TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
-    TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
-    TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
-    TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
-    TRCLSR            = 0x8bee, // 10  001  0111  1101  110
-    TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
-    TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
-    TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
-    TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
-    TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
-    TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
-    TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
-    TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
-    TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
-    TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
-    TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
-    TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
-    TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
-    TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
-    TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
-    TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
-
-    // GICv3 registers
-    ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
-    ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
-    ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
-    ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
-    ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
-    ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
-    ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
-    ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
-  };
-
-  enum SysRegWOValues {
-    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
-    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
-    PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
-
-    // Trace Registers
-    TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
-    TRCLAR            = 0x8be6, // 10  001  0111  1100  110
-
-    // GICv3 registers
-    ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
-    ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
-    ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
-    ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
-    ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
-    ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
-  };
-
-  enum SysRegValues {
-    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
-    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
-    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
-    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
-    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
-    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
-    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
-    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
-    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
-    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
-    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
-    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
-    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
-    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
-    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
-    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
-    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
-    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
-    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
-    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
-    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
-    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
-    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
-    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
-    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
-    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
-    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
-    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
-    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
-    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
-    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
-    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
-    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
-    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
-    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
-    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
-    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
-    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
-    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
-    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
-    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
-    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
-    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
-    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
-    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
-    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
-    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
-    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
-    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
-    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
-    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
-    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
-    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
-    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
-    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
-    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
-    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
-    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
-    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
-    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
-    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
-    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
-    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
-    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
-    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
-    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
-    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
-    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
-    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
-    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
-    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
-    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
-    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
-    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
-    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
-    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
-    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
-    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
-    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
-    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
-    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
-    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
-    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
-    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
-    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
-    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
-    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
-    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
-    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
-    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
-    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
-    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
-    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
-    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
-    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
-    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
-    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
-    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
-    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
-    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
-    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
-    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
-    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
-    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
-    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
-    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
-    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
-    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
-    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
-    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
-    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
-    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
-    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
-    SP_EL0            = 0xc208, // 11  000  0100  0001  000
-    SP_EL1            = 0xe208, // 11  100  0100  0001  000
-    SP_EL2            = 0xf208, // 11  110  0100  0001  000
-    SPSel             = 0xc210, // 11  000  0100  0010  000
-    NZCV              = 0xda10, // 11  011  0100  0010  000
-    DAIF              = 0xda11, // 11  011  0100  0010  001
-    CurrentEL         = 0xc212, // 11  000  0100  0010  010
-    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
-    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
-    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
-    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
-    FPCR              = 0xda20, // 11  011  0100  0100  000
-    FPSR              = 0xda21, // 11  011  0100  0100  001
-    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
-    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
-    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
-    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
-    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
-    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
-    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
-    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
-    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
-    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
-    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
-    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
-    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
-    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
-    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
-    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
-    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
-    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
-    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
-    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
-    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
-    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
-    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
-    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
-    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
-    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
-    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
-    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
-    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
-    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
-    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
-    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
-    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
-    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
-    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
-    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
-    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
-    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
-    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
-    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
-    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
-    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
-    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
-    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
-    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
-    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
-    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
-    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
-    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
-    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
-    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
-    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
-    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
-    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
-    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
-    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
-    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
-    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
-    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
-    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
-    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
-    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
-    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
-    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
-    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
-    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
-    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
-    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
-    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
-    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
-    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
-    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
-    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
-    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
-    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
-    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
-    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
-    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
-    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
-    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
-    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
-    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
-    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
-    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
-    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
-    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
-    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
-    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
-    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
-    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
-    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
-    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
-    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
-    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
-    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
-    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
-    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
-    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
-    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
-    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
-    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
-    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
-    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
-    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
-    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
-    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
-    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
-    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
-    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
-    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
-    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
-    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
-    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
-    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
-    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
-    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
-    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
-    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
-    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
-    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
-    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
-    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
-    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
-    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
-    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
-    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
-    PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
-
-    // Trace registers
-    TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
-    TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
-    TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
-    TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
-    TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
-    TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
-    TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
-    TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
-    TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
-    TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
-    TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
-    TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
-    TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
-    TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
-    TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
-    TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
-    TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
-    TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
-    TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
-    TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
-    TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
-    TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
-    TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
-    TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
-    TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
-    TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
-    TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
-    TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
-    TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
-    TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
-    TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
-    TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
-    TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
-    TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
-    TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
-    TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
-    TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
-    TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
-    TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
-    TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
-    TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
-    TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
-    TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
-    TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
-    TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
-    TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
-    TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
-    TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
-    TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
-    TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
-    TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
-    TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
-    TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
-    TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
-    TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
-    TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
-    TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
-    TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
-    TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
-    TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
-    TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
-    TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
-    TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
-    TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
-    TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
-    TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
-    TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
-    TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
-    TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
-    TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
-    TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
-    TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
-    TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
-    TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
-    TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
-    TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
-    TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
-    TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
-    TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
-    TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
-    TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
-    TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
-    TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
-    TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
-    TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
-    TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
-    TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
-    TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
-    TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
-    TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
-    TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
-    TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
-    TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
-    TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
-    TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
-    TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
-    TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
-    TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
-    TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
-    TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
-    TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
-    TRCACVR0          = 0x8900, // 10  001  0010  0000  000
-    TRCACVR1          = 0x8910, // 10  001  0010  0010  000
-    TRCACVR2          = 0x8920, // 10  001  0010  0100  000
-    TRCACVR3          = 0x8930, // 10  001  0010  0110  000
-    TRCACVR4          = 0x8940, // 10  001  0010  1000  000
-    TRCACVR5          = 0x8950, // 10  001  0010  1010  000
-    TRCACVR6          = 0x8960, // 10  001  0010  1100  000
-    TRCACVR7          = 0x8970, // 10  001  0010  1110  000
-    TRCACVR8          = 0x8901, // 10  001  0010  0000  001
-    TRCACVR9          = 0x8911, // 10  001  0010  0010  001
-    TRCACVR10         = 0x8921, // 10  001  0010  0100  001
-    TRCACVR11         = 0x8931, // 10  001  0010  0110  001
-    TRCACVR12         = 0x8941, // 10  001  0010  1000  001
-    TRCACVR13         = 0x8951, // 10  001  0010  1010  001
-    TRCACVR14         = 0x8961, // 10  001  0010  1100  001
-    TRCACVR15         = 0x8971, // 10  001  0010  1110  001
-    TRCACATR0         = 0x8902, // 10  001  0010  0000  010
-    TRCACATR1         = 0x8912, // 10  001  0010  0010  010
-    TRCACATR2         = 0x8922, // 10  001  0010  0100  010
-    TRCACATR3         = 0x8932, // 10  001  0010  0110  010
-    TRCACATR4         = 0x8942, // 10  001  0010  1000  010
-    TRCACATR5         = 0x8952, // 10  001  0010  1010  010
-    TRCACATR6         = 0x8962, // 10  001  0010  1100  010
-    TRCACATR7         = 0x8972, // 10  001  0010  1110  010
-    TRCACATR8         = 0x8903, // 10  001  0010  0000  011
-    TRCACATR9         = 0x8913, // 10  001  0010  0010  011
-    TRCACATR10        = 0x8923, // 10  001  0010  0100  011
-    TRCACATR11        = 0x8933, // 10  001  0010  0110  011
-    TRCACATR12        = 0x8943, // 10  001  0010  1000  011
-    TRCACATR13        = 0x8953, // 10  001  0010  1010  011
-    TRCACATR14        = 0x8963, // 10  001  0010  1100  011
-    TRCACATR15        = 0x8973, // 10  001  0010  1110  011
-    TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
-    TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
-    TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
-    TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
-    TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
-    TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
-    TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
-    TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
-    TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
-    TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
-    TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
-    TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
-    TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
-    TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
-    TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
-    TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
-    TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
-    TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
-    TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
-    TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
-    TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
-    TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
-    TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
-    TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
-    TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
-    TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
-    TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
-    TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
-    TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
-    TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
-    TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
-    TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
-    TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
-    TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
-    TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
-    TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
-    TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
-    TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
-    TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
-
-    // GICv3 registers
-    ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
-    ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
-    ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
-    ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
-    ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
-    ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
-    ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
-    ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
-    ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
-    ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
-    ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
-    ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
-    ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
-    ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
-    ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
-    ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
-    ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
-    ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
-    ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
-    ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
-    ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
-    ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
-    ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
-    ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
-    ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
-    ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
-    ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
-    ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
-    ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
-    ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
-    ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
-    ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
-    ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
-    ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
-    ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
-    ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
-    ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
-    ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
-    ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
-    ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
-    ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
-    ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
-    ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
-    ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
-    ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
-    ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
-    ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
-    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
-  };
-
-  // Cyclone specific system registers
-  enum CycloneSysRegValues {
-    CPM_IOACC_CTL_EL3 = 0xff90
-  };
-
-  // Note that these do not inherit from ARM64NamedImmMapper. This class is
-  // sufficiently different in its behaviour that I don't believe it's worth
-  // burdening the common ARM64NamedImmMapper with abstractions only needed in
-  // this one case.
-  struct SysRegMapper {
-    static const ARM64NamedImmMapper::Mapping SysRegPairs[];
-    static const ARM64NamedImmMapper::Mapping CycloneSysRegPairs[];
-
-    const ARM64NamedImmMapper::Mapping *InstPairs;
-    size_t NumInstPairs;
-    uint64_t FeatureBits;
-
-    SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
-    uint32_t fromString(StringRef Name, bool &Valid) const;
-    std::string toString(uint32_t Bits, bool &Valid) const;
-  };
-
-  struct MSRMapper : SysRegMapper {
-    static const ARM64NamedImmMapper::Mapping MSRPairs[];
-    MSRMapper(uint64_t FeatureBits);
-  };
-
-  struct MRSMapper : SysRegMapper {
-    static const ARM64NamedImmMapper::Mapping MRSPairs[];
-    MRSMapper(uint64_t FeatureBits);
-  };
-
-  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
-}
-
-namespace ARM64TLBI {
-  enum TLBIValues {
-    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
-    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
-    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
-    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
-    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
-    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
-    VAE1IS       = 0x4419, // 01  000  1000  0011  001
-    VAE2IS       = 0x6419, // 01  100  1000  0011  001
-    VAE3IS       = 0x7419, // 01  110  1000  0011  001
-    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
-    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
-    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
-    VALE1IS      = 0x441d, // 01  000  1000  0011  101
-    VALE2IS      = 0x641d, // 01  100  1000  0011  101
-    VALE3IS      = 0x741d, // 01  110  1000  0011  101
-    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
-    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
-    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
-    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
-    VMALLE1      = 0x4438, // 01  000  1000  0111  000
-    ALLE2        = 0x6438, // 01  100  1000  0111  000
-    ALLE3        = 0x7438, // 01  110  1000  0111  000
-    VAE1         = 0x4439, // 01  000  1000  0111  001
-    VAE2         = 0x6439, // 01  100  1000  0111  001
-    VAE3         = 0x7439, // 01  110  1000  0111  001
-    ASIDE1       = 0x443a, // 01  000  1000  0111  010
-    VAAE1        = 0x443b, // 01  000  1000  0111  011
-    ALLE1        = 0x643c, // 01  100  1000  0111  100
-    VALE1        = 0x443d, // 01  000  1000  0111  101
-    VALE2        = 0x643d, // 01  100  1000  0111  101
-    VALE3        = 0x743d, // 01  110  1000  0111  101
-    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
-    VAALE1       = 0x443f  // 01  000  1000  0111  111
-  };
-
-  struct TLBIMapper : ARM64NamedImmMapper {
-    const static Mapping TLBIPairs[];
-
-    TLBIMapper();
-  };
-
-  static inline bool NeedsRegister(TLBIValues Val) {
-    switch (Val) {
-    case VMALLE1IS:
-    case ALLE2IS:
-    case ALLE3IS:
-    case ALLE1IS:
-    case VMALLS12E1IS:
-    case VMALLE1:
-    case ALLE2:
-    case ALLE3:
-    case ALLE1:
-    case VMALLS12E1:
-      return false;
-    default:
-      return true;
-    }
-  }
-} 
-
-namespace ARM64II {
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // ARM64 Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    MO_FRAGMENT = 0x7,
-
-    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
-    /// offset of the 4K page containing the symbol.  This is used with the
-    /// ADRP instruction.
-    MO_PAGE = 1,
-
-    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
-    /// that symbol within a 4K page.  This offset is added to the page address
-    /// to produce the complete address.
-    MO_PAGEOFF = 2,
-
-    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
-    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G3 = 3,
-
-    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
-    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G2 = 4,
-
-    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
-    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G1 = 5,
-
-    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
-    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G0 = 6,
-
-    /// MO_GOT - This flag indicates that a symbol operand represents the
-    /// address of the GOT entry for the symbol, rather than the address of
-    /// the symbol itself.
-    MO_GOT = 8,
-
-    /// MO_NC - Indicates whether the linker is expected to check the symbol
-    /// reference for overflow. For example in an ADRP/ADD pair of relocations
-    /// the ADRP usually does check, but not the ADD.
-    MO_NC = 0x10,
-
-    /// MO_TLS - Indicates that the operand being accessed is some kind of
-    /// thread-local symbol. On Darwin, only one type of thread-local access
-    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
-    /// referee will affect interpretation.
-    MO_TLS = 0x20
-  };
-} // end namespace ARM64II
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/Utils/CMakeLists.txt b/lib/Target/ARM64/Utils/CMakeLists.txt
deleted file mode 100644
index f69076f4ef64..000000000000
--- a/lib/Target/ARM64/Utils/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_llvm_library(LLVMARM64Utils
-  ARM64BaseInfo.cpp
-  )
diff --git a/lib/Target/ARM64/Utils/LLVMBuild.txt b/lib/Target/ARM64/Utils/LLVMBuild.txt
deleted file mode 100644
index 232dca29f407..000000000000
--- a/lib/Target/ARM64/Utils/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/ARM64/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Utils
-parent = ARM64
-required_libraries = Support
-add_to_library_groups = ARM64
diff --git a/lib/Target/ARM64/Utils/Makefile b/lib/Target/ARM64/Utils/Makefile
deleted file mode 100644
index 6491ad9a07bd..000000000000
--- a/lib/Target/ARM64/Utils/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/Utils/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Utils
-
-# Hack: we need to include 'main' ARM64 target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 15b574dd3688..f610fbb969bc 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -1577,6 +1577,10 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out) << iName << "->setName(\"";
     printEscapedString(cxi->getName());
     Out << "\");";
+    nl(Out) << iName << "->setVolatile("
+            << (cxi->isVolatile() ? "true" : "false") << ");";
+    nl(Out) << iName << "->setWeak("
+            << (cxi->isWeak() ? "true" : "false") << ");";
     break;
   }
   case Instruction::AtomicRMW: {
@@ -1607,6 +1611,8 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out) << iName << "->setName(\"";
     printEscapedString(rmwi->getName());
     Out << "\");";
+    nl(Out) << iName << "->setVolatile("
+            << (rmwi->isVolatile() ? "true" : "false") << ");";
     break;
   }
   case Instruction::LandingPad: {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index b8e5d24f4f16..17a6674cf577 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -943,21 +943,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       false, 0);
 }
 
-SDValue
-HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue CC = Op.getOperand(4);
-  SDValue TrueVal = Op.getOperand(2);
-  SDValue FalseVal = Op.getOperand(3);
-  SDLoc dl(Op);
-  SDNode* OpNode = Op.getNode();
-  EVT SVT = OpNode->getValueType(0);
-
-  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i1, LHS, RHS, CC);
-  return DAG.getNode(ISD::SELECT, dl, SVT, Cond, TrueVal, FalseVal);
-}
-
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
@@ -1341,8 +1326,9 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
     // Lower SELECT_CC to SETCC and SELECT.
-    setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::i64,   Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::i1,    Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::i32,   Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::i64,   Expand);
 
     if (QRI->Subtarget.hasV5TOps()) {
 
@@ -1354,18 +1340,12 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
 
       setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
       setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-      setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
 
     } else {
 
       // Hexagon has no select or setcc: expand to SELECT_CC.
       setOperationAction(ISD::SELECT, MVT::f32, Expand);
       setOperationAction(ISD::SELECT, MVT::f64, Expand);
-
-      // This is a workaround documented in DAGCombiner.cpp:2892 We don't
-      // support SELECT_CC on every type.
-      setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
-
     }
 
     if (EmitJumpTables) {
@@ -1577,7 +1557,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
 
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
     case ISD::SELECT:             return Op;
     case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4f27c276564c..0ddaf846b31d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -124,7 +124,6 @@ namespace llvm {
                             const SmallVectorImpl<SDValue> &OutVals,
                             SDValue Callee) const;
 
-    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 9e1e0fdf3d48..f66ffd284a93 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -18,10 +18,8 @@ using namespace llvm;
 
 bool llvm::flag_aligned_memcpy;
 
-HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine
-                                                 &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine &TM)
+    : TargetSelectionDAGInfo(TM.getDataLayout()) {}
 
 HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
 }
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index aa4121f4ed5d..247207f992dc 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -17,9 +17,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HexagonTargetMachine.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -47,13 +48,12 @@ using namespace llvm;
 namespace {
 
 class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
-    const HexagonTargetMachine& QTM;
-    const HexagonSubtarget &QST;
+  const HexagonTargetMachine &QTM;
 
  public:
     static char ID;
-    HexagonSplitConst32AndConst64(const HexagonTargetMachine& TM)
-      : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+    HexagonSplitConst32AndConst64(const HexagonTargetMachine &TM)
+        : MachineFunctionPass(ID), QTM(TM) {}
 
     const char *getPassName() const override {
       return "Hexagon Split Const32s and Const64s";
@@ -67,6 +67,12 @@ char HexagonSplitConst32AndConst64::ID = 0;
 
 bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 
+  const HexagonTargetObjectFile &TLOF =
+      (const HexagonTargetObjectFile &)
+      QTM.getTargetLowering()->getObjFileLowering();
+  if (TLOF.IsSmallDataEnabled())
+    return true;
+
   const TargetInstrInfo *TII = QTM.getInstrInfo();
 
   // Loop over all of the basic blocks
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 2572e11ae9e1..b9237647ff4a 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -150,16 +150,12 @@ bool HexagonPassConfig::addPostRegAlloc() {
 
 bool HexagonPassConfig::addPreSched2() {
   const HexagonTargetMachine &TM = getHexagonTargetMachine();
-  const HexagonTargetObjectFile &TLOF =
-    (const HexagonTargetObjectFile &)getTargetLowering()->getObjFileLowering();
 
   addPass(createHexagonCopyToCombine());
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
-  if (!TLOF.IsSmallDataEnabled()) {
-    addPass(createHexagonSplitConst32AndConst64(TM));
-    printAndVerify("After hexagon split const32/64 pass");
-  }
+  addPass(createHexagonSplitConst32AndConst64(TM));
+  printAndVerify("After hexagon split const32/64 pass");
   return true;
 }
 
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 13abaf8ce7a1..1b0837cb3b54 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AArch64 ARM ARM64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = ARM AArch64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
index c700383b5ccc..6ebddaffd30a 100644
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
+++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
@@ -17,8 +17,7 @@ using namespace llvm;
 #define DEBUG_TYPE "msp430-selectiondag-info"
 
 MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const MSP430TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+    : TargetSelectionDAGInfo(TM.getDataLayout()) {}
 
 MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() {
 }
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index b9a0962603fb..dd2b857789ca 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -72,56 +72,47 @@ class MipsAsmParser : public MCTargetAsmParser {
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   /// Parse a register as used in CFI directives
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
-  bool ParseParenSuffix(StringRef Name,
-                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  bool ParseParenSuffix(StringRef Name, OperandVector &Operands);
 
-  bool ParseBracketSuffix(StringRef Name,
-                          SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  bool ParseBracketSuffix(StringRef Name, OperandVector &Operands);
 
-  bool
-  ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                   SmallVectorImpl<MCParsedAsmOperand *> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  MipsAsmParser::OperandMatchResultTy
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy MatchAnyRegisterNameWithoutDollar(
-      SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier,
-      SMLoc S);
+  MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  MatchAnyRegisterWithoutDollar(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                                SMLoc S);
+  MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                    StringRef Identifier, SMLoc S);
 
   MipsAsmParser::OperandMatchResultTy
-  ParseAnyRegister(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S);
 
-  MipsAsmParser::OperandMatchResultTy
-  ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseAnyRegister(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy
-  ParseJumpTarget(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseImm(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy
-  parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseJumpTarget(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy
-  ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands);
 
-  bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseLSAImm(OperandVector &Operands);
 
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &,
-                    StringRef Mnemonic);
+  bool searchSymbolAlias(OperandVector &Operands);
+
+  bool ParseOperand(OperandVector &, StringRef Mnemonic);
 
   bool needsExpansion(MCInst &Inst);
 
@@ -183,7 +174,14 @@ class MipsAsmParser : public MCTargetAsmParser {
     return STI.getFeatureBits() & Mips::FeatureMicroMips;
   }
 
-  bool parseRegister(unsigned &RegNum);
+  bool hasMips4() const { return STI.getFeatureBits() & Mips::FeatureMips4; }
+  bool hasMips32() const { return STI.getFeatureBits() & Mips::FeatureMips32; }
+  bool hasMips32r6() const {
+    return STI.getFeatureBits() & Mips::FeatureMips32r6;
+  }
+  bool hasMips64r6() const {
+    return STI.getFeatureBits() & Mips::FeatureMips64r6;
+  }
 
   bool eatComma(StringRef ErrorStr);
 
@@ -205,7 +203,7 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   unsigned getGPR(int RegNo);
 
-  int getATReg();
+  int getATReg(SMLoc Loc);
 
   bool processInstruction(MCInst &Inst, SMLoc IDLoc,
                           SmallVectorImpl<MCInst> &Instructions);
@@ -230,6 +228,14 @@ class MipsAsmParser : public MCTargetAsmParser {
   }
 
 public:
+  enum MipsMatchResultTy {
+    Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "MipsGenAsmMatcher.inc"
+#undef GET_OPERAND_DIAGNOSTIC_TYPES
+
+  };
+
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII,
                 const MCTargetOptions &Options)
@@ -247,6 +253,9 @@ class MipsAsmParser : public MCTargetAsmParser {
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
+  /// True if all of $fcc0 - $fcc7 exist for the current ISA.
+  bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
+
   /// Warn if RegNo is the current assembler temporary.
   void WarnIfAssemblerTemporary(int RegNo, SMLoc Loc);
 };
@@ -289,9 +298,11 @@ class MipsOperand : public MCParsedAsmOperand {
     k_Token          /// A simple token
   } Kind;
 
+public:
   MipsOperand(KindTy K, MipsAsmParser &Parser)
       : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
 
+private:
   /// For diagnostics, and checking the assembler temporary
   MipsAsmParser &AsmParser;
 
@@ -330,10 +341,11 @@ class MipsOperand : public MCParsedAsmOperand {
   SMLoc StartLoc, EndLoc;
 
   /// Internal constructor for register kinds
-  static MipsOperand *CreateReg(unsigned Index, RegKind RegKind,
-                                const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
-                                MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_RegisterIndex, Parser);
+  static std::unique_ptr<MipsOperand> CreateReg(unsigned Index, RegKind RegKind,
+                                                const MCRegisterInfo *RegInfo,
+                                                SMLoc S, SMLoc E,
+                                                MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser);
     Op->RegIdx.Index = Index;
     Op->RegIdx.RegInfo = RegInfo;
     Op->RegIdx.Kind = RegKind;
@@ -612,6 +624,12 @@ class MipsOperand : public MCParsedAsmOperand {
     return Kind == k_Token;
   }
   bool isMem() const override { return Kind == k_Memory; }
+  bool isConstantMemOff() const {
+    return isMem() && dyn_cast<MCConstantExpr>(getMemOff());
+  }
+  template <unsigned Bits> bool isMemWithSimmOffset() const {
+    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff());
+  }
   bool isInvNum() const { return Kind == k_Immediate; }
   bool isLSAImm() const {
     if (!isConstantImm())
@@ -656,9 +674,13 @@ class MipsOperand : public MCParsedAsmOperand {
     return Mem.Off;
   }
 
-  static MipsOperand *CreateToken(StringRef Str, SMLoc S,
-                                  MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_Token, Parser);
+  int64_t getConstantMemOff() const {
+    return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
+  }
+
+  static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
+                                                  MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Token, Parser);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -668,74 +690,75 @@ class MipsOperand : public MCParsedAsmOperand {
 
   /// Create a numeric register (e.g. $1). The exact register remains
   /// unresolved until an instruction successfully matches
-  static MipsOperand *CreateNumericReg(unsigned Index,
-                                       const MCRegisterInfo *RegInfo, SMLoc S,
-                                       SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateNumericReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                   SMLoc E, MipsAsmParser &Parser) {
     DEBUG(dbgs() << "CreateNumericReg(" << Index << ", ...)\n");
     return CreateReg(Index, RegKind_Numeric, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely a GPR.
   /// This is typically only used for named registers such as $gp.
-  static MipsOperand *CreateGPRReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateGPRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_GPR, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely a FGR.
   /// This is typically only used for named registers such as $f0.
-  static MipsOperand *CreateFGRReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateFGRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_FGR, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an FCC.
   /// This is typically only used for named registers such as $fcc0.
-  static MipsOperand *CreateFCCReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateFCCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_FCC, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an ACC.
   /// This is typically only used for named registers such as $ac0.
-  static MipsOperand *CreateACCReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateACCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_ACC, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an MSA128.
   /// This is typically only used for named registers such as $w0.
-  static MipsOperand *CreateMSA128Reg(unsigned Index,
-                                      const MCRegisterInfo *RegInfo, SMLoc S,
-                                      SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateMSA128Reg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                  SMLoc E, MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_MSA128, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an MSACtrl.
   /// This is typically only used for named registers such as $msaaccess.
-  static MipsOperand *CreateMSACtrlReg(unsigned Index,
-                                       const MCRegisterInfo *RegInfo, SMLoc S,
-                                       SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateMSACtrlReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                   SMLoc E, MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_MSACtrl, RegInfo, S, E, Parser);
   }
 
-  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
-                                MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_Immediate, Parser);
+  static std::unique_ptr<MipsOperand>
+  CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Immediate, Parser);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static MipsOperand *CreateMem(MipsOperand *Base, const MCExpr *Off, SMLoc S,
-                                SMLoc E, MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_Memory, Parser);
-    Op->Mem.Base = Base;
+  static std::unique_ptr<MipsOperand>
+  CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
+            SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Memory, Parser);
+    Op->Mem.Base = Base.release();
     Op->Mem.Off = Off;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -756,7 +779,11 @@ class MipsOperand : public MCParsedAsmOperand {
     return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
   }
   bool isFCCAsmReg() const {
-    return isRegIdx() && RegIdx.Kind & RegKind_FCC && RegIdx.Index <= 7;
+    if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
+      return false;
+    if (!AsmParser.hasEightFccRegisters())
+      return RegIdx.Index == 0;
+    return RegIdx.Index <= 7;
   }
   bool isACCAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
@@ -882,6 +909,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     }
   }
 
+  // SSNOP is deprecated on MIPS32r6/MIPS64r6
+  // We still accept it but it is a normal nop.
+  if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) {
+    std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+    Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a "
+                                                      "nop instruction");
+  }
+
   if (MCID.hasDelaySlot() && Options.isReorder()) {
     // If this instruction has a delay slot and .set reorder is active,
     // emit a NOP after it.
@@ -1090,8 +1125,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   unsigned ImmOffset, HiOffset, LoOffset;
   const MCExpr *ExprOffset;
   unsigned TmpRegNum;
-  unsigned AtRegNum = getReg(
-      (isGP64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg());
   // 1st operand is either the source or destination register.
   assert(Inst.getOperand(0).isReg() && "expected register operand kind");
   unsigned RegOpNum = Inst.getOperand(0).getReg();
@@ -1111,10 +1144,46 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
     ExprOffset = Inst.getOperand(2).getExpr();
   // All instructions will have the same location.
   TempInst.setLoc(IDLoc);
-  // 1st instruction in expansion is LUi. For load instruction we can use
-  // the dst register as a temporary if base and dst are different,
-  // but for stores we must use $at.
-  TmpRegNum = (isLoad && (BaseRegNum != RegOpNum)) ? RegOpNum : AtRegNum;
+  // These are some of the types of expansions we perform here:
+  // 1) lw $8, sym        => lui $8, %hi(sym)
+  //                         lw $8, %lo(sym)($8)
+  // 2) lw $8, offset($9) => lui $8, %hi(offset)
+  //                         add $8, $8, $9
+  //                         lw $8, %lo(offset)($9)
+  // 3) lw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         lw $8, %lo(offset)($at)
+  // 4) sw $8, sym        => lui $at, %hi(sym)
+  //                         sw $8, %lo(sym)($at)
+  // 5) sw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         sw $8, %lo(offset)($at)
+  // 6) ldc1 $f0, sym     => lui $at, %hi(sym)
+  //                         ldc1 $f0, %lo(sym)($at)
+  //
+  // For load instructions we can use the destination register as a temporary
+  // if base and dst are different (examples 1 and 2) and if the base register
+  // is general purpose otherwise we must use $at (example 6) and error if it's
+  // not available. For stores we must use $at (examples 4 and 5) because we
+  // must not clobber the source register setting up the offset.
+  const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
+  int16_t RegClassOp0 = Desc.OpInfo[0].RegClass;
+  unsigned RegClassIDOp0 =
+      getContext().getRegisterInfo()->getRegClass(RegClassOp0).getID();
+  bool IsGPR = (RegClassIDOp0 == Mips::GPR32RegClassID) ||
+               (RegClassIDOp0 == Mips::GPR64RegClassID);
+  if (isLoad && IsGPR && (BaseRegNum != RegOpNum))
+    TmpRegNum = RegOpNum;
+  else {
+    int AT = getATReg(IDLoc);
+    // At this point we need AT to perform the expansions and we exit if it is
+    // not available.
+    if (!AT)
+      return;
+    TmpRegNum =
+        getReg((isGP64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, AT);
+  }
+
   TempInst.setOpcode(Mips::LUi);
   TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
   if (isImmOpnd)
@@ -1164,10 +1233,24 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   TempInst.clear();
 }
 
-bool MipsAsmParser::MatchAndEmitInstruction(
-    SMLoc IDLoc, unsigned &Opcode,
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, MCStreamer &Out,
-    unsigned &ErrorInfo, bool MatchingInlineAsm) {
+unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // As described by the Mips32r2 spec, the registers Rd and Rs for
+  // jalr.hb must be different.
+  unsigned Opcode = Inst.getOpcode();
+
+  if (Opcode == Mips::JALR_HB &&
+      (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()))
+    return Match_RequiresDifferentSrcAndDst;
+
+  return Match_Success;
+}
+
+bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                            OperandVector &Operands,
+                                            MCStreamer &Out,
+                                            unsigned &ErrorInfo,
+                                            bool MatchingInlineAsm) {
+
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
   unsigned MatchResult =
@@ -1192,7 +1275,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((MipsOperand *)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -1201,6 +1284,8 @@ bool MipsAsmParser::MatchAndEmitInstruction(
   }
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction");
+  case Match_RequiresDifferentSrcAndDst:
+    return Error(IDLoc, "source and destination must be different");
   }
   return true;
 }
@@ -1354,10 +1439,11 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   return true;
 }
 
-int MipsAsmParser::getATReg() {
+int MipsAsmParser::getATReg(SMLoc Loc) {
   int AT = Options.getATRegNum();
   if (AT == 0)
-    TokError("Pseudo instruction requires $at, which is not available");
+    reportParseError(Loc,
+                     "Pseudo instruction requires $at, which is not available");
   return AT;
 }
 
@@ -1378,9 +1464,7 @@ int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
   return getReg(RegClass, RegNum);
 }
 
-bool
-MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                            StringRef Mnemonic) {
+bool MipsAsmParser::ParseOperand(OperandVector &Operands, StringRef Mnemonic) {
   DEBUG(dbgs() << "ParseOperand\n");
 
   // Check if the current operand has a custom associated parser, if so, try to
@@ -1431,6 +1515,7 @@ MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
   case AsmToken::Minus:
   case AsmToken::Plus:
   case AsmToken::Integer:
+  case AsmToken::Tilde:
   case AsmToken::String: {
     DEBUG(dbgs() << ".. generic integer\n");
     OperandMatchResultTy ResTy = ParseImm(Operands);
@@ -1578,11 +1663,11 @@ bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
 
 bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                   SMLoc &EndLoc) {
-  SmallVector<MCParsedAsmOperand *, 1> Operands;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
   OperandMatchResultTy ResTy = ParseAnyRegister(Operands);
   if (ResTy == MatchOperand_Success) {
     assert(Operands.size() == 1);
-    MipsOperand &Operand = *static_cast<MipsOperand *>(Operands.front());
+    MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front());
     StartLoc = Operand.getStartLoc();
     EndLoc = Operand.getEndLoc();
 
@@ -1595,8 +1680,6 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
       RegNo = isGP64() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
     }
 
-    delete &Operand;
-
     return (RegNo == (unsigned)-1);
   }
 
@@ -1632,8 +1715,8 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
   return Result;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseMemOperand(OperandVector &Operands) {
   DEBUG(dbgs() << "parseMemOperand\n");
   const MCExpr *IdVal = nullptr;
   SMLoc S;
@@ -1653,8 +1736,8 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
     const AsmToken &Tok = Parser.getTok(); // Get the next token.
     if (Tok.isNot(AsmToken::LParen)) {
-      MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
-      if (Mnemonic->getToken() == "la") {
+      MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
+      if (Mnemonic.getToken() == "la") {
         SMLoc E =
             SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
         Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
@@ -1666,9 +1749,10 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
         // Zero register assumed, add a memory operand with ZERO as its base.
         // "Base" will be managed by k_Memory.
-        MipsOperand *Base = MipsOperand::CreateGPRReg(
-            0, getContext().getRegisterInfo(), S, E, *this);
-        Operands.push_back(MipsOperand::CreateMem(Base, IdVal, S, E, *this));
+        auto Base = MipsOperand::CreateGPRReg(0, getContext().getRegisterInfo(),
+                                              S, E, *this);
+        Operands.push_back(
+            MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this));
         return MatchOperand_Success;
       }
       Error(Parser.getTok().getLoc(), "'(' expected");
@@ -1695,7 +1779,8 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     IdVal = MCConstantExpr::Create(0, getContext());
 
   // Replace the register operand with the memory operand.
-  MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
+  std::unique_ptr<MipsOperand> op(
+      static_cast<MipsOperand *>(Operands.back().release()));
   // Remove the register from the operands.
   // "op" will be managed by k_Memory.
   Operands.pop_back();
@@ -1709,12 +1794,11 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
                                    getContext());
   }
 
-  Operands.push_back(MipsOperand::CreateMem(op, IdVal, S, E, *this));
+  Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this));
   return MatchOperand_Success;
 }
 
-bool MipsAsmParser::searchSymbolAlias(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
 
   MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
   if (Sym) {
@@ -1740,9 +1824,8 @@ bool MipsAsmParser::searchSymbolAlias(
     } else if (Expr->getKind() == MCExpr::Constant) {
       Parser.Lex();
       const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
-      MipsOperand *op =
-          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this);
-      Operands.push_back(op);
+      Operands.push_back(
+          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this));
       return true;
     }
   }
@@ -1750,9 +1833,9 @@ bool MipsAsmParser::searchSymbolAlias(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::MatchAnyRegisterNameWithoutDollar(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier,
-    SMLoc S) {
+MipsAsmParser::MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                                 StringRef Identifier,
+                                                 SMLoc S) {
   int Index = matchCPURegisterName(Identifier);
   if (Index != -1) {
     Operands.push_back(MipsOperand::CreateGPRReg(
@@ -1799,8 +1882,7 @@ MipsAsmParser::MatchAnyRegisterNameWithoutDollar(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::MatchAnyRegisterWithoutDollar(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, SMLoc S) {
+MipsAsmParser::MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
   auto Token = Parser.getLexer().peekTok(false);
 
   if (Token.is(AsmToken::Identifier)) {
@@ -1822,8 +1904,8 @@ MipsAsmParser::MatchAnyRegisterWithoutDollar(
   return MatchOperand_NoMatch;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseAnyRegister(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseAnyRegister(OperandVector &Operands) {
   DEBUG(dbgs() << "ParseAnyRegister\n");
 
   auto Token = Parser.getTok();
@@ -1850,7 +1932,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseAnyRegister(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::ParseImm(OperandVector &Operands) {
   switch (getLexer().getKind()) {
   default:
     return MatchOperand_NoMatch;
@@ -1858,6 +1940,7 @@ MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   case AsmToken::Minus:
   case AsmToken::Plus:
   case AsmToken::Integer:
+  case AsmToken::Tilde:
   case AsmToken::String:
     break;
   }
@@ -1872,8 +1955,8 @@ MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   return MatchOperand_Success;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseJumpTarget(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseJumpTarget(OperandVector &Operands) {
   DEBUG(dbgs() << "ParseJumpTarget\n");
 
   SMLoc S = getLexer().getLoc();
@@ -1899,7 +1982,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseJumpTarget(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::parseInvNum(OperandVector &Operands) {
   const MCExpr *IdVal;
   // If the first token is '$' we may have register operand.
   if (Parser.getTok().is(AsmToken::Dollar))
@@ -1917,7 +2000,7 @@ MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::ParseLSAImm(OperandVector &Operands) {
   switch (getLexer().getKind()) {
   default:
     return MatchOperand_NoMatch;
@@ -1982,6 +2065,8 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
           .Case("call_lo", MCSymbolRefExpr::VK_Mips_CALL_LO16)
           .Case("higher", MCSymbolRefExpr::VK_Mips_HIGHER)
           .Case("highest", MCSymbolRefExpr::VK_Mips_HIGHEST)
+          .Case("pcrel_hi", MCSymbolRefExpr::VK_Mips_PCREL_HI16)
+          .Case("pcrel_lo", MCSymbolRefExpr::VK_Mips_PCREL_LO16)
           .Default(MCSymbolRefExpr::VK_None);
 
   assert(VK != MCSymbolRefExpr::VK_None);
@@ -1994,8 +2079,7 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 /// ::= '(', register, ')'
 /// handle it before we iterate so we don't get tripped up by the lack of
 /// a comma.
-bool MipsAsmParser::ParseParenSuffix(
-    StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::ParseParenSuffix(StringRef Name, OperandVector &Operands) {
   if (getLexer().is(AsmToken::LParen)) {
     Operands.push_back(
         MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
@@ -2023,8 +2107,8 @@ bool MipsAsmParser::ParseParenSuffix(
 /// ::= '[', integer, ']'
 /// handle it before we iterate so we don't get tripped up by the lack of
 /// a comma.
-bool MipsAsmParser::ParseBracketSuffix(
-    StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::ParseBracketSuffix(StringRef Name,
+                                       OperandVector &Operands) {
   if (getLexer().is(AsmToken::LBrac)) {
     Operands.push_back(
         MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
@@ -2046,9 +2130,8 @@ bool MipsAsmParser::ParseBracketSuffix(
   return false;
 }
 
-bool MipsAsmParser::ParseInstruction(
-    ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                     SMLoc NameLoc, OperandVector &Operands) {
   DEBUG(dbgs() << "ParseInstruction\n");
   // Check if we have valid mnemonic
   if (!mnemonicIsValid(Name, 0)) {
@@ -2294,25 +2377,6 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
   return false;
 }
 
-bool MipsAsmParser::parseRegister(unsigned &RegNum) {
-  if (!getLexer().is(AsmToken::Dollar))
-    return false;
-
-  Parser.Lex();
-
-  const AsmToken &Reg = Parser.getTok();
-  if (Reg.is(AsmToken::Identifier)) {
-    RegNum = matchCPURegisterName(Reg.getIdentifier());
-  } else if (Reg.is(AsmToken::Integer)) {
-    RegNum = Reg.getIntVal();
-  } else {
-    return false;
-  }
-
-  Parser.Lex();
-  return true;
-}
-
 bool MipsAsmParser::eatComma(StringRef ErrorStr) {
   if (getLexer().isNot(AsmToken::Comma)) {
     SMLoc Loc = getLexer().getLoc();
@@ -2330,21 +2394,20 @@ bool MipsAsmParser::parseDirectiveCPLoad(SMLoc Loc) {
 
   // FIXME: Warn if cpload is used in Mips16 mode.
 
-  SmallVector<MCParsedAsmOperand *, 1> Reg;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
   OperandMatchResultTy ResTy = ParseAnyRegister(Reg);
   if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
     reportParseError("expected register containing function address");
     return false;
   }
 
-  MipsOperand *RegOpnd = static_cast<MipsOperand *>(Reg[0]);
-  if (!RegOpnd->isGPRAsmReg()) {
-    reportParseError(RegOpnd->getStartLoc(), "invalid register");
+  MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+  if (!RegOpnd.isGPRAsmReg()) {
+    reportParseError(RegOpnd.getStartLoc(), "invalid register");
     return false;
   }
 
-  getTargetStreamer().emitDirectiveCpload(RegOpnd->getGPR32Reg());
-  delete RegOpnd;
+  getTargetStreamer().emitDirectiveCpload(RegOpnd.getGPR32Reg());
   return false;
 }
 
@@ -2353,23 +2416,48 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
   unsigned Save;
   bool SaveIsReg = true;
 
-  if (!parseRegister(FuncReg))
-    return reportParseError("expected register containing function address");
-  FuncReg = getGPR(FuncReg);
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+  OperandMatchResultTy ResTy = ParseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
+    reportParseError("expected register containing function address");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  MipsOperand &FuncRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+  if (!FuncRegOpnd.isGPRAsmReg()) {
+    reportParseError(FuncRegOpnd.getStartLoc(), "invalid register");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  FuncReg = FuncRegOpnd.getGPR32Reg();
+  TmpReg.clear();
 
   if (!eatComma("expected comma parsing directive"))
     return true;
 
-  if (!parseRegister(Save)) {
+  ResTy = ParseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
     const AsmToken &Tok = Parser.getTok();
     if (Tok.is(AsmToken::Integer)) {
       Save = Tok.getIntVal();
       SaveIsReg = false;
       Parser.Lex();
-    } else
-      return reportParseError("expected save register or stack offset");
-  } else
-    Save = getGPR(Save);
+    } else {
+      reportParseError("expected save register or stack offset");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+  } else {
+    MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!SaveOpnd.isGPRAsmReg()) {
+      reportParseError(SaveOpnd.getStartLoc(), "invalid register");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+    Save = SaveOpnd.getGPR32Reg();
+  }
 
   if (!eatComma("expected comma parsing directive"))
     return true;
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 21abe6c5857d..902b87759dc0 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -57,11 +57,23 @@ class MipsDisassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
-                   bool bigEndian) :
-    MipsDisassemblerBase(STI, Ctx, bigEndian) {
-      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
-    }
+  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool bigEndian)
+      : MipsDisassemblerBase(STI, Ctx, bigEndian) {
+    IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+  }
+
+  bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; }
+  bool hasMips32() const { return STI.getFeatureBits() & Mips::FeatureMips32; }
+  bool hasMips32r6() const {
+    return STI.getFeatureBits() & Mips::FeatureMips32r6;
+  }
+
+  bool isGP64() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; }
+
+  bool hasCOP3() const {
+    // Only present in MIPS-I and MIPS-II
+    return !hasMips32() && !hasMips3();
+  }
 
   /// getInstruction - See MCDisassembler.
   DecodeStatus getInstruction(MCInst &instr,
@@ -145,6 +157,10 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
                                            uint64_t Address,
                                            const void *Decoder);
 
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
 static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
                                               unsigned Insn,
                                               uint64_t Address,
@@ -195,6 +211,11 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
                                                uint64_t Address,
                                                const void *Decoder);
 
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
@@ -251,6 +272,11 @@ static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
 
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
 static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  unsigned Insn,
                                  uint64_t Address,
@@ -276,11 +302,45 @@ static DecodeStatus DecodeExtSize(MCInst &Inst,
 static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder);
 
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
 /// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
 /// handle.
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
                                    const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
 namespace llvm {
 extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
               TheMips64elTarget;
@@ -328,6 +388,12 @@ extern "C" void LLVMInitializeMipsDisassembler() {
 
 #include "MipsGenDisassemblerTables.inc"
 
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
                                    const void *Decoder) {
@@ -374,6 +440,262 @@ static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
+template <typename InsnType>
+static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b001000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BOVC if rs >= rt
+  //      BEQZALC if rs == 0 && rt != 0
+  //      BEQC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BOVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BEQC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BEQZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b011000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BNVC if rs >= rt
+  //      BNEZALC if rs == 0 && rt != 0
+  //      BNEC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BNVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BNEC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BNEZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BLEZC   if rs == 0  && rt != 0
+  //      BGEZC   if rs == rt && rt != 0
+  //      BGEC    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZC);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010111 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BGTZC   if rs == 0  && rt != 0
+  //      BLTZC   if rs == rt && rt != 0
+  //      BLTC    if rs != rt && rs != 0  && rt != 0
+
+  bool HasRs = false;
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BGTZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BLTZC);
+  else {
+    MI.setOpcode(Mips::BLTC);
+    HasRs = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                              Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZ instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000111 sssss ttttt iiiiiiiiiiiiiiii
+  //      BGTZ    if rt == 0
+  //      BGTZALC if rs == 0 && rt != 0
+  //      BLTZALC if rs != 0 && rs == rt
+  //      BLTUC   if rs != 0 && rs != rt
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+  bool HasRt = false;
+
+  if (Rt == 0) {
+    MI.setOpcode(Mips::BGTZ);
+    HasRs = true;
+  } else if (Rs == 0) {
+    MI.setOpcode(Mips::BGTZALC);
+    HasRt = true;
+  } else if (Rs == Rt) {
+    MI.setOpcode(Mips::BLTZALC);
+    HasRs = true;
+  } else {
+    MI.setOpcode(Mips::BLTUC);
+    HasRs = true;
+    HasRt = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  if (HasRt)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid   if rs == 0
+  //      BLEZALC   if rs == 0  && rt != 0
+  //      BGEZALC   if rs == rt && rt != 0
+  //      BGEUC     if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZALC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZALC);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEUC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
   /// readInstruction - read four bytes from the MemoryObject
   /// and return 32 bit word sorted according to the given endianess
 static DecodeStatus readInstruction32(const MemoryObject &region,
@@ -433,6 +755,7 @@ MipsDisassembler::getInstruction(MCInst &instr,
     return MCDisassembler::Fail;
 
   if (IsMicroMips) {
+    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit opcodes):\n");
     // Calling the auto-generated decoder function.
     Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address,
                                this, STI);
@@ -443,6 +766,37 @@ MipsDisassembler::getInstruction(MCInst &instr,
     return MCDisassembler::Fail;
   }
 
+  if (hasCOP3()) {
+    DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+    Result =
+        decodeInstruction(DecoderTableCOP3_32, instr, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6() && isGP64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r632, instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
   Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
                              this, STI);
@@ -486,12 +840,6 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   return MCDisassembler::Fail;
 }
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
-  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
-  return *(RegInfo->getRegClass(RC).begin() + RegNo);
-}
-
 static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
@@ -600,6 +948,17 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMem(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
@@ -725,6 +1084,27 @@ static DecodeStatus DecodeFMem(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff);
+  unsigned Rt = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){
+    Inst.addOperand(MCOperand::CreateReg(Rt));
+  }
+
+  Inst.addOperand(MCOperand::CreateReg(Rt));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
                                               unsigned RegNo,
@@ -846,12 +1226,23 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
                                        const void *Decoder) {
-  unsigned BranchOffset = Offset & 0xffff;
-  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
+  int32_t BranchOffset = (SignExtend32<16>(Offset) << 2) + 4;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -890,8 +1281,7 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  unsigned BranchOffset = Offset & 0xffff;
-  BranchOffset = SignExtend32<18>(BranchOffset << 1);
+  int32_t BranchOffset = SignExtend32<16>(Offset) << 1;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -947,3 +1337,9 @@ static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
   Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) << 2));
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<18>(Insn) << 3));
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index cff6855d4a3f..8c797517e316 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -166,6 +166,8 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   case MCSymbolRefExpr::VK_Mips_GOT_LO16:  OS << "%got_lo("; break;
   case MCSymbolRefExpr::VK_Mips_CALL_HI16: OS << "%call_hi("; break;
   case MCSymbolRefExpr::VK_Mips_CALL_LO16: OS << "%call_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_HI16: OS << "%pcrel_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_LO16: OS << "%pcrel_lo("; break;
   }
 
   OS << SRE->getSymbol();
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 153974e470f9..d8e6128cd54d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -56,6 +56,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_MICROMIPS_GOT_PAGE:
   case Mips::fixup_MICROMIPS_GOT_OFST:
   case Mips::fixup_MICROMIPS_GOT_DISP:
+  case Mips::fixup_MIPS_PCLO16:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -69,6 +70,13 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     if (!isIntN(16, Value) && Ctx)
       Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
+  case Mips::fixup_MIPS_PC19_S2:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 4;
+    // We now check if Value can be encoded as a 19-bit signed immediate.
+    if (!isIntN(19, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC19 fixup");
+    break;
   case Mips::fixup_Mips_26:
     // So far we are only using this type for jumps.
     // The displacement is then divided by 4 to give us an 28 bit
@@ -80,6 +88,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_Mips_GOT_HI16:
   case Mips::fixup_Mips_CALL_HI16:
   case Mips::fixup_MICROMIPS_HI16:
+  case Mips::fixup_MIPS_PCHI16:
     // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
@@ -102,6 +111,29 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     if (!isIntN(16, Value) && Ctx)
       Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
+  case Mips::fixup_MIPS_PC18_S3:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 8;
+    // We now check if Value can be encoded as a 18-bit signed immediate.
+    if (!isIntN(18, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC18 fixup");
+    break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 21-bit signed immediate.
+    if (!isIntN(21, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC21 fixup");
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 26-bit signed immediate.
+    if (!isIntN(26, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    break;
   }
 
   return Value;
@@ -189,7 +221,7 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 const MCFixupKindInfo &MipsAsmBackend::
 getFixupKindInfo(MCFixupKind Kind) const {
-  const static MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] = {
+  const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
     // This table *must* be in same the order of fixup_* kinds in
     // MipsFixupKinds.h.
     //
@@ -229,6 +261,12 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_GOT_LO16",     0,     16,   0 },
     { "fixup_Mips_CALL_HI16",    0,     16,   0 },
     { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+    { "fixup_Mips_PC18_S3",      0,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",      0,     19,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC21_S2",      0,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      0,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
     { "fixup_MICROMIPS_HI16",    0,     16,   0 },
     { "fixup_MICROMIPS_LO16",    0,     16,   0 },
@@ -246,12 +284,78 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
   };
 
+  const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_16",          16,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           6,     26,   0 },
+    { "fixup_Mips_HI16",        16,     16,   0 },
+    { "fixup_Mips_LO16",        16,     16,   0 },
+    { "fixup_Mips_GPREL16",     16,     16,   0 },
+    { "fixup_Mips_LITERAL",     16,     16,   0 },
+    { "fixup_Mips_GOT_Global",  16,     16,   0 },
+    { "fixup_Mips_GOT_Local",   16,     16,   0 },
+    { "fixup_Mips_PC16",        16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",      16,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",      21,      5,   0 },
+    { "fixup_Mips_SHIFT6",      21,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",       16,     16,   0 },
+    { "fixup_Mips_GOTTPREL",    16,     16,   0 },
+    { "fixup_Mips_TPREL_HI",    16,     16,   0 },
+    { "fixup_Mips_TPREL_LO",    16,     16,   0 },
+    { "fixup_Mips_TLSLDM",      16,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",   16,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",   16,     16,   0 },
+    { "fixup_Mips_Branch_PCRel",16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",    16,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",    16,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",    16,     16,   0 },
+    { "fixup_Mips_GOT_OFST",    16,     16,   0 },
+    { "fixup_Mips_GOT_DISP",    16,     16,   0 },
+    { "fixup_Mips_HIGHER",      16,     16,   0 },
+    { "fixup_Mips_HIGHEST",     16,     16,   0 },
+    { "fixup_Mips_GOT_HI16",    16,     16,   0 },
+    { "fixup_Mips_GOT_LO16",    16,     16,   0 },
+    { "fixup_Mips_CALL_HI16",   16,     16,   0 },
+    { "fixup_Mips_CALL_LO16",   16,     16,   0 },
+    { "fixup_Mips_PC18_S3",     14,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",     13,     19,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC21_S2",     11,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      6,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_26_S1",   6,     26,   0 },
+    { "fixup_MICROMIPS_HI16",   16,     16,   0 },
+    { "fixup_MICROMIPS_LO16",   16,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",  16,     16,   0 },
+    { "fixup_MICROMIPS_PC16_S1",16,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16", 16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  16,     16,   0 }
+  };
+
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
   assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
           "Invalid kind!");
-  return Infos[Kind - FirstTargetFixupKind];
+
+  if (IsLittle)
+    return LittleEndianInfos[Kind - FirstTargetFixupKind];
+  return BigEndianInfos[Kind - FirstTargetFixupKind];
 }
 
 /// WriteNopData - Write an (optimal) nop sequence of Count bytes
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 794978b30bf8..49ac25690b98 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -193,6 +193,24 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
     Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
     break;
+  case Mips::fixup_MIPS_PC19_S2:
+    Type = ELF::R_MIPS_PC19_S2;
+    break;
+  case Mips::fixup_MIPS_PC18_S3:
+    Type = ELF::R_MIPS_PC18_S3;
+    break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Type = ELF::R_MIPS_PC21_S2;
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Type = ELF::R_MIPS_PC26_S2;
+    break;
+  case Mips::fixup_MIPS_PCHI16:
+    Type = ELF::R_MIPS_PCHI16;
+    break;
+  case Mips::fixup_MIPS_PCLO16:
+    Type = ELF::R_MIPS_PCLO16;
+    break;
   }
   return Type;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index dc6192c20506..05080f046f89 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -128,6 +128,24 @@ namespace Mips {
     // resulting in - R_MIPS_CALL_LO16
     fixup_Mips_CALL_LO16,
 
+    // resulting in - R_MIPS_PC18_S3
+    fixup_MIPS_PC18_S3,
+
+    // resulting in - R_MIPS_PC19_S2
+    fixup_MIPS_PC19_S2,
+
+    // resulting in - R_MIPS_PC21_S2
+    fixup_MIPS_PC21_S2,
+
+    // resulting in - R_MIPS_PC26_S2
+    fixup_MIPS_PC26_S2,
+
+    // resulting in - R_MIPS_PCHI16
+    fixup_MIPS_PCHI16,
+
+    // resulting in - R_MIPS_PCLO16
+    fixup_MIPS_PCLO16,
+
     // resulting in - R_MICROMIPS_26_S1
     fixup_MICROMIPS_26_S1,
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 6be8c03c5e63..43fc52136ddc 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -258,7 +258,9 @@ getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
   assert(MO.isExpr() &&
          "getBranchTarget21OpValue expects only expressions or immediates");
 
-  // TODO: Push 21 PC fixup.
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC21_S2)));
   return 0;
 }
 
@@ -278,7 +280,9 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
   assert(MO.isExpr() &&
          "getBranchTarget26OpValue expects only expressions or immediates");
 
-  // TODO: Push 26 PC fixup.
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC26_S2)));
   return 0;
 }
 
@@ -476,6 +480,12 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups,
     case MCSymbolRefExpr::VK_Mips_CALL_LO16:
       FixupKind = Mips::fixup_Mips_CALL_LO16;
       break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_HI16:
+      FixupKind = Mips::fixup_MIPS_PCHI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_LO16:
+      FixupKind = Mips::fixup_MIPS_PCLO16;
+      break;
     } // switch
 
     Fixups.push_back(MCFixup::Create(0, Expr, MCFixupKind(FixupKind)));
@@ -611,11 +621,42 @@ unsigned
 MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                                          SmallVectorImpl<MCFixup> &Fixups,
                                          const MCSubtargetInfo &STI) const {
-  assert(MI.getOperand(OpNo).isImm());
-  // The immediate is encoded as 'immediate << 2'.
-  unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
-  assert((Res & 3) == 0);
-  return Res >> 2;
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 2'.
+    unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+    assert((Res & 3) == 0);
+    return Res >> 2;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm19Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC19_S2)));
+  return 0;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 3'.
+    unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+    assert((Res & 7) == 0);
+    return Res >> 3;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm18Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC18_S3)));
+  return 0;
 }
 
 #include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 3f7daabfefaa..304167fd03db 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -141,6 +141,10 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
+  unsigned getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index cd6be734dfe8..ce4b9a8f9e98 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -64,6 +64,7 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
       return false;
 
     case Mips::JAL:
+    case Mips::BAL:
     case Mips::BAL_BR:
     case Mips::BLTZAL:
     case Mips::BGEZAL:
@@ -137,18 +138,17 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
                                                     &IsStore);
     bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
     if (IsMemAccess || IsSPFirstOperand) {
-      if (PendingCall)
-        report_fatal_error("Dangerous instruction in branch delay slot!");
-
       bool MaskBefore = (IsMemAccess
                          && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
                                                           .getReg()));
       bool MaskAfter = IsSPFirstOperand && !IsStore;
-      if (MaskBefore || MaskAfter)
+      if (MaskBefore || MaskAfter) {
+        if (PendingCall)
+          report_fatal_error("Dangerous instruction in branch delay slot!");
         sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
-      else
-        MipsELFStreamer::EmitInstruction(Inst, STI);
-      return;
+        return;
+      }
+      // fallthrough
     }
 
     // Sandbox calls by aligning call and branch delay to the bundle end.
@@ -203,6 +203,7 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
   case Mips::LWC1:
   case Mips::LDC1:
   case Mips::LL:
+  case Mips::LL_R6:
   case Mips::LWL:
   case Mips::LWR:
     *AddrIdx = 1;
@@ -223,6 +224,7 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
 
   // Store instructions with base address register in position 2.
   case Mips::SC:
+  case Mips::SC_R6:
     *AddrIdx = 2;
     if (IsStore)
       *IsStore = true;
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index d95f9b07b955..b93017a7400a 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -24,13 +24,13 @@ def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM_MM<0x2f>;
 def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>,
               LW_FM_MM<0x2e>;
 def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
-               LWXC1_FM_MM<0x48>;
+               LWXC1_FM_MM<0x48>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
-               SWXC1_FM_MM<0x88>;
+               SWXC1_FM_MM<0x88>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
-               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2;
+               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2_NOT_32R6_64R6;
 def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
-               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2;
+               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2_NOT_32R6_64R6;
 
 def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
                   CEQS_FM_MM<0>;
@@ -38,9 +38,9 @@ def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
                   CEQS_FM_MM<1>;
 
 def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>,
-              BC1F_FM_MM<0x1c>;
+              BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6;
 def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>,
-              BC1F_FM_MM<0x1d>;
+              BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
 
 def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
                    ROUND_W_FM_MM<0, 0x6c>;
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index fc656d7beea9..051db7525f6d 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -23,13 +23,26 @@ class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
 //
 //===----------------------------------------------------------------------===//
 
-def OPGROUP_COP1    { bits<6> Value = 0b010001; }
-def OPGROUP_AUI     { bits<6> Value = 0b001111; }
-def OPGROUP_DAUI    { bits<6> Value = 0b011101; }
-def OPGROUP_PCREL   { bits<6> Value = 0b111011; }
-def OPGROUP_REGIMM  { bits<6> Value = 0b000001; }
-def OPGROUP_SPECIAL { bits<6> Value = 0b000000; }
-def OPGROUP_SPECIAL3 { bits<6> Value = 0b011111; }
+class OPGROUP<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPGROUP_COP1     : OPGROUP<0b010001>;
+def OPGROUP_COP2     : OPGROUP<0b010010>;
+def OPGROUP_ADDI     : OPGROUP<0b001000>;
+def OPGROUP_AUI      : OPGROUP<0b001111>;
+def OPGROUP_BLEZ     : OPGROUP<0b000110>;
+def OPGROUP_BGTZ     : OPGROUP<0b000111>;
+def OPGROUP_BLEZL    : OPGROUP<0b010110>;
+def OPGROUP_BGTZL    : OPGROUP<0b010111>;
+def OPGROUP_DADDI    : OPGROUP<0b011000>;
+def OPGROUP_DAUI     : OPGROUP<0b011101>;
+def OPGROUP_PCREL    : OPGROUP<0b111011>;
+def OPGROUP_REGIMM   : OPGROUP<0b000001>;
+def OPGROUP_SPECIAL  : OPGROUP<0b000000>;
+// The spec occasionally names this value LL, LLD, SC, or SCD.
+def OPGROUP_SPECIAL3 : OPGROUP<0b011111>;
+// The spec names this constant LWC2, LDC2, SWC2, and SDC2 in different places.
+def OPGROUP_COP2LDST : OPGROUP<0b010010>;
 
 class OPCODE2<bits<2> Val> {
   bits<2> Value = Val;
@@ -38,6 +51,11 @@ def OPCODE2_ADDIUPC : OPCODE2<0b00>;
 def OPCODE2_LWPC    : OPCODE2<0b01>;
 def OPCODE2_LWUPC   : OPCODE2<0b10>;
 
+class OPCODE3<bits<3> Val> {
+  bits<3> Value = Val;
+}
+def OPCODE3_LDPC : OPCODE3<0b110>;
+
 class OPCODE5<bits<5> Val> {
   bits<5> Value = Val;
 }
@@ -45,6 +63,17 @@ def OPCODE5_ALUIPC : OPCODE5<0b11111>;
 def OPCODE5_AUIPC  : OPCODE5<0b11110>;
 def OPCODE5_DAHI : OPCODE5<0b00110>;
 def OPCODE5_DATI : OPCODE5<0b11110>;
+def OPCODE5_BC1EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
+def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
+def OPCODE5_BGEZAL : OPCODE5<0b10001>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE5_LDC2   : OPCODE5<0b01110>;
+def OPCODE5_LWC2   : OPCODE5<0b01010>;
+def OPCODE5_SDC2   : OPCODE5<0b01111>;
+def OPCODE5_SWC2   : OPCODE5<0b01011>;
 
 class OPCODE6<bits<6> Val> {
   bits<6> Value = Val;
@@ -53,6 +82,19 @@ def OPCODE6_ALIGN    : OPCODE6<0b100000>;
 def OPCODE6_DALIGN   : OPCODE6<0b100100>;
 def OPCODE6_BITSWAP  : OPCODE6<0b100000>;
 def OPCODE6_DBITSWAP : OPCODE6<0b100100>;
+def OPCODE6_JALR     : OPCODE6<0b001001>;
+def OPCODE6_CACHE    : OPCODE6<0b100101>;
+def OPCODE6_PREF     : OPCODE6<0b110101>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE6_LL       : OPCODE6<0b110110>;
+def OPCODE6_LLD      : OPCODE6<0b110111>;
+def OPCODE6_SC       : OPCODE6<0b100110>;
+def OPCODE6_SCD      : OPCODE6<0b100111>;
+def OPCODE6_CLO      : OPCODE6<0b010001>;
+def OPCODE6_CLZ      : OPCODE6<0b010000>;
+def OPCODE6_DCLO     : OPCODE6<0b010011>;
+def OPCODE6_DCLZ     : OPCODE6<0b010010>;
 
 class FIELD_FMT<bits<5> Val> {
   bits<5> Value = Val;
@@ -86,6 +128,22 @@ class FIELD_CMP_FORMAT<bits<5> Val> {
 def FIELD_CMP_FORMAT_S : FIELD_CMP_FORMAT<0b10100>;
 def FIELD_CMP_FORMAT_D : FIELD_CMP_FORMAT<0b10101>;
 
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class DecodeDisambiguates<string Name> {
+  string DecoderMethod = !strconcat("Decode", Name);
+}
+
+class DecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+  string DecoderNamespace = "Mips32r6_64r6_Ambiguous";
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Encoding Formats
@@ -109,6 +167,17 @@ class DAUI_FM : AUI_FM {
   let Inst{31-26} = OPGROUP_DAUI.Value;
 }
 
+class BAL_FM : MipsR6Inst {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = OPCODE5_BGEZAL.Value;
+  let Inst{15-0} = offset;
+}
+
 class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
   bits<5> fs;
   bits<5> fd;
@@ -138,6 +207,30 @@ class COP1_3R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
   let Inst{5-0} = funct;
 }
 
+class COP1_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ft;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-0} = offset;
+}
+
+class COP2_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ct;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ct;
+  let Inst{15-0} = offset;
+}
+
 class PCREL16_FM<OPCODE5 Operation> : MipsR6Inst {
   bits<5> rs;
   bits<16> imm;
@@ -162,6 +255,18 @@ class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst {
   let Inst{18-0} = imm;
 }
 
+class PCREL18_FM<OPCODE3 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<18> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = Operation.Value;
+  let Inst{17-0} = imm;
+}
+
 class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
   bits<5> rd;
   bits<5> rt;
@@ -176,6 +281,36 @@ class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
   let Inst{5-0}   = Operation.Value;
 }
 
+class SPECIAL3_MEM_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-7}  = offset;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00001;
+  let Inst{5-0}   = Operation.Value;
+}
+
 class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
   bits<5> rd;
   bits<5> rs;
@@ -191,25 +326,53 @@ class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
   let Inst{5-0}   = funct;
 }
 
-class CMP_BRANCH_OFF16_FM<bits<6> funct> : MipsR6Inst {
+// This class is ambiguous with other branches:
+//   BEQC/BNEC require that rs > rt
+class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
   bits<5> rs;
   bits<5> rt;
   bits<16> offset;
 
   bits<32> Inst;
 
-  let Inst{31-26} = funct;
+  let Inst{31-26} = funct.Value;
   let Inst{25-21} = rs;
   let Inst{20-16} = rt;
   let Inst{15-0} = offset;
 }
 
-class CMP_BRANCH_RT_OFF16_FM<bits<6> funct> : CMP_BRANCH_OFF16_FM<funct> {
+// This class is ambiguous with other branches:
+//   BLEZC/BGEZC/BEQZALC/BNEZALC/BGTZALC require that rs == 0 && rt != 0
+// The '1R_RT' in the name means 1 register in the rt field.
+class CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
   let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLTZC/BGTZC/BLTZALC/BGEZALC require that rs == rt && rt != 0
+// The '1R_BOTH' in the name means 1 register in both the rs and rt fields.
+class CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
 }
 
 class CMP_BRANCH_OFF21_FM<bits<6> funct> : MipsR6Inst {
-  bits<5> rs;
+  bits<5> rs; // rs != 0
   bits<21> offset;
 
   bits<32> Inst;
@@ -273,6 +436,23 @@ class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
   let Inst{5-0}   = Operation.Value;
 }
 
+class SPECIAL3_LL_SC_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = rt;
+  let Inst{15-7} = offset;
+  let Inst{5-0} = Operation.Value;
+
+  string DecoderMethod = "DecodeSpecial3LlSc";
+}
+
 class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst {
   bits<5> rs;
   bits<16> imm;
@@ -302,3 +482,31 @@ class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format,
   let Inst{4-0}   = Cond.Value;
 }
 
+class JR_HB_R6_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = Operation.Value;
+}
+
+class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<11> offset = addr{10-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2LDST.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = base;
+  let Inst{10-0}  = offset;
+}
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 4d2d103f88e8..52e12fcf9462 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -14,43 +14,9 @@
 include "Mips32r6InstrFormats.td"
 
 // Notes about removals/changes from MIPS32r6:
-// Unclear: ssnop
-// Reencoded: cache, pref
-// Reencoded: clo, clz
 // Reencoded: jr -> jalr
 // Reencoded: jr.hb -> jalr.hb
-// Reencoded: ldc2
-// Reencoded: ll, sc
-// Reencoded: lwc2
 // Reencoded: sdbbp
-// Reencoded: sdc2
-// Reencoded: swc2
-// Removed: /.ps$/, cvt.ps.s, cvt.ps.pw
-// Removed: addi
-// Removed: bc1any2, bc1any4
-// Removed: bc2[ft]
-// Removed: bc2f, bc2t
-// Removed: bgezal
-// Removed: bltzal
-// Removed: c.cond.fmt, bc1[ft]
-// Removed: div, divu
-// Removed: jalx
-// Removed: ldxc1
-// Removed: luxc1
-// Removed: lwl, lwr, lwle, lwre, swl, swr, swle, swre
-// Removed: lwxc1
-// Removed: madd.[ds], nmadd.[ds], nmsub.[ds], sub.[ds]
-// Removed: mfhi, mflo, mthi, mtlo, madd, maddu, msub, msubu, mul
-// Removed: movf, movt
-// Removed: movf.fmt, movt.fmt, movn.fmt, movz.fmt
-// Removed: movn, movz
-// Removed: mult, multu
-// Removed: prefx
-// Removed: sdxc1
-// Removed: suxc1
-// Removed: swxc1
-// Removed: teqi, tgei, tgeiu, tlti, tltiu, tnei
-// Rencoded: [ls][wd]c2
 
 def brtarget21 : Operand<OtherVT> {
   let EncoderMethod = "getBranchTarget21OpValue";
@@ -88,30 +54,61 @@ class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>;
 class AUI_ENC    : AUI_FM;
 class AUIPC_ENC  : PCREL16_FM<OPCODE5_AUIPC>;
 
+class BAL_ENC   : BAL_FM;
 class BALC_ENC  : BRANCH_OFF26_FM<0b111010>;
 class BC_ENC    : BRANCH_OFF26_FM<0b110010>;
-class BEQC_ENC  : CMP_BRANCH_OFF16_FM<0b001000>;
-class BEQZALC_ENC : CMP_BRANCH_RT_OFF16_FM<0b001000>;
-class BNEC_ENC  : CMP_BRANCH_OFF16_FM<0b011000>;
-class BNEZALC_ENC : CMP_BRANCH_RT_OFF16_FM<0b011000>;
-
-class BLTZC_ENC : CMP_BRANCH_OFF16_FM<0b010111>;
-class BGEZC_ENC : CMP_BRANCH_OFF16_FM<0b010110>;
-class BGTZALC_ENC : CMP_BRANCH_RT_OFF16_FM<0b000111>;
-
-class BLEZC_ENC : CMP_BRANCH_RT_OFF16_FM<0b010110>;
-class BLTZALC_ENC : CMP_BRANCH_OFF16_FM<0b000111>;
-class BGTZC_ENC : CMP_BRANCH_RT_OFF16_FM<0b010111>;
+class BEQC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                  DecodeDisambiguates<"AddiGroupBranch">;
+class BEQZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_ADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BNEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                  DecodeDisambiguates<"DaddiGroupBranch">;
+class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+
+class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguates<"BgtzlGroupBranch">;
+class BGEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BGEUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZ>,
+                  DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguates<"BlezlGroupBranch">;
+class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLTC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZL>,
+                 DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+class BLTUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZ>,
+                  DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguates<"BgtzGroupBranch">;
+class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguatedBy<"BgtzlGroupBranch">;
 
 class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>;
-class BGEZALC_ENC : CMP_BRANCH_OFF16_FM<0b000110>;
+class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguates<"BlezGroupBranch">;
 class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>;
 
+class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>;
+class BC1NEZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1NEZ>;
+class BC2EQZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2EQZ>;
+class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>;
+
 class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>;
 class JIC_ENC   : JMP_IDX_COMPACT_FM<0b110110>;
-
+class JR_HB_R6_ENC : JR_HB_R6_FM<OPCODE6_JALR>;
 class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>;
-class BLEZALC_ENC : CMP_BRANCH_RT_OFF16_FM<0b000110>;
+class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BNVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                   DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BOVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                   DecodeDisambiguatedBy<"AddiGroupBranch">;
 class DIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011010>;
 class DIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011011>;
 class MOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011010>;
@@ -155,11 +152,27 @@ class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>;
 class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>;
 class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>;
 
-class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, RegisterOperand FGROpnd> {
-  dag OutOperandList = (outs FGROpnd:$fd);
+class CACHE_ENC : SPECIAL3_MEM_FM<OPCODE6_CACHE>;
+class PREF_ENC : SPECIAL3_MEM_FM<OPCODE6_PREF>;
+
+class LDC2_R6_ENC : COP2LDST_FM<OPCODE5_LDC2>;
+class LWC2_R6_ENC : COP2LDST_FM<OPCODE5_LWC2>;
+class SDC2_R6_ENC : COP2LDST_FM<OPCODE5_SDC2>;
+class SWC2_R6_ENC : COP2LDST_FM<OPCODE5_SWC2>;
+
+class LL_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LL>;
+class SC_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SC>;
+
+class CLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLO>;
+class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>;
+
+class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
+                          RegisterOperand FGROpnd,
+                          SDPatternOperator Op = null_frag> {
+  dag OutOperandList = (outs FGRCCOpnd:$fd);
   dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
   string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
-  list<dag> Pattern = [];
+  list<dag> Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))];
 }
 
 //===----------------------------------------------------------------------===//
@@ -174,25 +187,25 @@ multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
                     CMP_CONDN_DESC_BASE<"f", Typestr, FGROpnd>,
                     ISA_MIPS32R6;
   def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
-                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>,
+                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
                      ISA_MIPS32R6;
   def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
-                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>,
+                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
                      ISA_MIPS32R6;
   def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
-                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
                       ISA_MIPS32R6;
   def CMP_OLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLT>,
-                      CMP_CONDN_DESC_BASE<"olt", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"olt", Typestr, FGROpnd, setolt>,
                       ISA_MIPS32R6;
   def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
-                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
                       ISA_MIPS32R6;
   def CMP_OLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLE>,
-                      CMP_CONDN_DESC_BASE<"ole", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"ole", Typestr, FGROpnd, setole>,
                       ISA_MIPS32R6;
   def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
-                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
                       ISA_MIPS32R6;
   def CMP_SF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SF>,
                      CMP_CONDN_DESC_BASE<"sf", Typestr, FGROpnd>,
@@ -226,16 +239,17 @@ multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
 //
 //===----------------------------------------------------------------------===//
 
-class PCREL19_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd> {
   dag OutOperandList = (outs GPROpnd:$rs);
-  dag InOperandList = (ins simm19_lsl2:$imm);
+  dag InOperandList = (ins ImmOpnd:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
   list<dag> Pattern = [];
 }
 
-class ADDIUPC_DESC : PCREL19_DESC_BASE<"addiupc", GPR32Opnd>;
-class LWPC_DESC: PCREL19_DESC_BASE<"lwpc", GPR32Opnd>;
-class LWUPC_DESC: PCREL19_DESC_BASE<"lwupc", GPR32Opnd>;
+class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2>;
+class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2>;
+class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2>;
 
 class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                       Operand ImmOpnd> {
@@ -297,28 +311,34 @@ class CMP_CBR_EQNE_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
 
 class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
                              RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
-  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
   dag OutOperandList = (outs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
   list<Register> Defs = [AT];
 }
 
+class BAL_DESC : BC_DESC_BASE<"bal", brtarget> {
+  bit isCall = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [RA];
+}
+
 class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
   bit isCall = 1;
   list<Register> Defs = [RA];
 }
 
 class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
+class BGEC_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR32Opnd>;
+class BGEUC_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR32Opnd>;
 class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>;
 class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>;
 
-class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd> {
-  string Constraints = "$rs = $rt";
-}
+class BLTC_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR32Opnd>;
+class BLTUC_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR32Opnd>;
 
-class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd> {
-  string Constraints = "$rs = $rt";
-}
+class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>;
+class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>;
 
 class BLEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR32Opnd>;
 class BGTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR32Opnd>;
@@ -326,6 +346,29 @@ class BGTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR32Opnd>;
 class BEQZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR32Opnd>;
 class BNEZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR32Opnd>;
 
+class COP1_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins FGR64Opnd:$ft, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC1EQZ_DESC : COP1_BCCZ_DESC_BASE<"bc1eqz $ft, $offset">;
+class BC1NEZ_DESC : COP1_BCCZ_DESC_BASE<"bc1nez $ft, $offset">;
+
+class COP2_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins COP2Opnd:$ct, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC2EQZ_DESC : COP2_BCCZ_DESC_BASE<"bc2eqz $ct, $offset">;
+class BC2NEZ_DESC : COP2_BCCZ_DESC_BASE<"bc2nez $ct, $offset">;
+
+class BOVC_DESC   : CMP_BC_DESC_BASE<"bovc", brtarget, GPR32Opnd>;
+class BNVC_DESC   : CMP_BC_DESC_BASE<"bnvc", brtarget, GPR32Opnd>;
+
 class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
                                 RegisterOperand GPROpnd> {
   dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
@@ -347,6 +390,14 @@ class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR32Opnd> {
   list<Register> Defs = [AT];
 }
 
+class JR_HB_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  bit isBranch = 1;
+  bit isIndirectBranch = 1;
+  bit hasDelaySlot = 1;
+  bit isTerminator=1;
+  bit isBarrier=1;
+}
+
 class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rt);
@@ -356,24 +407,28 @@ class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
 
 class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd>;
 
-class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       SDPatternOperator Op=null_frag> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [];
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+
+  // This instruction doesn't trap division by zero itself. We must insert
+  // teq instructions as well.
+  bit usesCustomInserter = 1;
 }
 
-class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd>;
-class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd>;
-class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd>;
-class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd>;
+class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd, sdiv>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, udiv>;
+class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd, srem>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, urem>;
 
 class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
   list<Register> Defs = [RA];
 }
 
 class BGEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezalc", brtarget, GPR32Opnd> {
-  string Constraints = "$rs = $rt";
   list<Register> Defs = [RA];
 }
 
@@ -386,35 +441,41 @@ class BLEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezalc", brtarget, GPR32Opnd> {
 }
 
 class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> {
-  string Constraints = "$rs = $rt";
   list<Register> Defs = [RA];
 }
 
 class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
   list<Register> Defs = [RA];
 }
-class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+
+class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       SDPatternOperator Op=null_frag> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [];
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
 }
 
-class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd>;
-class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd>;
-class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd>;
+class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd, mulhs>;
+class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, mulhu>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, mul>;
 class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd>;
 
-class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
   dag OutOperandList = (outs FGROpnd:$fd);
-  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  dag InOperandList = (ins FGRCCOpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
   string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
-  list<dag> Pattern = [];
+  list<dag> Pattern = [(set FGROpnd:$fd, (select FGRCCOpnd:$fd_in,
+                                                 FGROpnd:$ft,
+                                                 FGROpnd:$fs))];
   string Constraints = "$fd_in = $fd";
 }
 
-class SEL_D_DESC : COP1_4R_DESC_BASE<"sel.d", FGR64Opnd>;
-class SEL_S_DESC : COP1_4R_DESC_BASE<"sel.s", FGR32Opnd>;
+class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
 
 class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   dag OutOperandList = (outs GPROpnd:$rd);
@@ -426,6 +487,14 @@ class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
 class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
 class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
 
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  string Constraints = "$fd_in = $fd";
+}
+
 class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>;
 class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>;
 class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>;
@@ -472,6 +541,79 @@ class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
 class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
 class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
 
+class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
+                      RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+  list<dag> Pattern = [];
+}
+
+class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>;
+class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, GPR32Opnd>;
+
+class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
+  dag OutOperandList = (outs COPOpnd:$rt);
+  dag InOperandList = (ins mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+}
+
+class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd>;
+class LWC2_R6_DESC : COP2LD_DESC_BASE<"lwc2", COP2Opnd>;
+
+class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins COPOpnd:$rt, mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+}
+
+class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>;
+class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd>;
+
+class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+}
+
+class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd>;
+
+class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$dst);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+  string Constraints = "$rt = $dst";
+}
+
+class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd>;
+
+class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+}
+
+class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz (not GPROpnd:$rs)))];
+}
+
+class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz GPROpnd:$rs))];
+}
+
+class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd>;
+class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd>;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -483,17 +625,18 @@ def ALIGN : ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6;
 def ALUIPC : ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6;
 def AUI : AUI_ENC, AUI_DESC, ISA_MIPS32R6;
 def AUIPC : AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
+def BAL : BAL_ENC, BAL_DESC, ISA_MIPS32R6;
 def BALC : BALC_ENC, BALC_DESC, ISA_MIPS32R6;
-def BC1EQZ;
-def BC1NEZ;
-def BC2EQZ;
-def BC2NEZ;
+def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6;
+def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6;
+def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
+def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
 def BC : BC_ENC, BC_DESC, ISA_MIPS32R6;
 def BEQC : BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
 def BEQZALC : BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
 def BEQZC : BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
-def BGEC;  // Also aliased to blec with operands swapped
-def BGEUC; // Also aliased to bleuc with operands swapped
+def BGEC : BGEC_ENC, BGEC_DESC, ISA_MIPS32R6;
+def BGEUC : BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6;
 def BGEZALC : BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
 def BGEZC : BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
 def BGTZALC : BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
@@ -501,24 +644,31 @@ def BGTZC : BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
 def BITSWAP : BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
 def BLEZALC : BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
 def BLEZC : BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
-def BLTC; // Also aliased to bgtc with operands swapped
-def BLTUC; // Also aliased to bgtuc with operands swapped
+def BLTC : BLTC_ENC, BLTC_DESC, ISA_MIPS32R6;
+def BLTUC : BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6;
 def BLTZALC : BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
 def BLTZC : BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
 def BNEC : BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
 def BNEZALC : BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
 def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
-def BNVC;
-def BOVC;
+def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
+def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+def CACHE_R6 : CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
 def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6;
 def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6;
+def CLO_R6 : CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
+def CLZ_R6 : CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
 defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
 defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd>;
 def DIV : DIV_ENC, DIV_DESC, ISA_MIPS32R6;
 def DIVU : DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
 def JIALC : JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
 def JIC : JIC_ENC, JIC_DESC, ISA_MIPS32R6;
+def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6;
+def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6;
+def LL_R6 : LL_R6_ENC, LL_R6_DESC, ISA_MIPS32R6;
 // def LSA; // See MSA
+def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
 def LWPC : LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
 def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
 def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6;
@@ -540,13 +690,105 @@ def MUHU   : MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
 def MUL_R6 : MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
 def MULU   : MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 def NAL; // BAL with rd=0
+def PREF_R6 : PREF_ENC, PREF_DESC, ISA_MIPS32R6;
 def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6;
 def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6;
-def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6;
+def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
+def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
+def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
 def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6;
 def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6;
-def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6;
+def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
 def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6;
 def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6;
 def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6;
 def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6;
+def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// f32 comparisons supported via another comparison
+def : MipsPat<(setone f32:$lhs, f32:$rhs),
+              (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seto f32:$lhs, f32:$rhs),
+              (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(setune f32:$lhs, f32:$rhs),
+              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_OLT_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_OLE_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setne f32:$lhs, f32:$rhs),
+              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+
+// f64 comparisons supported via another comparison
+def : MipsPat<(setone f64:$lhs, f64:$rhs),
+              (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seto f64:$lhs, f64:$rhs),
+              (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(setune f64:$lhs, f64:$rhs),
+              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_OLT_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_OLE_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setne f64:$lhs, f64:$rhs),
+              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+
+// i32 selects
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$f, i32:$cond), (SELEQZ i32:$t, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
+                  (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)),
+                  (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
+                      i32:$f),
+              (OR (SELNEZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))),
+                  (SELEQZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)),
+                      i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))),
+                  (SELEQZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>,
+              ISA_MIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz),
+              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz),
+              (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f),
+              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f),
+              (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index df49aa8e7802..88422ce19dea 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -36,6 +36,9 @@ def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 def immSExt10_64 : PatLeaf<(i64 imm),
                            [{ return isInt<10>(N->getSExtValue()); }]>;
 
+def immZExt16_64 : PatLeaf<(i64 imm),
+                           [{ return isInt<16>(N->getZExtValue()); }]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -62,7 +65,7 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
 def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>,
-              ISA_MIPS3;
+              ISA_MIPS3_NOT_32R6_64R6;
 def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU,
                           immSExt16, add>,
               ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
@@ -155,17 +158,17 @@ def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
 }
 
 def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
-            ISA_MIPS3;
+            ISA_MIPS3_NOT_32R6_64R6;
 def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>,
-            ISA_MIPS3;
+            ISA_MIPS3_NOT_32R6_64R6;
 def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>,
-            ISA_MIPS3;
+            ISA_MIPS3_NOT_32R6_64R6;
 def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
-            ISA_MIPS3;
+            ISA_MIPS3_NOT_32R6_64R6;
 
 /// Load-linked, Store-conditional
-def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3;
-def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3;
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3_NOT_32R6_64R6;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
 
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
@@ -183,30 +186,36 @@ def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>;
 
 /// Multiply and Divide Instructions.
 def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1c>, ISA_MIPS3;
+             MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6;
 def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1d>, ISA_MIPS3;
+             MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
-                                 II_DMULT>;
+                                 II_DMULT>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
-                                 II_DMULTU>;
+                                 II_DMULTU>, ISA_MIPS3_NOT_32R6_64R6;
 def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1e>, ISA_MIPS3;
+            MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6;
 def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1f>, ISA_MIPS3;
+            MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
-                                II_DDIV, 0, 1, 1>;
+                                II_DDIV, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
-                                II_DDIVU, 0, 1, 1>;
+                                II_DDIVU, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in {
-def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>;
-def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>;
-def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>;
-def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>;
-def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>;
-def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
-def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>, ISA_MIPS3_NOT_32R6_64R6;
 
 /// Sign Ext In Register Instructions.
 def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
@@ -216,8 +225,8 @@ def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
 }
 
 /// Count Leading
-def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64;
-def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64;
+def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64_NOT_64R6;
+def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64_NOT_64R6;
 
 /// Double Word Swap Bytes/HalfWords
 def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2;
@@ -245,16 +254,12 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
                     "sll\t$rd, $rt, 0", [], II_SLL>;
 }
 
-// We need the following two pseudo instructions to avoid offset calculation for
+// We need the following pseudo instruction to avoid offset calculation for
 // long branches.  See the comment in file MipsLongBranch.cpp for detailed
 // explanation.
 
-// Expands to: lui $dst, %highest($tgt - $baltgt)
-def LONG_BRANCH_LUi64 : PseudoSE<(outs GPR64Opnd:$dst),
-  (ins brtarget:$tgt, brtarget:$baltgt), []>;
-
 // Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
-// where %PART may be %higher, %hi or %lo, depending on the relocation kind
+// where %PART may be %hi or %lo, depending on the relocation kind
 // that $tgt is annotated with.
 def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
   (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
@@ -435,13 +440,13 @@ def : MipsInstAlias<"daddu $rs, $rt, $imm",
                     0>;
 def : MipsInstAlias<"dadd $rs, $rt, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                    0>;
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"daddu $rs, $imm",
                     (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
                     0>;
 def : MipsInstAlias<"dadd $rs, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                    0>;
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"add $rs, $imm",
                     (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
                     0>;
@@ -454,10 +459,22 @@ def : MipsInstAlias<"dsll $rd, $rt, $rs",
 def : MipsInstAlias<"dsubu $rt, $rs, $imm",
                     (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
                             InvertedImOperand64:$imm), 0>;
+def : MipsInstAlias<"dsubi $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsubi $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"dsub $rs, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
                            InvertedImOperand64:$imm),
-                    0>;
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"dsubu $rs, $imm",
                     (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
                             InvertedImOperand64:$imm),
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index 0f48784898e7..dbfb9fbbf9bc 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -13,11 +13,6 @@
 
 // Notes about removals/changes from MIPS32r6:
 // Reencoded: dclo, dclz
-// Reencoded: lld, scd
-// Removed: daddi
-// Removed: ddiv, ddivu, dmult, dmultu
-// Removed: div, divu
-// Removed: ldl, ldr, ldle, ldre, sdl, sdr, sdle, sdre
 
 //===----------------------------------------------------------------------===//
 //
@@ -30,6 +25,8 @@ class DAUI_ENC    : DAUI_FM;
 class DAHI_ENC    : REGIMM_FM<OPCODE5_DAHI>;
 class DATI_ENC    : REGIMM_FM<OPCODE5_DATI>;
 class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>;
+class DCLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLO>;
+class DCLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLZ>;
 class DDIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011110>;
 class DDIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011111>;
 class DMOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011110>;
@@ -38,6 +35,9 @@ class DMUH_ENC    : SPECIAL_3R_FM<0b00011, 0b111000>;
 class DMUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b111001>;
 class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b111000>;
 class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b111001>;
+class LDPC_ENC    : PCREL18_FM<OPCODE3_LDPC>;
+class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
+class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
 
 //===----------------------------------------------------------------------===//
 //
@@ -45,19 +45,33 @@ class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b111001>;
 //
 //===----------------------------------------------------------------------===//
 
+class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+  string Constraints = "$rs = $rt";
+}
+
 class DALIGN_DESC  : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
-class DAHI_DESC    : AUI_DESC_BASE<"dahi", GPR64Opnd>;
-class DATI_DESC    : AUI_DESC_BASE<"dati", GPR64Opnd>;
+class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd>;
+class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd>;
 class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd>;
 class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd>;
-class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd>;
-class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd>;
-class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd>;
-class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd>;
-class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd>;
-class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd>;
-class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd>;
+class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>;
+class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>;
+class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>;
+class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>;
+class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>;
+class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>;
+class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>;
+class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, mulhu>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, mul>;
 class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd>;
+class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3>;
+class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd>;
+class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd>;
+class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
+class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
 
 //===----------------------------------------------------------------------===//
 //
@@ -70,6 +84,8 @@ def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
 def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
 def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
 def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
+def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
+def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
 def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
 def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
 // def DLSA; // See MSA
@@ -79,4 +95,113 @@ def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
 def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
 def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
 def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
-def LDPC;
+def LDPC: LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
+def LLD_R6 : LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS32R6;
+def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
+let DecoderNamespace = "Mips32r6_64r6_GP64" in {
+  def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
+  def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// i64 selects
+def : MipsPat<(select i64:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$f, i64:$cond),
+                    (SELEQZ64 i64:$t, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setgt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELNEZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELEQZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setugt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELNEZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELEQZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+
+// i64 selects from an i32 comparison
+// One complicating factor here is that bits 32-63 of an i32 are undefined.
+// FIXME: Ideally, setcc would always produce an i64 on MIPS64 targets.
+//        This would allow us to remove the sign-extensions here.
+def : MipsPat<(select i32:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELEQZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+
+def : MipsPat<(select i32:$cond, i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select i32:$cond, immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 626657e9dff2..6df90aa75a11 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -958,7 +958,6 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
 bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
   return (Opcode == Mips::LONG_BRANCH_LUi
           || Opcode == Mips::LONG_BRANCH_ADDiu
-          || Opcode == Mips::LONG_BRANCH_LUi64
           || Opcode == Mips::LONG_BRANCH_DADDiu);
 }
 
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 13fa546b9eb2..151ef134e1da 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -124,6 +124,7 @@ class MipsCodeEmitter : public MachineFunctionPass {
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSimm18Lsl3Encoding(const MachineInstr &MI, unsigned OpNo) const;
 
   /// Expand pseudo instructions with accumulator register operands.
   void expandACCInstr(MachineBasicBlock::instr_iterator MI,
@@ -273,6 +274,12 @@ unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getSimm18Lsl3Encoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getSimm19Lsl2Encoding(const MachineInstr &MI,
                                                 unsigned OpNo) const {
   llvm_unreachable("Unimplemented function.");
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 7177f6544a39..690f62608504 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -104,136 +104,162 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 
 // Instantiation of instructions.
 def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
-               ADD_FM<0, 0xa>, INSN_MIPS4_32;
+               ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in {
   def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>;
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>;
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>;
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
 }
 
 def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>, INSN_MIPS4_32;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in {
   def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
 }
 
 def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
-               CMov_I_F_FM<18, 16>, INSN_MIPS4_32;
+               CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
-                 CMov_I_F_FM<18, 16>, AdditionalRequires<[HasMips64]>;
+                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
+                 AdditionalRequires<[HasMips64]>;
 
 def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
-               CMov_I_F_FM<19, 16>, INSN_MIPS4_32;
+               CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
-                 CMov_I_F_FM<19, 16>, AdditionalRequires<[IsGP64bit]>;
+                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
+                 AdditionalRequires<[IsGP64bit]>;
 
 def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
                                     II_MOVZ_D>, CMov_I_F_FM<18, 17>,
-                 INSN_MIPS4_32, FGR_32;
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
                                     II_MOVN_D>, CMov_I_F_FM<19, 17>,
-                 INSN_MIPS4_32, FGR_32;
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 
 let DecoderNamespace = "Mips64" in {
   def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
-                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32, FGR_64;
+                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
-                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32, FGR_64;
+                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   let isCodeGenOnly = 1 in {
-    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVZ_D>, CMov_I_F_FM<18, 17>, FGR_64;
-    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVN_D>, CMov_I_F_FM<19, 17>, FGR_64;
+    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
+                       CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
+                       CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   }
 }
 
 def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
-             CMov_F_I_FM<1>, INSN_MIPS4_32;
+             CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
-               CMov_F_I_FM<1>, AdditionalRequires<[IsGP64bit]>;
+               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6,
+               AdditionalRequires<[IsGP64bit]>;
 
 def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
-             CMov_F_I_FM<0>, INSN_MIPS4_32;
+             CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
-               CMov_F_I_FM<0>, AdditionalRequires<[IsGP64bit]>;
+               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6,
+               AdditionalRequires<[IsGP64bit]>;
 
 def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
-             CMov_F_F_FM<16, 1>, INSN_MIPS4_32;
+             CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
 def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
-             CMov_F_F_FM<16, 0>, INSN_MIPS4_32;
+             CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
                                   MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
-               INSN_MIPS4_32, FGR_32;
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
                                   MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
-               INSN_MIPS4_32, FGR_32;
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 
 let DecoderNamespace = "Mips64" in {
   def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
-                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32, FGR_64;
+                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
-                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32, FGR_64;
+                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
 }
 
 // Instantiation of conditional move patterns.
-defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
-defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>;
-defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>;
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
 
-defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>, GPR_64;
+defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
-       GPR_64;
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
        GPR_64;
-defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>, GPR_64;
-defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>, GPR_64;
-defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>, GPR_64;
-defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>, GPR_64;
-defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>, GPR_64;
-defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>, GPR_64;
-
-defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>;
-
-defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, GPR_64;
-defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, GPR_64;
-defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, GPR_64;
 
-defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
-defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>;
-defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>;
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
        GPR_64;
-defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, GPR_64;
-defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, GPR_64;
 
-defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>, FGR_32;
-defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, FGR_32;
-defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, FGR_32;
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
 
-defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>, FGR_64;
+defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
 defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
        FGR_64;
-defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, FGR_64;
-defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>, FGR_64;
-defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, FGR_64;
-defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, FGR_64;
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index d6c7cac27300..bcfbc12df01c 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -177,6 +178,13 @@ namespace {
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
         Changed |= runOnMachineBasicBlock(*FI);
+
+      // This pass invalidates liveness information when it reorders
+      // instructions to fill delay slot. Without this, -verify-machineinstrs
+      // will fail.
+      if (Changed)
+        F.getRegInfo().invalidateLiveness();
+
       return Changed;
     }
 
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 268a0ed591c4..375d9b205542 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -12,6 +12,7 @@
 #include "MipsISelLowering.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
 
 using namespace llvm;
 
@@ -36,11 +37,11 @@ class MipsFastISel final : public FastISel {
 
   /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
   /// make the right decision when generating code for different targets.
-  const MipsSubtarget *Subtarget;
   Module &M;
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
+  const MipsSubtarget &Subtarget;
   MipsFunctionInfo *MFI;
 
   // Convenience variables to avoid some queries.
@@ -54,12 +55,12 @@ class MipsFastISel final : public FastISel {
       : FastISel(funcInfo, libInfo),
         M(const_cast<Module &>(*funcInfo.Fn->getParent())),
         TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()),
-        TLI(*TM.getTargetLowering()) {
-    Subtarget = &TM.getSubtarget<MipsSubtarget>();
+        TLI(*TM.getTargetLowering()),
+        Subtarget(TM.getSubtarget<MipsSubtarget>()) {
     MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
     Context = &funcInfo.Fn->getContext();
-    TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
-                       (Subtarget->hasMips32r2() && (Subtarget->isABI_O32())));
+    TargetSupported = ((Subtarget.getRelocationModel() == Reloc::PIC_) &&
+                       (Subtarget.hasMips32r2() && (Subtarget.isABI_O32())));
   }
 
   bool TargetSelectInstruction(const Instruction *I) override;
@@ -68,8 +69,11 @@ class MipsFastISel final : public FastISel {
   bool ComputeAddress(const Value *Obj, Address &Addr);
 
 private:
+  bool EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                unsigned Alignment = 0);
   bool EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
                  unsigned Alignment = 0);
+  bool SelectLoad(const Instruction *I);
   bool SelectRet(const Instruction *I);
   bool SelectStore(const Instruction *I);
 
@@ -80,6 +84,36 @@ class MipsFastISel final : public FastISel {
   unsigned MaterializeGV(const GlobalValue *GV, MVT VT);
   unsigned MaterializeInt(const Constant *C, MVT VT);
   unsigned Materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+
+  // for some reason, this default is not generated by tablegen
+  // so we explicitly generate it here.
+  //
+  unsigned FastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill, uint64_t imm1,
+                             uint64_t imm2, unsigned Op3, bool Op3IsKill) {
+    return 0;
+  }
+
+  MachineInstrBuilder EmitInst(unsigned Opc) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  }
+
+  MachineInstrBuilder EmitInst(unsigned Opc, unsigned DstReg) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                   DstReg);
+  }
+
+  MachineInstrBuilder EmitInstStore(unsigned Opc, unsigned SrcReg,
+                                    unsigned MemReg, int64_t MemOffset) {
+    return EmitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
+  }
+
+  MachineInstrBuilder EmitInstLoad(unsigned Opc, unsigned DstReg,
+                                      unsigned MemReg, int64_t MemOffset) {
+    return EmitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
+  }
+
+#include "MipsGenFastISel.inc"
 };
 
 bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
@@ -100,6 +134,8 @@ bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
   // We will extend this in a later patch:
   //   If this is a type than can be sign or zero-extended to a basic operation
   //   go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16)
+    return true;
   return false;
 }
 
@@ -116,6 +152,45 @@ bool MipsFastISel::ComputeAddress(const Value *Obj, Address &Addr) {
   return Addr.Base.Reg != 0;
 }
 
+bool MipsFastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                            unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i32: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LW;
+    break;
+  }
+  case MVT::i16: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LHu;
+    break;
+  }
+  case MVT::i8: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LBu;
+    break;
+  }
+  case MVT::f32: {
+    ResultReg = createResultReg(&Mips::FGR32RegClass);
+    Opc = Mips::LWC1;
+    break;
+  }
+  case MVT::f64: {
+    ResultReg = createResultReg(&Mips::AFGR64RegClass);
+    Opc = Mips::LDC1;
+    break;
+  }
+  default:
+    return false;
+  }
+  EmitInstLoad(Opc, ResultReg, Addr.Base.Reg, Addr.Offset);
+  return true;
+}
+
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned MipsFastISel::TargetMaterializeConstant(const Constant *C) {
@@ -141,12 +216,49 @@ bool MipsFastISel::EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
   //
   // more cases will be handled here in following patches.
   //
-  if (VT != MVT::i32)
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opc = Mips::SB;
+    break;
+  case MVT::i16:
+    Opc = Mips::SH;
+    break;
+  case MVT::i32:
+    Opc = Mips::SW;
+    break;
+  case MVT::f32:
+    Opc = Mips::SWC1;
+    break;
+  case MVT::f64:
+    Opc = Mips::SDC1;
+    break;
+  default:
+    return false;
+  }
+  EmitInstStore(Opc, SrcReg, Addr.Base.Reg, Addr.Offset);
+  return true;
+}
+
+bool MipsFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(0), Addr))
     return false;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::SW))
-      .addReg(SrcReg)
-      .addReg(Addr.Base.Reg)
-      .addImm(Addr.Offset);
+
+  unsigned ResultReg;
+  if (!EmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
+  UpdateValueMap(I, ResultReg);
   return true;
 }
 
@@ -186,8 +298,7 @@ bool MipsFastISel::SelectRet(const Instruction *I) {
   if (Ret->getNumOperands() > 0) {
     return false;
   }
-  unsigned RetOpc = Mips::RetRA;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(RetOpc));
+  EmitInst(Mips::RetRA);
   return true;
 }
 
@@ -197,6 +308,8 @@ bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
   default:
     break;
+  case Instruction::Load:
+    return SelectLoad(I);
   case Instruction::Store:
     return SelectStore(I);
   case Instruction::Ret:
@@ -207,6 +320,22 @@ bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
 }
 
 unsigned MipsFastISel::MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  if (VT == MVT::f32) {
+    const TargetRegisterClass *RC = &Mips::FGR32RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg = Materialize32BitInt(Imm, &Mips::GPR32RegClass);
+    EmitInst(Mips::MTC1, DestReg).addReg(TempReg);
+    return DestReg;
+  } else if (VT == MVT::f64) {
+    const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg1 = Materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
+    unsigned TempReg2 =
+        Materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
+    EmitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1);
+    return DestReg;
+  }
   return 0;
 }
 
@@ -221,9 +350,8 @@ unsigned MipsFastISel::MaterializeGV(const GlobalValue *GV, MVT VT) {
   // TLS not supported at this time.
   if (IsThreadLocal)
     return 0;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LW), DestReg)
-      .addReg(MFI->getGlobalBaseReg())
-      .addGlobalAddress(GV, 0, MipsII::MO_GOT);
+  EmitInst(Mips::LW, DestReg).addReg(MFI->getGlobalBaseReg()).addGlobalAddress(
+      GV, 0, MipsII::MO_GOT);
   return DestReg;
 }
 unsigned MipsFastISel::MaterializeInt(const Constant *C, MVT VT) {
@@ -245,15 +373,10 @@ unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
 
   if (isInt<16>(Imm)) {
     unsigned Opc = Mips::ADDiu;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-        .addReg(Mips::ZERO)
-        .addImm(Imm);
+    EmitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm);
     return ResultReg;
   } else if (isUInt<16>(Imm)) {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
-            ResultReg)
-        .addReg(Mips::ZERO)
-        .addImm(Imm);
+    EmitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm);
     return ResultReg;
   }
   unsigned Lo = Imm & 0xFFFF;
@@ -261,16 +384,10 @@ unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
   if (Lo) {
     // Both Lo and Hi have nonzero bits.
     unsigned TmpReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
-            TmpReg).addImm(Hi);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
-            ResultReg)
-        .addReg(TmpReg)
-        .addImm(Lo);
-
+    EmitInst(Mips::LUi, TmpReg).addImm(Hi);
+    EmitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
   } else {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
-            ResultReg).addImm(Hi);
+    EmitInst(Mips::LUi, ResultReg).addImm(Hi);
   }
   return ResultReg;
 }
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 4eb9d4356ef5..90cff631931f 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -202,8 +202,9 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 #ifndef NDEBUG
   case ISD::LOAD:
   case ISD::STORE:
-    assert(cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
-           cast<MemSDNode>(Node)->getAlignment() &&
+    assert((Subtarget.systemSupportsUnalignedAccess() ||
+            cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+            cast<MemSDNode>(Node)->getAlignment()) &&
            "Unexpected unaligned loads/stores.");
     break;
 #endif
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index ff20988159af..605236fee55d 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -287,7 +287,8 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::i32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
-  setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i64,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
   setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
@@ -815,10 +816,10 @@ addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
   return VReg;
 }
 
-static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
-                                          MachineBasicBlock &MBB,
-                                          const TargetInstrInfo &TII,
-                                          bool Is64Bit) {
+static MachineBasicBlock *insertDivByZeroTrap(MachineInstr *MI,
+                                              MachineBasicBlock &MBB,
+                                              const TargetInstrInfo &TII,
+                                              bool Is64Bit) {
   if (NoZeroDivCheck)
     return &MBB;
 
@@ -836,6 +837,10 @@ static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
 
   // Clear Divisor's kill flag.
   Divisor.setIsKill(false);
+
+  // We would normally delete the original instruction here but in this case
+  // we only needed to inject an additional instruction rather than replace it.
+
   return &MBB;
 }
 
@@ -918,10 +923,22 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return emitAtomicCmpSwap(MI, BB, 8);
   case Mips::PseudoSDIV:
   case Mips::PseudoUDIV:
-    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), false);
+  case Mips::DIV:
+  case Mips::DIVU:
+  case Mips::MOD:
+  case Mips::MODU:
+    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
+                               false);
   case Mips::PseudoDSDIV:
   case Mips::PseudoDUDIV:
-    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), true);
+  case Mips::DDIV:
+  case Mips::DDIVU:
+  case Mips::DMOD:
+  case Mips::DMODU:
+    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
+                               true);
+  case Mips::SEL_D:
+    return emitSEL_D(MI, BB);
   }
 }
 
@@ -941,16 +958,20 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
-    LL = isMicroMips ? Mips::LL_MM : Mips::LL;
-    SC = isMicroMips ? Mips::SC_MM : Mips::SC;
+    if (isMicroMips) {
+      LL = Mips::LL_MM;
+      SC = Mips::SC_MM;
+    } else {
+      LL = Subtarget->hasMips32r6() ? Mips::LL : Mips::LL_R6;
+      SC = Subtarget->hasMips32r6() ? Mips::SC : Mips::SC_R6;
+    }
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
     BEQ = Mips::BEQ;
-  }
-  else {
-    LL = Mips::LLD;
-    SC = Mips::SCD;
+  } else {
+    LL = Subtarget->hasMips64r6() ? Mips::LLD : Mips::LLD_R6;
+    SC = Subtarget->hasMips64r6() ? Mips::SCD : Mips::SCD_R6;
     AND = Mips::AND64;
     NOR = Mips::NOR64;
     ZERO = Mips::ZERO_64;
@@ -1012,11 +1033,39 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   return exitMBB;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
-                                             MachineBasicBlock *BB,
-                                             unsigned Size, unsigned BinOpcode,
-                                             bool Nand) const {
+MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
+    unsigned SrcReg) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Subtarget->hasMips32r2() && Size == 1) {
+    BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  if (Subtarget->hasMips32r2() && Size == 2) {
+    BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  unsigned ScrReg = RegInfo.createVirtualRegister(RC);
+
+  assert(Size < 32);
+  int64_t ShiftImm = 32 - (Size * 8);
+
+  BuildMI(BB, DL, TII->get(Mips::SLL), ScrReg).addReg(SrcReg).addImm(ShiftImm);
+  BuildMI(BB, DL, TII->get(Mips::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm);
+
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+    bool Nand) const {
   assert((Size == 1 || Size == 2) &&
          "Unsupported size for EmitAtomicBinaryPartial.");
 
@@ -1046,7 +1095,6 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   unsigned StoreVal = RegInfo.createVirtualRegister(RC);
   unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
   unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned SllRes = RegInfo.createVirtualRegister(RC);
   unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
@@ -1152,19 +1200,14 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   //  sinkMBB:
   //    and     maskedoldval1,oldval,mask
   //    srl     srlres,maskedoldval1,shiftamt
-  //    sll     sllres,srlres,24
-  //    sra     dest,sllres,24
+  //    sign_extend dest,srlres
   BB = sinkMBB;
-  int64_t ShiftImm = (Size == 1) ? 24 : 16;
 
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
     .addReg(OldVal).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
       .addReg(MaskedOldVal1).addReg(ShiftAmt);
-  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
-      .addReg(SrlRes).addImm(ShiftImm);
-  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
-      .addReg(SllRes).addImm(ShiftImm);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
 
   MI->eraseFromParent(); // The instruction is gone now.
 
@@ -1285,7 +1328,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
   unsigned StoreVal = RegInfo.createVirtualRegister(RC);
   unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned SllRes = RegInfo.createVirtualRegister(RC);
   unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
@@ -1382,23 +1424,44 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
 
   //  sinkMBB:
   //    srl     srlres,maskedoldval0,shiftamt
-  //    sll     sllres,srlres,24
-  //    sra     dest,sllres,24
+  //    sign_extend dest,srlres
   BB = sinkMBB;
-  int64_t ShiftImm = (Size == 1) ? 24 : 16;
 
   BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
       .addReg(MaskedOldVal0).addReg(ShiftAmt);
-  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
-      .addReg(SrlRes).addImm(ShiftImm);
-  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
-      .addReg(SllRes).addImm(ShiftImm);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
   return exitMBB;
 }
 
+MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock::iterator II(MI);
+
+  unsigned Fc = MI->getOperand(1).getReg();
+  const auto &FGR64RegClass = TRI->getRegClass(Mips::FGR64RegClassID);
+
+  unsigned Fc2 = RegInfo.createVirtualRegister(FGR64RegClass);
+
+  BuildMI(*BB, II, DL, TII->get(Mips::SUBREG_TO_REG), Fc2)
+      .addImm(0)
+      .addReg(Fc)
+      .addImm(Mips::sub_lo);
+
+  // We don't erase the original instruction, we just replace the condition
+  // register with the 64-bit super-register.
+  MI->getOperand(1).setReg(Fc2);
+
+  return BB;
+}
+
 //===----------------------------------------------------------------------===//
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
@@ -1439,6 +1502,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest = Op.getOperand(2);
   SDLoc DL(Op);
 
+  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
   SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
 
   // Return if flag is not set by a floating point comparison.
@@ -1458,6 +1522,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue MipsTargetLowering::
 lowerSELECT(SDValue Op, SelectionDAG &DAG) const
 {
+  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
 
   // Return if flag is not set by a floating point comparison.
@@ -1483,6 +1548,7 @@ lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 }
 
 SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op);
 
   assert(Cond.getOpcode() == MipsISD::FPCmp &&
@@ -1941,6 +2007,9 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LD = cast<LoadSDNode>(Op);
   EVT MemVT = LD->getMemoryVT();
 
+  if (Subtarget->systemSupportsUnalignedAccess())
+    return Op;
+
   // Return if load is aligned or if MemVT is neither i32 nor i64.
   if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
       ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
@@ -2064,7 +2133,8 @@ SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   EVT MemVT = SD->getMemoryVT();
 
   // Lower unaligned integer stores.
-  if ((SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+  if (!Subtarget->systemSupportsUnalignedAccess() &&
+      (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
       ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
     return lowerUnalignedIntStore(SD, DAG, Subtarget->isLittle());
 
@@ -3485,21 +3555,22 @@ passByValArg(SDValue Chain, SDLoc DL,
              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
              const MipsCC &CC, const ByValArgInfo &ByVal,
              const ISD::ArgFlagsTy &Flags, bool isLittle) const {
-  unsigned ByValSize = Flags.getByValSize();
-  unsigned Offset = 0; // Offset in # of bytes from the beginning of struct.
-  unsigned RegSize = CC.regSize();
-  unsigned Alignment = std::min(Flags.getByValAlign(), RegSize);
-  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSize * 8);
+  unsigned ByValSizeInBytes = Flags.getByValSize();
+  unsigned OffsetInBytes = 0; // From beginning of struct
+  unsigned RegSizeInBytes = CC.regSize();
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
 
   if (ByVal.NumRegs) {
     const MCPhysReg *ArgRegs = CC.intArgRegs();
-    bool LeftoverBytes = (ByVal.NumRegs * RegSize > ByValSize);
+    bool LeftoverBytes = (ByVal.NumRegs * RegSizeInBytes > ByValSizeInBytes);
     unsigned I = 0;
 
     // Copy words to registers.
-    for (; I < ByVal.NumRegs - LeftoverBytes; ++I, Offset += RegSize) {
+    for (; I < ByVal.NumRegs - LeftoverBytes;
+         ++I, OffsetInBytes += RegSizeInBytes) {
       SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                                    DAG.getConstant(Offset, PtrTy));
+                                    DAG.getConstant(OffsetInBytes, PtrTy));
       SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
                                     MachinePointerInfo(), false, false, false,
                                     Alignment);
@@ -3509,38 +3580,38 @@ passByValArg(SDValue Chain, SDLoc DL,
     }
 
     // Return if the struct has been fully copied.
-    if (ByValSize == Offset)
+    if (ByValSizeInBytes == OffsetInBytes)
       return;
 
     // Copy the remainder of the byval argument with sub-word loads and shifts.
     if (LeftoverBytes) {
-      assert((ByValSize > Offset) && (ByValSize < Offset + RegSize) &&
-             "Size of the remainder should be smaller than RegSize.");
+      assert((ByValSizeInBytes > OffsetInBytes) &&
+             (ByValSizeInBytes < OffsetInBytes + RegSizeInBytes) &&
+             "Size of the remainder should be smaller than RegSizeInBytes.");
       SDValue Val;
 
-      for (unsigned LoadSize = RegSize / 2, TotalSizeLoaded = 0;
-           Offset < ByValSize; LoadSize /= 2) {
-        unsigned RemSize = ByValSize - Offset;
+      for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
+           OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
+        unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;
 
-        if (RemSize < LoadSize)
+        if (RemainingSizeInBytes < LoadSizeInBytes)
           continue;
 
         // Load subword.
         SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                                      DAG.getConstant(Offset, PtrTy));
-        SDValue LoadVal =
-          DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr,
-                         MachinePointerInfo(), MVT::getIntegerVT(LoadSize * 8),
-                         false, false, Alignment);
+                                      DAG.getConstant(OffsetInBytes, PtrTy));
+        SDValue LoadVal = DAG.getExtLoad(
+            ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
+            MVT::getIntegerVT(LoadSizeInBytes * 8), false, false, Alignment);
         MemOpChains.push_back(LoadVal.getValue(1));
 
         // Shift the loaded value.
         unsigned Shamt;
 
         if (isLittle)
-          Shamt = TotalSizeLoaded;
+          Shamt = TotalBytesLoaded * 8;
         else
-          Shamt = (RegSize - (TotalSizeLoaded + LoadSize)) * 8;
+          Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;
 
         SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
                                     DAG.getConstant(Shamt, MVT::i32));
@@ -3550,9 +3621,9 @@ passByValArg(SDValue Chain, SDLoc DL,
         else
           Val = Shift;
 
-        Offset += LoadSize;
-        TotalSizeLoaded += LoadSize;
-        Alignment = std::min(Alignment, LoadSize);
+        OffsetInBytes += LoadSizeInBytes;
+        TotalBytesLoaded += LoadSizeInBytes;
+        Alignment = std::min(Alignment, LoadSizeInBytes);
       }
 
       unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
@@ -3562,9 +3633,9 @@ passByValArg(SDValue Chain, SDLoc DL,
   }
 
   // Copy remainder of byval arg to it with memcpy.
-  unsigned MemCpySize = ByValSize - Offset;
+  unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
   SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                            DAG.getConstant(Offset, PtrTy));
+                            DAG.getConstant(OffsetInBytes, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
                             DAG.getIntPtrConstant(ByVal.Address));
   Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 4ac33bf1142c..b006aba380ea 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -598,6 +598,12 @@ namespace llvm {
 
     unsigned getJumpTableEncoding() const override;
 
+    /// Emit a sign-extension using sll/sra, seb, or seh appropriately.
+    MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr *MI,
+                                                MachineBasicBlock *BB,
+                                                unsigned Size, unsigned DstReg,
+                                                unsigned SrcRec) const;
+
     MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                     unsigned Size, unsigned BinOpcode, bool Nand = false) const;
     MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr *MI,
@@ -607,6 +613,7 @@ namespace llvm {
                                   MachineBasicBlock *BB, unsigned Size) const;
     MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI,
                                   MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *emitSEL_D(MachineInstr *MI, MachineBasicBlock *BB) const;
   };
 
   /// Create MipsTargetLowering objects.
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 32cda3b278a5..aa8f053327f3 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -153,6 +153,15 @@ class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
   InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
          [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>;
 
+class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+                 InstrItinClass Itin> :
+  InstSE<(outs DstRC:$fs), (ins DstRC:$fs_in, SrcRC:$rt),
+         !strconcat(opstr, "\t$rt, $fs"), [], Itin, FrmFR, opstr> {
+  // $fs_in is part of a white lie to work around a widespread bug in the FPU
+  // implementation. See expandBuildPairF64 for details.
+  let Constraints = "$fs = $fs_in";
+}
+
 class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
             SDPatternOperator OpNode= null_frag> :
   InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
@@ -249,11 +258,11 @@ multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
   def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC, itin>, C_COND_FM<fmt, 15>;
 }
 
-defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>;
-defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>,
+defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>, ISA_MIPS1_NOT_32R6_64R6;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
            AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
-defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>,
+defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
            AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
@@ -355,8 +364,12 @@ def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
 def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
             MFC1_FM<3>, ISA_MIPS32R2;
-def MTHC1 : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
-            MFC1_FM<7>, ISA_MIPS32R2;
+def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+                MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
+def MTHC1_D64 : MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+                MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> {
+  let DecoderNamespace = "Mips64";
+}
 def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
             bitconvert>, MFC1_FM<1>, ISA_MIPS3;
 def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
@@ -390,56 +403,64 @@ def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>,
 // Cop2 Memory Instructions
 // FIXME: These aren't really FPU instructions and as such don't belong in this
 //        file
-def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
-def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
-def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>, ISA_MIPS2;
-def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>, ISA_MIPS2;
+def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>,
+           ISA_MIPS2_NOT_32R6_64R6;
 
 // Cop3 Memory Instructions
 // FIXME: These aren't really FPU instructions and as such don't belong in this
 //        file
-def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
-def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
-def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>, ISA_MIPS2;
-def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>, ISA_MIPS2;
+let DecoderNamespace = "COP3_" in {
+  def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
+  def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
+  def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>,
+             ISA_MIPS2;
+  def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>,
+             ISA_MIPS2;
+}
 
 // Indexed loads and stores.
 // Base register + offset register addressing mode (indicated by "x" in the
 // instruction mnemonic) is disallowed under NaCl.
 let AdditionalPredicates = [IsNotNaCl] in {
   def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
-              INSN_MIPS4_32R2;
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
   def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
-              INSN_MIPS4_32R2;
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
 }
 
 let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
   def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
-              INSN_MIPS4_32R2, FGR_32;
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
   def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
-              INSN_MIPS4_32R2, FGR_32;
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
 }
 
 let DecoderNamespace="Mips64" in {
   def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
-                INSN_MIPS4_32R2, FGR_64;
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
   def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
-                INSN_MIPS4_32R2, FGR_64;
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
 }
 
 // Load/store doubleword indexed unaligned.
 let AdditionalPredicates = [IsNotNaCl] in {
   def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
-              INSN_MIPS5_32R2, FGR_32;
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
   def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
-              INSN_MIPS5_32R2, FGR_32;
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
 }
 
 let DecoderNamespace="Mips64" in {
   def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
-                INSN_MIPS5_32R2, FGR_64;
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
   def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
-                INSN_MIPS5_32R2, FGR_64;
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
 }
 
 /// Floating-point Aritmetic
@@ -457,42 +478,42 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
 defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
 
 def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-             MADDS_FM<4, 0>, ISA_MIPS32R2;
+             MADDS_FM<4, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
 def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-             MADDS_FM<5, 0>, ISA_MIPS32R2;
+             MADDS_FM<5, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
-                MADDS_FM<6, 0>, ISA_MIPS32R2;
+                MADDS_FM<6, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
   def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
-                MADDS_FM<7, 0>, ISA_MIPS32R2;
+                MADDS_FM<7, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
 }
 
 def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-               MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_32;
+               MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
 def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-               MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_32;
+               MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
 
 let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_32;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
   def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_32;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
 }
 
 let isCodeGenOnly=1 in {
   def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_64;
+                 MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
   def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_64;
+                 MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
 }
 
 let AdditionalPredicates = [NoNaNsFPMath],
     isCodeGenOnly=1 in {
   def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_64;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
   def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_64;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
 }
 
 //===----------------------------------------------------------------------===//
@@ -504,9 +525,9 @@ def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
 
 def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>,
-           BC1F_FM<0, 0>;
+           BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
 def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>,
-           BC1F_FM<0, 1>;
+           BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
@@ -531,12 +552,13 @@ def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
 def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 
 /// Floating Point Compare
-def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>;
+def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>,
+               ISA_MIPS1_NOT_32R6_64R6;
 def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               AdditionalRequires<[NotFP64bit]>;
+               ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
 def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               AdditionalRequires<[IsFP64bit]>;
+               ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
@@ -569,8 +591,10 @@ def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
 //===----------------------------------------------------------------------===//
 // InstAliases.
 //===----------------------------------------------------------------------===//
-def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>;
-def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>;
+def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 0377eabed78d..a708e307f3f2 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -844,6 +844,34 @@ class BARRIER_FM<bits<5> op> : StdArch {
   let Inst{5-0} = 0;   // SLL
 }
 
+class JR_HB_FM<bits<6> op> : StdArch{
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
+class JALR_HB_FM<bits<6> op> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
 class COP0_TLB_FM<bits<6> op> : StdArch {
   bits<32> Inst;
 
@@ -852,3 +880,17 @@ class COP0_TLB_FM<bits<6> op> : StdArch {
   let Inst{24-6} = 0;
   let Inst{5-0} = op;     // Operation
 }
+
+class CACHEOP_FM<bits<6> op> : StdArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-0}  = offset;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 78cba614d6cf..b1b455769477 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -166,6 +166,8 @@ def HasMips32r2  :    Predicate<"Subtarget.hasMips32r2()">,
                       AssemblerPredicate<"FeatureMips32r2">;
 def HasMips32r6  :    Predicate<"Subtarget.hasMips32r6()">,
                       AssemblerPredicate<"FeatureMips32r6">;
+def NotMips32r6  :    Predicate<"!Subtarget.hasMips32r6()">,
+                      AssemblerPredicate<"!FeatureMips32r6">;
 def IsGP64bit    :    Predicate<"Subtarget.isGP64bit()">,
                       AssemblerPredicate<"FeatureGP64Bit">;
 def IsGP32bit    :    Predicate<"!Subtarget.isGP64bit()">,
@@ -176,6 +178,8 @@ def HasMips64r2  :    Predicate<"Subtarget.hasMips64r2()">,
                       AssemblerPredicate<"FeatureMips64r2">;
 def HasMips64r6  :    Predicate<"Subtarget.hasMips64r6()">,
                       AssemblerPredicate<"FeatureMips64r6">;
+def NotMips64r6  :    Predicate<"!Subtarget.hasMips64r6()">,
+                      AssemblerPredicate<"!FeatureMips64r6">;
 def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
                       AssemblerPredicate<"FeatureN64">;
 def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
@@ -211,11 +215,34 @@ class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
 // They are mutually exclusive.
 //===----------------------------------------------------------------------===//
 
+// FIXME: I'd prefer to use additive predicates to build the instruction sets
+//        but we are short on assembler feature bits at the moment. Using a
+//        subtractive predicate will hopefully keep us under the 32 predicate
+//        limit long enough to develop an alternative way to handle P1||P2
+//        predicates.
+class ISA_MIPS1_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+}
 class ISA_MIPS2    { list<Predicate> InsnPredicates = [HasMips2]; }
+class ISA_MIPS2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+}
 class ISA_MIPS3    { list<Predicate> InsnPredicates = [HasMips3]; }
+class ISA_MIPS3_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
+}
 class ISA_MIPS32   { list<Predicate> InsnPredicates = [HasMips32]; }
+class ISA_MIPS32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6];
+}
 class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
+class ISA_MIPS32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
+}
 class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
+class ISA_MIPS64_NOT_64R6 {
+  list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
+}
 class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
 class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
 class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
@@ -223,17 +250,32 @@ class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
 // The portions of MIPS-III that were also added to MIPS32
 class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
 
+// The portions of MIPS-III that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS3_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6];
+}
+
 // The portions of MIPS-III that were also added to MIPS32
 class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
 
-// The portions of MIPS-IV that were also added to MIPS32
-class INSN_MIPS4_32 { list<Predicate> InsnPredicates = [HasMips4_32]; }
+// The portions of MIPS-IV that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6];
+}
 
-// The portions of MIPS-IV that were also added to MIPS32R2
-class INSN_MIPS4_32R2 { list<Predicate> InsnPredicates = [HasMips4_32r2]; }
+// The portions of MIPS-IV that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6];
+}
 
-// The portions of MIPS-V that were also added to MIPS32R2
-class INSN_MIPS5_32R2 { list<Predicate> InsnPredicates = [HasMips5_32r2]; }
+// The portions of MIPS-V that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS5_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -310,7 +352,9 @@ def calltarget  : Operand<iPTR> {
   let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
+def simm9 : Operand<i32>;
 def simm10 : Operand<i32>;
+def simm11 : Operand<i32>;
 
 def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
@@ -319,6 +363,13 @@ def simm16      : Operand<i32> {
 def simm19_lsl2 : Operand<i32> {
   let EncoderMethod = "getSimm19Lsl2Encoding";
   let DecoderMethod = "DecodeSimm19Lsl2";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm18_lsl3 : Operand<i32> {
+  let EncoderMethod = "getSimm18Lsl3Encoding";
+  let DecoderMethod = "DecodeSimm18Lsl3";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
 def simm20      : Operand<i32> {
@@ -368,6 +419,15 @@ def MipsMemAsmOperand : AsmOperandClass {
   let ParserMethod = "parseMemOperand";
 }
 
+def MipsMemSimm11AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm11";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<11>";
+  //let DiagnosticType = "Simm11";
+}
+
 def MipsInvertedImmoperand : AsmOperandClass {
   let Name = "InvNum";
   let RenderMethod = "addImmOperands";
@@ -399,6 +459,17 @@ def mem_msa : mem_generic {
   let EncoderMethod = "getMSAMemEncoding";
 }
 
+def mem_simm9 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm9);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem_simm11 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm11);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm11AsmOperand;
+}
+
 def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -1000,7 +1071,8 @@ def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
 def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16,
                                add>,
             ADDI_FM<0x9>, IsAsCheapAsAMove;
-def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>;
+def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>,
+            ISA_MIPS1_NOT_32R6_64R6;
 def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xa>;
 def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
@@ -1023,7 +1095,7 @@ def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
             ADD_FM<0, 0x23>;
 let Defs = [HI0, LO0] in
 def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
-            ADD_FM<0x1c, 2>, ISA_MIPS32;
+            ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
 def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
 def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
 def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
@@ -1074,13 +1146,17 @@ def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
 /// load/store left/right
 let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
     AdditionalPredicates = [NotInMicroMips] in {
-def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>;
-def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>;
-def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>;
-def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>;
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+          ISA_MIPS1_NOT_32R6_64R6;
 }
 
-def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM;
+def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
 def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
 def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
 def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
@@ -1088,12 +1164,18 @@ def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
 def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
 def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
 
-def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>, ISA_MIPS2;
-def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>, ISA_MIPS2;
-def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>, ISA_MIPS2;
-def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>, ISA_MIPS2;
-def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>, ISA_MIPS2;
-def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>, ISA_MIPS2;
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>,
+           ISA_MIPS2_NOT_32R6_64R6;
 
 def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>;
 def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
@@ -1110,8 +1192,8 @@ let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably
 def WAIT : WAIT_FT<"wait">, WAIT_FM;
 
 /// Load-linked, Store-conditional
-def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2;
-def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2;
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2_NOT_32R6_64R6;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2_NOT_32R6_64R6;
 }
 
 /// Jump and Branch Instructions
@@ -1132,12 +1214,16 @@ def B       : UncondBranch<BEQ>;
 
 def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
-def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
+  def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+  def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
 }
-def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>;
-def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>;
-def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>;
+
+// FIXME: JALX really requires either MIPS16 or microMIPS in addition to MIPS32.
+def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>, ISA_MIPS32_NOT_32R6_64R6;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
+             ISA_MIPS1_NOT_32R6_64R6;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
+             ISA_MIPS1_NOT_32R6_64R6;
 def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
 def TAILCALL : TailCall<J>;
 def TAILCALL_R : TailCallReg<GPR32Opnd, JR>;
@@ -1167,20 +1253,24 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 
 /// Multiply and Divide Instructions.
 def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x18>;
+            MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
 def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x19>;
+            MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
 def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x1a>;
+            MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
 def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x1b>;
+            MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
 
-def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
-def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
+           ISA_MIPS1_NOT_32R6_64R6;
 let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
     AdditionalPredicates = [NotInMicroMips] in {
-def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
-def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
+           ISA_MIPS1_NOT_32R6_64R6;
 }
 
 /// Sign Ext In Register Instructions.
@@ -1190,8 +1280,10 @@ def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
           SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
 
 /// Count Leading
-def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, ISA_MIPS32;
-def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, ISA_MIPS32;
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>,
+          ISA_MIPS32_NOT_32R6_64R6;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>,
+          ISA_MIPS32_NOT_32R6_64R6;
 
 /// Word Swap Bytes Within Halfwords
 def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2;
@@ -1206,27 +1298,37 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>, ISA_MIPS32;
-def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>, ISA_MIPS32;
-def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>, ISA_MIPS32;
-def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>, ISA_MIPS32;
+def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
+            ISA_MIPS32_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NotDSP] in {
-def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>;
-def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>;
-def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
-def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>;
-def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>;
-def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd, II_MADD>;
-def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>;
-def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>;
-def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>;
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd, II_MADD>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
 }
 
 def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
-                               0, 1, 1>;
+                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
-                               0, 1, 1>;
+                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 
 def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
 
@@ -1245,6 +1347,46 @@ def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>;
 def EHB : Barrier<"ehb">, BARRIER_FM<3>;
 def PAUSE : Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
 
+// JR_HB and JALR_HB are defined here using the new style naming
+// scheme because some of this code is shared with Mips32r6InstrInfo.td
+// and because of that it doesn't follow the naming convention of the
+// rest of the file. To avoid a mixture of old vs new style, the new
+// style was chosen.
+class JR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rs");
+  list<dag> Pattern = [];
+}
+
+class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [];
+}
+
+class JR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+                   JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  let isBranch=1;
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+class JALR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+                     JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+}
+
+class JR_HB_ENC : JR_HB_FM<8>;
+class JALR_HB_ENC : JALR_HB_FM<9>;
+
+def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
+def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
+
 class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
                                       FrmOther>;
 def TLBP : TLB<"tlbp">, COP0_TLB_FM<0x08>;
@@ -1252,6 +1394,15 @@ def TLBR : TLB<"tlbr">, COP0_TLB_FM<0x01>;
 def TLBWI : TLB<"tlbwi">, COP0_TLB_FM<0x02>;
 def TLBWR : TLB<"tlbwr">, COP0_TLB_FM<0x06>;
 
+class CacheOp<string instr_asm, Operand MemOpnd, RegisterOperand GPROpnd> :
+    InstSE<(outs), (ins  MemOpnd:$addr, uimm5:$hint),
+           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther>;
+
+def CACHE : CacheOp<"cache", mem, GPR32Opnd>, CACHEOP_FM<0b101111>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+def PREF :  CacheOp<"pref", mem, GPR32Opnd>, CACHEOP_FM<0b110011>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
@@ -1260,19 +1411,23 @@ def : MipsInstAlias<"move $dst, $src",
       GPR_32 {
   let AdditionalPredicates = [NotInMicroMips];
 }
-def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
 def : MipsInstAlias<"addu $rs, $rt, $imm",
                     (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : MipsInstAlias<"add $rs, $rt, $imm",
                     (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : MipsInstAlias<"and $rs, $rt, $imm",
                     (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"and $rs, $imm",
+                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
 def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 }
 def : MipsInstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
 def : MipsInstAlias<"not $rt, $rs",
                     (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
 def : MipsInstAlias<"neg $rt, $rs",
@@ -1289,6 +1444,8 @@ def : MipsInstAlias<"xor $rs, $rt, $imm",
                     (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
 def : MipsInstAlias<"or $rs, $rt, $imm",
                     (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"or $rs, $imm",
+                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
 def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
 def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
@@ -1331,6 +1488,8 @@ def : MipsInstAlias<"sra $rd, $rt, $rs",
                     (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"srl $rd, $rt, $rs",
                     (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"sync",
+                    (SYNC 0), 1>, ISA_MIPS2;
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1383,6 +1542,10 @@ let AdditionalPredicates = [NotDSP] in {
                 (ADDiu GPR32:$src, imm:$imm)>;
 }
 
+// SYNC
+def : MipsPat<(MipsSync (i32 immz)),
+              (SYNC 0)>, ISA_MIPS2;
+
 // Call
 def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
               (JAL tglobaladdr:$dst)>;
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index a8fd39176eb6..c6838a37be28 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -15,6 +15,7 @@
 
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -64,7 +65,8 @@ namespace {
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 :
+            (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl() ? 9 : 10))) {}
 
     const char *getPassName() const override {
       return "Mips Long Branch";
@@ -264,6 +266,13 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
+    // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
+    // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
+    // pseudo-instruction wrapping BGEZAL).
+
+    const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+    unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR;
+
     if (ABI != MipsSubtarget::N64) {
       // $longbr:
       //  addiu $sp, $sp, -8
@@ -305,9 +314,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
         .addMBB(TgtMBB).addMBB(BalTgtMBB);
       MIBundleBuilder(*LongBrMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
-                  .addReg(Mips::AT).addMBB(TgtMBB).addMBB(BalTgtMBB));
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+                      .addReg(Mips::AT)
+                      .addMBB(TgtMBB)
+                      .addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
@@ -316,18 +327,28 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
         .addReg(Mips::SP).addImm(0);
 
-      MIBundleBuilder(*BalTgtMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
-        .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
-                .addReg(Mips::SP).addImm(8));
+      if (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+        MIBundleBuilder(*BalTgtMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
+          .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
+                  .addReg(Mips::SP).addImm(8));
+      } else {
+        // In NaCl, modifying the sp is not allowed in branch delay slot.
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+          .addReg(Mips::SP).addImm(8);
+
+        MIBundleBuilder(*BalTgtMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
+          .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+
+        // Bundle-align the target of indirect branch JR.
+        TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+      }
     } else {
       // $longbr:
       //  daddiu $sp, $sp, -16
       //  sd $ra, 0($sp)
-      //  lui64 $at, %highest($tgt - $baltgt)
-      //  daddiu $at, $at, %higher($tgt - $baltgt)
-      //  dsll $at, $at, 16
-      //  daddiu $at, $at, %hi($tgt - $baltgt)
+      //  daddiu $at, $zero, %hi($tgt - $baltgt)
       //  dsll $at, $at, 16
       //  bal $baltgt
       //  daddiu $at, $at, %lo($tgt - $baltgt)
@@ -339,10 +360,20 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       // $fallthrough:
       //
 
-      // TODO: %highest and %higher can have non-zero values only when the
-      // offset is greater than 4GB, which is highly unlikely.  Replace
-      // them (and the following instructon that shifts $at by 16) with the
-      // instruction that sets $at to zero.
+      // We assume the branch is within-function, and that offset is within
+      // +/- 2GB.  High 32 bits will therefore always be zero.
+
+      // Note that this will work even if the offset is negative, because
+      // of the +1 modification that's added in that case.  For example, if the
+      // offset is -1MB (0xFFFFFFFFFFF00000), the computation for %higher is
+      //
+      // 0xFFFFFFFFFFF00000 + 0x80008000 = 0x000000007FF08000
+      //
+      // and the bits [47:32] are zero.  For %highest
+      //
+      // 0xFFFFFFFFFFF00000 + 0x800080008000 = 0x000080007FF08000
+      //
+      // and the bits [63:48] are zero.
 
       Pos = LongBrMBB->begin();
 
@@ -350,25 +381,19 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
         .addReg(Mips::SP_64).addImm(-16);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
         .addReg(Mips::SP_64).addImm(0);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi64),
-              Mips::AT_64).addMBB(TgtMBB).addMBB(BalTgtMBB);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
-              Mips::AT_64).addReg(Mips::AT_64).addMBB(TgtMBB, MipsII::MO_HIGHER)
-                          .addMBB(BalTgtMBB);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(16);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
-              Mips::AT_64).addReg(Mips::AT_64).addMBB(TgtMBB, MipsII::MO_ABS_HI)
-                          .addMBB(BalTgtMBB);
+              Mips::AT_64).addReg(Mips::ZERO_64)
+                          .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
         .addReg(Mips::AT_64).addImm(16);
 
       MIBundleBuilder(*LongBrMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
-                        Mips::AT_64).addReg(Mips::AT_64)
-                                    .addMBB(TgtMBB, MipsII::MO_ABS_LO)
-                                    .addMBB(BalTgtMBB));
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(
+              BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
+                  .addReg(Mips::AT_64)
+                  .addMBB(TgtMBB, MipsII::MO_ABS_LO)
+                  .addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
@@ -450,9 +475,18 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
         continue;
 
       int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4;
+      int64_t Offset = computeOffset(I->Br) / ShVal;
+
+      if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+        // The offset calculation does not include sandboxing instructions
+        // that will be added later in the MC layer.  Since at this point we
+        // don't know the exact amount of code that "sandboxing" will add, we
+        // conservatively estimate that code will not grow more than 100%.
+        Offset *= 2;
+      }
 
       // Check if offset fits into 16-bit immediate field of branches.
-      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / ShVal))
+      if (!ForceLongBranch && isInt<16>(Offset))
         continue;
 
       I->HasLongBranch = true;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 85f786746611..821392e1d45f 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -162,16 +162,16 @@ MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
 }
 
 void MipsMCInstLower::
-lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI, int Opcode,
-                   MCSymbolRefExpr::VariantKind Kind) const {
-  OutMI.setOpcode(Opcode);
+lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(Mips::LUi);
 
   // Lower register operand.
   OutMI.addOperand(LowerOperand(MI->getOperand(0)));
 
-  // Create %hi($tgt-$baltgt) or %highest($tgt-$baltgt).
+  // Create %hi($tgt-$baltgt).
   OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
-                             MI->getOperand(2).getMBB(), Kind));
+                             MI->getOperand(2).getMBB(),
+                             MCSymbolRefExpr::VK_Mips_ABS_HI));
 }
 
 void MipsMCInstLower::
@@ -185,7 +185,7 @@ lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
     OutMI.addOperand(LowerOperand(MO));
   }
 
-  // Create %lo($tgt-$baltgt), %hi($tgt-$baltgt) or %higher($tgt-$baltgt).
+  // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
   OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
                              MI->getOperand(3).getMBB(), Kind));
 }
@@ -196,11 +196,7 @@ bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
   default:
     return false;
   case Mips::LONG_BRANCH_LUi:
-    lowerLongBranchLUi(MI, OutMI, Mips::LUi, MCSymbolRefExpr::VK_Mips_ABS_HI);
-    return true;
-  case Mips::LONG_BRANCH_LUi64:
-    lowerLongBranchLUi(MI, OutMI, Mips::LUi64,
-                       MCSymbolRefExpr::VK_Mips_HIGHEST);
+    lowerLongBranchLUi(MI, OutMI);
     return true;
   case Mips::LONG_BRANCH_ADDiu:
     lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu,
@@ -208,10 +204,7 @@ bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
     return true;
   case Mips::LONG_BRANCH_DADDiu:
     unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
-    if (TargetFlags == MipsII::MO_HIGHER)
-      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
-                           MCSymbolRefExpr::VK_Mips_HIGHER);
-    else if (TargetFlags == MipsII::MO_ABS_HI)
+    if (TargetFlags == MipsII::MO_ABS_HI)
       lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
                            MCSymbolRefExpr::VK_Mips_ABS_HI);
     else if (TargetFlags == MipsII::MO_ABS_LO)
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 6971f49ff233..269190ffc065 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -39,8 +39,7 @@ class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
                                MachineOperandType MOTy, unsigned Offset) const;
   MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
                       MCSymbolRefExpr::VariantKind Kind) const;
-  void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI,
-                          int Opcode, MCSymbolRefExpr::VariantKind Kind) const;
+  void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
   void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
                             int Opcode,
                             MCSymbolRefExpr::VariantKind Kind) const;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 875a596eb4aa..b5897af52cdc 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -348,6 +348,10 @@ def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
 def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
           Unallocatable;
 
+// MIPS32r6/MIPS64r6 store FPU condition codes in normal FGR registers.
+// This class allows us to represent this in codegen patterns.
+def FGRCC : RegisterClass<"Mips", [i32], 32, (sequence "F%u", 0, 31)>;
+
 def MSA128B: RegisterClass<"Mips", [v16i8], 128,
                            (sequence "W%u", 0, 31)>;
 def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
@@ -512,6 +516,12 @@ def FGR32Opnd : RegisterOperand<FGR32> {
   let ParserMatchClass = FGR32AsmOperand;
 }
 
+def FGRCCOpnd : RegisterOperand<FGRCC> {
+  // The assembler doesn't use register classes so we can re-use
+  // FGR32AsmOperand.
+  let ParserMatchClass = FGR32AsmOperand;
+}
+
 def FGRH32Opnd : RegisterOperand<FGRH32> {
   let ParserMatchClass = FGRH32AsmOperand;
 }
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index eb9a819aa3cb..cc7ed714beb5 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -152,6 +152,76 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::STORE, MVT::f64, Custom);
   }
 
+  if (Subtarget->hasMips32r6()) {
+    // MIPS32r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::MUL, MVT::i32, Legal);
+    setOperationAction(ISD::MULHS, MVT::i32, Legal);
+    setOperationAction(ISD::MULHU, MVT::i32, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::i32, Legal);
+    setOperationAction(ISD::UDIV, MVT::i32, Legal);
+    setOperationAction(ISD::SREM, MVT::i32, Legal);
+    setOperationAction(ISD::UREM, MVT::i32, Legal);
+
+    // MIPS32r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+
+    setOperationAction(ISD::SETCC, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+
+    assert(Subtarget->isFP64bit() && "FR=1 is required for MIPS32r6");
+    setOperationAction(ISD::SETCC, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+    setOperationAction(ISD::BRCOND, MVT::Other, Legal);
+
+    // Floating point > and >= are supported via < and <=
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  }
+
+  if (Subtarget->hasMips64r6()) {
+    // MIPS64r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::MUL, MVT::i64, Legal);
+    setOperationAction(ISD::MULHS, MVT::i64, Legal);
+    setOperationAction(ISD::MULHU, MVT::i64, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIV, MVT::i64, Legal);
+    setOperationAction(ISD::UDIV, MVT::i64, Legal);
+    setOperationAction(ISD::SREM, MVT::i64, Legal);
+    setOperationAction(ISD::UREM, MVT::i64, Legal);
+
+    // MIPS64r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  }
+
   computeRegisterProperties();
 }
 
@@ -254,6 +324,16 @@ MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
                                                     bool *Fast) const {
   MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
 
+  if (Subtarget->systemSupportsUnalignedAccess()) {
+    // MIPS32r6/MIPS64r6 is required to support unaligned access. It's
+    // implementation defined whether this is handled by hardware, software, or
+    // a hybrid of the two but it's expected that most implementations will
+    // handle the majority of cases in hardware.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
   switch (SVT) {
   case MVT::i64:
   case MVT::i32:
@@ -439,8 +519,8 @@ static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 &&
-      selectMADD(N, &DAG))
+  if (Subtarget->hasMips32() && !Subtarget->hasMips32r6() &&
+      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
@@ -1168,6 +1248,9 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
                                           bool HasLo, bool HasHi,
                                           SelectionDAG &DAG) const {
+  // MIPS32r6/MIPS64r6 removed accumulator based multiplies.
+  assert(!Subtarget->hasMips32r6());
+
   EVT Ty = Op.getOperand(0).getValueType();
   SDLoc DL(Op);
   SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index f6f364f1a361..e82c8cff7225 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -542,20 +542,31 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
   DebugLoc dl = I->getDebugLoc();
   const TargetRegisterInfo &TRI = getRegisterInfo();
+  bool HasMTHC1 = TM.getSubtarget<MipsSubtarget>().hasMips32r2() ||
+                  TM.getSubtarget<MipsSubtarget>().hasMips32r6();
 
-  // For FP32 mode:
-  //   mtc1 Lo, $fp
-  //   mtc1 Hi, $fp + 1
-  // For FP64 mode:
+  // When mthc1 is available, use:
   //   mtc1 Lo, $fp
   //   mthc1 Hi, $fp
+  //
+  // Otherwise, for FP64:
+  //   spill + reload via ldc1
+  // This has not been implemented since FP64 on MIPS32 and earlier is not
+  // supported.
+  //
+  // Otherwise, for FP32:
+  //   mtc1 Lo, $fp
+  //   mtc1 Hi, $fp + 1
 
   BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
     .addReg(LoReg);
 
-  if (FP64) {
-    // FIXME: The .addReg(DstReg, RegState::Implicit) is a white lie used to
-    //        temporarily work around a widespread bug in the -mfp64 support.
+  if (HasMTHC1 || FP64) {
+    assert(TM.getSubtarget<MipsSubtarget>().hasMips32r2() &&
+           "MTHC1 requires MIPS32r2");
+
+    // FIXME: The .addReg(DstReg) is a white lie used to temporarily work
+    //        around a widespread bug in the -mfp64 support.
     //        The problem is that none of the 32-bit fpu ops mention the fact
     //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
     //        requires a major overhaul of the FPU implementation which can't
@@ -565,9 +576,9 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
     //        We therefore pretend that it reads the bottom 32-bits to
     //        artificially create a dependency and prevent the scheduler
     //        changing the behaviour of the code.
-    BuildMI(MBB, I, dl, get(Mips::MTHC1), TRI.getSubReg(DstReg, Mips::sub_hi))
-        .addReg(HiReg)
-        .addReg(DstReg, RegState::Implicit);
+    BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
+        .addReg(DstReg)
+        .addReg(HiReg);
   } else
     BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
       .addReg(HiReg);
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
index 0d4398e2fe95..c8e995a991da 100644
--- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp
+++ b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
@@ -17,8 +17,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips-selectiondag-info"
 
 MipsSelectionDAGInfo::MipsSelectionDAGInfo(const MipsTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+    : TargetSelectionDAGInfo(TM.getDataLayout()) {}
 
 MipsSelectionDAGInfo::~MipsSelectionDAGInfo() {
 }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index d57e67831457..bff656ff1257 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -158,13 +158,21 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   bool hasMips3() const { return MipsArchVersion >= Mips3; }
   bool hasMips4_32() const { return HasMips4_32; }
   bool hasMips4_32r2() const { return HasMips4_32r2; }
-  bool hasMips32() const { return MipsArchVersion >= Mips32; }
-  bool hasMips32r2() const { return MipsArchVersion == Mips32r2 ||
-                                   MipsArchVersion == Mips64r2; }
-  bool hasMips32r6() const { return MipsArchVersion == Mips32r6 ||
-                                   MipsArchVersion == Mips64r6; }
+  bool hasMips32() const {
+    return MipsArchVersion >= Mips32 && MipsArchVersion != Mips3 &&
+           MipsArchVersion != Mips4 && MipsArchVersion != Mips5;
+  }
+  bool hasMips32r2() const {
+    return MipsArchVersion == Mips32r2 || MipsArchVersion == Mips32r6 ||
+           MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
+  }
+  bool hasMips32r6() const {
+    return MipsArchVersion == Mips32r6 || MipsArchVersion == Mips64r6;
+  }
   bool hasMips64() const { return MipsArchVersion >= Mips64; }
-  bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
+  bool hasMips64r2() const {
+    return MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
+  }
   bool hasMips64r6() const { return MipsArchVersion == Mips64r6; }
 
   bool hasCnMips() const { return HasCnMips; }
@@ -234,7 +242,12 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   /// \brief Reset the subtarget for the Mips target.
   void resetSubtarget(MachineFunction *MF);
 
-
+  /// Does the system support unaligned memory access.
+  ///
+  /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
+  /// specify which component of the system provides it. Hardware, software, and
+  /// hybrid implementations are all valid.
+  bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 4ec575ff34c9..195b3c0fe9ed 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1356,7 +1356,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
 
   // Skip meta data
   if (GVar->hasSection()) {
-    if (GVar->getSection() == "llvm.metadata")
+    if (GVar->getSection() == StringRef("llvm.metadata"))
       return;
   }
 
@@ -1438,7 +1438,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
         O << "linear";
         break;
       case 2:
-        assert(0 && "Anisotropic filtering is not supported");
+        llvm_unreachable("Anisotropic filtering is not supported");
       default:
         O << "nearest";
         break;
@@ -1562,7 +1562,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       }
       break;
     default:
-      assert(0 && "type not supported yet");
+      llvm_unreachable("type not supported yet");
     }
 
   }
@@ -1682,7 +1682,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     O << "]";
     break;
   default:
-    assert(0 && "type not supported yet");
+    llvm_unreachable("type not supported yet");
   }
   return;
 }
diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
index c3d9f120e996..f3a095d829e4 100644
--- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -63,7 +63,7 @@ class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
   static char ID;
   NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
 
-  virtual bool runOnFunction(Function &F) override;
+  bool runOnFunction(Function &F) override;
 
   /// Optimizes load/store instructions. Idx is the index of the pointer operand
   /// (0 for load, and 1 for store). Returns true if it changes anything.
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b0943befa88f..0e3e0d50adee 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -130,7 +130,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
 
   // Operations not directly supported by NVPTX.
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 397f4bccfe2b..a98fb37f6e25 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -146,7 +146,7 @@ bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
 void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
   // We implement "poor man's DCE" here to make sure any code that is no longer
   // live is actually unreachable and can be trivially eliminated by the
-  // unreachable block elimiation pass.
+  // unreachable block elimination pass.
   for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
        UI != UE; ++UI) {
     if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 26a4f840520e..30583b0ff8c1 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -83,7 +83,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(
     CodeGenOpt::Level OL, bool is64bit)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
       Subtarget(TT, CPU, FS, is64bit), DL(computeDataLayout(Subtarget)),
-      InstrInfo(*this), TLInfo(*this), TSInfo(*this),
+      InstrInfo(*this), TLInfo(*this), TSInfo(&DL),
       FrameLowering(
           *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
   initAsmInfo();
diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h
index 45cc0b8b67f2..02c5a94c3d03 100644
--- a/lib/Target/NVPTX/cl_common_defines.h
+++ b/lib/Target/NVPTX/cl_common_defines.h
@@ -1,5 +1,5 @@
-#ifndef __CL_COMMON_DEFINES_H__
-#define __CL_COMMON_DEFINES_H__
+#ifndef CL_COMMON_DEFINES_H
+#define CL_COMMON_DEFINES_H
 // This file includes defines that are common to both kernel code and
 // the NVPTX back-end.
 
@@ -119,4 +119,4 @@ typedef enum clk_sampler_type {
 #define CLK_LOCAL_MEM_FENCE (1 << 0)
 #define CLK_GLOBAL_MEM_FENCE (1 << 1)
 
-#endif // __CL_COMMON_DEFINES_H__
+#endif // CL_COMMON_DEFINES_H
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 3ac037dc0fc4..2f562ca7891a 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -238,7 +238,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool ParseExpression(const MCExpr *&EVal);
   bool ParseDarwinExpression(const MCExpr *&EVal);
 
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseOperand(OperandVector &Operands);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveTC(unsigned Size, SMLoc L);
@@ -246,12 +246,11 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool ParseDarwinDirectiveMachine(SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
-  void ProcessInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  void ProcessInstruction(MCInst &Inst, const OperandVector &Ops);
 
   /// @name Auto-generated Match Functions
   /// {
@@ -276,13 +275,12 @@ class PPCAsmParser : public MCTargetAsmParser {
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
-  bool ParseInstruction(ParseInstructionInfo &Info,
-                        StringRef Name, SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
 
   const MCExpr *applyModifierToExpr(const MCExpr *E,
@@ -548,8 +546,9 @@ struct PPCOperand : public MCParsedAsmOperand {
 
   void print(raw_ostream &OS) const override;
 
-  static PPCOperand *CreateToken(StringRef Str, SMLoc S, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(Token);
+  static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S,
+                                                 bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -558,22 +557,27 @@ struct PPCOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static PPCOperand *CreateTokenWithStringCopy(StringRef Str, SMLoc S,
-                                               bool IsPPC64) {
+  static std::unique_ptr<PPCOperand>
+  CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) {
     // Allocate extra memory for the string and copy it.
+    // FIXME: This is incorrect, Operands are owned by unique_ptr with a default
+    // deleter which will destroy them by simply using "delete", not correctly
+    // calling operator delete on this extra memory after calling the dtor
+    // explicitly.
     void *Mem = ::operator new(sizeof(PPCOperand) + Str.size());
-    PPCOperand *Op = new (Mem) PPCOperand(Token);
-    Op->Tok.Data = (const char *)(Op + 1);
+    std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token));
+    Op->Tok.Data = (const char *)(Op.get() + 1);
     Op->Tok.Length = Str.size();
-    std::memcpy((char *)(Op + 1), Str.data(), Str.size());
+    std::memcpy((void *)Op->Tok.Data, Str.data(), Str.size());
     Op->StartLoc = S;
     Op->EndLoc = S;
     Op->IsPPC64 = IsPPC64;
     return Op;
   }
 
-  static PPCOperand *CreateImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(Immediate);
+  static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E,
+                                               bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -581,9 +585,9 @@ struct PPCOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static PPCOperand *CreateExpr(const MCExpr *Val,
-                                SMLoc S, SMLoc E, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(Expression);
+  static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S,
+                                                SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Expression);
     Op->Expr.Val = Val;
     Op->Expr.CRVal = EvaluateCRExpr(Val);
     Op->StartLoc = S;
@@ -592,9 +596,9 @@ struct PPCOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static PPCOperand *CreateTLSReg(const MCSymbolRefExpr *Sym,
-                                  SMLoc S, SMLoc E, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(TLSRegister);
+  static std::unique_ptr<PPCOperand>
+  CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(TLSRegister);
     Op->TLSReg.Sym = Sym;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -602,8 +606,8 @@ struct PPCOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static PPCOperand *CreateFromMCExpr(const MCExpr *Val,
-                                      SMLoc S, SMLoc E, bool IsPPC64) {
+  static std::unique_ptr<PPCOperand>
+  CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) {
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val))
       return CreateImm(CE->getValue(), S, E, IsPPC64);
 
@@ -634,10 +638,8 @@ void PPCOperand::print(raw_ostream &OS) const {
   }
 }
 
-
-void PPCAsmParser::
-ProcessInstruction(MCInst &Inst,
-                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+void PPCAsmParser::ProcessInstruction(MCInst &Inst,
+                                      const OperandVector &Operands) {
   int Opcode = Inst.getOpcode();
   switch (Opcode) {
   case PPC::LAx: {
@@ -917,11 +919,10 @@ ProcessInstruction(MCInst &Inst,
   }
 }
 
-bool PPCAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
   MCInst Inst;
 
   switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
@@ -942,7 +943,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((PPCOperand*)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     }
 
@@ -1216,12 +1217,10 @@ ParseDarwinExpression(const MCExpr *&EVal) {
 /// ParseOperand
 /// This handles registers in the form 'NN', '%rNN' for ELF platforms and
 /// rNN for MachO.
-bool PPCAsmParser::
-ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
-  PPCOperand *Op;
 
   // Attempt to parse the next token as an immediate
   switch (getLexer().getKind()) {
@@ -1233,8 +1232,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     int64_t IntVal;
     if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
       Parser.Lex(); // Eat the identifier token.
-      Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
-      Operands.push_back(Op);
+      Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
       return false;
     }
     return Error(S, "invalid register name");
@@ -1249,8 +1247,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       int64_t IntVal;
       if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
         Parser.Lex(); // Eat the identifier token.
-        Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
-        Operands.push_back(Op);
+        Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
         return false;
       }
     }
@@ -1272,8 +1269,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   }
 
   // Push the parsed operand into the list of operands
-  Op = PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64());
-  Operands.push_back(Op);
+  Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64()));
 
   // Check whether this is a TLS call expression
   bool TLSCall = false;
@@ -1292,8 +1288,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     E = Parser.getTok().getLoc();
     Parser.Lex(); // Eat the ')'.
 
-    Op = PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64());
-    Operands.push_back(Op);
+    Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64()));
   }
 
   // Otherwise, check for D-form memory operands
@@ -1340,17 +1335,15 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     E = Parser.getTok().getLoc();
     Parser.Lex(); // Eat the ')'.
 
-    Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
-    Operands.push_back(Op);
+    Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
   }
 
   return false;
 }
 
 /// Parse an instruction mnemonic followed by its operands.
-bool PPCAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
   // The first operand is the token for the instruction name.
   // If the next character is a '+' or '-', we need to add it to the
   // instruction name, to match what TableGen is doing.
@@ -1554,7 +1547,7 @@ extern "C" void LLVMInitializePowerPCAsmParser() {
 
 // Define this matcher function after the auto-generated include so we
 // have the match class enum definitions.
-unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
                                                   unsigned Kind) {
   // If the kind is a token for a literal immediate, check if our asm
   // operand matches. This is for InstAliases which have a fixed-value
@@ -1568,8 +1561,8 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
     default: return Match_InvalidOperand;
   }
 
-  PPCOperand *Op = static_cast<PPCOperand*>(AsmOp);
-  if (Op->isImm() && Op->getImm() == ImmVal)
+  PPCOperand &Op = static_cast<PPCOperand &>(AsmOp);
+  if (Op.isImm() && Op.getImm() == ImmVal)
     return Match_Success;
 
   return Match_InvalidOperand;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index a4983ad67235..435a93f78c1d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -102,17 +102,45 @@ class PPCMCCodeEmitter : public MCCodeEmitter {
 
     // Output the constant in big/little endian byte order.
     unsigned Size = Desc.getSize();
-    if (IsLittleEndian) {
-      for (unsigned i = 0; i != Size; ++i) {
-        OS << (char)Bits;
-        Bits >>= 8;
+    switch (Size) {
+    case 4:
+      if (IsLittleEndian) {
+        OS << (char)(Bits);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 24);
+      } else {
+        OS << (char)(Bits >> 24);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits);
       }
-    } else {
-      int ShiftValue = (Size * 8) - 8;
-      for (unsigned i = 0; i != Size; ++i) {
-        OS << (char)(Bits >> ShiftValue);
-        Bits <<= 8;
+      break;
+    case 8:
+      // If we emit a pair of instructions, the first one is
+      // always in the top 32 bits, even on little-endian.
+      if (IsLittleEndian) {
+        OS << (char)(Bits >> 32);
+        OS << (char)(Bits >> 40);
+        OS << (char)(Bits >> 48);
+        OS << (char)(Bits >> 56);
+        OS << (char)(Bits);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 24);
+      } else {
+        OS << (char)(Bits >> 56);
+        OS << (char)(Bits >> 48);
+        OS << (char)(Bits >> 40);
+        OS << (char)(Bits >> 32);
+        OS << (char)(Bits >> 24);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits);
       }
+      break;
+    default:
+      llvm_unreachable ("Invalid instruction size");
     }
     
     ++MCNumEmitted;  // Keep track of the # of mi's emitted.
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 2174b18715f1..fd044d951fcc 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -365,8 +365,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
 
-    // Change the opcode to ADDIS8.  If the global address is external,
-    // has common linkage, is a function address, or is a jump table
+    // Change the opcode to ADDIS8.  If the global address is external, has
+    // common linkage, is a non-local function address, or is a jump table
     // address, then generate a TOC entry and reference that.  Otherwise
     // reference the symbol directly.
     TmpInst.setOpcode(PPC::ADDIS8);
@@ -375,27 +375,25 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for ADDIStocHA!");
     MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
-    bool IsFunction = false;
+    bool IsNonLocalFunction = false;
     bool IsCommon = false;
     bool IsAvailExt = false;
 
     if (MO.isGlobal()) {
-      const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ? GAlias->getAliasee() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      IsExternal = GVar && !GVar->hasInitializer();
-      IsCommon = GVar && RealGValue->hasCommonLinkage();
-      IsFunction = !GVar;
-      IsAvailExt = GVar && RealGValue->hasAvailableExternallyLinkage();
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsCommon = GV->hasCommonLinkage();
+      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
+        (GV->isDeclaration() || GV->isWeakForLinker());
+      IsAvailExt = GV->hasAvailableExternallyLinkage();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
 
-    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI() ||
-        TM.getCodeModel() == CodeModel::Large)
+    if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt ||
+        MO.isJTI() || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
@@ -427,13 +425,10 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ? GAlias->getAliasee() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-    
-      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage() ||
+      MOSymbol = getSymbol(GValue);
+      if (GValue->getType()->getElementType()->isFunctionTy() ||
+          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+          GValue->hasAvailableExternallyLinkage() ||
           TM.getCodeModel() == CodeModel::Large)
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
@@ -457,20 +452,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
     MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
-    bool IsFunction = false;
+    bool IsNonLocalFunction = false;
 
     if (MO.isGlobal()) {
-      const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ? GAlias->getAliasee() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      IsExternal = GVar && !GVar->hasInitializer();
-      IsFunction = !GVar;
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
+        (GV->isDeclaration() || GV->isWeakForLinker());
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
-    if (IsFunction || IsExternal || TM.getCodeModel() == CodeModel::Large)
+    if (IsNonLocalFunction || IsExternal ||
+        TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index c0c495fa9aec..924a07c6cff3 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -86,7 +86,7 @@ class PPCFastISel final : public FastISel {
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
-  const PPCSubtarget &PPCSubTarget;
+  const PPCSubtarget *PPCSubTarget;
   LLVMContext *Context;
 
   public:
@@ -96,9 +96,7 @@ class PPCFastISel final : public FastISel {
       TM(FuncInfo.MF->getTarget()),
       TII(*TM.getInstrInfo()),
       TLI(*TM.getTargetLowering()),
-      PPCSubTarget(
-       *((static_cast<const PPCTargetMachine *>(&TM))->getSubtargetImpl())
-      ),
+      PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()),
       Context(&FuncInfo.Fn->getContext()) { }
 
   // Backend specific FastISel code.
@@ -740,7 +738,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
-  if (SrcVT == MVT::i1 && PPCSubTarget.useCRBits())
+  if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
     return false;
 
   // See if operand 2 is an immediate encodeable in the compare.
@@ -901,7 +899,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
     if (!IsSigned) {
       LoadOpc = PPC::LFIWZX;
       Addr.Offset = 4;
-    } else if (PPCSubTarget.hasLFIWAX()) {
+    } else if (PPCSubTarget->hasLFIWAX()) {
       LoadOpc = PPC::LFIWAX;
       Addr.Offset = 4;
     }
@@ -942,7 +940,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
 
   // We can only lower an unsigned convert if we have the newer
   // floating-point conversion operations.
-  if (!IsSigned && !PPCSubTarget.hasFPCVT())
+  if (!IsSigned && !PPCSubTarget->hasFPCVT())
     return false;
 
   // FIXME: For now we require the newer floating-point conversion operations
@@ -950,7 +948,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
   // to single-precision float.  Otherwise we have to generate a lot of
   // fiddly code to avoid double rounding.  If necessary, the fiddly code
   // can be found in PPCTargetLowering::LowerINT_TO_FP().
-  if (DstVT == MVT::f32 && !PPCSubTarget.hasFPCVT())
+  if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
     return false;
 
   // Extend the input if necessary.
@@ -1065,7 +1063,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
     if (IsSigned)
       Opc = PPC::FCTIWZ;
     else
-      Opc = PPCSubTarget.hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+      Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
   else
     Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
 
@@ -1860,16 +1858,9 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   // FIXME: Jump tables are not yet required because fast-isel doesn't
   // handle switches; if that changes, we need them as well.  For now,
   // what follows assumes everything's a generic (or TLS) global address.
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (!GVar) {
-    // If GV is an alias, use the aliasee for determining thread-locality.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee());
-  }
 
   // FIXME: We don't yet handle the complexity of TLS.
-  bool IsTLS = GVar && GVar->isThreadLocal();
-  if (IsTLS)
+  if (GV->isThreadLocal())
     return 0;
 
   // For small code model, generate a simple TOC load.
@@ -1879,8 +1870,8 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
         .addGlobalAddress(GV)
         .addReg(PPC::X2);
   else {
-    // If the address is an externally defined symbol, a symbol with
-    // common or externally available linkage, a function address, or a
+    // If the address is an externally defined symbol, a symbol with common
+    // or externally available linkage, a non-local function address, or a
     // jump table address (not yet needed), or if we are generating code
     // for large code model, we generate:
     //       LDtocL(GV, ADDIStocHA(%X2, GV))
@@ -1891,12 +1882,13 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
-    // !GVar implies a function address.  An external variable is one
-    // without an initializer.
     // If/when switches are implemented, jump tables should be handled
     // on the "if" path here.
-    if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() ||
-        GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage())
+    if (CModel == CodeModel::Large ||
+        (GV->getType()->getElementType()->isFunctionTy() &&
+         (GV->isDeclaration() || GV->isWeakForLinker())) ||
+        GV->isDeclaration() || GV->hasCommonLinkage() ||
+        GV->hasAvailableExternallyLinkage())
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     else
@@ -2002,7 +1994,7 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
 unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
     const ConstantInt *CI = cast<ConstantInt>(C);
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2176,7 +2168,7 @@ unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
 
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index e2941568f1e2..9c5e588f2fc2 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -15,6 +15,7 @@
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -35,6 +36,191 @@ static const uint16_t VRRegNo[] = {
  PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
 };
 
+PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+                          (STI.hasQPX() || STI.isBGQ()) ? 32 : 16, 0),
+      Subtarget(STI) {}
+
+unsigned PPCFrameLowering::getMinCallArgumentsSize(bool isPPC64,
+                                                   bool isDarwinABI) {
+  // For the Darwin ABI / 64-bit SVR4 ABI:
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  if (isDarwinABI || isPPC64)
+    return 8 * (isPPC64 ? 8 : 4);
+
+  // 32-bit SVR4 ABI:
+  // There is no default stack allocated for the 8 first GPR arguments.
+  return 0;
+}
+
+/// getMinCallFrameSize - Return the minimum size a call frame can be using
+/// the PowerPC ABI.
+unsigned PPCFrameLowering::getMinCallFrameSize(bool isPPC64, bool isDarwinABI) {
+  // The call frame needs to be at least big enough for linkage and 8 args.
+  return PPCFrameLowering::getLinkageSize(isPPC64, isDarwinABI) +
+         PPCFrameLowering::getMinCallArgumentsSize(isPPC64, isDarwinABI);
+}
+
+// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
+    unsigned &NumEntries) const {
+  if (Subtarget.isDarwinABI()) {
+    NumEntries = 1;
+    if (Subtarget.isPPC64()) {
+      static const SpillSlot darwin64Offsets = {PPC::X31, -8};
+      return &darwin64Offsets;
+    } else {
+      static const SpillSlot darwinOffsets = {PPC::R31, -4};
+      return &darwinOffsets;
+    }
+  }
+
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    NumEntries = 0;
+    return nullptr;
+  }
+
+  // Note that the offsets here overlap, but this is fixed up in
+  // processFunctionBeforeFrameFinalized.
+
+  static const SpillSlot Offsets[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::R31, -4},
+      {PPC::R30, -8},
+      {PPC::R29, -12},
+      {PPC::R28, -16},
+      {PPC::R27, -20},
+      {PPC::R26, -24},
+      {PPC::R25, -28},
+      {PPC::R24, -32},
+      {PPC::R23, -36},
+      {PPC::R22, -40},
+      {PPC::R21, -44},
+      {PPC::R20, -48},
+      {PPC::R19, -52},
+      {PPC::R18, -56},
+      {PPC::R17, -60},
+      {PPC::R16, -64},
+      {PPC::R15, -68},
+      {PPC::R14, -72},
+
+      // CR save area offset.  We map each of the nonvolatile CR fields
+      // to the slot for CR2, which is the first of the nonvolatile CR
+      // fields to be assigned, so that we only allocate one save slot.
+      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+      {PPC::CR2, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  static const SpillSlot Offsets64[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::X31, -8},
+      {PPC::X30, -16},
+      {PPC::X29, -24},
+      {PPC::X28, -32},
+      {PPC::X27, -40},
+      {PPC::X26, -48},
+      {PPC::X25, -56},
+      {PPC::X24, -64},
+      {PPC::X23, -72},
+      {PPC::X22, -80},
+      {PPC::X21, -88},
+      {PPC::X20, -96},
+      {PPC::X19, -104},
+      {PPC::X18, -112},
+      {PPC::X17, -120},
+      {PPC::X16, -128},
+      {PPC::X15, -136},
+      {PPC::X14, -144},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  if (Subtarget.isPPC64()) {
+    NumEntries = array_lengthof(Offsets64);
+
+    return Offsets64;
+  } else {
+    NumEntries = array_lengthof(Offsets);
+
+    return Offsets;
+  }
+}
+
 /// RemoveVRSaveCode - We have found that this function does not need any code
 /// to manipulate the VRSAVE register, even though it uses vector registers.
 /// This can happen when the only registers used are known to be live in or out
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 94e9b67338b4..ca1ca56d08fa 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -14,23 +14,18 @@
 #define POWERPC_FRAMEINFO_H
 
 #include "PPC.h"
-#include "PPCSubtarget.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-  class PPCSubtarget;
+class PPCSubtarget;
 
 class PPCFrameLowering: public TargetFrameLowering {
   const PPCSubtarget &Subtarget;
 
 public:
-  PPCFrameLowering(const PPCSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
-        (sti.hasQPX() || sti.isBGQ()) ? 32 : 16, 0),
-      Subtarget(sti) {
-  }
+  PPCFrameLowering(const PPCSubtarget &STI);
 
   unsigned determineFrameLayout(MachineFunction &MF,
                                 bool UpdateMF = true,
@@ -79,6 +74,12 @@ class PPCFrameLowering: public TargetFrameLowering {
     return isPPC64 ? 16 : 4;
   }
 
+  /// getTOCSaveOffset - Return the previous frame offset to save the
+  /// TOC register -- 64-bit SVR4 ABI only.
+  static unsigned getTOCSaveOffset(void) {
+    return 40;
+  }
+
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
   /// frame pointer.
   static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
@@ -116,188 +117,11 @@ class PPCFrameLowering: public TargetFrameLowering {
 
   /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI
   /// argument area.
-  static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI) {
-    // For the Darwin ABI / 64-bit SVR4 ABI:
-    // The prolog code of the callee may store up to 8 GPR argument registers to
-    // the stack, allowing va_start to index over them in memory if its varargs.
-    // Because we cannot tell if this is needed on the caller side, we have to
-    // conservatively assume that it is needed.  As such, make sure we have at
-    // least enough stack space for the caller to store the 8 GPRs.
-    if (isDarwinABI || isPPC64)
-      return 8 * (isPPC64 ? 8 : 4);
-
-    // 32-bit SVR4 ABI:
-    // There is no default stack allocated for the 8 first GPR arguments.
-    return 0;
-  }
-
-  /// getMinCallFrameSize - Return the minimum size a call frame can be using
-  /// the PowerPC ABI.
-  static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI) {
-    // The call frame needs to be at least big enough for linkage and 8 args.
-    return getLinkageSize(isPPC64, isDarwinABI) +
-           getMinCallArgumentsSize(isPPC64, isDarwinABI);
-  }
-
-  // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+  static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI);
   const SpillSlot *
-  getCalleeSavedSpillSlots(unsigned &NumEntries) const override {
-    if (Subtarget.isDarwinABI()) {
-      NumEntries = 1;
-      if (Subtarget.isPPC64()) {
-        static const SpillSlot darwin64Offsets = {PPC::X31, -8};
-        return &darwin64Offsets;
-      } else {
-        static const SpillSlot darwinOffsets = {PPC::R31, -4};
-        return &darwinOffsets;
-      }
-    }
-
-    // Early exit if not using the SVR4 ABI.
-    if (!Subtarget.isSVR4ABI()) {
-      NumEntries = 0;
-      return nullptr;
-    }
-
-    // Note that the offsets here overlap, but this is fixed up in
-    // processFunctionBeforeFrameFinalized.
-
-    static const SpillSlot Offsets[] = {
-      // Floating-point register save area offsets.
-      {PPC::F31, -8},
-      {PPC::F30, -16},
-      {PPC::F29, -24},
-      {PPC::F28, -32},
-      {PPC::F27, -40},
-      {PPC::F26, -48},
-      {PPC::F25, -56},
-      {PPC::F24, -64},
-      {PPC::F23, -72},
-      {PPC::F22, -80},
-      {PPC::F21, -88},
-      {PPC::F20, -96},
-      {PPC::F19, -104},
-      {PPC::F18, -112},
-      {PPC::F17, -120},
-      {PPC::F16, -128},
-      {PPC::F15, -136},
-      {PPC::F14, -144},
-
-      // General register save area offsets.
-      {PPC::R31, -4},
-      {PPC::R30, -8},
-      {PPC::R29, -12},
-      {PPC::R28, -16},
-      {PPC::R27, -20},
-      {PPC::R26, -24},
-      {PPC::R25, -28},
-      {PPC::R24, -32},
-      {PPC::R23, -36},
-      {PPC::R22, -40},
-      {PPC::R21, -44},
-      {PPC::R20, -48},
-      {PPC::R19, -52},
-      {PPC::R18, -56},
-      {PPC::R17, -60},
-      {PPC::R16, -64},
-      {PPC::R15, -68},
-      {PPC::R14, -72},
-
-      // CR save area offset.  We map each of the nonvolatile CR fields
-      // to the slot for CR2, which is the first of the nonvolatile CR
-      // fields to be assigned, so that we only allocate one save slot.
-      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
-      {PPC::CR2, -4},
-
-      // VRSAVE save area offset.
-      {PPC::VRSAVE, -4},
-
-      // Vector register save area
-      {PPC::V31, -16},
-      {PPC::V30, -32},
-      {PPC::V29, -48},
-      {PPC::V28, -64},
-      {PPC::V27, -80},
-      {PPC::V26, -96},
-      {PPC::V25, -112},
-      {PPC::V24, -128},
-      {PPC::V23, -144},
-      {PPC::V22, -160},
-      {PPC::V21, -176},
-      {PPC::V20, -192}
-    };
-
-    static const SpillSlot Offsets64[] = {
-      // Floating-point register save area offsets.
-      {PPC::F31, -8},
-      {PPC::F30, -16},
-      {PPC::F29, -24},
-      {PPC::F28, -32},
-      {PPC::F27, -40},
-      {PPC::F26, -48},
-      {PPC::F25, -56},
-      {PPC::F24, -64},
-      {PPC::F23, -72},
-      {PPC::F22, -80},
-      {PPC::F21, -88},
-      {PPC::F20, -96},
-      {PPC::F19, -104},
-      {PPC::F18, -112},
-      {PPC::F17, -120},
-      {PPC::F16, -128},
-      {PPC::F15, -136},
-      {PPC::F14, -144},
-
-      // General register save area offsets.
-      {PPC::X31, -8},
-      {PPC::X30, -16},
-      {PPC::X29, -24},
-      {PPC::X28, -32},
-      {PPC::X27, -40},
-      {PPC::X26, -48},
-      {PPC::X25, -56},
-      {PPC::X24, -64},
-      {PPC::X23, -72},
-      {PPC::X22, -80},
-      {PPC::X21, -88},
-      {PPC::X20, -96},
-      {PPC::X19, -104},
-      {PPC::X18, -112},
-      {PPC::X17, -120},
-      {PPC::X16, -128},
-      {PPC::X15, -136},
-      {PPC::X14, -144},
-
-      // VRSAVE save area offset.
-      {PPC::VRSAVE, -4},
-
-      // Vector register save area
-      {PPC::V31, -16},
-      {PPC::V30, -32},
-      {PPC::V29, -48},
-      {PPC::V28, -64},
-      {PPC::V27, -80},
-      {PPC::V26, -96},
-      {PPC::V25, -112},
-      {PPC::V24, -128},
-      {PPC::V23, -144},
-      {PPC::V22, -160},
-      {PPC::V21, -176},
-      {PPC::V20, -192}
-    };
-
-    if (Subtarget.isPPC64()) {
-      NumEntries = array_lengthof(Offsets64);
-
-      return Offsets64;
-    } else {
-      NumEntries = array_lengthof(Offsets);
-
-      return Offsets;
-    }
-  }
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+  static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI);
 };
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 7ca706b81c3e..d0315f94834f 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -258,8 +258,8 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
 //   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
 //
 
-PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetMachine &TM)
-  : TM(TM) {
+PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG)
+    : DAG(DAG) {
   EndDispatchGroup();
 }
 
@@ -278,7 +278,7 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
                                      bool &isFirst, bool &isSingle,
                                      bool &isCracked,
                                      bool &isLoad, bool &isStore) {
-  const MCInstrDesc &MCID = TM.getInstrInfo()->get(Opcode);
+  const MCInstrDesc &MCID = DAG.TII->get(Opcode);
 
   isLoad  = MCID.mayLoad();
   isStore = MCID.mayStore();
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index cf4332cffa8c..23f76c16d138 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -54,7 +54,7 @@ class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer {
 /// setting the CTR register then branching through it within a dispatch group),
 /// or storing then loading from the same address within a dispatch group.
 class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
-  const TargetMachine &TM;
+  const ScheduleDAG &DAG;
 
   unsigned NumIssued;  // Number of insts issued, including advanced cycles.
 
@@ -75,7 +75,7 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
   unsigned NumStores;
 
 public:
-  PPCHazardRecognizer970(const TargetMachine &TM);
+  PPCHazardRecognizer970(const ScheduleDAG &DAG);
   virtual HazardType getHazardType(SUnit *SU, int Stalls) override;
   virtual void EmitInstruction(SUnit *SU) override;
   virtual void AdvanceCycle() override;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 63dac61f4cdc..4881b3fbb7ac 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -51,23 +51,25 @@ namespace {
   ///
   class PPCDAGToDAGISel : public SelectionDAGISel {
     const PPCTargetMachine &TM;
-    const PPCTargetLowering &PPCLowering;
-    const PPCSubtarget &PPCSubTarget;
+    const PPCTargetLowering *PPCLowering;
+    const PPCSubtarget *PPCSubTarget;
     unsigned GlobalBaseReg;
   public:
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
       : SelectionDAGISel(tm), TM(tm),
-        PPCLowering(*TM.getTargetLowering()),
-        PPCSubTarget(*TM.getSubtargetImpl()) {
+        PPCLowering(TM.getTargetLowering()),
+        PPCSubTarget(TM.getSubtargetImpl()) {
       initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
+      PPCLowering = TM.getTargetLowering();
+      PPCSubTarget = TM.getSubtargetImpl();
       SelectionDAGISel::runOnMachineFunction(MF);
 
-      if (!PPCSubTarget.isSVR4ABI())
+      if (!PPCSubTarget->isSVR4ABI())
         InsertVRSaveCode(MF);
 
       return true;
@@ -89,7 +91,7 @@ namespace {
 
     /// getSmallIPtrImm - Return a target constant of pointer type.
     inline SDValue getSmallIPtrImm(unsigned Imm) {
-      return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
+      return CurDAG->getTargetConstant(Imm, PPCLowering->getPointerTy());
     }
 
     /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s
@@ -122,7 +124,7 @@ namespace {
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -142,20 +144,20 @@ namespace {
     /// represented as an indexed [r+r] operation.  Returns false if it can
     /// be represented by [r+imm], which are preferred.
     bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
     }
 
     /// SelectAddrIdxOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
     bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+      return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
     }
 
     /// SelectAddrImmX4 - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement that is a multiple of 4.
     /// Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
     }
 
     // Select an address into a single register.
@@ -272,7 +274,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
     DebugLoc dl;
 
-    if (PPCLowering.getPointerTy() == MVT::i32) {
+    if (PPCLowering->getPointerTy() == MVT::i32) {
       GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
@@ -283,7 +285,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     }
   }
   return CurDAG->getRegister(GlobalBaseReg,
-                             PPCLowering.getPointerTy()).getNode();
+                             PPCLowering->getPointerTy()).getNode();
 }
 
 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit
@@ -580,7 +582,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     Opc = PPC::FCMPUS;
   } else {
     assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
-    Opc = PPCSubTarget.hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
   }
   return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
@@ -746,7 +748,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
 
-  if (!PPCSubTarget.useCRBits() &&
+  if (!PPCSubTarget->useCRBits() &&
       isInt32Immediate(N->getOperand(1), Imm)) {
     // We can codegen setcc op, imm very efficiently compared to a brcond.
     // Check for those cases here.
@@ -828,7 +830,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   if (LHS.getValueType().isVector()) {
     EVT VecVT = LHS.getValueType();
     MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
-    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget.hasVSX());
+    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget->hasVSX());
 
     switch (CC) {
       case ISD::SETEQ:
@@ -839,7 +841,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETONE:
       case ISD::SETUNE: {
         SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLNOR :
+        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
                                                                PPC::VNOR,
                                     VecVT, VCmp, VCmp);
       } 
@@ -861,9 +863,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
           return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
         } else {
           SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
+          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
           SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-          return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+          return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
                                                                  PPC::VOR,
                                       VecVT, VCmpGT, VCmpEQ);
         }
@@ -872,9 +874,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETOLE:
       case ISD::SETULE: {
         SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
-        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
+        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
         SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
                                                                PPC::VOR,
                                     VecVT, VCmpLE, VCmpEQ);
       }
@@ -883,7 +885,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
     }
   }
 
-  if (PPCSubTarget.useCRBits())
+  if (PPCSubTarget->useCRBits())
     return nullptr;
 
   bool Inv;
@@ -1101,7 +1103,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering.getPointerTy(),
+                                    PPCLowering->getPointerTy(),
                                     MVT::Other, Ops);
     } else {
       unsigned Opcode;
@@ -1136,7 +1138,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Base, Offset, Chain };
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering.getPointerTy(),
+                                    PPCLowering->getPointerTy(),
                                     MVT::Other, Ops);
     }
   }
@@ -1267,7 +1269,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     bool isPPC64 = (PtrVT == MVT::i64);
 
     // If this is a select of i1 operands, we'll pattern match it.
-    if (PPCSubTarget.useCRBits() &&
+    if (PPCSubTarget->useCRBits() &&
         N->getOperand(0).getValueType() == MVT::i1)
       break;
 
@@ -1338,14 +1340,14 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
   }
   case ISD::VSELECT:
-    if (PPCSubTarget.hasVSX()) {
+    if (PPCSubTarget->hasVSX()) {
       SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
       return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
     }
 
     break;
   case ISD::VECTOR_SHUFFLE:
-    if (PPCSubTarget.hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+    if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
       ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
       
@@ -1383,7 +1385,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     break;
   case PPCISD::BDNZ:
   case PPCISD::BDZ: {
-    bool IsPPC64 = PPCSubTarget.isPPC64();
+    bool IsPPC64 = PPCSubTarget->isPPC64();
     SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
     return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
                                    (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
@@ -1443,7 +1445,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
   }
   case PPCISD::TOC_ENTRY: {
-    assert (PPCSubTarget.isPPC64() && "Only supported for 64-bit ABI");
+    assert (PPCSubTarget->isPPC64() && "Only supported for 64-bit ABI");
 
     // For medium and large code model, we generate two instructions as
     // described below.  Otherwise we allow SelectCodeCommon to handle this,
@@ -1452,10 +1454,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
       break;
 
-    // The first source operand is a TargetGlobalAddress or a
-    // TargetJumpTable.  If it is an externally defined symbol, a symbol
-    // with common linkage, a function address, or a jump table address,
-    // or if we are generating code for large code model, we generate:
+    // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
+    // If it is an externally defined symbol, a symbol with common linkage,
+    // a non-local function address, or a jump table address, or if we are
+    // generating code for large code model, we generate:
     //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
     // Otherwise we generate:
     //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
@@ -1470,17 +1472,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue = GAlias ? GAlias->getAliasee() : GValue;
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      assert((GVar || isa<Function>(RealGValue)) &&
-             "Unexpected global value subclass!");
-
-      // An external variable is one without an initializer.  For these,
-      // for variables with common linkage, and for Functions, generate
-      // the LDtocL form.
-      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage())
+      if ((GValue->getType()->getElementType()->isFunctionTy() &&
+           (GValue->isDeclaration() || GValue->isWeakForLinker())) ||
+          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+          GValue->hasAvailableExternallyLinkage())
         return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
                                       SDValue(Tmp, 0));
     }
@@ -1583,7 +1578,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 // containing zero.
 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
   // If we're not using isel, then this does not matter.
-  if (!PPCSubTarget.hasISEL())
+  if (!PPCSubTarget->hasISEL())
     return false;
 
   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
@@ -2045,7 +2040,7 @@ void PPCDAGToDAGISel::PeepholeCROps() {
 
 void PPCDAGToDAGISel::PeepholePPC64() {
   // These optimizations are currently supported only for 64-bit SVR4.
-  if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
+  if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
     return;
 
   SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 214c86920609..f6884d5a2716 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -50,20 +51,18 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
-static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
-  if (TM.getSubtargetImpl()->isDarwin())
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  // If it isn't a Mach-O file then it's going to be a linux ELF
+  // object file.
+  if (TT.isOSDarwin())
     return new TargetLoweringObjectFileMachO();
 
-  if (TM.getSubtargetImpl()->isSVR4ABI())
-    return new PPC64LinuxTargetObjectFile();
-
-  return new TargetLoweringObjectFileELF();
+  return new PPC64LinuxTargetObjectFile();
 }
 
 PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
-  : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
-  const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
-
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))),
+      Subtarget(*TM.getSubtargetImpl()) {
   setPow2DivIsCheap();
 
   // Use _setjmp/_longjmp instead of setjmp/longjmp.
@@ -72,7 +71,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
   // arguments are at least 4/8 bytes aligned.
-  bool isPPC64 = Subtarget->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
   setMinStackArgumentAlignment(isPPC64 ? 8:4);
 
   // Set up the register classes.
@@ -98,10 +97,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
 
-  if (Subtarget->useCRBits()) {
+  if (Subtarget.useCRBits()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-    if (isPPC64 || Subtarget->hasFPCVT()) {
+    if (isPPC64 || Subtarget.hasFPCVT()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
       AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
                          isPPC64 ? MVT::i64 : MVT::i32);
@@ -176,17 +175,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   // If we're enabling GP optimizations, use hardware square root
-  if (!Subtarget->hasFSQRT() &&
+  if (!Subtarget.hasFSQRT() &&
       !(TM.Options.UnsafeFPMath &&
-        Subtarget->hasFRSQRTE() && Subtarget->hasFRE()))
+        Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 
-  if (!Subtarget->hasFSQRT() &&
+  if (!Subtarget.hasFSQRT() &&
       !(TM.Options.UnsafeFPMath &&
-        Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
+        Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
-  if (Subtarget->hasFCPSGN()) {
+  if (Subtarget.hasFCPSGN()) {
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
   } else {
@@ -194,7 +193,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
   }
 
-  if (Subtarget->hasFPRND()) {
+  if (Subtarget.hasFPRND()) {
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
@@ -216,7 +215,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
-  if (Subtarget->hasPOPCNTD()) {
+  if (Subtarget.hasPOPCNTD()) {
     setOperationAction(ISD::CTPOP, MVT::i32  , Legal);
     setOperationAction(ISD::CTPOP, MVT::i64  , Legal);
   } else {
@@ -228,7 +227,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::ROTR, MVT::i32   , Expand);
   setOperationAction(ISD::ROTR, MVT::i64   , Expand);
 
-  if (!Subtarget->useCRBits()) {
+  if (!Subtarget.useCRBits()) {
     // PowerPC does not have Select
     setOperationAction(ISD::SELECT, MVT::i32, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
@@ -241,11 +240,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
   // PowerPC wants to optimize integer setcc a bit
-  if (!Subtarget->useCRBits())
+  if (!Subtarget.useCRBits())
     setOperationAction(ISD::SETCC, MVT::i32, Custom);
 
   // PowerPC does not have BRCOND which requires SetCC
-  if (!Subtarget->useCRBits())
+  if (!Subtarget.useCRBits())
     setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 
   setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
@@ -297,7 +296,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
-  if (Subtarget->isSVR4ABI()) {
+  if (Subtarget.isSVR4ABI()) {
     if (isPPC64) {
       // VAARG always uses double-word chunks, so promote anything smaller.
       setOperationAction(ISD::VAARG, MVT::i1, Promote);
@@ -317,7 +316,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   } else
     setOperationAction(ISD::VAARG, MVT::Other, Expand);
 
-  if (Subtarget->isSVR4ABI() && !isPPC64)
+  if (Subtarget.isSVR4ABI() && !isPPC64)
     // VACOPY is custom lowered with the 32-bit SVR4 ABI.
     setOperationAction(ISD::VACOPY            , MVT::Other, Custom);
   else
@@ -350,7 +349,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
 
-  if (Subtarget->has64BitSupport()) {
+  if (Subtarget.has64BitSupport()) {
     // They also have instructions for converting between i64 and fp.
     setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
@@ -360,7 +359,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     // We cannot do this with Promote because i64 is not a legal type.
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 
-    if (PPCSubTarget.hasLFIWAX() || Subtarget->isPPC64())
+    if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   } else {
     // PowerPC does not have FP_TO_UINT on 32-bit implementations.
@@ -368,8 +367,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   }
 
   // With the instructions enabled under FPCVT, we can do everything.
-  if (PPCSubTarget.hasFPCVT()) {
-    if (Subtarget->has64BitSupport()) {
+  if (Subtarget.hasFPCVT()) {
+    if (Subtarget.has64BitSupport()) {
       setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
       setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
@@ -382,7 +381,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   }
 
-  if (Subtarget->use64BitRegs()) {
+  if (Subtarget.use64BitRegs()) {
     // 64-bit PowerPC implementations can support i64 types directly
     addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
     // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
@@ -398,7 +397,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   }
 
-  if (Subtarget->hasAltivec()) {
+  if (Subtarget.hasAltivec()) {
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -488,7 +487,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
     setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
     setOperationAction(ISD::SELECT, MVT::v4i32,
-                       Subtarget->useCRBits() ? Legal : Expand);
+                       Subtarget.useCRBits() ? Legal : Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
@@ -507,7 +506,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
     setOperationAction(ISD::FMA, MVT::v4f32, Legal);
 
-    if (TM.Options.UnsafeFPMath || Subtarget->hasVSX()) {
+    if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
       setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
     }
@@ -535,7 +534,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
 
-    if (Subtarget->hasVSX()) {
+    if (Subtarget.hasVSX()) {
       setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
 
@@ -613,7 +612,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     }
   }
 
-  if (Subtarget->has64BitSupport()) {
+  if (Subtarget.has64BitSupport()) {
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
   }
@@ -642,7 +641,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::BR_CC);
-  if (Subtarget->useCRBits())
+  if (Subtarget.useCRBits())
     setTargetDAGCombine(ISD::BRCOND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -651,7 +650,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
-  if (Subtarget->useCRBits()) {
+  if (Subtarget.useCRBits()) {
     setTargetDAGCombine(ISD::TRUNCATE);
     setTargetDAGCombine(ISD::SETCC);
     setTargetDAGCombine(ISD::SELECT_CC);
@@ -664,7 +663,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   }
 
   // Darwin long double math library functions have $LDBL128 appended.
-  if (Subtarget->isDarwin()) {
+  if (Subtarget.isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
     setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
     setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
@@ -679,21 +678,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
-  if (Subtarget->useCRBits())
+  if (Subtarget.useCRBits())
     setHasMultipleConditionRegisters();
 
   setMinFunctionAlignment(2);
-  if (PPCSubTarget.isDarwin())
+  if (Subtarget.isDarwin())
     setPrefFunctionAlignment(4);
 
-  if (isPPC64 && Subtarget->isJITCodeModel())
+  if (isPPC64 && Subtarget.isJITCodeModel())
     // Temporary workaround for the inability of PPC64 JIT to handle jump
     // tables.
     setSupportJumpTables(false);
 
   setInsertFencesForAtomic(true);
 
-  if (Subtarget->enableMachineScheduler())
+  if (Subtarget.enableMachineScheduler())
     setSchedulingPreference(Sched::Source);
   else
     setSchedulingPreference(Sched::Hybrid);
@@ -702,8 +701,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // The Freescale cores does better with aggressive inlining of memcpy and
   // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
-  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
-      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+  if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
     MaxStoresPerMemset = 32;
     MaxStoresPerMemsetOptSize = 16;
     MaxStoresPerMemcpy = 32;
@@ -747,14 +746,14 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
 /// function arguments in the caller parameter area.
 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
   // Darwin passes everything on 4 byte boundary.
-  if (PPCSubTarget.isDarwin())
+  if (Subtarget.isDarwin())
     return 4;
 
   // 16byte and wider vectors are passed on 16byte boundary.
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
-  unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4;
-  if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX())
-    getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16);
+  unsigned Align = Subtarget.isPPC64() ? 8 : 4;
+  if (Subtarget.hasAltivec() || Subtarget.hasQPX())
+    getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
   return Align;
 }
 
@@ -774,7 +773,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
   case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
-  case PPCISD::TOC_RESTORE:     return "PPCISD::TOC_RESTORE";
   case PPCISD::LOAD:            return "PPCISD::LOAD";
   case PPCISD::LOAD_TOC:        return "PPCISD::LOAD_TOC";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
@@ -826,7 +824,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
-    return PPCSubTarget.useCRBits() ? MVT::i1 : MVT::i32;
+    return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -855,15 +853,17 @@ static bool isConstantOrUndef(int Op, int Val) {
 
 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUHUM instruction.
-bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                               SelectionDAG &DAG) {
+  unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1;
   if (!isUnary) {
     for (unsigned i = 0; i != 16; ++i)
-      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+1))
+      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+j))
         return false;
   } else {
     for (unsigned i = 0; i != 8; ++i)
-      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+1) ||
-          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+1))
+      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j))
         return false;
   }
   return true;
@@ -871,18 +871,27 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
 
 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUWUM instruction.
-bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                               SelectionDAG &DAG) {
+  unsigned j, k;
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    j = 0;
+    k = 1;
+  } else {
+    j = 2;
+    k = 3;
+  }
   if (!isUnary) {
     for (unsigned i = 0; i != 16; i += 2)
-      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
-          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+k))
         return false;
   } else {
     for (unsigned i = 0; i != 8; i += 2)
-      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
-          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3) ||
-          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+2) ||
-          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+3))
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+k) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+k))
         return false;
   }
   return true;
@@ -909,27 +918,39 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
 }
 
 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
-/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                             bool isUnary) {
-  if (!isUnary)
-    return isVMerge(N, UnitSize, 8, 24);
-  return isVMerge(N, UnitSize, 8, 8);
+                             bool isUnary, SelectionDAG &DAG) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 0, 16);
+    return isVMerge(N, UnitSize, 0, 0);
+  } else {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 8, 24);
+    return isVMerge(N, UnitSize, 8, 8);
+  }
 }
 
 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
-/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                             bool isUnary) {
-  if (!isUnary)
-    return isVMerge(N, UnitSize, 0, 16);
-  return isVMerge(N, UnitSize, 0, 0);
+                             bool isUnary, SelectionDAG &DAG) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 8, 24);
+    return isVMerge(N, UnitSize, 8, 8);
+  } else {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 0, 16);
+    return isVMerge(N, UnitSize, 0, 0);
+  }
 }
 
 
 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
 /// amount, otherwise return -1.
-int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
+int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) {
   if (N->getValueType(0) != MVT::v16i8)
     return -1;
 
@@ -946,18 +967,38 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
   // numbered from this value.
   unsigned ShiftAmt = SVOp->getMaskElt(i);
   if (ShiftAmt < i) return -1;
-  ShiftAmt -= i;
 
-  if (!isUnary) {
-    // Check the rest of the elements to see if they are consecutive.
-    for (++i; i != 16; ++i)
-      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
-        return -1;
-  } else {
-    // Check the rest of the elements to see if they are consecutive.
-    for (++i; i != 16; ++i)
-      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
-        return -1;
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+
+    ShiftAmt += i;
+
+    if (!isUnary) {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i))
+          return -1;
+    } else {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15))
+          return -1;
+    }
+
+  } else {  // Big Endian
+
+    ShiftAmt -= i;
+
+    if (!isUnary) {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+          return -1;
+    } else {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+          return -1;
+    }
   }
   return ShiftAmt;
 }
@@ -1010,10 +1051,14 @@ bool PPC::isAllNegativeZeroVector(SDNode *N) {
 
 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
+                                SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
-  return SVOp->getMaskElt(0) / EltSize;
+  if (DAG.getTarget().getDataLayout()->isLittleEndian())
+    return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
+  else
+    return SVOp->getMaskElt(0) / EltSize;
 }
 
 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
@@ -1299,7 +1344,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     short Imm;
     if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
       Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
-      Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+      Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
       return true;
     }
@@ -1350,7 +1395,7 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
   }
 
   // Otherwise, do it the hard way, using R0 as the base register.
-  Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+  Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                          N.getValueType());
   Index = N;
   return true;
@@ -1497,7 +1542,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
     return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
@@ -1518,7 +1563,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
     return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
@@ -1555,7 +1600,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
-  bool is64bit = PPCSubTarget.isPPC64();
+  bool is64bit = Subtarget.isPPC64();
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
 
@@ -1646,7 +1691,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
     return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
@@ -2094,8 +2139,8 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                         SDLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
-  if (PPCSubTarget.isSVR4ABI()) {
-    if (PPCSubTarget.isPPC64())
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
       return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
                                          dl, DAG, InVals);
     else
@@ -2184,7 +2229,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
           RC = &PPC::F4RCRegClass;
           break;
         case MVT::f64:
-          if (PPCSubTarget.hasVSX())
+          if (Subtarget.hasVSX())
             RC = &PPC::VSFRCRegClass;
           else
             RC = &PPC::F8RCRegClass;
@@ -2589,7 +2634,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         if (ObjectVT == MVT::f32)
           VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
         else
-          VReg = MF.addLiveIn(FPR[FPR_idx], PPCSubTarget.hasVSX() ?
+          VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ?
                                             &PPC::VSFRCRegClass :
                                             &PPC::F8RCRegClass);
 
@@ -3280,7 +3325,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
                                                         SDLoc dl) const {
   if (SPDiff) {
     // Load the LR and FP stack slot for later adjusting.
-    EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
+    EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
     LROpOut = getReturnAddrFrameIndex(DAG);
     LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(),
                           false, false, false, 0);
@@ -3373,10 +3418,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
                      SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
                      SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
                      SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
-                     const PPCSubtarget &PPCSubTarget) {
+                     const PPCSubtarget &Subtarget) {
 
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isSVR4ABI = PPCSubTarget.isSVR4ABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   NodeTys.push_back(MVT::Other);   // Returns a chain
@@ -3385,11 +3430,12 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   unsigned CallOpc = PPCISD::CALL;
 
   bool needIndirectCall = true;
-  if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
-    // If this is an absolute destination address, use the munged value.
-    Callee = SDValue(Dest, 0);
-    needIndirectCall = false;
-  }
+  if (!isSVR4ABI || !isPPC64)
+    if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
+      // If this is an absolute destination address, use the munged value.
+      Callee = SDValue(Dest, 0);
+      needIndirectCall = false;
+    }
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
@@ -3398,8 +3444,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
       unsigned OpFlags = 0;
       if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          (PPCSubTarget.getTargetTriple().isMacOSX() &&
-           PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
+          (Subtarget.getTargetTriple().isMacOSX() &&
+           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
           (G->getGlobal()->isDeclaration() ||
            G->getGlobal()->isWeakForLinker())) {
         // PC-relative references to external symbols should go through $stub,
@@ -3422,8 +3468,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     unsigned char OpFlags = 0;
 
     if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-        (PPCSubTarget.getTargetTriple().isMacOSX() &&
-         PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
+        (Subtarget.getTargetTriple().isMacOSX() &&
+         Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -3497,8 +3543,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       // additional register being allocated and an unnecessary move instruction
       // being generated.
       VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue TOCOff = DAG.getIntPtrConstant(8);
+      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
       SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain,
-                                       Callee, InFlag);
+                                       AddTOC, InFlag);
       Chain = LoadTOCPtr.getValue(0);
       InFlag = LoadTOCPtr.getValue(1);
 
@@ -3613,10 +3661,10 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
                                  isTailCall, RegsToPass, Ops, NodeTys,
-                                 PPCSubTarget);
+                                 Subtarget);
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
-  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+  if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
     Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
 
   // When performing tail call optimization the callee pops its arguments off
@@ -3657,7 +3705,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   // same TOC), the NOP will remain unchanged.
 
   bool needsTOCRestore = false;
-  if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) {
+  if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) {
     if (CallOpc == PPCISD::BCTRL) {
       // This is a call through a function pointer.
       // Restore the caller TOC from the save area into R2.
@@ -3682,7 +3730,12 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
 
   if (needsTOCRestore) {
     SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
+    SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
+    SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
+    Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag);
     InFlag = Chain.getValue(1);
   }
 
@@ -3718,8 +3771,8 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
-  if (PPCSubTarget.isSVR4ABI()) {
-    if (PPCSubTarget.isPPC64())
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
       return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
                               isTailCall, Outs, OutVals, Ins,
                               dl, DAG, InVals);
@@ -4337,19 +4390,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   // pointers in the 64-bit SVR4 ABI.
   if (!isTailCall &&
       !dyn_cast<GlobalAddressSDNode>(Callee) &&
-      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
-      !isBLACompatibleAddress(Callee, DAG)) {
+      !dyn_cast<ExternalSymbolSDNode>(Callee)) {
     // Load r2 into a virtual register and store it to the TOC save area.
     SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
     // TOC save area offset.
-    SDValue PtrOff = DAG.getIntPtrConstant(40);
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
+    SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset);
     SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
     Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
                          false, false, 0);
-    // R12 must contain the address of an indirect callee.  This does not
-    // mean the MTCTR instruction must use R12; it's easier to model this
-    // as an extra parameter, so do that.
-    RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -4818,8 +4867,8 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
 SDValue
 PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
@@ -4842,8 +4891,8 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
 SDValue
 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
@@ -5063,12 +5112,12 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
-                        (PPCSubTarget.hasFPCVT() ? PPCISD::FCTIWUZ :
+                        (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ :
                                                    PPCISD::FCTIDZ),
                       dl, MVT::f64, Src);
     break;
   case MVT::i64:
-    assert((Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()) &&
+    assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
            "i64 FP_TO_UINT is supported only with FPCVT");
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
                                                         PPCISD::FCTIDUZ,
@@ -5077,8 +5126,8 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   }
 
   // Convert the FP value to an int value through memory.
-  bool i32Stack = Op.getValueType() == MVT::i32 && PPCSubTarget.hasSTFIWX() &&
-    (Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT());
+  bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
+    (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
   SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
   int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI);
@@ -5120,17 +5169,17 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                        DAG.getConstantFP(1.0, Op.getValueType()),
                        DAG.getConstantFP(0.0, Op.getValueType()));
 
-  assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) &&
+  assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
          "UINT_TO_FP is supported only with FPCVT");
 
   // If we have FCFIDS, then use it when converting to single-precision.
   // Otherwise, convert to double-precision and then round.
-  unsigned FCFOp = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
                    (Op.getOpcode() == ISD::UINT_TO_FP ?
                     PPCISD::FCFIDUS : PPCISD::FCFIDS) :
                    (Op.getOpcode() == ISD::UINT_TO_FP ?
                     PPCISD::FCFIDU : PPCISD::FCFID);
-  MVT      FCFTy = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+  MVT      FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
                    MVT::f32 : MVT::f64;
 
   if (Op.getOperand(0).getValueType() == MVT::i64) {
@@ -5146,7 +5195,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     // However, if -enable-unsafe-fp-math is in effect, accept double
     // rounding to avoid the extra overhead.
     if (Op.getValueType() == MVT::f32 &&
-        !PPCSubTarget.hasFPCVT() &&
+        !Subtarget.hasFPCVT() &&
         !DAG.getTarget().Options.UnsafeFPMath) {
 
       // Twiddle input to make sure the low 11 bits are zero.  (If this
@@ -5184,7 +5233,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
     SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
 
-    if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT())
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
       FP = DAG.getNode(ISD::FP_ROUND, dl,
                        MVT::f32, FP, DAG.getIntPtrConstant(0));
     return FP;
@@ -5201,7 +5250,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   SDValue Ld;
-  if (PPCSubTarget.hasLFIWAX() || PPCSubTarget.hasFPCVT()) {
+  if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
     int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
@@ -5220,7 +5269,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                  dl, DAG.getVTList(MVT::f64, MVT::Other),
                                  Ops, MVT::i32, MMO);
   } else {
-    assert(PPCSubTarget.isPPC64() &&
+    assert(Subtarget.isPPC64() &&
            "i32->FP without LFIWAX supported only on PPC64");
 
     int FrameIdx = FrameInfo->CreateStackObject(8, 8, false);
@@ -5242,7 +5291,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
   // FCFID it and return it.
   SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
-  if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT())
+  if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
     FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0));
   return FP;
 }
@@ -5531,10 +5580,14 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // we convert to a pseudo that will be expanded later into one of
     // the above forms.
     SDValue Elt = DAG.getConstant(SextVal, MVT::i32);
-    EVT VT = Op.getValueType();
-    int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4);
-    SDValue EltSize = DAG.getConstant(Size, MVT::i32);
-    return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    EVT VT = (SplatSize == 1 ? MVT::v16i8 :
+              (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
+    SDValue EltSize = DAG.getConstant(SplatSize, MVT::i32);
+    SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    if (VT == Op.getValueType())
+      return RetVal;
+    else
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
   }
 
   // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
@@ -5553,6 +5606,22 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
   }
 
+  // The remaining cases assume either big endian element order or
+  // a splat-size that equates to the element size of the vector
+  // to be built.  An example that doesn't work for little endian is
+  // {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits
+  // and a vector element size of 16 bits.  The code below will
+  // produce the vector in big endian element order, which for little
+  // endian is {-1, 0, -1, 0, -1, 0, -1, 0}.
+
+  // For now, just avoid these optimizations in that case.
+  // FIXME: Develop correct optimizations for LE with mismatched
+  // splat and element sizes.
+
+  if (Subtarget.isLittleEndian() &&
+      SplatSize != Op.getValueType().getVectorElementType().getSizeInBits())
+    return SDValue();
+
   // Check to see if this is a wide variety of vsplti*, binop self cases.
   static const signed char SplatCsts[] = {
     -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
@@ -5721,6 +5790,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   SDValue V2 = Op.getOperand(1);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   EVT VT = Op.getValueType();
+  bool isLittleEndian = Subtarget.isLittleEndian();
 
   // Cases that are handled by instructions that take permute immediates
   // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
@@ -5729,15 +5799,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     if (PPC::isSplatShuffleMask(SVOp, 1) ||
         PPC::isSplatShuffleMask(SVOp, 2) ||
         PPC::isSplatShuffleMask(SVOp, 4) ||
-        PPC::isVPKUWUMShuffleMask(SVOp, true) ||
-        PPC::isVPKUHUMShuffleMask(SVOp, true) ||
-        PPC::isVSLDOIShuffleMask(SVOp, true) != -1 ||
-        PPC::isVMRGLShuffleMask(SVOp, 1, true) ||
-        PPC::isVMRGLShuffleMask(SVOp, 2, true) ||
-        PPC::isVMRGLShuffleMask(SVOp, 4, true) ||
-        PPC::isVMRGHShuffleMask(SVOp, 1, true) ||
-        PPC::isVMRGHShuffleMask(SVOp, 2, true) ||
-        PPC::isVMRGHShuffleMask(SVOp, 4, true)) {
+        PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) ||
+        PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) {
       return Op;
     }
   }
@@ -5745,15 +5815,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // Altivec has a variety of "shuffle immediates" that take two vector inputs
   // and produce a fixed permutation.  If any of these match, do not lower to
   // VPERM.
-  if (PPC::isVPKUWUMShuffleMask(SVOp, false) ||
-      PPC::isVPKUHUMShuffleMask(SVOp, false) ||
-      PPC::isVSLDOIShuffleMask(SVOp, false) != -1 ||
-      PPC::isVMRGLShuffleMask(SVOp, 1, false) ||
-      PPC::isVMRGLShuffleMask(SVOp, 2, false) ||
-      PPC::isVMRGLShuffleMask(SVOp, 4, false) ||
-      PPC::isVMRGHShuffleMask(SVOp, 1, false) ||
-      PPC::isVMRGHShuffleMask(SVOp, 2, false) ||
-      PPC::isVMRGHShuffleMask(SVOp, 4, false))
+  if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) ||
+      PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG))
     return Op;
 
   // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
@@ -5787,7 +5857,9 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
   // perfect shuffle vector to determine if it is cost effective to do this as
   // discrete instructions, or whether we should use a vperm.
-  if (isFourElementShuffle) {
+  // For now, we skip this for little endian until such time as we have a
+  // little-endian perfect shuffle table.
+  if (isFourElementShuffle && !isLittleEndian) {
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex =
       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
@@ -5816,6 +5888,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
   // that it is in input element units, not in bytes.  Convert now.
+
+  // For little endian, the order of the input vectors is reversed, and
+  // the permutation mask is complemented with respect to 31.  This is
+  // necessary to produce proper semantics with the big-endian-biased vperm
+  // instruction.
   EVT EltVT = V1.getValueType().getVectorElementType();
   unsigned BytesPerElement = EltVT.getSizeInBits()/8;
 
@@ -5824,13 +5901,22 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
 
     for (unsigned j = 0; j != BytesPerElement; ++j)
-      ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
-                                           MVT::i32));
+      if (isLittleEndian)
+        ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j),
+                                             MVT::i32));
+      else
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+                                             MVT::i32));
   }
 
   SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
                                   ResultMask);
-  return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask);
+  if (isLittleEndian)
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V2, V1, VPermMask);
+  else
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V1, V2, VPermMask);
 }
 
 /// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
@@ -6023,6 +6109,7 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
                             LHS, RHS, Zero, DAG, dl);
   } else if (Op.getValueType() == MVT::v16i8) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    bool isLittleEndian = Subtarget.isLittleEndian();
 
     // Multiply the even 8-bit parts, producing 16-bit sums.
     SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
@@ -6034,13 +6121,24 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
                                           LHS, RHS, DAG, dl, MVT::v8i16);
     OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
 
-    // Merge the results together.
+    // Merge the results together.  Because vmuleub and vmuloub are
+    // instructions with a big-endian bias, we must reverse the
+    // element numbering and reverse the meaning of "odd" and "even"
+    // when generating little endian code.
     int Ops[16];
     for (unsigned i = 0; i != 8; ++i) {
-      Ops[i*2  ] = 2*i+1;
-      Ops[i*2+1] = 2*i+1+16;
+      if (isLittleEndian) {
+        Ops[i*2  ] = 2*i;
+        Ops[i*2+1] = 2*i+16;
+      } else {
+        Ops[i*2  ] = 2*i+1;
+        Ops[i*2+1] = 2*i+1+16;
+      }
     }
-    return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+    if (isLittleEndian)
+      return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
+    else
+      return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
   } else {
     llvm_unreachable("Unknown mul to lower!");
   }
@@ -6060,17 +6158,17 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::VASTART:
-    return LowerVASTART(Op, DAG, PPCSubTarget);
+    return LowerVASTART(Op, DAG, Subtarget);
 
   case ISD::VAARG:
-    return LowerVAARG(Op, DAG, PPCSubTarget);
+    return LowerVAARG(Op, DAG, Subtarget);
 
   case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG, PPCSubTarget);
+    return LowerVACOPY(Op, DAG, Subtarget);
 
-  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, Subtarget);
   case ISD::DYNAMIC_STACKALLOC:
-    return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
+    return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
 
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
@@ -6140,7 +6238,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     EVT VT = N->getValueType(0);
 
     if (VT == MVT::i64) {
-      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, PPCSubTarget);
+      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget);
 
       Results.push_back(NewNode);
       Results.push_back(NewNode.getValue(1));
@@ -6251,7 +6349,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
   // registers without caring whether they're 32 or 64, but here we're
   // doing actual arithmetic on the addresses.
-  bool is64bit = PPCSubTarget.isPPC64();
+  bool is64bit = Subtarget.isPPC64();
   unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -6446,7 +6544,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
   unsigned BufReg = MI->getOperand(1).getReg();
 
-  if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) {
+  if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
     MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
             .addReg(PPC::X2)
             .addImm(TOCOffset)
@@ -6459,12 +6557,12 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned BaseReg;
   if (MF->getFunction()->getAttributes().hasAttribute(
           AttributeSet::FunctionIndex, Attribute::Naked))
-    BaseReg = PPCSubTarget.isPPC64() ? PPC::X1 : PPC::R1;
+    BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
   else
-    BaseReg = PPCSubTarget.isPPC64() ? PPC::BP8 : PPC::BP;
+    BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
 
   MIB = BuildMI(*thisMBB, MI, DL,
-                TII->get(PPCSubTarget.isPPC64() ? PPC::STD : PPC::STW))
+                TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
           .addReg(BaseReg)
           .addImm(BPOffset)
           .addReg(BufReg);
@@ -6488,10 +6586,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // mainMBB:
   //  mainDstReg = 0
   MIB = BuildMI(mainMBB, DL,
-    TII->get(PPCSubTarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
+    TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
 
   // Store IP
-  if (PPCSubTarget.isPPC64()) {
+  if (Subtarget.isPPC64()) {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
             .addReg(LabelReg)
             .addImm(LabelOffset)
@@ -6603,7 +6701,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Reload TOC
-  if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) {
+  if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
             .addImm(TOCOffset)
             .addReg(BufReg);
@@ -6641,7 +6739,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   MachineFunction *F = BB->getParent();
 
-  if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+  if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
                                  MI->getOpcode() == PPC::SELECT_CC_I8 ||
                                  MI->getOpcode() == PPC::SELECT_I4 ||
                                  MI->getOpcode() == PPC::SELECT_I8)) {
@@ -6858,7 +6956,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // We must use 64-bit registers for addresses when targeting 64-bit,
     // since we're actually doing arithmetic on them.  Other registers
     // can be 32-bit.
-    bool is64bit = PPCSubTarget.isPPC64();
+    bool is64bit = Subtarget.isPPC64();
     bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
 
     unsigned dest   = MI->getOperand(0).getReg();
@@ -7066,10 +7164,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
 
   EVT VT = Op.getValueType();
 
-  if ((VT == MVT::f32 && PPCSubTarget.hasFRES()) ||
-      (VT == MVT::f64 && PPCSubTarget.hasFRE())  ||
-      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) {
+  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRE())  ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
 
     // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
     // For the reciprocal, we need to find the zero of the function:
@@ -7082,7 +7180,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
     // correct after every iteration. The minimum architected relative
     // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
     // 23 digits and double has 52 digits.
-    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
     if (VT.getScalarType() == MVT::f64)
       ++Iterations;
 
@@ -7129,10 +7227,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
 
   EVT VT = Op.getValueType();
 
-  if ((VT == MVT::f32 && PPCSubTarget.hasFRSQRTES()) ||
-      (VT == MVT::f64 && PPCSubTarget.hasFRSQRTE())  ||
-      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) {
+  if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRSQRTE())  ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
 
     // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
     // For the reciprocal sqrt, we need to find the zero of the function:
@@ -7145,7 +7243,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
     // correct after every iteration. The minimum architected relative
     // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
     // 23 digits and double has 52 digits.
-    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
     if (VT.getScalarType() == MVT::f64)
       ++Iterations;
 
@@ -7308,7 +7406,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
 
-  assert(PPCSubTarget.useCRBits() &&
+  assert(Subtarget.useCRBits() &&
          "Expecting to be tracking CR bits");
   // If we're tracking CR bits, we need to be careful that we don't have:
   //   trunc(binary-ops(zext(x), zext(y)))
@@ -7606,9 +7704,9 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
     return SDValue();
 
   if (!((N->getOperand(0).getValueType() == MVT::i1 &&
-        PPCSubTarget.useCRBits()) ||
+        Subtarget.useCRBits()) ||
        (N->getOperand(0).getValueType() == MVT::i32 &&
-        PPCSubTarget.isPPC64())))
+        Subtarget.isPPC64())))
     return SDValue();
 
   if (N->getOperand(0).getOpcode() != ISD::AND &&
@@ -7926,8 +8024,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       DCI.AddToWorklist(RV.getNode());
       RV = DAGCombineFastRecip(RV, DCI);
       if (RV.getNode()) {
-	// Unfortunately, RV is now NaN if the input was exactly 0. Select out
-	// this case and force the answer to 0.
+        // Unfortunately, RV is now NaN if the input was exactly 0. Select out
+        // this case and force the answer to 0.
 
         EVT VT = RV.getValueType();
 
@@ -8047,6 +8145,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // This is a type-legal unaligned Altivec load.
       SDValue Chain = LD->getChain();
       SDValue Ptr = LD->getBasePtr();
+      bool isLittleEndian = Subtarget.isLittleEndian();
 
       // This implements the loading of unaligned vectors as described in
       // the venerable Apple Velocity Engine overview. Specifically:
@@ -8054,25 +8153,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
       //
       // The general idea is to expand a sequence of one or more unaligned
-      // loads into a alignment-based permutation-control instruction (lvsl),
-      // a series of regular vector loads (which always truncate their
-      // input address to an aligned address), and a series of permutations.
-      // The results of these permutations are the requested loaded values.
-      // The trick is that the last "extra" load is not taken from the address
-      // you might suspect (sizeof(vector) bytes after the last requested
-      // load), but rather sizeof(vector) - 1 bytes after the last
-      // requested vector. The point of this is to avoid a page fault if the
-      // base address happened to be aligned. This works because if the base
-      // address is aligned, then adding less than a full vector length will
-      // cause the last vector in the sequence to be (re)loaded. Otherwise,
-      // the next vector will be fetched as you might suspect was necessary.
+      // loads into an alignment-based permutation-control instruction (lvsl
+      // or lvsr), a series of regular vector loads (which always truncate
+      // their input address to an aligned address), and a series of
+      // permutations.  The results of these permutations are the requested
+      // loaded values.  The trick is that the last "extra" load is not taken
+      // from the address you might suspect (sizeof(vector) bytes after the
+      // last requested load), but rather sizeof(vector) - 1 bytes after the
+      // last requested vector. The point of this is to avoid a page fault if
+      // the base address happened to be aligned. This works because if the
+      // base address is aligned, then adding less than a full vector length
+      // will cause the last vector in the sequence to be (re)loaded.
+      // Otherwise, the next vector will be fetched as you might suspect was
+      // necessary.
 
       // We might be able to reuse the permutation generation from
       // a different base address offset from this one by an aligned amount.
       // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
       // optimization later.
-      SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr,
-                                          DAG, dl, MVT::v16i8);
+      Intrinsic::ID Intr = (isLittleEndian ?
+                            Intrinsic::ppc_altivec_lvsr :
+                            Intrinsic::ppc_altivec_lvsl);
+      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
 
       // Refine the alignment of the original load (a "new" load created here
       // which was identical to the first except for the alignment would be
@@ -8121,8 +8223,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (ExtraLoad.getValueType() != MVT::v4i32)
         ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
 
-      SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
-                                      BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+      // Because vperm has a big-endian bias, we must reverse the order
+      // of the input vectors and complement the permute control vector
+      // when generating little endian code.  We have already handled the
+      // latter by using lvsr instead of lvsl, so just reverse BaseLoad
+      // and ExtraLoad here.
+      SDValue Perm;
+      if (isLittleEndian)
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                ExtraLoad, BaseLoad, PermCntl, DAG, dl);
+      else
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                BaseLoad, ExtraLoad, PermCntl, DAG, dl);
 
       if (VT != MVT::v4i32)
         Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
@@ -8162,9 +8274,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
     }
     break;
-  case ISD::INTRINSIC_WO_CHAIN:
-    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() ==
-          Intrinsic::ppc_altivec_lvsl &&
+  case ISD::INTRINSIC_WO_CHAIN: {
+    bool isLittleEndian = Subtarget.isLittleEndian();
+    Intrinsic::ID Intr = (isLittleEndian ?
+                          Intrinsic::ppc_altivec_lvsr :
+                          Intrinsic::ppc_altivec_lvsl);
+    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
         N->getOperand(1)->getOpcode() == ISD::ADD) {
       SDValue Add = N->getOperand(1);
 
@@ -8176,8 +8291,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
              UE = BasePtr->use_end(); UI != UE; ++UI) {
           if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
               cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
-                Intrinsic::ppc_altivec_lvsl) {
-            // We've found another LVSL, and this address if an aligned
+                Intr) {
+            // We've found another LVSL/LVSR, and this address is an aligned
             // multiple of that one. The results will be the same, so use the
             // one we've just found instead.
 
@@ -8186,6 +8301,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         }
       }
     }
+    }
 
     break;
   case ISD::BSWAP:
@@ -8533,11 +8649,11 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     // GCC RS6000 Constraint Letters
     switch (Constraint[0]) {
     case 'b':   // R1-R31
-      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+      if (VT == MVT::i64 && Subtarget.isPPC64())
         return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
       return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
     case 'r':   // R0-R31
-      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+      if (VT == MVT::i64 && Subtarget.isPPC64())
         return std::make_pair(0U, &PPC::G8RCRegClass);
       return std::make_pair(0U, &PPC::GPRCRegClass);
     case 'f':
@@ -8569,7 +8685,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   // register.
   // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
   // the AsmName field from *RegisterInfo.td, then this would not be necessary.
-  if (R.first && VT == MVT::i64 && PPCSubTarget.isPPC64() &&
+  if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
       PPC::GPRCRegClass.contains(R.first)) {
     const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
     return std::make_pair(TRI->getMatchingSuperReg(R.first,
@@ -8703,8 +8819,8 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
   // the stack.
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setLRStoreRequired();
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -8758,8 +8874,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
 // this table could be generated automatically from RegInfo.
 unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
                                               EVT VT) const {
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
 
   if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
       (!isPPC64 && VT != MVT::i32))
@@ -8800,7 +8916,7 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
                                            bool IsMemset, bool ZeroMemset,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
-  if (this->PPCSubTarget.isPPC64()) {
+  if (Subtarget.isPPC64()) {
     return MVT::i64;
   } else {
     return MVT::i32;
@@ -8859,7 +8975,7 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
     return false;
 
   if (VT.getSimpleVT().isVector()) {
-    if (PPCSubTarget.hasVSX()) {
+    if (Subtarget.hasVSX()) {
       if (VT != MVT::v2f64 && VT != MVT::v2i64)
         return false;
     } else {
@@ -8903,7 +9019,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
 }
 
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
-  if (DisableILPPref || PPCSubTarget.enableMachineScheduler())
+  if (DisableILPPref || Subtarget.enableMachineScheduler())
     return TargetLowering::getSchedulingPreference(N);
 
   return Sched::ILP;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 080ef5d0f76f..2b69208fea7e 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -18,7 +18,6 @@
 #include "PPC.h"
 #include "PPCInstrInfo.h"
 #include "PPCRegisterInfo.h"
-#include "PPCSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
@@ -71,19 +70,14 @@ namespace llvm {
 
       TOC_ENTRY,
 
-      /// The following three target-specific nodes are used for calls through
+      /// The following two target-specific nodes are used for calls through
       /// function pointers in the 64-bit SVR4 ABI.
 
-      /// Restore the TOC from the TOC save area of the current stack frame.
-      /// This is basically a hard coded load instruction which additionally
-      /// takes/produces a flag.
-      TOC_RESTORE,
-
       /// Like a regular LOAD but additionally taking/producing a flag.
       LOAD,
 
-      /// LOAD into r2 (also taking/producing a flag). Like TOC_RESTORE, this is
-      /// a hard coded load instruction.
+      /// Like LOAD (taking/producing a flag), but using r2 as hard-coded
+      /// destination.
       LOAD_TOC,
 
       /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
@@ -303,25 +297,27 @@ namespace llvm {
   namespace PPC {
     /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUHUM instruction.
-    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                              SelectionDAG &DAG);
 
     /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUWUM instruction.
-    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                              SelectionDAG &DAG);
 
     /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
     /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
     bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                            bool isUnary);
+                            bool isUnary, SelectionDAG &DAG);
 
     /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
     /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
     bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                            bool isUnary);
+                            bool isUnary, SelectionDAG &DAG);
 
     /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
     /// amount, otherwise return -1.
-    int isVSLDOIShuffleMask(SDNode *N, bool isUnary);
+    int isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG);
 
     /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a splat of a single element that is suitable for input to
@@ -334,7 +330,7 @@ namespace llvm {
 
     /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
     /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize);
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
 
     /// get_VSPLTI_elt - If this is a build_vector of constants which can be
     /// formed by using a vspltis[bhw] instruction of the specified element
@@ -343,8 +339,9 @@ namespace llvm {
     SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
   }
 
+  class PPCSubtarget;
   class PPCTargetLowering : public TargetLowering {
-    const PPCSubtarget &PPCSubTarget;
+    const PPCSubtarget &Subtarget;
 
   public:
     explicit PPCTargetLowering(PPCTargetMachine &TM);
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index b71c09ea8e12..9318f70f4305 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -802,17 +802,11 @@ def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   [(set i64:$rD,
                      (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
 
-let hasSideEffects = 1, isCodeGenOnly = 1 in {
-let RST = 2, DS = 2 in
-def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
-                    "ld 2, 8($reg)", IIC_LdStLD,
-                    [(PPCload_toc i64:$reg)]>, isPPC64;
-                    
-let RST = 2, DS = 10, RA = 1 in
-def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
-                    "ld 2, 40(1)", IIC_LdStLD,
-                    [(PPCtoc_restore)]>, isPPC64;
-}
+let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2 in
+def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src),
+                    "ld 2, $src", IIC_LdStLD,
+                    [(PPCload_toc ixaddr:$src)]>, isPPC64;
+
 def LDX  : XForm_1<31,  21, (outs g8rc:$rD), (ins memrr:$src),
                    "ldx $rD, $src", IIC_LdStLD,
                    [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 2fd4a3eeae7d..dce46d84e6e1 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -22,111 +22,127 @@ def vnot_ppc : PatFrag<(ops node:$in),
 
 def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false,
+                                   *CurDAG);
 }]>;
 def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false,
+                                   *CurDAG);
 }]>;
 def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true,
+                                   *CurDAG);
 }]>;
 def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true,
+                                   *CurDAG);
 }]>;
 
 
 def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false,
+                                 *CurDAG);
 }]>;
 def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false,
+                                 *CurDAG);
 }]>;
 def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false,
+                                 *CurDAG);
 }]>;
 def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false,
+                                 *CurDAG);
 }]>;
 def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false,
+                                 *CurDAG);
 }]>;
 def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false,
+                                 *CurDAG);
 }]>;
 
 
 def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true,
+                                 *CurDAG);
 }]>;
 def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true,
+                                 *CurDAG);
 }]>;
 def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true,
+                                 *CurDAG);
 }]>;
 def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true,
+                                 *CurDAG);
 }]>;
 def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true,
+                                 *CurDAG);
 }]>;
 def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true,
+                                 *CurDAG);
 }]>;
 
 
 def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false));
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false, *CurDAG));
 }]>;
 def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVSLDOIShuffleMask(N, false) != -1;
+  return PPC::isVSLDOIShuffleMask(N, false, *CurDAG) != -1;
 }], VSLDOI_get_imm>;
 
 
 /// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
 /// vector_shuffle(X,undef,mask) by the dag combiner.
 def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true));
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true, *CurDAG));
 }]>;
 def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVSLDOIShuffleMask(N, true) != -1;
+  return PPC::isVSLDOIShuffleMask(N, true, *CurDAG) != -1;
 }], VSLDOI_unary_get_imm>;
 
 
 // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
 def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 1));
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG));
 }]>;
 def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
   return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
 }], VSPLTB_get_imm>;
 def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 2));
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG));
 }]>;
 def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
   return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
 }], VSPLTH_get_imm>;
 def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 4));
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG));
 }]>;
 def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
@@ -223,7 +239,7 @@ class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
 //===----------------------------------------------------------------------===//
 // Instruction Definitions.
 
-def HasAltivec : Predicate<"PPCSubTarget.hasAltivec()">;
+def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
 let Predicates = [HasAltivec] in {
 
 let isCodeGenOnly = 1 in {
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 7fed2c65da73..1e4396cd1017 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -360,20 +360,6 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
   let Inst{30-31} = xo;
 }
 
-class DSForm_1a<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
-                InstrItinClass itin, list<dag> pattern>
-         : I<opcode, OOL, IOL, asmstr, itin> {
-   bits<5>  RST;
-   bits<14> DS;
-   bits<5>  RA;
- 
-   let Pattern = pattern;
-   
-   let Inst{6-10}  = RST;
-   let Inst{11-15} = RA;
-   let Inst{16-29} = DS;
-   let Inst{30-31} = xo;
-}
 
 // 1.7.6 X-Form
 class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 0c15e2d02320..431cfd754997 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -60,23 +61,25 @@ cl::Hidden);
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
-PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
-  : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl()) {}
+PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
+    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+      Subtarget(STI), RI(STI) {}
 
 /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
 /// this target when scheduling the DAG.
-ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
-  const TargetMachine *TM,
-  const ScheduleDAG *DAG) const {
-  unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                           const ScheduleDAG *DAG) const {
+  unsigned Directive =
+      static_cast<const PPCSubtarget *>(STI)->getDarwinDirective();
   if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
       Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
-    const InstrItineraryData *II = TM->getInstrItineraryData();
+    const InstrItineraryData *II =
+        &static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
     return new ScoreboardHazardRecognizer(II, DAG);
   }
 
-  return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG);
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
 }
 
 /// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
@@ -84,7 +87,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
 ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   const InstrItineraryData *II,
   const ScheduleDAG *DAG) const {
-  unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  unsigned Directive =
+      DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
   if (Directive == PPC::DIR_PWR7)
     return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
@@ -92,9 +96,9 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   // Most subtargets use a PPC970 recognizer.
   if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
       Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
-    assert(TM.getInstrInfo() && "No InstrInfo?");
+    assert(DAG->TII && "No InstrInfo?");
 
-    return new PPCHazardRecognizer970(TM);
+    return new PPCHazardRecognizer970(*DAG);
   }
 
   return new ScoreboardHazardRecognizer(II, DAG);
@@ -129,7 +133,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
 
     // On some cores, there is an additional delay between writing to a condition
     // register, and using it from a branch.
-    unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+    unsigned Directive = Subtarget.getDarwinDirective();
     switch (Directive) {
     default: break;
     case PPC::DIR_7400:
@@ -313,7 +317,7 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI) const {
   // This function is used for scheduling, and the nop wanted here is the type
   // that terminates dispatch groups on the POWER cores.
-  unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  unsigned Directive = Subtarget.getDarwinDirective();
   unsigned Opcode;
   switch (Directive) {
   default:            Opcode = PPC::NOP; break;
@@ -332,7 +336,7 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
-  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
 
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
@@ -538,7 +542,7 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "PPC branch conditions have two components!");
 
-  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
 
   // One-way branch.
   if (!FBB) {
@@ -579,7 +583,7 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                 const SmallVectorImpl<MachineOperand> &Cond,
                 unsigned TrueReg, unsigned FalseReg,
                 int &CondCycles, int &TrueCycles, int &FalseCycles) const {
-  if (!TM.getSubtargetImpl()->hasISEL())
+  if (!Subtarget.hasISEL())
     return false;
 
   if (Cond.size() != 2)
@@ -623,7 +627,7 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 &&
          "PPC branch conditions have two components!");
 
-  assert(TM.getSubtargetImpl()->hasISEL() &&
+  assert(Subtarget.hasISEL() &&
          "Cannot insert select on target without ISEL support");
 
   // Get the register classes.
@@ -826,7 +830,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                        FrameIdx));
     NonRI = true;
   } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-    assert(TM.getSubtargetImpl()->isDarwin() &&
+    assert(Subtarget.isDarwin() &&
            "VRSAVE only needs spill/restore on Darwin");
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
                                        .addReg(SrcReg,
@@ -921,7 +925,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                        FrameIdx));
     NonRI = true;
   } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-    assert(TM.getSubtargetImpl()->isDarwin() &&
+    assert(Subtarget.isDarwin() &&
            "VRSAVE only needs spill/restore on Darwin");
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
                                                get(PPC::RESTORE_VRSAVE),
@@ -1035,7 +1039,7 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
 
   unsigned ZeroReg;
   if (UseInfo->isLookupPtrRegClass()) {
-    bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+    bool isPPC64 = Subtarget.isPPC64();
     ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO;
   } else {
     ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ?
@@ -1102,7 +1106,7 @@ bool PPCInstrInfo::PredicateInstruction(
   unsigned OpC = MI->getOpcode();
   if (OpC == PPC::BLR) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
-      bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+      bool isPPC64 = Subtarget.isPPC64();
       MI->setDesc(get(Pred[0].getImm() ?
                       (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) :
                       (isPPC64 ? PPC::BDZLR8  : PPC::BDZLR)));
@@ -1124,7 +1128,7 @@ bool PPCInstrInfo::PredicateInstruction(
     return true;
   } else if (OpC == PPC::B) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
-      bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+      bool isPPC64 = Subtarget.isPPC64();
       MI->setDesc(get(Pred[0].getImm() ?
                       (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                       (isPPC64 ? PPC::BDZ8  : PPC::BDZ)));
@@ -1162,7 +1166,7 @@ bool PPCInstrInfo::PredicateInstruction(
       llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
 
     bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
-    bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+    bool isPPC64 = Subtarget.isPPC64();
 
     if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) :
@@ -1323,7 +1327,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // for equality checks (as those don't depend on the sign). On PPC64,
   // we are restricted to equality for unsigned 64-bit comparisons and for
   // signed 32-bit comparisons the applicability is more restricted.
-  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
   bool is32BitSignedCompare   = OpC ==  PPC::CMPWI || OpC == PPC::CMPW;
   bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW;
   bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD;
@@ -1813,9 +1817,14 @@ namespace {
 
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX then go ahead and return without doing
+      // anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
+
       LIS = &getAnalysis<LiveIntervals>();
 
-      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -1966,6 +1975,9 @@ namespace {
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX on the subtarget, don't do anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -2040,6 +2052,9 @@ namespace {
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX don't bother doing anything here.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
       TII = TM->getInstrInfo();
 
       bool Changed = false;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index d9db3e1b1dc5..83f14c6cf214 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -65,7 +65,7 @@ enum PPC970_Unit {
 
 
 class PPCInstrInfo : public PPCGenInstrInfo {
-  PPCTargetMachine &TM;
+  PPCSubtarget &Subtarget;
   const PPCRegisterInfo RI;
 
   bool StoreRegToStackSlot(MachineFunction &MF,
@@ -80,7 +80,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             bool &NonRI, bool &SpillsVRS) const;
   virtual void anchor();
 public:
-  explicit PPCInstrInfo(PPCTargetMachine &TM);
+  explicit PPCInstrInfo(PPCSubtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -89,7 +89,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   const PPCRegisterInfo &getRegisterInfo() const { return RI; }
 
   ScheduleHazardRecognizer *
-  CreateTargetHazardRecognizer(const TargetMachine *TM,
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const override;
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 1d984aba1faf..c2e3382b3e79 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -141,9 +141,6 @@ def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
 def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
                           [SDNPHasChain, SDNPSideEffect,
                            SDNPInGlue, SDNPOutGlue]>;
-def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
-                            [SDNPHasChain, SDNPSideEffect,
-                             SDNPInGlue, SDNPOutGlue]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
@@ -610,10 +607,10 @@ def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
-def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
-def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
-def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
-def IsNotBookE  : Predicate<"!PPCSubTarget.isBookE()">;
+def In32BitMode  : Predicate<"!PPCSubTarget->isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget->isPPC64()">;
+def IsBookE  : Predicate<"PPCSubTarget->isBookE()">;
+def IsNotBookE  : Predicate<"!PPCSubTarget->isBookE()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 9cc919ebe383..49bcc4876d33 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -39,7 +39,7 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
   }
 }
 
-def HasVSX : Predicate<"PPCSubTarget.hasVSX()">;
+def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
 let Predicates = [HasVSX] in {
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
 let neverHasSideEffects = 1 in { // VSX instructions don't have side effects.
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 7bbc71bd1fa9..e5f113a0c030 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -13,7 +13,7 @@
 
 #include "PPCJITInfo.h"
 #include "PPCRelocations.h"
-#include "PPCTargetMachine.h"
+#include "PPCSubtarget.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -25,6 +25,11 @@ using namespace llvm;
 
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 
+PPCJITInfo::PPCJITInfo(PPCSubtarget &STI)
+    : Subtarget(STI), is64Bit(STI.isPPC64()) {
+  useGOT = 0;
+}
+
 #define BUILD_ADDIS(RD,RS,IMM16) \
   ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
 #define BUILD_ORI(RD,RS,UIMM16) \
@@ -393,7 +398,7 @@ void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn,
     JCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
     JCE.emitWordBE(0x7d6802a6);     // mflr r11
     JCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
-  } else if (TM.getSubtargetImpl()->isDarwinABI()){
+  } else if (Subtarget.isDarwinABI()){
     JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
     JCE.emitWordBE(0x7d6802a6);     // mflr r11
     JCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
index 0693e3e86cd5..b6b37ffb852b 100644
--- a/lib/Target/PowerPC/PPCJITInfo.h
+++ b/lib/Target/PowerPC/PPCJITInfo.h
@@ -18,32 +18,29 @@
 #include "llvm/Target/TargetJITInfo.h"
 
 namespace llvm {
-  class PPCTargetMachine;
+class PPCSubtarget;
+class PPCJITInfo : public TargetJITInfo {
+protected:
+  PPCSubtarget &Subtarget;
+  bool is64Bit;
 
-  class PPCJITInfo : public TargetJITInfo {
-  protected:
-    PPCTargetMachine &TM;
-    bool is64Bit;
-  public:
-    PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) {
-      useGOT = 0;
-      is64Bit = tmIs64Bit;
-    }
+public:
+  PPCJITInfo(PPCSubtarget &STI);
 
-    StubLayout getStubLayout() override;
-    void *emitFunctionStub(const Function* F, void *Fn,
-                           JITCodeEmitter &JCE) override;
-    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
-    void relocate(void *Function, MachineRelocation *MR,
-                  unsigned NumRelocs, unsigned char* GOTBase) override;
+  StubLayout getStubLayout() override;
+  void *emitFunctionStub(const Function *F, void *Fn,
+                         JITCodeEmitter &JCE) override;
+  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+  void relocate(void *Function, MachineRelocation *MR, unsigned NumRelocs,
+                unsigned char *GOTBase) override;
 
-    /// replaceMachineCodeForFunction - Make it so that calling the function
-    /// whose machine code is at OLD turns into a call to NEW, perhaps by
-    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-    /// code.
-    ///
-    void replaceMachineCodeForFunction(void *Old, void *New) override;
-  };
+  /// replaceMachineCodeForFunction - Make it so that calling the function
+  /// whose machine code is at OLD turns into a call to NEW, perhaps by
+  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+  /// code.
+  ///
+  void replaceMachineCodeForFunction(void *Old, void *New) override;
+};
 }
 
 #endif
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index f742f726186b..dc1674214769 100644
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -16,9 +16,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "powerpc-selectiondag-info"
 
-PPCSelectionDAGInfo::PPCSelectionDAGInfo(const PPCTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL)
+    : TargetSelectionDAGInfo(DL) {}
 
-PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {
-}
+PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {}
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index 341b69cdfb5f..b2e7f3b5f2ac 100644
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -22,7 +22,7 @@ class PPCTargetMachine;
 
 class PPCSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit PPCSelectionDAGInfo(const PPCTargetMachine &TM);
+  explicit PPCSelectionDAGInfo(const DataLayout *DL);
   ~PPCSelectionDAGInfo();
 };
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index ea9daee4f8e8..0759200ce353 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -32,15 +32,57 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "PPCGenSubtargetInfo.inc"
 
-PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, bool is64Bit,
-                           CodeGenOpt::Level OptLevel)
-    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
-      OptLevel(OptLevel) {
+/// Return the datalayout string of a subtarget.
+static std::string getDataLayoutString(const PPCSubtarget &ST) {
+  const Triple &T = ST.getTargetTriple();
+
+  std::string Ret;
+
+  // Most PPC* platforms are big endian, PPC64LE is little endian.
+  if (ST.isLittleEndian())
+    Ret = "e";
+  else
+    Ret = "E";
+
+  Ret += DataLayout::getManglingComponent(T);
+
+  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+  // pointers.
+  if (!ST.isPPC64() || T.getOS() == Triple::Lv2)
+    Ret += "-p:32:32";
+
+  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+  // documentation are wrong; these are correct (i.e. "what gcc does").
+  if (ST.isPPC64() || ST.isSVR4ABI())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+  if (ST.isPPC64())
+    Ret += "-n32:64";
+  else
+    Ret += "-n32";
+
+  return Ret;
+}
+
+PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
+  return *this;
 }
 
+PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, PPCTargetMachine &TM,
+                           bool is64Bit, CodeGenOpt::Level OptLevel)
+    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
+      OptLevel(OptLevel),
+      FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+      DL(getDataLayoutString(*this)), InstrInfo(*this), JITInfo(*this),
+      TLInfo(TM), TSInfo(&DL) {}
+
 /// SetJITMode - This is called to inform the subtarget info that we are
 /// producing code for the JIT.
 void PPCSubtarget::SetJITMode() {
@@ -156,6 +198,11 @@ void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   // Determine endianness.
   IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
+
+  // FIXME: For now, we disable VSX in little-endian mode until endian
+  // issues in those instructions can be addressed.
+  if (IsLittleEndian)
+    HasVSX = false;
 }
 
 /// hasLazyResolverStub - Return true if accesses to the specified global have
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 76f4a318e82d..2d8399e78110 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -14,7 +14,13 @@
 #ifndef POWERPCSUBTARGET_H
 #define POWERPCSUBTARGET_H
 
+#include "PPCFrameLowering.h"
+#include "PPCInstrInfo.h"
+#include "PPCISelLowering.h"
+#include "PPCJITInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -102,12 +108,19 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   /// OptLevel - What default optimization level we're emitting code for.
   CodeGenOpt::Level OptLevel;
 
+  PPCFrameLowering FrameLowering;
+  const DataLayout DL;
+  PPCInstrInfo InstrInfo;
+  PPCJITInfo JITInfo;
+  PPCTargetLowering TLInfo;
+  PPCSelectionDAGInfo TSInfo;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   PPCSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, bool is64Bit,
+               const std::string &FS, PPCTargetMachine &TM, bool is64Bit,
                CodeGenOpt::Level OptLevel);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
@@ -127,10 +140,21 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   ///
   unsigned getDarwinDirective() const { return DarwinDirective; }
 
-  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
+  const PPCFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const PPCInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  PPCJITInfo *getJITInfo() { return &JITInfo; }
+  const PPCTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const PPCSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
   /// \brief Reset the features for the PowerPC target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
@@ -205,6 +229,8 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
                              RegClassVector& CriticalPathRCs) const override;
 
+  bool enableEarlyIfConversion() const override { return hasISEL(); }
+
   // Scheduling customization.
   bool enableMachineScheduler() const override;
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index fdfb8c9bfc9d..9563b9045c39 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -37,53 +37,12 @@ extern "C" void LLVMInitializePowerPCTarget() {
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
 }
 
-/// Return the datalayout string of a subtarget.
-static std::string getDataLayoutString(const PPCSubtarget &ST) {
-  const Triple &T = ST.getTargetTriple();
-
-  std::string Ret;
-
-  // Most PPC* platforms are big endian, PPC64LE is little endian.
-  if (ST.isLittleEndian())
-    Ret = "e";
-  else
-    Ret = "E";
-
-  Ret += DataLayout::getManglingComponent(T);
-
-  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
-  // pointers.
-  if (!ST.isPPC64() || T.getOS() == Triple::Lv2)
-    Ret += "-p:32:32";
-
-  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
-  // documentation are wrong; these are correct (i.e. "what gcc does").
-  if (ST.isPPC64() || ST.isSVR4ABI())
-    Ret += "-i64:64";
-  else
-    Ret += "-f64:32:64";
-
-  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
-  if (ST.isPPC64())
-    Ret += "-n32:64";
-  else
-    Ret += "-n32";
-
-  return Ret;
-}
-
-PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
+PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL,
-                                   bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, is64Bit, OL),
-    DL(getDataLayoutString(Subtarget)), InstrInfo(*this),
-    FrameLowering(Subtarget), JITInfo(*this, is64Bit),
-    TLInfo(*this), TSInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+                                   CodeGenOpt::Level OL, bool is64Bit)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, is64Bit, OL) {
   initAsmInfo();
 }
 
@@ -148,12 +107,8 @@ bool PPCPassConfig::addPreISel() {
 }
 
 bool PPCPassConfig::addILPOpts() {
-  if (getPPCSubtarget().hasISEL()) {
-    addPass(&EarlyIfConverterID);
-    return true;
-  }
-
-  return false;
+  addPass(&EarlyIfConverterID);
+  return true;
 }
 
 bool PPCPassConfig::addInstSelector() {
@@ -165,25 +120,19 @@ bool PPCPassConfig::addInstSelector() {
     addPass(createPPCCTRLoopsVerify());
 #endif
 
-  if (getPPCSubtarget().hasVSX())
-    addPass(createPPCVSXCopyPass());
-
+  addPass(createPPCVSXCopyPass());
   return false;
 }
 
 bool PPCPassConfig::addPreRegAlloc() {
-  if (getPPCSubtarget().hasVSX()) {
-    initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
-    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
-               &PPCVSXFMAMutateID);
-  }
-
+  initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+  insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+             &PPCVSXFMAMutateID);
   return false;
 }
 
 bool PPCPassConfig::addPreSched2() {
-  if (getPPCSubtarget().hasVSX())
-    addPass(createPPCVSXCopyCleanupPass());
+  addPass(createPPCVSXCopyCleanupPass());
 
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 9e924945d8fa..3cf8063b70ef 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -29,13 +29,6 @@ namespace llvm {
 ///
 class PPCTargetMachine : public LLVMTargetMachine {
   PPCSubtarget        Subtarget;
-  const DataLayout    DL;       // Calculates type size & alignment
-  PPCInstrInfo        InstrInfo;
-  PPCFrameLowering    FrameLowering;
-  PPCJITInfo          JITInfo;
-  PPCTargetLowering   TLInfo;
-  PPCSelectionDAGInfo TSInfo;
-  InstrItineraryData  InstrItins;
 
 public:
   PPCTargetMachine(const Target &T, StringRef TT,
@@ -43,25 +36,29 @@ class PPCTargetMachine : public LLVMTargetMachine {
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL, bool is64Bit);
 
-  const PPCInstrInfo      *getInstrInfo() const override { return &InstrInfo; }
-  const PPCFrameLowering  *getFrameLowering() const override {
-    return &FrameLowering;
+  const PPCInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
   }
-        PPCJITInfo        *getJITInfo() override         { return &JITInfo; }
+  const PPCFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  PPCJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
   const PPCTargetLowering *getTargetLowering() const override {
-   return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
   const PPCSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
-  const PPCRegisterInfo   *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+  const PPCRegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
   }
 
-  const DataLayout    *getDataLayout() const override    { return &DL; }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
   const PPCSubtarget  *getSubtargetImpl() const override { return &Subtarget; }
   const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+    return &getSubtargetImpl()->getInstrItineraryData();
   }
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2cc1dfc02cc6..007901b23e0c 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -29,7 +29,7 @@ static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
 
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializePPCTTIPass(PassRegistry &);
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 5d0cf81c4025..f92bde853770 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -17,6 +17,7 @@
 namespace llvm {
 
 class AMDGPUInstrPrinter;
+class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
 class MCAsmInfo;
@@ -47,8 +48,8 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
 // Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
 Pass *createAMDGPUStructurizeCFGPass();
-FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 
 /// \brief Creates an AMDGPU-specific Target Transformation Info pass.
@@ -80,8 +81,8 @@ enum AddressSpaces {
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
   CONSTANT_ADDRESS = 2, ///< Address space for constant memory
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  REGION_ADDRESS   = 4, ///< Address space for region memory.
-  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
   PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
   PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
 
@@ -106,7 +107,8 @@ enum AddressSpaces {
   CONSTANT_BUFFER_13 = 21,
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
-  LAST_ADDRESS     = 24
+  ADDRESS_NONE = 24, ///< Address space for unknown memory.
+  LAST_ADDRESS = ADDRESS_NONE
 };
 
 } // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 2edc1150203e..d3dff531a7f6 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -7,8 +7,7 @@
 //
 //==-----------------------------------------------------------------------===//
 
-// Include AMDIL TD files
-include "AMDILBase.td"
+include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
 // Subtarget Features
@@ -87,28 +86,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
 def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
 def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
 
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+        "localmemorysize"#Value,
+        "LocalMemorySize",
+        !cast<string>(Value),
+        "The size of local memory in bytes">;
+
 class SubtargetFeatureGeneration <string Value,
                                   list<SubtargetFeature> Implies> :
         SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
                           Value#" GPU generation", Implies>;
 
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
 def FeatureR600 : SubtargetFeatureGeneration<"R600",
-        [FeatureR600ALUInst, FeatureFetchLimit8]>;
+        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
 
 def FeatureR700 : SubtargetFeatureGeneration<"R700",
-        [FeatureFetchLimit16]>;
+        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
 
 def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
-        [FeatureFetchLimit16]>;
+        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
 
 def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-        [FeatureFetchLimit16, FeatureWavefrontSize64]>;
+        [FeatureFetchLimit16, FeatureWavefrontSize64,
+         FeatureLocalMemorySize32768]
+>;
 
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-        [Feature64BitPtr, FeatureFP64]>;
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>;
 
 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-        [Feature64BitPtr, FeatureFP64]>;
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>;
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
@@ -120,6 +131,10 @@ def AMDGPU : Target {
   let InstructionSet = AMDGPUInstrInfo;
 }
 
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
 //===----------------------------------------------------------------------===//
 // Predicate helper class
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 170f47905744..2da7792a88a6 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -19,6 +19,7 @@
 
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
deleted file mode 100644
index 91aeee2fc57b..000000000000
--- a/lib/Target/R600/AMDGPUConvertToISA.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass lowers AMDIL machine instructions to the appropriate
-/// hardware instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUConvertToISAPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  TargetMachine &TM;
-
-public:
-  AMDGPUConvertToISAPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TM(tm) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {return "AMDGPU Convert to ISA";}
-
-};
-
-} // End anonymous namespace
-
-char AMDGPUConvertToISAPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
-  return new AMDGPUConvertToISAPass(tm);
-}
-
-bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
-  const AMDGPUInstrInfo * TII =
-                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                      I != E; ++I) {
-      MachineInstr &MI = *I;
-      TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
-    }
-  }
-  return false;
-}
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index e7e90d3184de..9e8302ec0a1b 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -83,7 +83,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
     OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
     OffsetBytes += MFI->getObjectSize(i);
-    // Each regiter holds 4 bytes, so we must always align the offset to at
+    // Each register holds 4 bytes, so we must always align the offset to at
     // least 4 bytes, so that 2 frame objects won't share the same register.
     OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
   }
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index f1f0bfa89ac0..b4e86ce3a1a3 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
 #include "SIISelLowering.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -256,6 +257,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     };
     return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
   }
+  case ISD::SCALAR_TO_VECTOR:
+  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
     const AMDGPURegisterInfo *TRI =
@@ -264,7 +267,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                    static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
-    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    EVT EltVT = VT.getVectorElementType();
+    assert(EltVT.bitsEq(MVT::i32));
     if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       bool UseVReg = true;
       for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
@@ -305,7 +309,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       // can't be bundled by our scheduler.
       switch(NumVectorElts) {
       case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
-      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+      case 4:
+        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+        else
+          RegClassID = AMDGPU::R600_Reg128RegClassID;
+        break;
       default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
       }
     }
@@ -313,8 +322,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
 
     if (NumVectorElts == 1) {
-      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
-                                  VT.getVectorElementType(),
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
                                   N->getOperand(0), RegClass);
     }
 
@@ -323,11 +331,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     // 16 = Max Num Vector Elements
     // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
     // 1 = Vector Register Class
-    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1);
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
 
     RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
-    for (unsigned i = 0; i < N->getNumOperands(); i++) {
+    unsigned NOps = N->getNumOperands();
+    for (unsigned i = 0; i < NOps; i++) {
       // XXX: Why is this here?
       if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
         IsRegSeq = false;
@@ -337,6 +346,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       RegSeqArgs[1 + (2 * i) + 1] =
               CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
     }
+
+    if (NOps != NumVectorElts) {
+      // Fill in the missing undef elements if this was a scalar_to_vector.
+      assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+      MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                     SDLoc(N), EltVT);
+      for (unsigned i = NOps; i < NumVectorElts; ++i) {
+        RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+        RegSeqArgs[1 + (2 * i) + 1] =
+          CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
+      }
+    }
+
     if (!IsRegSeq)
       break;
     return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 04924cf130a3..dc39bee6511e 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -84,6 +84,25 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
 
 #include "AMDGPUGenCallingConv.inc"
 
+// Find a larger type to do a load / store of a vector with.
+EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, StoreSize);
+
+  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+// Type for a vector that will be loaded to.
+EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, 32);
+
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
@@ -107,9 +126,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FROUND, MVT::f32, Legal);
   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 
-  // The hardware supports ROTR, but not ROTL
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -202,29 +218,64 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
 
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+    setOperationAction(ISD::FRINT, MVT::f64, Custom);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+  }
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
 
-  setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
-  setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Custom);
 
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+    // GPU does not have divrem function for signed or unsigned.
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
 
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  // The hardware supports 32-bit ROTR, but not ROTL.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
   setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
   setOperationAction(ISD::SUB, MVT::i64, Expand);
-
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
-  setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
-  static const MVT::SimpleValueType IntTypes[] = {
+  static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
 
-  for (MVT VT : IntTypes) {
-    //Expand the following operations for the current type by default
+  for (MVT VT : VectorIntTypes) {
+    // Expand the following operations for the current type by default.
     setOperationAction(ISD::ADD,  VT, Expand);
     setOperationAction(ISD::AND,  VT, Expand);
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
@@ -232,25 +283,41 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::MUL,  VT, Expand);
     setOperationAction(ISD::OR,   VT, Expand);
     setOperationAction(ISD::SHL,  VT, Expand);
-    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::SRL,  VT, Expand);
     setOperationAction(ISD::SRA,  VT, Expand);
+    setOperationAction(ISD::SRL,  VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::SUB,  VT, Expand);
-    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    // TODO: Implement custom UREM / SREM routines.
+    setOperationAction(ISD::SDIV, VT, Custom);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::XOR,  VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
   }
 
-  static const MVT::SimpleValueType FloatTypes[] = {
+  static const MVT::SimpleValueType FloatVectorTypes[] = {
     MVT::v2f32, MVT::v4f32
   };
 
-  for (MVT VT : FloatTypes) {
+  for (MVT VT : FloatVectorTypes) {
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
@@ -261,11 +328,29 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FSQRT, VT, Expand);
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
   }
 
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::SELECT_CC);
+
+  setSchedulingPreference(Sched::RegPressure);
+  setJumpIsExpensive(true);
+
+  // There are no integer divide instructions, and these expand to a pretty
+  // large sequence of instructions.
+  setIntDivIsCheap(false);
+
+  // TODO: Investigate this when 64-bit divides are implemented.
+  addBypassSlowDiv(64, 32);
+
+  // FIXME: Need to really handle these.
+  MaxStoresPerMemcpy  = 4096;
+  MaxStoresPerMemmove = 4096;
+  MaxStoresPerMemset  = 4096;
 }
 
 //===----------------------------------------------------------------------===//
@@ -276,6 +361,19 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const {
   return MVT::i32;
 }
 
+// The backend supports 32 and 64 bit floating point immediates.
+// FIXME: Why are we reporting vectors of FP immediates as legal?
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
+}
+
+// We don't want to shrink f64 / f32 constants.
+bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
+}
+
 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
                                                    EVT CastTy) const {
   if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
@@ -383,26 +481,31 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
   return SDValue();
 }
 
-SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
-    const {
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     Op.getNode()->dump();
     llvm_unreachable("Custom lowering code for this"
                      "instruction is not implemented yet!");
     break;
-  // AMDIL DAG lowering
-  case ISD::SDIV: return LowerSDIV(Op, DAG);
-  case ISD::SREM: return LowerSREM(Op, DAG);
+  // AMDGPU DAG lowering.
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  // AMDGPU DAG lowering
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SDIV: return LowerSDIV(Op, DAG);
+  case ISD::SREM: return LowerSREM(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
+  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
+  case ISD::FRINT: return LowerFRINT(Op, DAG);
+  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+
+  // AMDIL DAG lowering.
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   }
   return Op;
 }
@@ -514,6 +617,16 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
+// FIXME: This implements accesses to initialized globals in the constant
+// address space by copying them to private and accessing that. It does not
+// properly handle illegal types or vectors. The private vector loads are not
+// scalarized, and the illegal scalars hit an assertion. This technique will not
+// work well with large initializers, and this should eventually be
+// removed. Initialized globals should be placed into a data section that the
+// runtime will load into a buffer before the kernel is executed. Uses of the
+// global need to be replaced with a pointer loaded from an implicit kernel
+// argument into this buffer holding the copy of the data, which will remove the
+// need for any of this.
 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        const GlobalValue *GV,
                                                        const SDValue &InitPtr,
@@ -521,12 +634,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        SelectionDAG &DAG) const {
   const DataLayout *TD = getTargetMachine().getDataLayout();
   SDLoc DL(InitPtr);
+  Type *InitTy = Init->getType();
+
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
-    EVT VT = EVT::getEVT(CI->getType());
-    PointerType *PtrTy = PointerType::get(CI->getType(), 0);
-    return DAG.getStore(Chain, DL,  DAG.getConstant(*CI, VT), InitPtr,
-                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                 TD->getPrefTypeAlignment(CI->getType()));
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
@@ -537,21 +652,55 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                  TD->getPrefTypeAlignment(CFP->getType()));
   }
 
-  if (Init->getType()->isAggregateType()) {
+  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
+    const StructLayout *SL = TD->getStructLayout(ST);
+
+    EVT PtrVT = InitPtr.getValueType();
+    SmallVector<SDValue, 8> Chains;
+
+    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
+      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(I);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
     EVT PtrVT = InitPtr.getValueType();
-    unsigned NumElements = Init->getType()->getArrayNumElements();
+
+    unsigned NumElements;
+    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
+      NumElements = AT->getNumElements();
+    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
+      NumElements = VT->getNumElements();
+    else
+      llvm_unreachable("Unexpected type");
+
+    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
     SmallVector<SDValue, 8> Chains;
     for (unsigned i = 0; i < NumElements; ++i) {
-      SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
-          Init->getType()->getArrayElementType()), PtrVT);
+      SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-      Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
-                       GV, Ptr, Chain, DAG));
+
+      Constant *Elt = Init->getAggregateElement(i);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
     }
 
     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
 
+  if (isa<UndefValue>(Init)) {
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
+  }
+
   Init->dump();
   llvm_unreachable("Unhandled constant initializer");
 }
@@ -591,11 +740,19 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     unsigned Size = TD->getTypeAllocSize(EltType);
     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
 
-    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
-    const Constant *Init = Var->getInitializer();
+    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
+    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+
     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
-    SDValue InitPtr = DAG.getFrameIndex(FI,
-        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
+
+    const GlobalVariable *Var = cast<GlobalVariable>(GV);
+    if (!Var->hasInitializer()) {
+      // This has no use, but bugpoint will hit it.
+      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
+    }
+
+    const Constant *Init = Var->getInitializer();
     SmallVector<SDNode*, 8> WorkList;
 
     for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
@@ -614,8 +771,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
       }
       DAG.UpdateNodeOperands(*I, Ops);
     }
-    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
-        getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
   }
   }
 }
@@ -651,8 +807,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
   const AMDGPUFrameLowering *TFL =
    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 
-  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
-  assert(FIN);
+  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
   unsigned FrameIndex = FIN->getIndex();
   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
@@ -668,26 +823,26 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
     default: return Op;
-    case AMDGPUIntrinsic::AMDIL_abs:
+    case AMDGPUIntrinsic::AMDGPU_abs:
+    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
       return LowerIntrinsicIABS(Op, DAG);
-    case AMDGPUIntrinsic::AMDIL_exp:
-      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
     case AMDGPUIntrinsic::AMDGPU_lrp:
       return LowerIntrinsicLRP(Op, DAG);
-    case AMDGPUIntrinsic::AMDIL_fraction:
+    case AMDGPUIntrinsic::AMDGPU_fract:
+    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDIL_max:
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_clamp:
+    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
     case AMDGPUIntrinsic::AMDGPU_imax:
       return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
     case AMDGPUIntrinsic::AMDGPU_umax:
       return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDIL_min:
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
     case AMDGPUIntrinsic::AMDGPU_imin:
       return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
@@ -703,6 +858,26 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
                          Op.getOperand(1), Op.getOperand(2));
 
+    case AMDGPUIntrinsic::AMDGPU_umad24:
+      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_imad24:
+      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
+
     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                          Op.getOperand(1),
@@ -726,7 +901,13 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                          Op.getOperand(1),
                          Op.getOperand(2));
 
-    case AMDGPUIntrinsic::AMDIL_round_nearest:
+    case AMDGPUIntrinsic::AMDGPU_brev:
+      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
+      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
   }
 }
@@ -838,7 +1019,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
 
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
                                                SelectionDAG &DAG) const {
-  StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT MemVT = Store->getMemoryVT();
   unsigned MemBits = MemVT.getSizeInBits();
 
@@ -1052,6 +1233,250 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+  // char|short jq = ia ^ ib;
+  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+  // jq = jq >> (bitsize - 2)
+  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
+
+  // jq = jq | 0x1
+  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+  // jq = (int)jq
+  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+  // int ia = (int)LHS;
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // int ib, (int)RHS;
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
+      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+  // int cv = fr >= fb;
+  SDValue cv;
+  if (INTTY == MVT::i32) {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  } else {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  }
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
+      DAG.getConstant(0, OVT));
+  // dst = iq + jq;
+  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+  return iq;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSDIV32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r0, r0, r1
+  // ixor r10, r10, r11
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSelectCC(DL,
+      r0, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, OVT),
+      DAG.getConstant(0, OVT),
+      ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSelectCC(DL,
+      r1, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, OVT),
+      DAG.getConstant(0, OVT),
+      ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r0, r0, r1
+  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+  // ixor r10, r10, r11
+  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+  return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType().getScalarType();
+
+  if (OVT == MVT::i64)
+    return LowerSDIV64(Op, DAG);
+
+  if (OVT.getScalarType() == MVT::i32)
+    return LowerSDIV32(Op, DAG);
+
+  if (OVT == MVT::i16 || OVT == MVT::i8) {
+    // FIXME: We should be checking for the masked bits. This isn't reached
+    // because i8 and i16 are not legal types.
+    return LowerSDIV24(Op, DAG);
+  }
+
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSREM32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r20, r0, r1
+  // umul r20, r20, r1
+  // sub r0, r0, r20
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r20, r0, r1
+  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+  // umul r20, r20, r1
+  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
+
+  // sub r0, r0, r20
+  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+  return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType();
+
+  if (OVT.getScalarType() == MVT::i64)
+    return LowerSREM64(Op, DAG);
+
+  if (OVT.getScalarType() == MVT::i32)
+    return LowerSREM32(Op, DAG);
+
+  return SDValue(Op.getNode(), 0);
+}
+
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -1156,6 +1581,131 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   return DAG.getMergeValues(Ops, DL);
 }
 
+SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src)
+  // if (src > 0.0 && src != result)
+  //   result += 1.0
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  const SDValue Zero = DAG.getConstant(0, MVT::i32);
+  const SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  // Extract the upper half, since this is where we will find the sign and
+  // exponent.
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  // Extract the exponent.
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, MVT::i32),
+                                DAG.getConstant(ExpBits, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, MVT::i32));
+
+  // Extract the sign bit.
+  const SDValue SignBitMask = DAG.getConstant(1ul << 31, MVT::i32);
+  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
+
+  // Extend back to to 64-bits.
+  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                  Zero, SignBit);
+  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
+
+  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
+  const SDValue FractMask = DAG.getConstant((1L << FractBits) - 1, MVT::i64);
+
+  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
+  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+
+  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
+  SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
+  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+
+  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
+  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
+
+  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
+
+  APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
+  SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
+
+  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src);
+  // if (src < 0.0 && src != result)
+  //   result += -1.0.
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+  const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue S0 = Op.getOperand(0);
@@ -1173,7 +1723,6 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
                         DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
   return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
-
 }
 
 SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
@@ -1247,6 +1796,17 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
     DCI.CommitTargetLoweringOpt(TLO);
 }
 
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
+                               uint32_t Offset, uint32_t Width) {
+  if (Width + Offset < 32) {
+    IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+    return DAG.getConstant(Result, MVT::i32);
+  }
+
+  return DAG.getConstant(Src0 >> Offset, MVT::i32);
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -1293,6 +1853,85 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::SELECT_CC: {
       return CombineMinMax(N, DAG);
     }
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    assert(!N->getValueType(0).isVector() &&
+           "Vector handling of BFE not implemented");
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+    if (WidthVal == 0)
+      return DAG.getConstant(0, MVT::i32);
+
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    SDValue BitsFrom = N->getOperand(0);
+    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+    if (OffsetVal == 0) {
+      // This is already sign / zero extended, so try to fold away extra BFEs.
+      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+      if (OpSignBits >= SignBits)
+        return BitsFrom;
+
+      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+      if (Signed) {
+        // This is a sign_extend_inreg. Replace it to take advantage of existing
+        // DAG Combines. If not eliminated, we will match back to BFE during
+        // selection.
+
+        // TODO: The sext_inreg of extended types ends, although we can could
+        // handle them in a single BFE.
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+                           DAG.getValueType(SmallVT));
+      }
+
+      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+    }
+
+    if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (Signed) {
+        return constantFoldBFE<int32_t>(DAG,
+                                        Val->getSExtValue(),
+                                        OffsetVal,
+                                        WidthVal);
+      }
+
+      return constantFoldBFE<uint32_t>(DAG,
+                                       Val->getZExtValue(),
+                                       OffsetVal,
+                                       WidthVal);
+    }
+
+    APInt Demanded = APInt::getBitsSet(32,
+                                       OffsetVal,
+                                       OffsetVal + WidthVal);
+
+    if ((OffsetVal + WidthVal) >= 32) {
+      SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
+      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                         BitsFrom, ShiftVal);
+    }
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+        TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
   }
   return SDValue();
 }
@@ -1383,6 +2022,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   // AMDGPU DAG nodes
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(CLAMP)
   NODE_NAME_CASE(FMAX)
   NODE_NAME_CASE(SMAX)
   NODE_NAME_CASE(UMAX)
@@ -1393,8 +2033,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(BREV)
   NODE_NAME_CASE(MUL_U24)
   NODE_NAME_CASE(MUL_I24)
+  NODE_NAME_CASE(MAD_U24)
+  NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(EXPORT)
@@ -1407,6 +2050,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SAMPLEB)
   NODE_NAME_CASE(SAMPLED)
   NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(CVT_F32_UBYTE0)
+  NODE_NAME_CASE(CVT_F32_UBYTE1)
+  NODE_NAME_CASE(CVT_F32_UBYTE2)
+  NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
@@ -1435,8 +2083,14 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   unsigned Depth) const {
 
   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+  APInt KnownZero2;
+  APInt KnownOne2;
   unsigned Opc = Op.getOpcode();
+
   switch (Opc) {
+  default:
+    break;
   case ISD::INTRINSIC_WO_CHAIN: {
     // FIXME: The intrinsic should just use the node.
     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
@@ -1460,7 +2114,59 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
                               KnownZero, KnownOne, DAG, Depth);
     break;
-  default:
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CWidth)
+      return;
+
+    unsigned BitWidth = 32;
+    uint32_t Width = CWidth->getZExtValue() & 0x1f;
+    if (Width == 0) {
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getNullValue(BitWidth);
+      return;
+    }
+
+    // FIXME: This could do a lot more. If offset is 0, should be the same as
+    // sign_extend_inreg implementation, but that involves duplicating it.
+    if (Opc == AMDGPUISD::BFE_I32)
+      KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+    else
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
     break;
   }
+  }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case AMDGPUISD::BFE_I32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!Width)
+      return 1;
+
+    unsigned SignBits = 32 - Width->getZExtValue() + 1;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Offset || !Offset->isNullValue())
+      return SignBits;
+
+    // TODO: Could probably figure something out with non-0 offsets.
+    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    return std::max(SignBits, Op0SignBits);
+  }
+
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+  }
+
+  default:
+    return 1;
+  }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index bf6916259aff..5be3070f589b 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -42,10 +42,30 @@ class AMDGPUTargetLowering : public TargetLowering {
   SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
+
+  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                  unsigned BitsDiff,
+                                  SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
 protected:
+  static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
+  static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
 
   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
@@ -91,6 +111,10 @@ class AMDGPUTargetLowering : public TargetLowering {
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
   MVT getVectorIdxTy() const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+
   bool isLoadBitCastBeneficial(EVT, EVT) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                       bool isVarArg,
@@ -101,6 +125,7 @@ class AMDGPUTargetLowering : public TargetLowering {
                     SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
@@ -124,36 +149,14 @@ class AMDGPUTargetLowering : public TargetLowering {
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
-// Functions defined in AMDILISelLowering.cpp
-public:
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info,
-                          const CallInst &I, unsigned Intrinsic) const override;
-
-  /// We want to mark f32/f64 floating point values as legal.
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-
-  /// We don't want to shrink f64/f32 constants.
-  bool ShouldShrinkFPConstant(EVT VT) const override;
-
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  virtual unsigned ComputeNumSignBitsForTargetNode(
+    SDValue Op,
+    const SelectionDAG &DAG,
+    unsigned Depth = 0) const override;
 
 private:
+  // Functions defined in AMDILISelLowering.cpp
   void InitAMDILLowering();
-  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
-                                  unsigned BitsDiff,
-                                  SelectionDAG &DAG) const;
-  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-  EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 };
 
@@ -170,6 +173,7 @@ enum {
   // End AMDIL ISD Opcodes
   DWORDADDR,
   FRACT,
+  CLAMP,
   COS_HW,
   SIN_HW,
   FMAX,
@@ -184,8 +188,11 @@ enum {
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
+  BREV, // Reverse bits.
   MUL_U24,
   MUL_I24,
+  MAD_U24,
+  MAD_I24,
   TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
@@ -196,6 +203,21 @@ enum {
   SAMPLEB,
   SAMPLED,
   SAMPLEL,
+
+  // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
+  CVT_F32_UBYTE0,
+  CVT_F32_UBYTE1,
+  CVT_F32_UBYTE2,
+  CVT_F32_UBYTE3,
+  /// This node is for VLIW targets and it is used to represent a vector
+  /// that is stored in consecutive registers with the same channel.
+  /// For example:
+  ///   |X  |Y|Z|W|
+  /// T0|v.x| | | |
+  /// T1|v.y| | | |
+  /// T2|v.z| | | |
+  /// T3|v.w| | | |
+  BUILD_VERTICAL_VECTOR,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 1c3361a26581..fef5b8cac5ba 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -30,8 +30,8 @@ using namespace llvm;
 // Pin the vtable to this file.
 void AMDGPUInstrInfo::anchor() {}
 
-AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
-  : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { }
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { }
 
 const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
   return RI;
@@ -320,33 +320,11 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
     return -1;
   }
 
-  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+  Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
 
   return getIndirectIndexBegin(MF) + Offset;
 }
 
-
-void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
-    DebugLoc DL) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const AMDGPURegisterInfo & RI = getRegisterInfo();
-
-  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
-    MachineOperand &MO = MI.getOperand(i);
-    // Convert dst regclass to one that is supported by the ISA
-    if (MO.isReg() && MO.isDef()) {
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-        const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
-        const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
-
-        assert(newRegClass);
-
-        MRI.setRegClass(MO.getReg(), newRegClass);
-      }
-    }
-  }
-}
-
 int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   switch (Channels) {
   default: return Opcode;
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 74baf6b2a6f7..ee23020988f3 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -33,7 +33,7 @@
 
 namespace llvm {
 
-class AMDGPUTargetMachine;
+class AMDGPUSubtarget;
 class MachineFunction;
 class MachineInstr;
 class MachineInstrBuilder;
@@ -45,9 +45,9 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
                           MachineBasicBlock &MBB) const;
   virtual void anchor();
 protected:
-  TargetMachine &TM;
+  const AMDGPUSubtarget &ST;
 public:
-  explicit AMDGPUInstrInfo(TargetMachine &tm);
+  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
 
   virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
 
@@ -185,11 +185,6 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
                                     unsigned ValueReg, unsigned Address,
                                     unsigned OffsetReg) const = 0;
 
-
-  /// \brief Convert the AMDIL MachineInstr to a supported ISA
-  /// MachineInstr
-  void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const;
-
   /// \brief Build a MOV instruction.
   virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 258d5a60ef37..942a9e8ff351 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -34,6 +34,8 @@ def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+
 // out = max(a, b) a and b are signed ints
 def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
   [SDNPCommutative, SDNPAssociative]
@@ -59,6 +61,17 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
+
+def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
+  SDTIntToFPOp, []>;
+
+
 // urecip - This operation is a helper for integer division, it returns the
 // result of 1 / a as a fractional unsigned integer.
 // out = (2^32 / a) + e
@@ -92,6 +105,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
+def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
+
 // Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
 // performing the mulitply.  The result is a 32-bit value.
 def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
@@ -100,3 +115,29 @@ def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
 def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
   [SDNPCommutative]
 >;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+  []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index ba7cac476190..8bfc11cd468c 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -49,6 +49,11 @@ def u8imm : Operand<i8> {
   let PrintMethod = "printU8ImmOperand";
 }
 
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget   : Operand<OtherVT>;
+
 //===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
@@ -127,6 +132,21 @@ def COND_NULL : PatLeaf <
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
 def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
   LoadSDNode *L = cast<LoadSDNode>(N);
   return L->getExtensionType() == ISD::ZEXTLOAD ||
@@ -232,26 +252,55 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
     return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
-def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value),
-                                    (atomic_load_add node:$ptr, node:$value), [{
-  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
 
-def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value),
-                                    (atomic_load_sub node:$ptr, node:$value), [{
-  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+class local_binary_atomic_op<SDNode atomic_op> :
+  PatFrag<(ops node:$ptr, node:$value),
+    (atomic_op node:$ptr, node:$value), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
+
+def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
+def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
+def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
+def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
+def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
+def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
+def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
+def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
+def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
+def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
+def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
+
 def mskor_global : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
   return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
+def atomic_cmp_swap_32_local :
+  PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+          (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  return AN->getMemoryVT() == MVT::i32 &&
+         AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def atomic_cmp_swap_64_local :
+  PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+          (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  return AN->getMemoryVT() == MVT::i64 &&
+         AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
+int FP32_NEG_ONE = 0xbf800000;
+int FP32_ONE = 0x3f800000;
 }
 def CONST : Constants;
 
@@ -273,7 +322,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
   (ins rc:$src0),
   "CLAMP $dst, $src0",
-  [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
 >;
 
 class FABS <RegisterClass rc> : AMDGPUShaderInst <
@@ -363,7 +412,7 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
 
 // BFI_INT patterns
 
-multiclass BFIPatterns <Instruction BFI_INT> {
+multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> {
 
   // Definition from ISA doc:
   // (y & x) | (z & ~x)
@@ -379,6 +428,19 @@ multiclass BFIPatterns <Instruction BFI_INT> {
     (BFI_INT $x, $y, $z)
   >;
 
+  def : Pat <
+    (fcopysign f32:$src0, f32:$src1),
+    (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
+  >;
+
+  def : Pat <
+    (f64 (fcopysign f64:$src0, f64:$src1)),
+      (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0),
+      (BFI_INT (LoadImm32 0x7fffffff),
+               (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
+  >;
 }
 
 // SHA-256 Ma patterns
@@ -423,6 +485,40 @@ class UMUL24Pattern <Instruction UMUL24> : Pat <
 >;
 */
 
+class IMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_imad24 : Pat <
+    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_imul24 : Pat <
+    (AMDGPUmul_i24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_umad24 : Pat <
+    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_umul24 : Pat <
+    (AMDGPUmul_u24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
 include "R600Instructions.td"
 include "R700Instructions.td"
 include "EvergreenInstructions.td"
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index 9f30bd8f1c98..6dc7612d46fb 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -18,10 +18,12 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
   def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
-
+  def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
@@ -51,12 +53,29 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
+  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
+}
+
+// Legacy names for compatability.
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+  def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 }
 
 let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index b759495ad8e2..ac82e88c9266 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
+#include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
 #include "SIInstrInfo.h"
diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 000000000000..2d3a7fdd4feb
--- /dev/null
+++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,365 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteAlloca : public FunctionPass,
+                       public InstVisitor<AMDGPUPromoteAlloca> {
+
+  static char ID;
+  Module *Mod;
+  const AMDGPUSubtarget &ST;
+  int LocalMemAvailable;
+
+public:
+  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
+                                                   LocalMemAvailable(0) { }
+  virtual bool doInitialization(Module &M);
+  virtual bool runOnFunction(Function &F);
+  virtual const char *getPassName() const {
+    return "AMDGPU Promote Alloca";
+  }
+  void visitAlloca(AllocaInst &I);
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+  Mod = &M;
+  return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+
+  const FunctionType *FTy = F.getFunctionType();
+
+  LocalMemAvailable = ST.getLocalMemorySize();
+
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+    const Type *ParamTy = FTy->getParamType(i);
+    if (ParamTy->isPointerTy() &&
+        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      LocalMemAvailable = 0;
+      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
+                      "local memory disabled.\n");
+      break;
+    }
+  }
+
+  if (LocalMemAvailable > 0) {
+    // Check how much local memory is being used by global objects
+    for (Module::global_iterator I = Mod->global_begin(),
+                                 E = Mod->global_end(); I != E; ++I) {
+      GlobalVariable *GV = I;
+      PointerType *GVTy = GV->getType();
+      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+        continue;
+      for (Value::use_iterator U = GV->use_begin(),
+                               UE = GV->use_end(); U != UE; ++U) {
+        Instruction *Use = dyn_cast<Instruction>(*U);
+        if (!Use)
+          continue;
+        if (Use->getParent()->getParent() == &F)
+          LocalMemAvailable -=
+              Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
+      }
+    }
+  }
+
+  LocalMemAvailable = std::max(0, LocalMemAvailable);
+  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+
+  visit(F);
+
+  return false;
+}
+
+static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+  return VectorType::get(ArrayTy->getArrayElementType(),
+                         ArrayTy->getArrayNumElements());
+}
+
+static Value* calculateVectorIndex(Value *Ptr,
+                                  std::map<GetElementPtrInst*, Value*> GEPIdx) {
+  if (isa<AllocaInst>(Ptr))
+    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+
+  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+  return GEPIdx[GEP];
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+  // FIXME we only support simple cases
+  if (GEP->getNumOperands() != 3)
+    return NULL;
+
+  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+  if (!I0 || !I0->isZero())
+    return NULL;
+
+  return GEP->getOperand(2);
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+  Type *AllocaTy = Alloca->getAllocatedType();
+
+  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+
+  // FIXME: There is no reason why we can't support larger arrays, we
+  // are just being conservative for now.
+  if (!AllocaTy->isArrayTy() ||
+      AllocaTy->getArrayElementType()->isVectorTy() ||
+      AllocaTy->getArrayNumElements() > 4) {
+
+    DEBUG(dbgs() << "  Cannot convert type to vector");
+    return false;
+  }
+
+  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+  std::vector<Value*> WorkList;
+  for (User *AllocaUser : Alloca->users()) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+    if (!GEP) {
+      WorkList.push_back(AllocaUser);
+      continue;
+    }
+
+    Value *Index = GEPToVectorIndex(GEP);
+
+    // If we can't compute a vector index from this GEP, then we can't
+    // promote this alloca to vector.
+    if (!Index) {
+      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << "\n");
+      return false;
+    }
+
+    GEPVectorIdx[GEP] = Index;
+    for (User *GEPUser : AllocaUser->users()) {
+      WorkList.push_back(GEPUser);
+    }
+  }
+
+  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+  DEBUG(dbgs() << "  Converting alloca to vector "; AllocaTy->dump();
+        dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n");
+
+  for (std::vector<Value*>::iterator I = WorkList.begin(),
+                                     E = WorkList.end(); I != E; ++I) {
+    Instruction *Inst = cast<Instruction>(*I);
+    IRBuilder<> Builder(Inst);
+    switch (Inst->getOpcode()) {
+    case Instruction::Load: {
+      Value *Ptr = Inst->getOperand(0);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      Inst->replaceAllUsesWith(ExtractElement);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::Store: {
+      Value *Ptr = Inst->getOperand(1);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+                                                       Inst->getOperand(0),
+                                                       Index);
+      Builder.CreateStore(NewVecValue, BitCast);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::BitCast:
+      break;
+
+    default:
+      Inst->dump();
+      llvm_unreachable("Do not know how to replace this instruction "
+                              "with vector op");
+    }
+  }
+  return true;
+}
+
+static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+  for (User *User : Val->users()) {
+    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+      continue;
+    if (isa<CallInst>(User)) {
+      WorkList.push_back(User);
+      continue;
+    }
+    if (!User->getType()->isPointerTy())
+      continue;
+    WorkList.push_back(User);
+    collectUsesWithPtrTypes(User, WorkList);
+  }
+}
+
+void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+  IRBuilder<> Builder(&I);
+
+  // First try to replace the alloca with a vector
+  Type *AllocaTy = I.getAllocatedType();
+
+  DEBUG(dbgs() << "Trying to promote " << I);
+
+  if (tryPromoteAllocaToVector(&I))
+    return;
+
+  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+
+  // FIXME: This is the maximum work group size.  We should try to get
+  // value from the reqd_work_group_size function attribute if it is
+  // available.
+  unsigned WorkGroupSize = 256;
+  int AllocaSize = WorkGroupSize *
+      Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
+
+  if (AllocaSize > LocalMemAvailable) {
+    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Promoting alloca to local memory\n");
+  LocalMemAvailable -= AllocaSize;
+
+  GlobalVariable *GV = new GlobalVariable(
+      *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
+      GlobalValue::ExternalLinkage, 0, I.getName(), 0,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+
+  FunctionType *FTy = FunctionType::get(
+      Type::getInt32Ty(Mod->getContext()), false);
+  AttributeSet AttrSet;
+  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
+
+  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.y", FTy, AttrSet);
+  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.z", FTy, AttrSet);
+  Value *ReadTIDIGX = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.x", FTy, AttrSet);
+  Value *ReadTIDIGY = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.y", FTy, AttrSet);
+  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.z", FTy, AttrSet);
+
+
+  Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
+  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
+  Value *TIdX  = Builder.CreateCall(ReadTIDIGX);
+  Value *TIdY  = Builder.CreateCall(ReadTIDIGY);
+  Value *TIdZ  = Builder.CreateCall(ReadTIDIGZ);
+
+  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+  TID = Builder.CreateAdd(TID, TIdZ);
+
+  std::vector<Value*> Indices;
+  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
+  Indices.push_back(TID);
+
+  Value *Offset = Builder.CreateGEP(GV, Indices);
+  I.mutateType(Offset->getType());
+  I.replaceAllUsesWith(Offset);
+  I.eraseFromParent();
+
+  std::vector<Value*> WorkList;
+
+  collectUsesWithPtrTypes(Offset, WorkList);
+
+  for (std::vector<Value*>::iterator i = WorkList.begin(),
+                                     e = WorkList.end(); i != e; ++i) {
+    Value *V = *i;
+    CallInst *Call = dyn_cast<CallInst>(V);
+    if (!Call) {
+      Type *EltTy = V->getType()->getPointerElementType();
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      V->mutateType(NewTy);
+      continue;
+    }
+
+    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
+    if (!Intr) {
+      std::vector<Type*> ArgTypes;
+      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
+                                ArgIdx != ArgEnd; ++ArgIdx) {
+        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
+      }
+      Function *F = Call->getCalledFunction();
+      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
+                                                F->isVarArg());
+      Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
+                                             F->getAttributes());
+      Function *NewF = cast<Function>(C);
+      Call->setCalledFunction(NewF);
+      continue;
+    }
+
+    Builder.SetInsertPoint(Intr);
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // These intrinsics are for address space 0 only
+      Intr->eraseFromParent();
+      continue;
+    case Intrinsic::memcpy: {
+      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+                           MemCpy->getLength(), MemCpy->getAlignment(),
+                           MemCpy->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    case Intrinsic::memset: {
+      MemSetInst *MemSet = cast<MemSetInst>(Intr);
+      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+                           MemSet->getLength(), MemSet->getAlignment(),
+                           MemSet->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    default:
+      Intr->dump();
+      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+    }
+  }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
+  return new AMDGPUPromoteAlloca(ST);
+}
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 19927faaa488..34332808f865 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -17,9 +17,9 @@
 
 using namespace llvm;
 
-AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
+AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st)
 : AMDGPUGenRegisterInfo(0),
-  TM(tm)
+  ST(st)
   { }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index a7cba0d25048..4731595d4f71 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -25,27 +25,19 @@
 
 namespace llvm {
 
-class AMDGPUTargetMachine;
+class AMDGPUSubtarget;
 class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
-  TargetMachine &TM;
   static const MCPhysReg CalleeSavedReg;
+  const AMDGPUSubtarget &ST;
 
-  AMDGPURegisterInfo(TargetMachine &tm);
+  AMDGPURegisterInfo(const AMDGPUSubtarget &st);
 
   BitVector getReservedRegs(const MachineFunction &MF) const override {
     assert(!"Unimplemented");  return BitVector();
   }
 
-  /// \param RC is an AMDIL reg class.
-  ///
-  /// \returns The ISA reg class that is equivalent to \p RC.
-  virtual const TargetRegisterClass * getISARegClass(
-                                         const TargetRegisterClass * RC) const {
-    assert(!"Unimplemented"); return nullptr;
-  }
-
   virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
     assert(!"Unimplemented"); return nullptr;
   }
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index f3b993204a54..4fd43905b210 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
 
 using namespace llvm;
 
@@ -39,8 +41,15 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   EnableIfCvt = true;
   WavefrontSize = 0;
   CFALUBug = false;
+  LocalMemorySize = 0;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
+
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    InstrInfo.reset(new R600InstrInfo(*this));
+  } else {
+    InstrInfo.reset(new SIInstrInfo(*this));
+  }
 }
 
 bool
@@ -101,6 +110,10 @@ AMDGPUSubtarget::hasCFAluBug() const {
   assert(getGeneration() <= NORTHERN_ISLANDS);
   return CFALUBug;
 }
+int
+AMDGPUSubtarget::getLocalMemorySize() const {
+  return LocalMemorySize;
+}
 bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 1b041d6bd2bd..9c78f35df3e2 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -15,6 +15,7 @@
 #ifndef AMDGPUSUBTARGET_H
 #define AMDGPUSUBTARGET_H
 #include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -27,6 +28,9 @@
 namespace llvm {
 
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+
 public:
   enum Generation {
     R600 = 0,
@@ -52,12 +56,16 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
   bool EnableIfCvt;
   unsigned WavefrontSize;
   bool CFALUBug;
+  int LocalMemorySize;
 
   InstrItineraryData InstrItins;
 
 public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
 
+  const AMDGPUInstrInfo *getInstrInfo() const {
+    return InstrInfo.get();
+  }
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
@@ -72,10 +80,22 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return (getGeneration() >= EVERGREEN);
   }
 
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
   bool hasBFM() const {
     return hasBFE();
   }
 
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    assert(Size == 64);
+    return (getGeneration() >= SOUTHERN_ISLANDS);
+  }
+
   bool hasMulU24() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -90,6 +110,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
   unsigned getWavefrontSize() const;
   unsigned getStackEntrySize() const;
   bool hasCFAluBug() const;
+  int getLocalMemorySize() const;
 
   bool enableMachineScheduler() const override {
     return getGeneration() <= NORTHERN_ISLANDS;
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 6b68c2abe367..be1eceaaa00d 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -54,7 +54,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
 
   if (ST.is64bit()) {
     // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
   }
 
   Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
@@ -80,10 +80,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
   InstrItins(&Subtarget.getInstrItineraryData()) {
   // TLInfo uses InstrInfo so it must be initialized after.
   if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    InstrInfo.reset(new R600InstrInfo(*this));
     TLInfo.reset(new R600TargetLowering(*this));
   } else {
-    InstrInfo.reset(new SIInstrInfo(*this));
     TLInfo.reset(new SITargetLowering(*this));
   }
   setRequiresStructuredCFG(true);
@@ -111,6 +109,7 @@ class AMDGPUPassConfig : public TargetPassConfig {
     return nullptr;
   }
 
+  virtual void addCodeGenPrepare();
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreRegAlloc() override;
@@ -136,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   PM.add(createAMDGPUTargetTransformInfoPass(this));
 }
 
+void AMDGPUPassConfig::addCodeGenPrepare() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  addPass(createAMDGPUPromoteAlloca(ST));
+  addPass(createSROAPass());
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool
 AMDGPUPassConfig::addPreISel() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
@@ -159,7 +165,6 @@ bool AMDGPUPassConfig::addInstSelector() {
 }
 
 bool AMDGPUPassConfig::addPreRegAlloc() {
-  addPass(createAMDGPUConvertToISAPass(*TM));
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
   if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 1287e134b59f..2eb36a3366f2 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -30,7 +30,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   const DataLayout Layout;
   AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   const InstrItineraryData *InstrItins;
 
@@ -46,13 +45,13 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
     return &IntrinsicInfo;
   }
   const AMDGPUInstrInfo *getInstrInfo() const override {
-    return InstrInfo.get();
+    return getSubtargetImpl()->getInstrInfo();
   }
   const AMDGPUSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
   const AMDGPURegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo->getRegisterInfo();
+    return &getInstrInfo()->getRegisterInfo();
   }
   AMDGPUTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
deleted file mode 100644
index 5dcd478fe5bc..000000000000
--- a/lib/Target/R600/AMDILBase.td
+++ /dev/null
@@ -1,25 +0,0 @@
-//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-// Dummy Instruction itineraries for pseudo instructions
-def ALU_NULL : FuncUnit;
-def NullALU : InstrItinClass;
-
-//===----------------------------------------------------------------------===//
-// Register File, Calling Conv, Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-
-include "AMDILRegisterInfo.td"
-include "AMDILInstrInfo.td"
-
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 7cea803f8987..fa48e65be4e6 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -13,56 +13,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUISelLowering.h"
-#include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
-//===----------------------------------------------------------------------===//
-// TargetLowering Implementation Help Functions End
-//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // TargetLowering Class Implementation Begins
 //===----------------------------------------------------------------------===//
 void AMDGPUTargetLowering::InitAMDILLowering() {
   static const MVT::SimpleValueType types[] = {
-    MVT::i8,
-    MVT::i16,
     MVT::i32,
     MVT::f32,
     MVT::f64,
     MVT::i64,
-    MVT::v2i8,
-    MVT::v4i8,
-    MVT::v2i16,
-    MVT::v4i16,
     MVT::v4f32,
     MVT::v4i32,
     MVT::v2f32,
-    MVT::v2i32,
-    MVT::v2f64,
-    MVT::v2i64
-  };
-
-  static const MVT::SimpleValueType IntTypes[] = {
-    MVT::i8,
-    MVT::i16,
-    MVT::i32,
-    MVT::i64
+    MVT::v2i32
   };
 
   static const MVT::SimpleValueType FloatTypes[] = {
@@ -71,21 +39,13 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
   };
 
   static const MVT::SimpleValueType VectorTypes[] = {
-    MVT::v2i8,
-    MVT::v4i8,
-    MVT::v2i16,
-    MVT::v4i16,
     MVT::v4f32,
     MVT::v4i32,
     MVT::v2f32,
-    MVT::v2i32,
-    MVT::v2f64,
-    MVT::v2i64
+    MVT::v2i32
   };
 
   const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
-  // These are the current register classes that are
-  // supported
 
   for (MVT VT : types) {
     setOperationAction(ISD::SUBE, VT, Expand);
@@ -95,87 +55,22 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::BRCOND, VT, Custom);
     setOperationAction(ISD::BR_JT, VT, Expand);
     setOperationAction(ISD::BRIND, VT, Expand);
-    // TODO: Implement custom UREM/SREM routines
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-    if (VT != MVT::i64 && VT != MVT::v2i64) {
-      setOperationAction(ISD::SDIV, VT, Custom);
-    }
   }
+
   for (MVT VT : FloatTypes) {
-    // IL does not have these operations for floating point types
     setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
-    setOperationAction(ISD::SETOLT, VT, Expand);
-    setOperationAction(ISD::SETOGE, VT, Expand);
-    setOperationAction(ISD::SETOGT, VT, Expand);
-    setOperationAction(ISD::SETOLE, VT, Expand);
-    setOperationAction(ISD::SETULT, VT, Expand);
-    setOperationAction(ISD::SETUGE, VT, Expand);
-    setOperationAction(ISD::SETUGT, VT, Expand);
-    setOperationAction(ISD::SETULE, VT, Expand);
-  }
-
-  for (MVT VT : IntTypes) {
-    // GPU also does not have divrem function for signed or unsigned
-    setOperationAction(ISD::SDIVREM, VT, Expand);
-
-    // GPU does not have [S|U]MUL_LOHI functions as a single instruction
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-
-    setOperationAction(ISD::BSWAP, VT, Expand);
-
-    // GPU doesn't have any counting operators
-    setOperationAction(ISD::CTPOP, VT, Expand);
-    setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTLZ, VT, Expand);
   }
 
   for (MVT VT : VectorTypes) {
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
-    setOperationAction(ISD::SDIVREM, VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    // setOperationAction(ISD::VSETCC, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
-
   }
-  setOperationAction(ISD::MULHU, MVT::i64, Expand);
-  setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
-  setOperationAction(ISD::ADD, MVT::v2i64, Expand);
-  setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-  setOperationAction(ISD::Constant          , MVT::i64  , Legal);
-  setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
-  setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
+
   if (STM.hasHWFP64()) {
-    // we support loading/storing v2f64 but not operations on the type
-    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
-    setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
-    setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
-    // We want to expand vector conversions into their scalar
-    // counterparts.
-    setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
-    setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
     setOperationAction(ISD::FABS, MVT::f64, Expand);
-    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
   }
-  // TODO: Fix the UDIV24 algorithm so it works for these
-  // types correctly. This needs vector comparisons
-  // for this to work correctly.
-  setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
-  setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
-  setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
-  setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
+
   setOperationAction(ISD::SUBC, MVT::Other, Expand);
   setOperationAction(ISD::ADDE, MVT::Other, Expand);
   setOperationAction(ISD::ADDC, MVT::Other, Expand);
@@ -183,378 +78,19 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
+  setOperationAction(ISD::Constant, MVT::i32, Legal);
+  setOperationAction(ISD::Constant, MVT::i64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 
-  // Use the default implementation.
-  setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
-  setOperationAction(ISD::Constant          , MVT::i32    , Legal);
-
-  setSchedulingPreference(Sched::RegPressure);
   setPow2DivIsCheap(false);
-  setSelectIsExpensive(true);
-  setJumpIsExpensive(true);
-
-  MaxStoresPerMemcpy  = 4096;
-  MaxStoresPerMemmove = 4096;
-  MaxStoresPerMemset  = 4096;
-
-}
-
-bool
-AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-    const CallInst &I, unsigned Intrinsic) const {
-  return false;
-}
-
-// The backend supports 32 and 64 bit floating point immediates
-bool
-AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
-      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool
-AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
-  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
-      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
-    return false;
-  } else {
-    return true;
-  }
+  setSelectIsExpensive(true); // FIXME: This makes no sense at all
 }
 
-
-// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
-// be zero. Op is expected to be a target specific node. Used by DAG
-// combiner.
-
-//===----------------------------------------------------------------------===//
-//                           Other Lowering Hooks
-//===----------------------------------------------------------------------===//
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
-  EVT OVT = Op.getValueType();
-  SDValue DST;
-  if (OVT.getScalarType() == MVT::i64) {
-    DST = LowerSDIV64(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i32) {
-    DST = LowerSDIV32(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i16
-      || OVT.getScalarType() == MVT::i8) {
-    DST = LowerSDIV24(Op, DAG);
-  } else {
-    DST = SDValue(Op.getNode(), 0);
-  }
-  return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
-  EVT OVT = Op.getValueType();
-  SDValue DST;
-  if (OVT.getScalarType() == MVT::i64) {
-    DST = LowerSREM64(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i32) {
-    DST = LowerSREM32(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i16) {
-    DST = LowerSREM16(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i8) {
-    DST = LowerSREM8(Op, DAG);
-  } else {
-    DST = SDValue(Op.getNode(), 0);
-  }
-  return DST;
-}
-
-EVT
-AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
-  int iSize = (size * numEle);
-  int vEle = (iSize >> ((size == 64) ? 6 : 5));
-  if (!vEle) {
-    vEle = 1;
-  }
-  if (size == 64) {
-    if (vEle == 1) {
-      return EVT(MVT::i64);
-    } else {
-      return EVT(MVT::getVectorVT(MVT::i64, vEle));
-    }
-  } else {
-    if (vEle == 1) {
-      return EVT(MVT::i32);
-    } else {
-      return EVT(MVT::getVectorVT(MVT::i32, vEle));
-    }
-  }
-}
-
-SDValue
-AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Jump  = Op.getOperand(2);
-  SDValue Result;
-  Result = DAG.getNode(
-      AMDGPUISD::BRANCH_COND,
-      SDLoc(Op),
-      Op.getValueType(),
-      Chain, Jump, Cond);
-  return Result;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  MVT INTTY;
-  MVT FLTTY;
-  if (!OVT.isVector()) {
-    INTTY = MVT::i32;
-    FLTTY = MVT::f32;
-  } else if (OVT.getVectorNumElements() == 2) {
-    INTTY = MVT::v2i32;
-    FLTTY = MVT::v2f32;
-  } else if (OVT.getVectorNumElements() == 4) {
-    INTTY = MVT::v4i32;
-    FLTTY = MVT::v4f32;
-  }
-  unsigned bitsize = OVT.getScalarType().getSizeInBits();
-  // char|short jq = ia ^ ib;
-  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
-
-  // jq = jq >> (bitsize - 2)
-  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); 
-
-  // jq = jq | 0x1
-  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
-
-  // jq = (int)jq
-  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
-
-  // int ia = (int)LHS;
-  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
-
-  // int ib, (int)RHS;
-  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
-
-  // float fa = (float)ia;
-  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
-
-  // float fb = (float)ib;
-  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
-
-  // float fq = native_divide(fa, fb);
-  SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
-
-  // fq = trunc(fq);
-  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
-
-  // float fqneg = -fq;
-  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
-
-  // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
-      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
-
-  // int iq = (int)fq;
-  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
-
-  // fr = fabs(fr);
-  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
-
-  // fb = fabs(fb);
-  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
-
-  // int cv = fr >= fb;
-  SDValue cv;
-  if (INTTY == MVT::i32) {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  } else {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  }
-  // jq = (cv ? jq : 0);
-  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, 
-      DAG.getConstant(0, OVT));
-  // dst = iq + jq;
-  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
-  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
-  return iq;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  // The LowerSDIV32 function generates equivalent to the following IL.
-  // mov r0, LHS
-  // mov r1, RHS
-  // ilt r10, r0, 0
-  // ilt r11, r1, 0
-  // iadd r0, r0, r10
-  // iadd r1, r1, r11
-  // ixor r0, r0, r10
-  // ixor r1, r1, r11
-  // udiv r0, r0, r1
-  // ixor r10, r10, r11
-  // iadd r0, r0, r10
-  // ixor DST, r0, r10
-
-  // mov r0, LHS
-  SDValue r0 = LHS;
-
-  // mov r1, RHS
-  SDValue r1 = RHS;
-
-  // ilt r10, r0, 0
-  SDValue r10 = DAG.getSelectCC(DL,
-      r0, DAG.getConstant(0, OVT),
-      DAG.getConstant(-1, MVT::i32),
-      DAG.getConstant(0, MVT::i32),
-      ISD::SETLT);
-
-  // ilt r11, r1, 0
-  SDValue r11 = DAG.getSelectCC(DL,
-      r1, DAG.getConstant(0, OVT),
-      DAG.getConstant(-1, MVT::i32),
-      DAG.getConstant(0, MVT::i32),
-      ISD::SETLT);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // iadd r1, r1, r11
-  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
-  // ixor r0, r0, r10
-  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
-  // ixor r1, r1, r11
-  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
-
-  // udiv r0, r0, r1
-  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
-
-  // ixor r10, r10, r11
-  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // ixor DST, r0, r10
-  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
-  return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue(Op.getNode(), 0);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  MVT INTTY = MVT::i32;
-  if (OVT == MVT::v2i8) {
-    INTTY = MVT::v2i32;
-  } else if (OVT == MVT::v4i8) {
-    INTTY = MVT::v4i32;
-  }
-  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
-  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
-  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
-  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
-  return LHS;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  MVT INTTY = MVT::i32;
-  if (OVT == MVT::v2i16) {
-    INTTY = MVT::v2i32;
-  } else if (OVT == MVT::v4i16) {
-    INTTY = MVT::v4i32;
-  }
-  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
-  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
-  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
-  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
-  return LHS;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  // The LowerSREM32 function generates equivalent to the following IL.
-  // mov r0, LHS
-  // mov r1, RHS
-  // ilt r10, r0, 0
-  // ilt r11, r1, 0
-  // iadd r0, r0, r10
-  // iadd r1, r1, r11
-  // ixor r0, r0, r10
-  // ixor r1, r1, r11
-  // udiv r20, r0, r1
-  // umul r20, r20, r1
-  // sub r0, r0, r20
-  // iadd r0, r0, r10
-  // ixor DST, r0, r10
-
-  // mov r0, LHS
-  SDValue r0 = LHS;
-
-  // mov r1, RHS
-  SDValue r1 = RHS;
-
-  // ilt r10, r0, 0
-  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
-
-  // ilt r11, r1, 0
-  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // iadd r1, r1, r11
-  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
-  // ixor r0, r0, r10
-  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
-  // ixor r1, r1, r11
-  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
-
-  // udiv r20, r0, r1
-  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
-
-  // umul r20, r20, r1
-  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
-
-  // sub r0, r0, r20
-  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // ixor DST, r0, r10
-  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
-  return DST;
-}
 
-SDValue
-AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue(Op.getNode(), 0);
+  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
+                     Chain, Jump, Cond);
 }
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
deleted file mode 100644
index 0f0c88db935a..000000000000
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ /dev/null
@@ -1,150 +0,0 @@
-//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file describes the AMDIL instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-//===--------------------------------------------------------------------===//
-// Custom Operands
-//===--------------------------------------------------------------------===//
-def brtarget   : Operand<OtherVT>;
-
-//===--------------------------------------------------------------------===//
-// Custom Selection DAG Type Profiles
-//===--------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Generic Profile Types
-//===----------------------------------------------------------------------===//
-
-def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
-    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
-    ]>;
-def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
-    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
-    ]>;
-def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
-    SDTCisEltOfVec<1, 0>
-    ]>;
-
-//===----------------------------------------------------------------------===//
-// Flow Control Profile Types
-//===----------------------------------------------------------------------===//
-// Branch instruction where second and third are basic blocks
-def SDTIL_BRCond : SDTypeProfile<0, 2, [
-    SDTCisVT<0, OtherVT>
-    ]>;
-
-//===--------------------------------------------------------------------===//
-// Custom Selection DAG Nodes
-//===--------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Flow Control DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
-
-//===----------------------------------------------------------------------===//
-// Call/Return DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
-    [SDNPHasChain, SDNPOptInGlue]>;
-
-//===--------------------------------------------------------------------===//
-// Instructions
-//===--------------------------------------------------------------------===//
-// Floating point math functions
-def IL_div_inf      : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
-
-//===----------------------------------------------------------------------===//
-// Integer functions
-//===----------------------------------------------------------------------===//
-def IL_umul        : SDNode<"AMDGPUISD::UMUL"    , SDTIntBinOp,
-    [SDNPCommutative, SDNPAssociative]>;
-
-//===--------------------------------------------------------------------===//
-// Custom Pattern DAG Nodes
-//===--------------------------------------------------------------------===//
-def global_store : PatFrag<(ops node:$val, node:$ptr),
-    (store node:$val, node:$ptr), [{
-        return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-//===----------------------------------------------------------------------===//
-// Load pattern fragments
-//===----------------------------------------------------------------------===//
-// Global address space loads
-def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-// Constant address space loads
-def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-//===----------------------------------------------------------------------===//
-// Complex addressing mode patterns
-//===----------------------------------------------------------------------===//
-def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
-def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
-def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
-def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
-
-//===----------------------------------------------------------------------===//
-// Instruction format classes
-//===----------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
-     let Namespace = "AMDGPU";
-     dag OutOperandList = outs;
-     dag InOperandList = ins;
-     let Pattern = pattern;
-     let AsmString = !strconcat(asmstr, "\n");
-     let isPseudo = 1;
-     let Itinerary = NullALU;
-     bit hasIEEEFlag = 0;
-     bit hasZeroOpFlag = 0;
-     let mayLoad = 0;
-     let mayStore = 0;
-     let hasSideEffects = 0;
-}
-
-//===--------------------------------------------------------------------===//
-// Multiclass Instruction formats
-//===--------------------------------------------------------------------===//
-// Multiclass that handles branch instructions
-multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
-    def _i32 : ILFormat<(outs),
-  (ins brtarget:$target, rci:$src0),
-        "; i32 Pseudo branch instruction",
-  [(Op bb:$target, (i32 rci:$src0))]>;
-    def _f32 : ILFormat<(outs),
-  (ins brtarget:$target, rcf:$src0),
-        "; f32 Pseudo branch instruction",
-  [(Op bb:$target, (f32 rcf:$src0))]>;
-}
-
-// Only scalar types should generate flow control
-multiclass BranchInstr<string name> {
-  def _i32 : ILFormat<(outs), (ins GPRI32:$src),
-      !strconcat(name, " $src"), []>;
-  def _f32 : ILFormat<(outs), (ins GPRF32:$src),
-      !strconcat(name, " $src"), []>;
-}
-// Only scalar types should generate flow control
-multiclass BranchInstr2<string name> {
-  def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
-      !strconcat(name, " $src0, $src1"), []>;
-  def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
-      !strconcat(name, " $src0, $src1"), []>;
-}
-
-//===--------------------------------------------------------------------===//
-// Intrinsics support
-//===--------------------------------------------------------------------===//
-include "AMDILIntrinsics.td"
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
deleted file mode 100644
index 4a3e02e202bb..000000000000
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ /dev/null
@@ -1,224 +0,0 @@
-//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file defines all of the amdil-specific intrinsics
-//
-//===---------------------------------------------------------------===//
-//===--------------------------------------------------------------------===//
-// Intrinsic classes
-// Generic versions of the above classes but for Target specific intrinsics
-// instead of SDNode patterns.
-//===--------------------------------------------------------------------===//
-let TargetPrefix = "AMDIL", isTarget = 1 in {
-     class VoidIntLong :
-          Intrinsic<[llvm_i64_ty], [], []>;
-     class VoidIntInt :
-          Intrinsic<[llvm_i32_ty], [], []>;
-     class VoidIntBool :
-          Intrinsic<[llvm_i32_ty], [], []>;
-     class UnaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-     class UnaryIntFloat :
-          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-     class ConvertIntFTOI :
-          Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-     class ConvertIntITOF :
-          Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
-     class UnaryIntNoRetInt :
-          Intrinsic<[], [llvm_anyint_ty], []>;
-     class UnaryIntNoRetFloat :
-          Intrinsic<[], [llvm_anyfloat_ty], []>;
-     class BinaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class BinaryIntFloat :
-          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class BinaryIntNoRetInt :
-          Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
-     class BinaryIntNoRetFloat :
-          Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
-     class TernaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class TernaryIntFloat :
-          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
-          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class QuaternaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class UnaryAtomicInt :
-          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class BinaryAtomicInt :
-          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class TernaryAtomicInt :
-          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
-     class UnaryAtomicIntNoRet :
-          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class BinaryAtomicIntNoRet :
-          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class TernaryAtomicIntNoRet :
-          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-}
-
-let TargetPrefix = "AMDIL", isTarget = 1 in {
-  def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
-
-  def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
-          UnaryIntInt;
-  def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
-          UnaryIntInt;
-  def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
-          UnaryIntInt;
-  def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
-          UnaryIntInt;
-  def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
-          UnaryIntInt;
-  def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
-                    TernaryIntInt;
-  def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
-                    TernaryIntInt;
-  def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
-                    QuaternaryIntInt;
-  def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
-      TernaryIntInt;
-  def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
-      BinaryIntInt;
-  def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
-          BinaryIntInt;
-  def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
-          BinaryIntInt;
-  def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
-          BinaryIntInt;
-  def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
-          BinaryIntInt;
-  def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
-          BinaryIntInt;
-  def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
-          BinaryIntInt;
-  def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
-          BinaryIntInt;
-  def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
-          BinaryIntInt;
-  def int_AMDIL_min     : GCCBuiltin<"__amdil_min">,
-          BinaryIntFloat;
-  def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
-          BinaryIntInt;
-  def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
-          BinaryIntInt;
-  def int_AMDIL_max     : GCCBuiltin<"__amdil_max">,
-          BinaryIntFloat;
-  def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
-          TernaryIntInt;
-  def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
-          TernaryIntInt;
-  def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
-          TernaryIntInt;
-  def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
-          UnaryIntFloat;
-  def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
-          TernaryIntFloat;
-  def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
-          UnaryIntFloat;
-  def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
-          UnaryIntFloat;
-  def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
-          UnaryIntFloat;
-  def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
-          UnaryIntFloat;
-  def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
-          UnaryIntFloat;
-  def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
-          UnaryIntFloat;
-  def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
-          UnaryIntFloat;
-  def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
-          UnaryIntFloat;
-  def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
-          UnaryIntFloat;
-  def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
-          UnaryIntFloat;
-  def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
-  def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
-  def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
-  def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
-          UnaryIntFloat;
-  def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
-          UnaryIntFloat;
-  def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
-          UnaryIntFloat;
-  def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
-          UnaryIntFloat;
-  def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
-          UnaryIntFloat;
-  def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
-          TernaryIntFloat;
-  def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
-          UnaryIntFloat;
-  def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
-          UnaryIntFloat;
-  def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
-          TernaryIntFloat;
-  def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
-      Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
-           llvm_v4i32_ty, llvm_i32_ty], []>;
-
-  def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
-        Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
- def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
-    Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
-  def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
-  def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
-      ConvertIntITOF;
-  def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
-      ConvertIntFTOI;
- def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
-      Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
-  def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
-      ConvertIntITOF;
-  def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
-      ConvertIntITOF;
-  def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
-      ConvertIntITOF;
-  def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
-      ConvertIntITOF;
-  def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
-        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
-          llvm_v2f32_ty, llvm_float_ty], []>;
-  def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
-        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
-          llvm_v2f32_ty], []>;
-  def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
-        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
-          llvm_v4f32_ty], []>;
-  def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
-        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
-          llvm_v4f32_ty], []>;
-}
diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td
deleted file mode 100644
index b9d033432e8c..000000000000
--- a/lib/Target/R600/AMDILRegisterInfo.td
+++ /dev/null
@@ -1,107 +0,0 @@
-//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-//  Declarations that describe the AMDIL register file
-//
-//===----------------------------------------------------------------------===//
-
-class AMDILReg<bits<16> num, string n> : Register<n> {
-  field bits<16> Value;
-  let Value = num;
-  let Namespace = "AMDGPU";
-}
-
-// We will start with 8 registers for each class before expanding to more
-// Since the swizzle is added based on the register class, we can leave it
-// off here and just specify different registers for different register classes
-def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
-def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
-def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
-def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
-def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
-def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
-def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
-def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
-def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
-def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
-def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
-def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
-def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
-def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
-def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
-def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
-def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
-def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
-def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
-def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
-
-// All registers between 1000 and 1024 are reserved and cannot be used
-// unless commented in this section
-// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
-// r1020 is used to hold the frame index for local arrays
-// r1019 is used to hold the dynamic stack allocation pointer
-// r1018 is used as a temporary register for handwritten code
-// r1017 is used as a temporary register for handwritten code
-// r1016 is used as a temporary register for load/store code
-// r1015 is used as a temporary register for data segment offset
-// r1014 is used as a temporary register for store code
-// r1013 is used as the section data pointer register
-// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
-// r1009 is used as the frame pointer register
-// r999 is used as the mem register.
-// r998 is used as the return address register.
-//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
-//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
-//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
-//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
-//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
-//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
-def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
-def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
-def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
-def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
-def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
-def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
-def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
-def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
-def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
-def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
-def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
-def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
-def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
-def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
-def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
-def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
-def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
-def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
-def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
-def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
-def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
-def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
-def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
-  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
-        let AltOrders = [(add (sequence "R%u", 1, 20))];
-        let AltOrderSelect = [{
-          return 1;
-        }];
-    }
-def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
-  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
-        let AltOrders = [(add (sequence "R%u", 1, 20))];
-        let AltOrderSelect = [{
-          return 1;
-        }];
-    }
-def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
-  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
-        let AltOrders = [(add (sequence "R%u", 1, 20))];
-        let AltOrderSelect = [{
-          return 1;
-        }];
-    }
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 3c6fa5a7ae46..be2ca06ef34d 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -24,8 +24,8 @@ add_llvm_target(R600CodeGen
   AMDGPUTargetMachine.cpp
   AMDGPUTargetTransformInfo.cpp
   AMDGPUISelLowering.cpp
-  AMDGPUConvertToISA.cpp
   AMDGPUInstrInfo.cpp
+  AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index 837d6025339f..26303452c101 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -21,12 +21,14 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
 let Predicates = [isCayman] in {
 
 def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
-  [(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
 def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
   [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
 >;
 
+def : IMad24Pat<MULADD_INT24_cm>;
+
 let isVector = 1 in {
 
 def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
@@ -47,6 +49,7 @@ def COS_cm : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
 
 // RECIP_UINT emulation for Cayman
 // The multiplication scales from [0,1] to the unsigned integer range
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index d9931c81d625..dcb7e982c7fc 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -75,6 +75,8 @@ def COS_eg : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
 def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 
+defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
+
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
@@ -293,7 +295,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)),
 def : Pat<(i32 (sext_inreg i32:$src, i16)),
   (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
 
-defm : BFIPatterns <BFI_INT_eg>;
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>;
 
 def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
   [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
@@ -301,8 +303,11 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
 >;
 
 def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
-  [(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
 def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
 def : ROTRPattern <BIT_ALIGN_INT_eg>;
 def MULADD_eg : MULADD_Common<0x14>;
@@ -321,6 +326,8 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
 def DOT4_eg : DOT4_Common<0xBE>;
 defm CUBE_eg : CUBE_Common<0xC0>;
 
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
+
 let hasSideEffects = 1 in {
   def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
 }
@@ -341,7 +348,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
 def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
 
 def GROUP_BARRIER : InstR600 <
-    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>,
+    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
     R600ALU_Word0,
     R600ALU_Word1_OP2 <0x54> {
 
@@ -370,6 +377,11 @@ def GROUP_BARRIER : InstR600 <
   let ALUInst = 1;
 }
 
+def : Pat <
+	(int_AMDGPU_barrier_global),
+	(GROUP_BARRIER)
+>;
+
 //===----------------------------------------------------------------------===//
 // LDS Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index d255e9690524..d98a6dbb37bd 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 489565e65795..f0e13e56d8ff 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -13,6 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
+#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
@@ -133,6 +136,16 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::FP_TO_SINT);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -144,6 +157,12 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::UDIV, MVT::i64, Custom);
   setOperationAction(ISD::UREM, MVT::i64, Custom);
 
+  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
+  //  to be Legal/Custom in order to avoid library calls.
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
@@ -537,6 +556,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
   case ISD::FCOS:
   case ISD::FSIN: return LowerTrig(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
@@ -809,6 +833,56 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
+SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
+                                                   SDValue Vector) const {
+
+  SDLoc DL(Vector);
+  EVT VecVT = Vector.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SmallVector<SDValue, 8> Args;
+
+  for (unsigned i = 0, e = VecVT.getVectorNumElements();
+                                                           i != e; ++i) {
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                               Vector, DAG.getConstant(i, getVectorIdxTy())));
+  }
+
+  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
+}
+
+SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Index = Op.getOperand(1);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
+                     Vector, Index);
+}
+
+SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
+                               Vector, Value, Index);
+  return vectorToVerticalVector(DAG, Insert);
+}
+
 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   // On hw >= R700, COS/SIN input must be between -1. and 1.
   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
@@ -840,6 +914,80 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
       DAG.getConstantFP(3.14159265359, MVT::f32));
 }
 
+SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue One  = DAG.getConstant(1, VT);
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
+  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
+  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
+  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
+
+  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
+  SDValue LoBig = Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue One  = DAG.getConstant(1, VT);
+
+  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
+  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
+  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
+  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
+
+  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
+  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(
       ISD::SETCC,
@@ -1762,7 +1910,8 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
         NewArgs);
   }
   }
-  return SDValue();
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 static bool
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index a8a464f338f7..381642aa600c 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -51,15 +51,17 @@ class R600TargetLowering : public AMDGPUTargetLowering {
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
   SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
+  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
 
-  /// \brief Lower ROTL opcode to BITALIGN
-  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
-
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
                                           SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index b0d9ae3e70e3..3972e2f03730 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -28,10 +28,9 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
-R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
-  : AMDGPUInstrInfo(tm),
-    RI(tm),
-    ST(tm.getSubtarget<AMDGPUSubtarget>())
+R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUInstrInfo(st),
+    RI(st)
   { }
 
 const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
@@ -52,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const {
   unsigned VectorComponents = 0;
-  if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
-      AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
+      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
+       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 4;
-  } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
-            AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
+            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
+             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 2;
   }
 
@@ -768,16 +771,6 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   return true;
 }
 
-int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
-  const MachineInstr *MI = op.getParent();
-
-  switch (MI->getDesc().OpInfo->RegClass) {
-  default: // FIXME: fallthrough??
-  case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
-  case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
-  };
-}
-
 static
 MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
@@ -1064,10 +1057,34 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return 2;
 }
 
+bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+
+  switch(MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+  case AMDGPU::R600_EXTRACT_ELT_V2:
+  case AMDGPU::R600_EXTRACT_ELT_V4:
+    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
+                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
+                      MI->getOperand(2).getReg(),
+                      RI.getHWRegChan(MI->getOperand(1).getReg()));
+    break;
+  case AMDGPU::R600_INSERT_ELT_V2:
+  case AMDGPU::R600_INSERT_ELT_V4:
+    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
+                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
+                       MI->getOperand(3).getReg(),                    // Offset
+                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
+    break;
+  }
+  MI->eraseFromParent();
+  return true;
+}
+
 void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
   const AMDGPUFrameLowering *TFL =
-                 static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
+    static_cast<const AMDGPUFrameLowering*>(
+    MF.getTarget().getFrameLowering());
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);
@@ -1100,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned ValueReg, unsigned Address,
                                        unsigned OffsetReg) const {
-  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                AMDGPU::AR_X, OffsetReg);
   setImmOperand(MOVA, AMDGPU::OpName::write, 0);
@@ -1117,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned ValueReg, unsigned Address,
                                        unsigned OffsetReg) const {
-  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                        AMDGPU::AR_X,
                                                        OffsetReg);
@@ -1220,7 +1267,6 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     const {
   assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
   unsigned Opcode;
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   if (ST.getGeneration() <= AMDGPUSubtarget::R700)
     Opcode = AMDGPU::DOT4_r600;
   else
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index b5304a0edfd5..45a57d367b8b 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -32,12 +32,22 @@ namespace llvm {
   class R600InstrInfo : public AMDGPUInstrInfo {
   private:
   const R600RegisterInfo RI;
-  const AMDGPUSubtarget &ST;
 
-  int getBranchInstr(const MachineOperand &op) const;
   std::vector<std::pair<int, unsigned> >
   ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
 
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
   public:
   enum BankSwizzle {
     ALU_VEC_012_SCL_210 = 0,
@@ -48,7 +58,7 @@ namespace llvm {
     ALU_VEC_210
   };
 
-  explicit R600InstrInfo(AMDGPUTargetMachine &tm);
+  explicit R600InstrInfo(const AMDGPUSubtarget &st);
 
   const R600RegisterInfo &getRegisterInfo() const override;
   void copyPhysReg(MachineBasicBlock &MBB,
@@ -197,6 +207,8 @@ namespace llvm {
   int getInstrLatency(const InstrItineraryData *ItinData,
                       SDNode *Node) const override { return 1;}
 
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index d2075c0577b0..58c704d8ec85 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -125,7 +125,7 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
 class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
                     InstrItinClass itin = AnyALU> :
     R600_1OP <inst, opName,
-              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
 >;
 
 // If you add or change the operands for R600_2OP instructions, you must
@@ -161,10 +161,10 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
 }
 
 class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
-                       InstrItinClass itim = AnyALU> :
+                       InstrItinClass itin = AnyALU> :
     R600_2OP <inst, opName,
               [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
-                                           R600_Reg32:$src1))]
+                                           R600_Reg32:$src1))], itin
 >;
 
 // If you add our change the operands for R600_3OP instructions, you must
@@ -1265,13 +1265,6 @@ let Predicates = [isR600] in {
 defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
 
 
-//===----------------------------------------------------------------------===//
-// Branch Instructions
-//===----------------------------------------------------------------------===//
-
-def IF_PREDICATE_SET  : ILFormat<(outs), (ins GPRI32:$src),
-  "IF_PREDICATE_SET $src", []>;
-
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -1345,15 +1338,6 @@ def TXD_SHADOW: InstR600 <
 } // End isPseudo = 1
 } // End usesCustomInserter = 1
 
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
-    usesCustomInserter = 1 in {
-  def RETURN          : ILFormat<(outs), (ins variable_ops),
-      "RETURN", [(IL_retflag)]>;
-}
-
 
 //===----------------------------------------------------------------------===//
 // Constant Buffer Addressing Support
@@ -1480,11 +1464,52 @@ let Inst{63-32} = Word1;
   let VTXInst = 1;
 }
 
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+}
+
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
+    def _i32 : ILFormat<(outs),
+  (ins brtarget:$target, rci:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, (i32 rci:$src0))]>;
+    def _f32 : ILFormat<(outs),
+  (ins brtarget:$target, rcf:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, (f32 rcf:$src0))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+}
 
-
-//===--------------------------------------------------------------------===//
-// Instructions support
-//===--------------------------------------------------------------------===//
 //===---------------------------------------------------------------------===//
 // Custom Inserter for Branches and returns, this eventually will be a
 // separate pass
@@ -1497,13 +1522,22 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
 }
 
 //===---------------------------------------------------------------------===//
-// Flow and Program control Instructions
+// Return instruction
 //===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1 in {
+  def RETURN          : ILFormat<(outs), (ins variable_ops),
+      "RETURN", [(IL_retflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+def IF_PREDICATE_SET  : ILFormat<(outs), (ins R600_Reg32:$src),
+  "IF_PREDICATE_SET $src", []>;
+
 let isTerminator=1 in {
-  def SWITCH      : ILFormat< (outs), (ins GPRI32:$src),
-  !strconcat("SWITCH", " $src"), []>;
-  def CASE        : ILFormat< (outs), (ins GPRI32:$src),
-      !strconcat("CASE", " $src"), []>;
   def BREAK       : ILFormat< (outs), (ins),
       "BREAK", []>;
   def CONTINUE    : ILFormat< (outs), (ins),
@@ -1547,6 +1581,60 @@ let isTerminator=1 in {
   defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
 }
 
+//===----------------------------------------------------------------------===//
+// Indirect addressing pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+class ExtractVertical <RegisterClass vec_rc> : InstR600 <
+  (outs R600_Reg32:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+let Constraints = "$dst = $vec" in {
+
+class InsertVertical <RegisterClass vec_rc> : InstR600 <
+  (outs vec_rc:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+} // End Constraints = "$dst = $vec"
+
+} // End isPseudo = 1
+
+def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
+def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
+
+def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
+def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
+
+class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
+                          ValueType scalar_ty> : Pat <
+  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
+  (inst $vec, $index)
+>;
+
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
+
+class InsertVerticalPat <Instruction inst, ValueType vec_ty,
+                         ValueType scalar_ty> : Pat <
+  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
+  (inst $vec, $value, $index)
+>;
+
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
+
 //===----------------------------------------------------------------------===//
 // ISel Patterns
 //===----------------------------------------------------------------------===//
@@ -1625,6 +1713,12 @@ def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
 
+let Predicates = [isR600] in {
+// Intrinsic patterns
+defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
+defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
+} // End isR600
+
 def getLDSNoRetOp : InstrMapping {
   let FilterClass = "R600_LDS_1A1D";
   let RowFields = ["BaseOp"];
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d1655d1ddc3c..7ea654cb14cd 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index c2f6c03320d1..74cf30974d58 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index f3bb88b3eefc..dc9567505082 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -20,15 +20,14 @@
 
 using namespace llvm;
 
-R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
-: AMDGPURegisterInfo(tm),
-  TM(tm)
+R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
   { RCW.RegWeight = 0; RCW.WeightLimit = 0;}
 
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
-  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo());
 
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
@@ -55,16 +54,6 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-const TargetRegisterClass *
-R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
-  switch (rc->getID()) {
-  case AMDGPU::GPRF32RegClassID:
-  case AMDGPU::GPRI32RegClassID:
-    return &AMDGPU::R600_Reg32RegClass;
-  default: return rc;
-  }
-}
-
 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 52e1a4bed948..247808b6e7b6 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -16,26 +16,18 @@
 #define R600REGISTERINFO_H_
 
 #include "AMDGPURegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
 
 namespace llvm {
 
-class R600TargetMachine;
+class AMDGPUSubtarget;
 
 struct R600RegisterInfo : public AMDGPURegisterInfo {
-  AMDGPUTargetMachine &TM;
   RegClassWeight RCW;
 
-  R600RegisterInfo(AMDGPUTargetMachine &tm);
+  R600RegisterInfo(const AMDGPUSubtarget &st);
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  /// \param RC is an AMDIL reg class.
-  ///
-  /// \returns the R600 reg class that is equivalent to \p RC.
-  const TargetRegisterClass *getISARegClass(
-    const TargetRegisterClass *RC) const override;
-
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 68bcd207b42c..cc667d985a82 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> :
 
 class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
     RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1, sub2, sub3];
-  let HWEncoding = encoding;
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
 }
 
 class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
     RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = encoding;
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
 }
 
+class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
+  "V"#lo#hi#"_"#chan,
+  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
+  lo
+>;
 
 foreach Index = 0-127 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
@@ -54,6 +64,24 @@ foreach Index = 0-127 in {
                                    Index>;
 }
 
+foreach Chan = [ "X", "Y", "Z", "W"] in {
+
+  let chan_encoding = !if(!eq(Chan, "X"), 0,
+                      !if(!eq(Chan, "Y"), 1,
+                      !if(!eq(Chan, "Z"), 2,
+                      !if(!eq(Chan, "W"), 3, 0)))) in {
+    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
+                                   [!cast<Register>("T0_"#Chan),
+                                    !cast<Register>("T1_"#Chan),
+                                    !cast<Register>("T2_"#Chan),
+                                    !cast<Register>("T3_"#Chan)],
+                                    0>;
+    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
+    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
+  }
+}
+
+
 // KCACHE_BANK0
 foreach Index = 159-128 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
@@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>;
 
 let isAllocatable = 0 in {
 
-// XXX: Only use the X channel, until we support wider stack widths
-def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
+
+// We only use Addr_[YZW] for vertical vectors.
+// FIXME if we add more vertical vector registers we will need to ad more
+// registers to these classes.
+def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
+def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
+def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
 
 def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
   (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
@@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
   let CopyCost = -1;
 }
 
+def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
+>;
+
 def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
                                 (add (sequence "T%u_XY", 0, 63))>;
+
+def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                      (add V01_X, V01_Y, V01_Z, V01_W,
+                                           V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index b51c46c59e6e..4e61d5b03aa6 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
+#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -105,18 +106,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
 
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
 
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
-  setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -215,9 +212,15 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
+  // FIXME: This should be removed and handled the same was as f32 fneg. Source
+  // modifiers also work for the double instructions.
+  setOperationAction(ISD::FNEG, MVT::f64, Expand);
+
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
 
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
   setSchedulingPreference(Sched::RegPressure);
 }
 
@@ -482,19 +485,20 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
-  case AMDGPU::V_SUB_F64:
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
-            MI->getOperand(0).getReg())
-            .addReg(MI->getOperand(1).getReg())
-            .addReg(MI->getOperand(2).getReg())
-            .addImm(0)  /* src2 */
-            .addImm(0)  /* ABS */
-            .addImm(0)  /* CLAMP */
-            .addImm(0)  /* OMOD */
-            .addImm(2); /* NEG */
+  case AMDGPU::V_SUB_F64: {
+    unsigned DestReg = MI->getOperand(0).getReg();
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
+      .addImm(0)  // SRC0 modifiers
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(1)  // SRC1 modifiers
+      .addReg(MI->getOperand(2).getReg())
+      .addImm(0)  // SRC2 modifiers
+      .addImm(0)  // src2
+      .addImm(0)  // CLAMP
+      .addImm(0); // OMOD
     MI->eraseFromParent();
     break;
-
+  }
   case AMDGPU::SI_RegisterStorePseudo: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -611,11 +615,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 
   case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
-  case ISD::ANY_EXTEND: // Fall-through
-  case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID =
@@ -903,34 +903,6 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
 }
 
-SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue True = Op.getOperand(2);
-  SDValue False = Op.getOperand(3);
-  SDValue CC = Op.getOperand(4);
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
-  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
-}
-
-SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  if (VT != MVT::i64) {
-    return SDValue();
-  }
-
-  SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
-                                                 DAG.getConstant(31, MVT::i32));
-
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
-}
-
 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1011,27 +983,99 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return Chain;
 }
 
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+                                                     DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  EVT ScalarVT = VT.getScalarType();
+  if (ScalarVT != MVT::f32)
+    return SDValue();
 
-SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
 
-  if (VT != MVT::i64) {
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // TODO: We could try to match extracting the higher bytes, which would be
+  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+  // about in practice.
+  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+      DCI.AddToWorklist(Cvt.getNode());
+      return Cvt;
+    }
+  }
+
+  // We are primarily trying to catch operations on illegal vector types
+  // before they are expanded.
+  // For scalars, we can use the more flexible method of checking masked bits
+  // after legalization.
+  if (!DCI.isBeforeLegalize() ||
+      !SrcVT.isVector() ||
+      SrcVT.getVectorElementType() != MVT::i8) {
     return SDValue();
   }
 
-  SDValue Src = Op.getOperand(0);
-  if (Src.getValueType() != MVT::i32)
-    Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
 
-  SDValue Zero = DAG.getConstant(0, MVT::i32);
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
-}
+  // Weird sized vectors are a pain to handle, but we know 3 is really the same
+  // size as 4.
+  unsigned NElts = SrcVT.getVectorNumElements();
+  if (!SrcVT.isSimple() && NElts != 3)
+    return SDValue();
 
-//===----------------------------------------------------------------------===//
-// Custom DAG optimizations
-//===----------------------------------------------------------------------===//
+  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
+  // prevent a mess from expanding to v4i32 and repacking.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
+    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
+
+    LoadSDNode *Load = cast<LoadSDNode>(Src);
+    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
+                                     Load->getChain(),
+                                     Load->getBasePtr(),
+                                     LoadVT,
+                                     Load->getMemOperand());
+
+    // Make sure successors of the original load stay after it by updating
+    // them to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
+
+    SmallVector<SDValue, 4> Elts;
+    if (RegVT.isVector())
+      DAG.ExtractVectorElements(NewLoad, Elts);
+    else
+      Elts.push_back(NewLoad);
+
+    SmallVector<SDValue, 4> Ops;
+
+    unsigned EltIdx = 0;
+    for (SDValue Elt : Elts) {
+      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
+      for (unsigned I = 0; I < ComponentsInElt; ++I) {
+        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
+        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
+        DCI.AddToWorklist(Cvt.getNode());
+        Ops.push_back(Cvt);
+      }
+
+      ++EltIdx;
+    }
+
+    assert(Ops.size() == NElts);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
+  }
+
+  return SDValue();
+}
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
@@ -1074,8 +1118,34 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       }
       break;
     }
+
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3: {
+    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+    SDValue Src = N->getOperand(0);
+    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
   }
-  return SDValue();
+
+  case ISD::UINT_TO_FP: {
+    return performUCharToFloatCombine(N, DCI);
+  }
+  }
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 /// \brief Test if RegClass is one of the VSrc classes
@@ -1296,7 +1366,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
   bool HaveVSrc = false, HaveSSrc = false;
 
-  // First figure out what we alread have in this instruction
+  // First figure out what we already have in this instruction.
   for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
        i != e && Op < NumOps; ++i, ++Op) {
 
@@ -1315,7 +1385,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
     }
   }
 
-  // If we neither have VSrc nor SSrc it makes no sense to continue
+  // If we neither have VSrc nor SSrc, it makes no sense to continue.
   if (!HaveVSrc && !HaveSSrc)
     return Node;
 
@@ -1331,17 +1401,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
     const SDValue &Operand = Node->getOperand(i);
     Ops.push_back(Operand);
 
-    // Already folded immediate ?
+    // Already folded immediate?
     if (isa<ConstantSDNode>(Operand.getNode()) ||
         isa<ConstantFPSDNode>(Operand.getNode()))
       continue;
 
-    // Is this a VSrc or SSrc operand ?
+    // Is this a VSrc or SSrc operand?
     unsigned RegClass = Desc->OpInfo[Op].RegClass;
     if (isVSrc(RegClass) || isSSrc(RegClass)) {
       // Try to fold the immediates
       if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
-        // Folding didn't worked, make sure we don't hit the SReg limit
+        // Folding didn't work, make sure we don't hit the SReg limit.
         ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
       }
       continue;
@@ -1370,7 +1440,6 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
       continue;
 
     if (DescE64) {
-
       // Test if it makes sense to switch to e64 encoding
       unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
       if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
@@ -1401,7 +1470,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
       if (!DescE64)
         continue;
       Desc = DescE64;
-      DescE64 = 0;
+      DescE64 = nullptr;
     }
     else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
       Ops.pop_back();
@@ -1411,7 +1480,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
       if (!DescE64)
         continue;
       Desc = DescE64;
-      DescE64 = 0;
+      DescE64 = nullptr;
     }
   }
 
@@ -1534,7 +1603,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   }
 }
 
-/// \brief Fold the instructions after slecting them
+/// \brief Fold the instructions after selecting them.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
   const SIInstrInfo *TII =
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index c6eaa812e4d6..2f97a9ada8f0 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -27,10 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
                                SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
   bool foldImm(SDValue &Operand, int32_t &Immediate,
@@ -46,6 +43,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
   MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
 
+  static SDValue performUCharToFloatCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI);
+
 public:
   SITargetLowering(TargetMachine &tm);
   bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 168eff25bb22..7cae9fc0d0eb 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -51,6 +51,16 @@ class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
   let Size = 8;
 }
 
+class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Scalar operations
 //===----------------------------------------------------------------------===//
@@ -207,7 +217,7 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
+    VOP3Common <outs, ins, asm, pattern> {
 
   bits<8> dst;
   bits<2> src0_modifiers;
@@ -233,16 +243,11 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{61} = src0_modifiers{0};
   let Inst{62} = src1_modifiers{0};
   let Inst{63} = src2_modifiers{0};
-  
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOP3 = 1;
+
 }
 
 class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
+    VOP3Common <outs, ins, asm, pattern> {
 
   bits<8> dst;
   bits<2> src0_modifiers;
@@ -266,11 +271,6 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{62} = src1_modifiers{0};
   let Inst{63} = src2_modifiers{0};
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOP3 = 1;
 }
 
 class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 4a9e3467c5a6..f5b82d53ba7a 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -19,13 +19,14 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCInstrDesc.h"
 
 using namespace llvm;
 
-SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
-  : AMDGPUInstrInfo(tm),
-    RI(tm) { }
+SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUInstrInfo(st),
+    RI(st) { }
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -187,18 +188,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       int FrameIndex,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
-  SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned KillFlag = isKill ? RegState::Kill : 0;
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
-  if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
-    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+  if (RI.hasVGPRs(RC)) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!");
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
+            .addReg(SrcReg);
+  } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
+    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF);
+    unsigned TgtReg = MFI->SpillTracker.LaneVGPR;
 
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg)
             .addReg(SrcReg, KillFlag)
             .addImm(Lane);
-    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane);
   } else if (RI.isSGPRClass(RC)) {
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for vector
@@ -207,8 +215,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // Reserve a spot in the spill tracker for each sub-register of
     // the vector register.
     unsigned NumSubRegs = RC->getSize() / 4;
-    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
-                                                        NumSubRegs);
+    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs);
     MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
                                     FirstLane);
 
@@ -234,19 +241,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        unsigned DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
-    SIMachineFunctionInfo::SpilledReg Spill =
-        MFI->SpillTracker.getSpilledReg(FrameIndex);
-    assert(Spill.VGPR);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
-            .addReg(Spill.VGPR)
-            .addImm(Spill.Lane);
-    insertNOPs(MI, 3);
+
+  if (RI.hasVGPRs(RC)) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!");
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addImm(0);
   } else if (RI.isSGPRClass(RC)){
     unsigned Opcode;
     switch(RC->getSize() * 8) {
+    case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
     case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
     case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
     case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
@@ -260,7 +267,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
             .addReg(Spill.VGPR)
             .addImm(FrameIndex);
-    insertNOPs(MI, 3);
   } else {
     llvm_unreachable("VGPR spilling not supported");
   }
@@ -281,6 +287,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
     return 2;
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+    return 1;
   default: llvm_unreachable("Invalid spill opcode");
   }
 }
@@ -334,7 +342,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
   case AMDGPU::SI_SPILL_S128_RESTORE:
-  case AMDGPU::SI_SPILL_S64_RESTORE: {
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_S32_RESTORE: {
     unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
 
     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
@@ -348,6 +357,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
               .addReg(MI->getOperand(1).getReg())
               .addImm(Spill.Lane + i);
     }
+    insertNOPs(MI, 3);
     MI->eraseFromParent();
     break;
   }
@@ -654,7 +664,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
@@ -667,6 +679,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
+  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
+  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   }
 }
 
@@ -731,8 +746,8 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
   unsigned SubReg = MRI.createVirtualRegister(SubRC);
 
   // Just in case the super register is itself a sub-register, copy it to a new
-  // value so we don't need to wory about merging its subreg index with the
-  // SubIdx passed to this function.  The register coalescer should be able to
+  // value so we don't need to worry about merging its subreg index with the
+  // SubIdx passed to this function. The register coalescer should be able to
   // eliminate this extra copy.
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
           NewSuperReg)
@@ -1157,22 +1172,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       continue;
     }
     case AMDGPU::S_AND_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BCNT1_I32_B64:
+      splitScalar64BitBCNT(Worklist, Inst);
       Inst->eraseFromParent();
       continue;
 
@@ -1217,6 +1237,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       // 3 to not hit an assertion later in MCInstLower.
       Inst->addOperand(MachineOperand::CreateImm(0));
       Inst->addOperand(MachineOperand::CreateImm(0));
+    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+      // The VALU version adds the second operand to the result, so insert an
+      // extra 0 operand.
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
     addDescImplicitUseDef(NewDesc, Inst);
@@ -1297,9 +1321,62 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::VReg_32RegClass;
 }
 
-void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                                     MachineInstr *Inst,
-                                     unsigned Opcode) const {
+void SIInstrInfo::splitScalar64BitUnaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
+void SIInstrInfo::splitScalar64BitBinaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst->getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -1360,6 +1437,46 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
   Worklist.push_back(HiHalf);
 }
 
+void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                                       MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src = Inst->getOperand(1);
+
+  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
+  const TargetRegisterClass *SrcRC = Src.isReg() ?
+    MRI.getRegClass(Src.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
+
+  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub0, SrcSubRC);
+  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub1, SrcSubRC);
+
+  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+    .addOperand(SrcRegSub0)
+    .addImm(0);
+
+  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+    .addOperand(SrcRegSub1)
+    .addReg(MidReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+
+  Worklist.push_back(First);
+  Worklist.push_back(Second);
+}
+
 void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
                                         MachineInstr *Inst) const {
   // Add the implict and explicit register definitions.
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 7b31a81f0a87..4c204d877809 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -44,13 +44,19 @@ class SIInstrInfo : public AMDGPUInstrInfo {
                          const TargetRegisterClass *RC,
                          const MachineOperand &Op) const;
 
-  void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
-                          MachineInstr *Inst, unsigned Opcode) const;
+  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                               MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                                MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                            MachineInstr *Inst) const;
 
   void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
 
 public:
-  explicit SIInstrInfo(AMDGPUTargetMachine &tm);
+  explicit SIInstrInfo(const AMDGPUSubtarget &st);
 
   const SIRegisterInfo &getRegisterInfo() const override {
     return RI;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 2242e6d8648b..eb9746779374 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -187,6 +187,12 @@ class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
   opName#" $dst, $src0", pattern
 >;
 
+// 64-bit input, 32-bit output.
+class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
 class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -260,7 +266,7 @@ class SIMCInstr <string pseudo, int subtarget> {
 multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
                    string opName> {
 
-  def "" : InstSI <outs, ins, "", pattern>, VOP <opName>,
+  def "" : VOP3Common <outs, ins, "", pattern>, VOP <opName>,
            SIMCInstr<OpName, SISubtarget.NONE> {
     let isPseudo = 1;
   }
@@ -357,12 +363,13 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
 }
 
 multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
-                        string opName, ValueType vt, PatLeaf cond> {
-
+                        string opName, ValueType vt, PatLeaf cond, bit defExec = 0> {
   def _e32 : VOPC <
     op, (ins arc:$src0, vrc:$src1),
     opName#"_e32 $dst, $src0, $src1", []
-  >, VOP <opName>;
+  >, VOP <opName> {
+    let Defs = !if(defExec, [VCC, EXEC], [VCC]);
+  }
 
   def _e64 : VOP3 <
     {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
@@ -375,6 +382,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
+    let Defs = !if(defExec, [EXEC], []);
     let src2 = SIOperand.ZERO;
     let src2_modifiers = 0;
   }
@@ -388,6 +396,14 @@ multiclass VOPC_64 <bits<8> op, string opName,
   ValueType vt = untyped, PatLeaf cond = COND_NULL>
   : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
 
+multiclass VOPCX_32 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>;
+
+multiclass VOPCX_64 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>;
+
 multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
   op, (outs VReg_32:$dst),
   (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
@@ -396,7 +412,7 @@ multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
   opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
 >;
 
-class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
+class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -410,9 +426,11 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
 
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
-  (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
-   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+  (ins InputMods:$src0_modifiers, VSrc_64:$src0,
+       InputMods:$src1_modifiers, VSrc_64:$src1,
+       InputMods:$src2_modifiers, VSrc_64:$src2,
+       InstFlag:$clamp, InstFlag:$omod),
+  opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern
 >, VOP <opName>;
 
 //===----------------------------------------------------------------------===//
@@ -475,10 +493,11 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A
   let vdst = 0;
 }
 
+// 1 address, 1 data.
 class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
   asm#" $vdst, $addr, $data0, $offset, [M0]",
   []> {
 
@@ -487,6 +506,41 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   let mayLoad = 1;
 }
 
+// 1 address, 2 data.
+class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+  asm#" $vdst, $addr, $data0, $data1, $offset, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+// 1 address, 2 data.
+class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+  asm#" $addr, $data0, $data1, $offset, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+// 1 address, 1 data.
+class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
+  asm#" $addr, $data0, $offset, [M0]",
+  []> {
+
+  let data1 = 0;
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index b216762564f4..428e49c6431c 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -96,22 +96,35 @@ def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
   [(set i32:$dst, (not i32:$src0))]
 >;
 
-def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64",
+  [(set i64:$dst, (not i64:$src0))]
+>;
 def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
 def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
-def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32",
+  [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+>;
 def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
 } // End neverHasSideEffects = 1
 
 ////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
 ////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
-////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
-////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
-////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
+def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
+  [(set i32:$dst, (ctpop i32:$src0))]
+>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
+
+////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>;
 ////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
-////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
+def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32",
+  [(set i32:$dst, (cttz_zero_undef i32:$src0))]
+>;
 ////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
-//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
+
+def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
+  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+>;
+
 //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
 def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
 //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
@@ -320,7 +333,7 @@ def S_CMPK_EQ_I32 : SOPK <
 >;
 */
 
-let isCompare = 1 in {
+let isCompare = 1, Defs = [SCC] in {
 def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
 def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
 def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
@@ -332,7 +345,7 @@ def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
 def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
 def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
 def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
-} // End isCompare = 1
+} // End isCompare = 1, Defs = [SCC]
 
 let Defs = [SCC], isCommutable = 1 in {
   def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
@@ -467,26 +480,26 @@ defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
 defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
 defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">;
-defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">;
-defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">;
-defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">;
-defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">;
-defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">;
-defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">;
-defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">;
-defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">;
-defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">;
-defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">;
-defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">;
-defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">;
-defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">;
-defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">;
-defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
+defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">;
+defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">;
+defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">;
+defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">;
+defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">;
+defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">;
+defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">;
+defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">;
+defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">;
+defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">;
+defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">;
+defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">;
+defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">;
+defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">;
+defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">;
+defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
 defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
@@ -505,26 +518,26 @@ defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
 defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
 defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">;
-defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">;
-defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">;
-defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">;
-defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">;
-defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">;
-defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">;
-defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">;
-defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">;
-defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">;
-defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">;
-defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">;
-defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">;
-defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">;
-defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">;
-defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">;
+defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">;
+defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">;
+defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">;
+defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">;
+defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">;
+defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">;
+defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">;
+defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">;
+defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">;
+defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">;
+defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">;
+defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">;
+defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">;
+defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">;
+defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">;
+defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
 defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
@@ -543,26 +556,26 @@ defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
 defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
 defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">;
-defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">;
-defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">;
-defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">;
-defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">;
-defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">;
-defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">;
-defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">;
-defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">;
-defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">;
-defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">;
-defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">;
-defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">;
-defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
-defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">;
-defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">;
+defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">;
+defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">;
+defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">;
+defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">;
+defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">;
+defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">;
+defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">;
+defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">;
+defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">;
+defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">;
+defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">;
+defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
+defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">;
+defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
 defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
@@ -611,18 +624,18 @@ defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
 defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
 defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">;
-defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">;
-defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">;
-defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">;
-defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">;
-defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">;
-defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">;
-defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
+defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">;
+defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">;
+defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">;
+defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">;
+defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">;
+defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">;
+defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">;
+defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
 defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
@@ -633,18 +646,18 @@ defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
 defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
 defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">;
-defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">;
-defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">;
-defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">;
-defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">;
-defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">;
-defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">;
-defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
+defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">;
+defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">;
+defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">;
+defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">;
+defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">;
+defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">;
+defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">;
+defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
 defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
@@ -655,18 +668,18 @@ defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
 defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
 defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">;
-defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">;
-defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">;
-defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">;
-defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">;
-defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">;
-defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">;
-defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
+defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">;
+defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">;
+defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">;
+defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">;
+defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">;
+defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">;
+defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">;
+defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
 defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
@@ -677,30 +690,30 @@ defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
 defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
 defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">;
-defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">;
-defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">;
-defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">;
-defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">;
-defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">;
-defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">;
-defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">;
+defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">;
+defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">;
+defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">;
+defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">;
+defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">;
+defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">;
+defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">;
+defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
-defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">;
+} // End hasSideEffects = 1
 
 defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
-defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+} // End hasSideEffects = 1
 
 } // End isCompare = 1
 
@@ -708,8 +721,97 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 // DS Instructions
 //===----------------------------------------------------------------------===//
 
-def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
-def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
+
+def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>;
+def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>;
+def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>;
+def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>;
+def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>;
+def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>;
+def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>;
+def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>;
+def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>;
+def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>;
+def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>;
+def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>;
+def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>;
+def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>;
+def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>;
+def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>;
+def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>;
+
+def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>;
+def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>;
+def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>;
+def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>;
+def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>;
+def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>;
+def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>;
+def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>;
+def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>;
+def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>;
+def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>;
+def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>;
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>;
+
+let SubtargetPredicate = isCI in {
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>;
+} // End isCI
+
+
+def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>;
+def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>;
+def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>;
+def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>;
+def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>;
+def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>;
+def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>;
+
+def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>;
+def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>;
+def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>;
+def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>;
+def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>;
+def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>;
+def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>;
+def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>;
+def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>;
+def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>;
+def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>;
+def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>;
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>;
+//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>;
+//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>;
+def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>;
+def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>;
+
+//let SubtargetPredicate = isCI in {
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+//} // End isCI
+
+// TODO: _SRC2_* forms
+
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
 def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
 def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
@@ -973,12 +1075,25 @@ defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
 defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
   [(set f64:$dst, (fextend f32:$src0))]
 >;
-//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
-//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
-//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
-//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
-//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
-//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))]
+>;
+defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
+  [(set i32:$dst, (fp_to_uint f64:$src0))]
+>;
+defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
+  [(set f64:$dst, (uint_to_fp i32:$src0))]
+>;
+
 defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
   [(set f32:$dst, (AMDGPUfract f32:$src0))]
 >;
@@ -1012,12 +1127,16 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 <
   0x0000002d, "V_RSQ_LEGACY_F32",
   [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
 >;
-defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
+  [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))]
+>;
 defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
   [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
 >;
 defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
-defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
+  [(set f64:$dst, (fdiv FP_ONE, (fsqrt f64:$src0)))]
+>;
 defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
 defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
   [(set f32:$dst, (fsqrt f32:$src0))]
@@ -1206,7 +1325,7 @@ defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32",
 defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
 defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
-//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
+defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
 defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
 defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 
@@ -1247,15 +1366,18 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
 let neverHasSideEffects = 1 in {
 
 defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
-defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
+defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
+  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+>;
 defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
-  [(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))]
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
 >;
 defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
-  [(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))]
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
 >;
 
 } // End neverHasSideEffects
+
 defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
 defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
 defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
@@ -1298,13 +1420,13 @@ defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
 defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
 def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
 
-def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
+def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
   [(set i64:$dst, (shl i64:$src0, i32:$src1))]
 >;
-def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
+def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64",
   [(set i64:$dst, (srl i64:$src0, i32:$src1))]
 >;
-def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64",
+def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64",
   [(set i64:$dst, (sra i64:$src0, i32:$src1))]
 >;
 
@@ -1521,6 +1643,7 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 
 }
 
+defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
 defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
 defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
@@ -1544,7 +1667,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
 >;
 
@@ -1616,6 +1739,15 @@ def : Pat <
   (S_XOR_B64 $src0, $src1)
 >;
 
+//===----------------------------------------------------------------------===//
+// SOPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (int_AMDGPU_barrier_global),
+  (S_BARRIER)
+>;
+
 //===----------------------------------------------------------------------===//
 // VOP2 Patterns
 //===----------------------------------------------------------------------===//
@@ -1856,7 +1988,10 @@ def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <v2i32, v2f32, VReg_64>;
 def : BitConvert <v2i32, i64, VReg_64>;
 def : BitConvert <i64, v2i32, VReg_64>;
-
+def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
 
@@ -1886,7 +2021,7 @@ def FCLAMP_SI : AMDGPUShaderInst <
 }
 
 def : Pat <
-  (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
+  (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
   (FCLAMP_SI f32:$src)
 >;
 
@@ -2070,10 +2205,8 @@ def : Pat <
 // VOP3 Patterns
 //===----------------------------------------------------------------------===//
 
-def : Pat <
-  (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)),
-  (V_MAD_F32 $src0, $src1, $src2)
->;
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
 
 def : Pat <
   (fadd f64:$src0, f64:$src1),
@@ -2100,7 +2233,7 @@ def : Pat <
   (V_MUL_HI_I32 $src0, $src1, (i32 0))
 >;
 
-defm : BFIPatterns <V_BFI_B32>;
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
 /********** ======================= **********/
@@ -2133,8 +2266,8 @@ multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
   >;
 
   def : Pat <
-    (frag vt:$src1, i32:$src0),
-    (inst 0, $src0, $src1, 0)
+    (frag vt:$val, i32:$ptr),
+    (inst 0, $ptr, $val, 0)
   >;
 }
 
@@ -2143,11 +2276,92 @@ defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
 defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
 defm : DSWritePat <DS_WRITE_B64, i64, local_store>;
 
-def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
-           (DS_ADD_U32_RTN 0, $ptr, $val, 0)>;
+multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value),
+    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag i32:$ptr, vt:$val),
+    (inst 0, $ptr, $val, 0)
+  >;
+}
+
+// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
+//
+// We need to use something for the data0, so we set a register to
+// -1. For the non-rtn variants, the manual says it does
+// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
+// will always do the increment so I'm assuming it's the same.
+//
+// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
+// needs to be a VGPR. The SGPR copy pass will fix this, and it's
+// easier since there is no v_mov_b64.
+multiclass DSAtomicIncRetPat<DS inst, ValueType vt,
+                             Instruction LoadImm, PatFrag frag> {
+  def : Pat <
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)),
+    (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag i32:$ptr, (vt 1)),
+    (inst 0, $ptr, (LoadImm (vt -1)), 0)
+  >;
+}
+
+multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap),
+    (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (frag i32:$ptr, vt:$cmp, vt:$swap),
+    (inst 0, $ptr, $cmp, $swap, 0)
+  >;
+}
+
+
+// 32-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
+                         S_MOV_B32, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
+                         S_MOV_B32, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
+                         S_MOV_B64, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
+                         S_MOV_B64, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
 
-def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
-           (DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
 
 //===----------------------------------------------------------------------===//
 // MUBUF Patterns
@@ -2295,7 +2509,7 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
 def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
 def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
-let Predicates = [isCI] in {
+let SubtargetPredicate = isCI in {
 
 // Sea island new arithmetic instructinos
 let neverHasSideEffects = 1 in {
@@ -2342,7 +2556,7 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
 // BUFFER_LOAD_DWORDX3
 // BUFFER_STORE_DWORDX3
 
-} // End Predicates = [isCI]
+} // End iSCI
 
 
 /********** ====================== **********/
@@ -2354,13 +2568,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I
   // 1. Extract with offset
   def : Pat<
     (vector_extract vt:$vec, (add i32:$idx, imm:$off)),
-    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
+    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
   >;
 
   // 2. Extract without offset
   def : Pat<
     (vector_extract vt:$vec, i32:$idx),
-    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
+    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
   >;
 
   // 3. Insert with offset
@@ -2386,20 +2600,6 @@ defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
 defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
 defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
 
-/********** =============== **********/
-/**********   Conditions    **********/
-/********** =============== **********/
-
-def : Pat<
-  (i1 (setcc f32:$src0, f32:$src1, SETO)),
-  (V_CMP_O_F32_e64 $src0, $src1)
->;
-
-def : Pat<
-  (i1 (setcc f32:$src0, f32:$src1, SETUO)),
-  (V_CMP_U_F32_e64 $src0, $src1)
->;
-
 //===----------------------------------------------------------------------===//
 // Conversion Patterns
 //===----------------------------------------------------------------------===//
@@ -2433,6 +2633,62 @@ def : Pat <
     (S_MOV_B32 -1), sub1)
 >;
 
+class ZExt_i64_i32_Pat <SDNode ext> : Pat <
+  (i64 (ext i32:$src)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+  (i64 (ext i1:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+
+def : ZExt_i64_i32_Pat<zext>;
+def : ZExt_i64_i32_Pat<anyext>;
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
+
+def : Pat <
+  (i64 (sext i32:$src)),
+    (INSERT_SUBREG
+      (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+      (S_ASHR_I32 $src, 31), sub1)
+>;
+
+def : Pat <
+  (i64 (sext i1:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG
+      (i64 (IMPLICIT_DEF)),
+      (V_CNDMASK_B32_e64 0, -1, $src), sub0),
+    (V_CNDMASK_B32_e64 0, -1, $src), sub1)
+>;
+
+def : Pat <
+  (f32 (sint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
+>;
+
+def : Pat <
+  (f32 (uint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
+>;
+
+def : Pat <
+  (f64 (sint_to_fp i1:$src)),
+    (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+>;
+
+def : Pat <
+  (f64 (uint_to_fp i1:$src)),
+  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+>;
+
 //===----------------------------------------------------------------------===//
 // Miscellaneous Patterns
 //===----------------------------------------------------------------------===//
@@ -2454,6 +2710,18 @@ def : Pat <
   (S_ADD_I32 $src0, $src1)
 >;
 
+def : Pat <
+  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+  (V_BCNT_U32_B32_e32 $popcnt, $val)
+>;
+
+def : Pat <
+  (i64 (ctpop i64:$src)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_BCNT1_I32_B64 $src), sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 6601f2a98065..9f5ff29ad93a 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -86,6 +86,7 @@ class SILowerControlFlowPass : public MachineFunctionPass {
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
+  void InitM0ForLDS(MachineBasicBlock::iterator MI);
   void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
   void IndirectSrc(MachineInstr &MI);
   void IndirectDst(MachineInstr &MI);
@@ -320,6 +321,14 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+/// The m0 register stores the maximum allowable address for LDS reads and
+/// writes.  Its value must be at least the size in bytes of LDS allocated by
+/// the shader.  For simplicity, we set it to the maximum possible value.
+void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
+            AMDGPU::M0).addImm(0xffffffff);
+}
+
 void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
 
   MachineBasicBlock &MBB = *MI.getParent();
@@ -333,52 +342,57 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
             .addReg(Idx);
     MBB.insert(I, MovRel);
-    MI.eraseFromParent();
-    return;
-  }
+  } else {
 
-  assert(AMDGPU::SReg_64RegClass.contains(Save));
-  assert(AMDGPU::VReg_32RegClass.contains(Idx));
+    assert(AMDGPU::SReg_64RegClass.contains(Save));
+    assert(AMDGPU::VReg_32RegClass.contains(Idx));
 
-  // Save the EXEC mask
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-          .addReg(AMDGPU::EXEC);
+    // Save the EXEC mask
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+            .addReg(AMDGPU::EXEC);
 
-  // Read the next variant into VCC (lower 32 bits) <- also loop target
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-          AMDGPU::VCC_LO)
-          .addReg(Idx);
+    // Read the next variant into VCC (lower 32 bits) <- also loop target
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+            AMDGPU::VCC_LO)
+            .addReg(Idx);
 
-  // Move index from VCC into M0
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-          .addReg(AMDGPU::VCC_LO);
+    // Move index from VCC into M0
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(AMDGPU::VCC_LO);
 
-  // Compare the just read M0 value to all possible Idx values
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
-          .addReg(AMDGPU::M0)
-          .addReg(Idx);
+    // Compare the just read M0 value to all possible Idx values
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+            .addReg(AMDGPU::M0)
+            .addReg(Idx);
 
-  // Update EXEC, save the original EXEC value to VCC
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-          .addReg(AMDGPU::VCC);
+    // Update EXEC, save the original EXEC value to VCC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+            .addReg(AMDGPU::VCC);
 
-  // Do the actual move
-  MBB.insert(I, MovRel);
+    // Do the actual move
+    MBB.insert(I, MovRel);
 
-  // Update EXEC, switch all done bits to 0 and all todo bits to 1
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(AMDGPU::VCC);
+    // Update EXEC, switch all done bits to 0 and all todo bits to 1
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+            .addReg(AMDGPU::EXEC)
+            .addReg(AMDGPU::VCC);
 
-  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addImm(-7)
-          .addReg(AMDGPU::EXEC);
+    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+            .addImm(-7)
+            .addReg(AMDGPU::EXEC);
 
-  // Restore EXEC
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-          .addReg(Save);
+    // Restore EXEC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+            .addReg(Save);
 
+  }
+  // FIXME: Are there any values other than the LDS address clamp that need to
+  // be stored in the m0 register and may be live for more than a few
+  // instructions?  If so, we should save the m0 register at the beginning
+  // of this function and restore it here.
+  // FIXME: Add support for LDS direct loads.
+  InitM0ForLDS(&MI);
   MI.eraseFromParent();
 }
 
@@ -523,8 +537,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock &MBB = MF.front();
     // Initialize M0 to a value that won't cause LDS access to be discarded
     // due to offset clamping
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
-            AMDGPU::M0).addImm(0xffffffff);
+    InitM0ForLDS(MBB.getFirstNonPHI());
   }
 
   if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index c72d549a3dd1..519a7ba63b26 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -14,21 +14,20 @@
 
 
 #include "SIRegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
-: AMDGPURegisterInfo(tm),
-  TM(tm)
+SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
   { }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::EXEC);
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
   TII->reserveIndirectRegisters(Reserved, MF);
   return Reserved;
 }
@@ -38,15 +37,6 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   return RC->getNumRegs();
 }
 
-const TargetRegisterClass *
-SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
-  switch (rc->getID()) {
-  case AMDGPU::GPRF32RegClassID:
-    return &AMDGPU::VReg_32RegClass;
-  default: return rc;
-  }
-}
-
 const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 36b4fcd32a89..6bcf2f015f02 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -20,24 +20,15 @@
 
 namespace llvm {
 
-class AMDGPUTargetMachine;
-
 struct SIRegisterInfo : public AMDGPURegisterInfo {
-  AMDGPUTargetMachine &TM;
 
-  SIRegisterInfo(AMDGPUTargetMachine &tm);
+  SIRegisterInfo(const AMDGPUSubtarget &st);
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
-  /// \param RC is an AMDIL reg class.
-  ///
-  /// \returns the SI register class that is equivalent to \p RC.
-  const TargetRegisterClass *
-    getISARegClass(const TargetRegisterClass *RC) const override;
-
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
   const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index da88820f52d2..9df005401897 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -47,31 +47,27 @@ class SparcAsmParser : public MCTargetAsmParser {
 
   // public interface of the MCTargetAsmParser.
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+                        SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
 
   // Custom parse functions for Sparc specific operands.
-  OperandMatchResultTy
-  parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
 
-  OperandMatchResultTy
-  parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               StringRef Name);
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
 
   OperandMatchResultTy
-  parseSparcAsmOperand(SparcOperand *&Operand, bool isCall = false);
+  parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Operand,
+                       bool isCall = false);
 
-  OperandMatchResultTy
-  parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  OperandMatchResultTy parseBranchModifiers(OperandVector &Operands);
 
   // returns true if Tok is matched to a register and returns register in RegNo.
   bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
@@ -153,8 +149,6 @@ class SparcOperand : public MCParsedAsmOperand {
 
   SMLoc StartLoc, EndLoc;
 
-  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
-
   struct Token {
     const char *Data;
     unsigned Length;
@@ -182,6 +176,8 @@ class SparcOperand : public MCParsedAsmOperand {
     struct MemOp Mem;
   };
 public:
+  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
   bool isToken() const override { return Kind == k_Token; }
   bool isReg() const override { return Kind == k_Register; }
   bool isImm() const override { return Kind == k_Immediate; }
@@ -291,8 +287,8 @@ class SparcOperand : public MCParsedAsmOperand {
     addExpr(Inst, Expr);
   }
 
-  static SparcOperand *CreateToken(StringRef Str, SMLoc S) {
-    SparcOperand *Op = new SparcOperand(k_Token);
+  static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<SparcOperand>(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -300,10 +296,9 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static SparcOperand *CreateReg(unsigned RegNum,
-                                 unsigned Kind,
-                                 SMLoc S, SMLoc E) {
-    SparcOperand *Op = new SparcOperand(k_Register);
+  static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
+                                                 SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
     Op->StartLoc = S;
@@ -311,49 +306,51 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static SparcOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    SparcOperand *Op = new SparcOperand(k_Immediate);
+  static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                 SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static SparcOperand *MorphToDoubleReg(SparcOperand *Op) {
-    unsigned Reg = Op->getReg();
-    assert(Op->Reg.Kind == rk_FloatReg);
+  static bool MorphToDoubleReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_FloatReg);
     unsigned regIdx = Reg - Sparc::F0;
     if (regIdx % 2 || regIdx > 31)
-      return nullptr;
-    Op->Reg.RegNum = DoubleRegs[regIdx / 2];
-    Op->Reg.Kind = rk_DoubleReg;
-    return Op;
+      return false;
+    Op.Reg.RegNum = DoubleRegs[regIdx / 2];
+    Op.Reg.Kind = rk_DoubleReg;
+    return true;
   }
 
-  static SparcOperand *MorphToQuadReg(SparcOperand *Op) {
-    unsigned Reg = Op->getReg();
+  static bool MorphToQuadReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
     unsigned regIdx = 0;
-    switch (Op->Reg.Kind) {
-    default: assert(0 && "Unexpected register kind!");
+    switch (Op.Reg.Kind) {
+    default: llvm_unreachable("Unexpected register kind!");
     case rk_FloatReg:
       regIdx = Reg - Sparc::F0;
       if (regIdx % 4 || regIdx > 31)
-        return nullptr;
+        return false;
       Reg = QuadFPRegs[regIdx / 4];
       break;
     case rk_DoubleReg:
       regIdx =  Reg - Sparc::D0;
       if (regIdx % 2 || regIdx > 31)
-        return nullptr;
+        return false;
       Reg = QuadFPRegs[regIdx / 2];
       break;
     }
-    Op->Reg.RegNum  = Reg;
-    Op->Reg.Kind = rk_QuadReg;
-    return Op;
+    Op.Reg.RegNum = Reg;
+    Op.Reg.Kind = rk_QuadReg;
+    return true;
   }
 
-  static SparcOperand *MorphToMEMrr(unsigned Base, SparcOperand *Op) {
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
     unsigned offsetReg = Op->getReg();
     Op->Kind = k_MemoryReg;
     Op->Mem.Base = Base;
@@ -362,10 +359,9 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static SparcOperand *CreateMEMri(unsigned Base,
-                                 const MCExpr *Off,
-                                 SMLoc S, SMLoc E) {
-    SparcOperand *Op = new SparcOperand(k_MemoryImm);
+  static std::unique_ptr<SparcOperand>
+  CreateMEMri(unsigned Base, const MCExpr *Off, SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_MemoryImm);
     Op->Mem.Base = Base;
     Op->Mem.OffsetReg = 0;
     Op->Mem.Off = Off;
@@ -374,7 +370,8 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static SparcOperand *MorphToMEMri(unsigned Base, SparcOperand *Op) {
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) {
     const MCExpr *Imm  = Op->getImm();
     Op->Kind = k_MemoryImm;
     Op->Mem.Base = Base;
@@ -386,11 +383,11 @@ class SparcOperand : public MCParsedAsmOperand {
 
 } // end namespace
 
-bool SparcAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out,
+                                             unsigned &ErrorInfo,
+                                             bool MatchingInlineAsm) {
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
   unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
@@ -415,7 +412,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((SparcOperand*) Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((SparcOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -450,11 +447,9 @@ ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
 static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
                                  unsigned VariantID);
 
-bool SparcAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                 SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands)
-{
+bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                      StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
 
   // First operand in MCInst is instruction mnemonic.
   Operands.push_back(SparcOperand::CreateToken(Name, NameLoc));
@@ -548,9 +543,8 @@ bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
-SparcAsmParser::OperandMatchResultTy SparcAsmParser::
-parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
-{
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
 
   SMLoc S, E;
   unsigned BaseReg = 0;
@@ -575,23 +569,20 @@ parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
     break;
   }
 
-  SparcOperand *Offset = nullptr;
+  std::unique_ptr<SparcOperand> Offset;
   OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
   if (ResTy != MatchOperand_Success || !Offset)
     return MatchOperand_NoMatch;
 
-  Offset = (Offset->isImm()
-            ? SparcOperand::MorphToMEMri(BaseReg, Offset)
-            : SparcOperand::MorphToMEMrr(BaseReg, Offset));
+  Operands.push_back(
+      Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset))
+                      : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset)));
 
-  Operands.push_back(Offset);
   return MatchOperand_Success;
 }
 
-SparcAsmParser::OperandMatchResultTy SparcAsmParser::
-parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-             StringRef Mnemonic)
-{
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
 
@@ -637,21 +628,21 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return MatchOperand_Success;
   }
 
-  SparcOperand *Op = nullptr;
+  std::unique_ptr<SparcOperand> Op;
 
   ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
   if (ResTy != MatchOperand_Success || !Op)
     return MatchOperand_ParseFail;
 
   // Push the parsed operand into the list of operands
-  Operands.push_back(Op);
+  Operands.push_back(std::move(Op));
 
   return MatchOperand_Success;
 }
 
 SparcAsmParser::OperandMatchResultTy
-SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall)
-{
+SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
+                                     bool isCall) {
 
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
@@ -718,8 +709,8 @@ SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall)
   return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
 }
 
-SparcAsmParser::OperandMatchResultTy SparcAsmParser::
-parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
 
   // parse (,a|,pn|,pt)+
 
@@ -928,18 +919,14 @@ extern "C" void LLVMInitializeSparcAsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "SparcGenAsmMatcher.inc"
 
-
-
-unsigned SparcAsmParser::
-validateTargetOperandClass(MCParsedAsmOperand *GOp,
-                           unsigned Kind)
-{
-  SparcOperand *Op = (SparcOperand*)GOp;
-  if (Op->isFloatOrDoubleReg()) {
+unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
+                                                    unsigned Kind) {
+  SparcOperand &Op = (SparcOperand &)GOp;
+  if (Op.isFloatOrDoubleReg()) {
     switch (Kind) {
     default: break;
     case MCK_DFPRegs:
-      if (!Op->isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
+      if (!Op.isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
         return MCTargetAsmParser::Match_Success;
       break;
     case MCK_QFPRegs:
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 7d517b699b22..0fbac218cb6b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -201,7 +201,7 @@ namespace {
     }
     void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
       // FIXME.
-      assert(0 && "relaxInstruction() unimplemented");
+      llvm_unreachable("relaxInstruction() unimplemented");
     }
 
     bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index ae57fdc4eebd..3ccdd038fb33 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -124,7 +124,7 @@ SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
 
 Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
   switch (Kind) {
-  default:           assert(0 && "Unhandled SparcMCExpr::VariantKind");
+  default: llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
   case VK_Sparc_LO:       return Sparc::fixup_sparc_lo10;
   case VK_Sparc_HI:       return Sparc::fixup_sparc_hi22;
   case VK_Sparc_H44:      return Sparc::fixup_sparc_h44;
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
index c775e9e83ee6..d0eec98b5e91 100644
--- a/lib/Target/Sparc/SparcJITInfo.cpp
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -213,7 +213,8 @@ extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
 
 
 void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  assert(0 && "FIXME: Implement SparcJITInfo::replaceMachineCodeForFunction");
+  llvm_unreachable("FIXME: Implement SparcJITInfo::"
+                   "replaceMachineCodeForFunction");
 }
 
 
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
index eb36d2940b76..c2b897c6081a 100644
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
+++ b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 #define DEBUG_TYPE "sparc-selectiondag-info"
 
 SparcSelectionDAGInfo::SparcSelectionDAGInfo(const SparcTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
+  : TargetSelectionDAGInfo(TM.getDataLayout()) {
 }
 
 SparcSelectionDAGInfo::~SparcSelectionDAGInfo() {
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index b759e9ae75ed..7d043388e8cf 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -44,7 +44,7 @@ class SparcTargetMachine : public LLVMTargetMachine {
   const TargetFrameLowering  *getFrameLowering() const override {
     return &FrameLowering;
   }
-  const SparcSubtarget   *getSubtargetImpl() const override{ return &Subtarget; }
+  const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; }
   const SparcRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 71de64fff87e..758be41ce263 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -104,10 +104,6 @@ class SystemZOperand : public MCParsedAsmOperand {
     MemOp Mem;
   };
 
-  SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
-    : Kind(kind), StartLoc(startLoc), EndLoc(endLoc)
-  {}
-
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -119,40 +115,44 @@ class SystemZOperand : public MCParsedAsmOperand {
   }
 
 public:
+  SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
+      : Kind(kind), StartLoc(startLoc), EndLoc(endLoc) {}
+
   // Create particular kinds of operand.
-  static SystemZOperand *createInvalid(SMLoc StartLoc, SMLoc EndLoc) {
-    return new SystemZOperand(KindInvalid, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc,
+                                                       SMLoc EndLoc) {
+    return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
   }
-  static SystemZOperand *createToken(StringRef Str, SMLoc Loc) {
-    SystemZOperand *Op = new SystemZOperand(KindToken, Loc, Loc);
+  static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
+    auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
     Op->Token.Length = Str.size();
     return Op;
   }
-  static SystemZOperand *createReg(RegisterKind Kind, unsigned Num,
-                                   SMLoc StartLoc, SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindReg, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
     Op->Reg.Kind = Kind;
     Op->Reg.Num = Num;
     return Op;
   }
-  static SystemZOperand *createAccessReg(unsigned Num, SMLoc StartLoc,
-                                         SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindAccessReg, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createAccessReg(unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindAccessReg, StartLoc, EndLoc);
     Op->AccessReg = Num;
     return Op;
   }
-  static SystemZOperand *createImm(const MCExpr *Expr, SMLoc StartLoc,
-                                   SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindImm, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
     Op->Imm = Expr;
     return Op;
   }
-  static SystemZOperand *createMem(RegisterKind RegKind, unsigned Base,
-                                   const MCExpr *Disp, unsigned Index,
-                                   const MCExpr *Length, SMLoc StartLoc,
-                                   SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindMem, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createMem(RegisterKind RegKind, unsigned Base, const MCExpr *Disp,
+            unsigned Index, const MCExpr *Length, SMLoc StartLoc,
+            SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
     Op->Mem.RegKind = RegKind;
     Op->Mem.Base = Base;
     Op->Mem.Index = Index;
@@ -313,21 +313,19 @@ class SystemZAsmParser : public MCTargetAsmParser {
   bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
                      bool IsAddress = false);
 
-  OperandMatchResultTy
-  parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                RegisterGroup Group, const unsigned *Regs, RegisterKind Kind);
+  OperandMatchResultTy parseRegister(OperandVector &Operands,
+                                     RegisterGroup Group, const unsigned *Regs,
+                                     RegisterKind Kind);
 
   bool parseAddress(unsigned &Base, const MCExpr *&Disp,
                     unsigned &Index, const MCExpr *&Length,
                     const unsigned *Regs, RegisterKind RegKind);
 
-  OperandMatchResultTy
-  parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               const unsigned *Regs, RegisterKind RegKind,
-               MemoryKind MemKind);
+  OperandMatchResultTy parseAddress(OperandVector &Operands,
+                                    const unsigned *Regs, RegisterKind RegKind,
+                                    MemoryKind MemKind);
 
-  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                    StringRef Mnemonic);
+  bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
 public:
   SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
@@ -343,87 +341,66 @@ class SystemZAsmParser : public MCTargetAsmParser {
   // Override MCTargetAsmParser.
   bool ParseDirective(AsmToken DirectiveID) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  bool ParseInstruction(ParseInstructionInfo &Info,
-                        StringRef Name, SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands)
-    override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   // Used by the TableGen code to parse particular operand types.
-  OperandMatchResultTy
-  parseGR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGR32(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
   }
-  OperandMatchResultTy
-  parseGRH32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGRH32(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
   }
-  OperandMatchResultTy
-  parseGRX32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGRX32(OperandVector &Operands) {
     llvm_unreachable("GRX32 should only be used for pseudo instructions");
   }
-  OperandMatchResultTy
-  parseGR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGR64(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
   }
-  OperandMatchResultTy
-  parseGR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGR128(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg);
   }
-  OperandMatchResultTy
-  parseADDR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseADDR32(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg);
   }
-  OperandMatchResultTy
-  parseADDR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseADDR64(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg);
   }
-  OperandMatchResultTy
-  parseADDR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseADDR128(OperandVector &Operands) {
     llvm_unreachable("Shouldn't be used as an operand");
   }
-  OperandMatchResultTy
-  parseFP32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseFP32(OperandVector &Operands) {
     return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg);
   }
-  OperandMatchResultTy
-  parseFP64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseFP64(OperandVector &Operands) {
     return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg);
   }
-  OperandMatchResultTy
-  parseFP128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseFP128(OperandVector &Operands) {
     return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
   }
-  OperandMatchResultTy
-  parseBDAddr32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDAddr32(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR32Regs, ADDR32Reg, BDMem);
   }
-  OperandMatchResultTy
-  parseBDAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDAddr64(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDMem);
   }
-  OperandMatchResultTy
-  parseBDXAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDXMem);
   }
-  OperandMatchResultTy
-  parseBDLAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDLMem);
   }
-  OperandMatchResultTy
-  parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  OperandMatchResultTy
-  parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-             int64_t MinVal, int64_t MaxVal);
-  OperandMatchResultTy
-  parsePCRel16(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseAccessReg(OperandVector &Operands);
+  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
+                                  int64_t MaxVal);
+  OperandMatchResultTy parsePCRel16(OperandVector &Operands) {
     return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1);
   }
-  OperandMatchResultTy
-  parsePCRel32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parsePCRel32(OperandVector &Operands) {
     return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
   }
 };
@@ -497,9 +474,8 @@ bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
 
 // Parse a register and add it to Operands.  The other arguments are as above.
 SystemZAsmParser::OperandMatchResultTy
-SystemZAsmParser::parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                RegisterGroup Group, const unsigned *Regs,
-                                RegisterKind Kind) {
+SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterGroup Group,
+                                const unsigned *Regs, RegisterKind Kind) {
   if (Parser.getTok().isNot(AsmToken::Percent))
     return MatchOperand_NoMatch;
 
@@ -566,9 +542,8 @@ bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
 // Parse a memory operand and add it to Operands.  The other arguments
 // are as above.
 SystemZAsmParser::OperandMatchResultTy
-SystemZAsmParser::parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               const unsigned *Regs, RegisterKind RegKind,
-                               MemoryKind MemKind) {
+SystemZAsmParser::parseAddress(OperandVector &Operands, const unsigned *Regs,
+                               RegisterKind RegKind, MemoryKind MemKind) {
   SMLoc StartLoc = Parser.getTok().getLoc();
   unsigned Base, Index;
   const MCExpr *Disp;
@@ -622,9 +597,9 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   return false;
 }
 
-bool SystemZAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
   Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
 
   // Read the remaining operands.
@@ -655,9 +630,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
   return false;
 }
 
-bool SystemZAsmParser::
-parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-             StringRef Mnemonic) {
+bool SystemZAsmParser::parseOperand(OperandVector &Operands,
+                                    StringRef Mnemonic) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -700,11 +674,11 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   return false;
 }
 
-bool SystemZAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
   MCInst Inst;
   unsigned MatchResult;
 
@@ -739,7 +713,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((SystemZOperand*)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((SystemZOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -753,8 +727,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Unexpected match type");
 }
 
-SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
-parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseAccessReg(OperandVector &Operands) {
   if (Parser.getTok().isNot(AsmToken::Percent))
     return MatchOperand_NoMatch;
 
@@ -768,9 +742,9 @@ parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
-parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-           int64_t MinVal, int64_t MaxVal) {
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
+                             int64_t MaxVal) {
   MCContext &Ctx = getContext();
   MCStreamer &Out = getStreamer();
   const MCExpr *Expr;
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 97abee303b09..528227bd3c0a 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 
 SystemZSelectionDAGInfo::
 SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
+  : TargetSelectionDAGInfo(TM.getDataLayout()) {
 }
 
 SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 4c9ce29c7e2a..1fca067ad2e2 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -65,7 +65,8 @@ bool SystemZPassConfig::addInstSelector() {
 }
 
 bool SystemZPassConfig::addPreSched2() {
-  if (getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
+  if (getOptLevel() != CodeGenOpt::None &&
+      getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
     addPass(&IfConverterID);
   return true;
 }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index f79cdfd0a791..95c8cb66f402 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -28,23 +28,6 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
-//---------------------------------------------------------------------------
-// Command-line options that tend to be useful on more than one back-end.
-//
-
-namespace llvm {
-  bool AsmVerbosityDefault(false);
-}
-
-static cl::opt<bool>
-DataSections("fdata-sections",
-  cl::desc("Emit data into separate sections"),
-  cl::init(false));
-static cl::opt<bool>
-FunctionSections("ffunction-sections",
-  cl::desc("Emit functions into separate sections"),
-  cl::init(false));
-
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
@@ -105,8 +88,8 @@ CodeModel::Model TargetMachine::getCodeModel() const {
 }
 
 /// Get the IR-specified TLS model for Var.
-static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
-  switch (Var->getThreadLocalMode()) {
+static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
+  switch (GV->getThreadLocalMode()) {
   case GlobalVariable::NotThreadLocal:
     llvm_unreachable("getSelectedTLSModel for non-TLS variable");
     break;
@@ -123,19 +106,13 @@ static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
 }
 
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
-  // If GV is an alias then use the aliasee for determining
-  // thread-localness.
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->getAliasee();
-  const GlobalVariable *Var = cast<GlobalVariable>(GV);
-
-  bool isLocal = Var->hasLocalLinkage();
-  bool isDeclaration = Var->isDeclaration();
+  bool isLocal = GV->hasLocalLinkage();
+  bool isDeclaration = GV->isDeclaration();
   bool isPIC = getRelocationModel() == Reloc::PIC_;
   bool isPIE = Options.PositionIndependentExecutable;
   // FIXME: what should we do for protected and internal visibility?
   // For variables, is internal different from hidden?
-  bool isHidden = Var->hasHiddenVisibility();
+  bool isHidden = GV->hasHiddenVisibility();
 
   TLSModel::Model Model;
   if (isPIC && !isPIE) {
@@ -151,7 +128,7 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   }
 
   // If the user specified a more specific model, use that.
-  TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
+  TLSModel::Model SelectedModel = getSelectedTLSModel(GV);
   if (SelectedModel > Model)
     return SelectedModel;
 
@@ -171,28 +148,28 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
     CodeGenInfo->setOptLevel(Level);
 }
 
-bool TargetMachine::getAsmVerbosityDefault() {
-  return AsmVerbosityDefault;
+bool TargetMachine::getAsmVerbosityDefault() const {
+  return Options.MCOptions.AsmVerbose;
 }
 
 void TargetMachine::setAsmVerbosityDefault(bool V) {
-  AsmVerbosityDefault = V;
+  Options.MCOptions.AsmVerbose = V;
 }
 
-bool TargetMachine::getFunctionSections() {
-  return FunctionSections;
+bool TargetMachine::getFunctionSections() const {
+  return Options.FunctionSections;
 }
 
-bool TargetMachine::getDataSections() {
-  return DataSections;
+bool TargetMachine::getDataSections() const {
+  return Options.DataSections;
 }
 
 void TargetMachine::setFunctionSections(bool V) {
-  FunctionSections = V;
+  Options.FunctionSections = V;
 }
 
 void TargetMachine::setDataSections(bool V) {
-  DataSections = V;
+  Options.DataSections = V;
 }
 
 void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 3ca13dac035d..0c388f8fb26c 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -43,6 +43,10 @@ bool TargetSubtargetInfo::enableMachineScheduler() const {
   return false;
 }
 
+bool TargetSubtargetInfo::enablePostMachineScheduler() const {
+  return false;
+}
+
 bool TargetSubtargetInfo::enablePostRAScheduler(
           CodeGenOpt::Level OptLevel,
           AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index 85c7bf009726..623479361c29 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -46,21 +47,21 @@ class X86AddressSanitizer : public X86AsmInstrumentation {
   virtual ~X86AddressSanitizer() {}
 
   // X86AsmInstrumentation implementation:
-  virtual void InstrumentInstruction(
-      const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override {
+  virtual void InstrumentInstruction(const MCInst &Inst,
+                                     OperandVector &Operands, MCContext &Ctx,
+                                     const MCInstrInfo &MII,
+                                     MCStreamer &Out) override {
     InstrumentMOV(Inst, Operands, Ctx, MII, Out);
   }
 
   // Should be implemented differently in x86_32 and x86_64 subclasses.
-  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
+  virtual void InstrumentMemOperandImpl(X86Operand &Op, unsigned AccessSize,
                                         bool IsWrite, MCContext &Ctx,
                                         MCStreamer &Out) = 0;
 
-  void InstrumentMemOperand(MCParsedAsmOperand *Op, unsigned AccessSize,
+  void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize,
                             bool IsWrite, MCContext &Ctx, MCStreamer &Out);
-  void InstrumentMOV(const MCInst &Inst,
-                     SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+  void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
                      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
   void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
     Out.EmitInstruction(Inst, STI);
@@ -70,24 +71,26 @@ class X86AddressSanitizer : public X86AsmInstrumentation {
   const MCSubtargetInfo &STI;
 };
 
-void X86AddressSanitizer::InstrumentMemOperand(
-    MCParsedAsmOperand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
-  assert(Op && Op->isMem() && "Op should be a memory operand.");
+void X86AddressSanitizer::InstrumentMemOperand(MCParsedAsmOperand &Op,
+                                               unsigned AccessSize,
+                                               bool IsWrite, MCContext &Ctx,
+                                               MCStreamer &Out) {
+  assert(Op.isMem() && "Op should be a memory operand.");
   assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
          "AccessSize should be a power of two, less or equal than 16.");
 
-  X86Operand *MemOp = static_cast<X86Operand *>(Op);
+  X86Operand &MemOp = static_cast<X86Operand &>(Op);
   // FIXME: get rid of this limitation.
-  if (IsStackReg(MemOp->getMemBaseReg()) || IsStackReg(MemOp->getMemIndexReg()))
+  if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg()))
     return;
 
   InstrumentMemOperandImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
 }
 
-void X86AddressSanitizer::InstrumentMOV(
-    const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {
+void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
+                                        OperandVector &Operands, MCContext &Ctx,
+                                        const MCInstrInfo &MII,
+                                        MCStreamer &Out) {
   // Access size in bytes.
   unsigned AccessSize = 0;
 
@@ -124,8 +127,9 @@ void X86AddressSanitizer::InstrumentMOV(
 
   const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
   for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
-    MCParsedAsmOperand *Op = Operands[Ix];
-    if (Op && Op->isMem())
+    assert(Operands[Ix]);
+    MCParsedAsmOperand &Op = *Operands[Ix];
+    if (Op.isMem())
       InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
   }
 }
@@ -136,21 +140,23 @@ class X86AddressSanitizer32 : public X86AddressSanitizer {
       : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer32() {}
 
-  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
+  virtual void InstrumentMemOperandImpl(X86Operand &Op, unsigned AccessSize,
                                         bool IsWrite, MCContext &Ctx,
                                         MCStreamer &Out) override;
 };
 
-void X86AddressSanitizer32::InstrumentMemOperandImpl(
-    X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
+void X86AddressSanitizer32::InstrumentMemOperandImpl(X86Operand &Op,
+                                                     unsigned AccessSize,
+                                                     bool IsWrite,
+                                                     MCContext &Ctx,
+                                                     MCStreamer &Out) {
   // FIXME: emit .cfi directives for correct stack unwinding.
   EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
   {
     MCInst Inst;
     Inst.setOpcode(X86::LEA32r);
     Inst.addOperand(MCOperand::CreateReg(X86::EAX));
-    Op->addMemOperands(Inst, 5);
+    Op.addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
   EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
@@ -161,8 +167,7 @@ void X86AddressSanitizer32::InstrumentMemOperandImpl(
         MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr));
   }
-  EmitInstruction(Out, MCInstBuilder(X86::ADD32ri).addReg(X86::ESP)
-                           .addReg(X86::ESP).addImm(4));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
   EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
 }
 
@@ -172,12 +177,12 @@ class X86AddressSanitizer64 : public X86AddressSanitizer {
       : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer64() {}
 
-  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
+  virtual void InstrumentMemOperandImpl(X86Operand &Op, unsigned AccessSize,
                                         bool IsWrite, MCContext &Ctx,
                                         MCStreamer &Out) override;
 };
 
-void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
+void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand &Op,
                                                      unsigned AccessSize,
                                                      bool IsWrite,
                                                      MCContext &Ctx,
@@ -202,7 +207,7 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
     MCInst Inst;
     Inst.setOpcode(X86::LEA64r);
     Inst.addOperand(MCOperand::CreateReg(X86::RDI));
-    Op->addMemOperands(Inst, 5);
+    Op.addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
   {
@@ -233,9 +238,11 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
 X86AsmInstrumentation::X86AsmInstrumentation() {}
 X86AsmInstrumentation::~X86AsmInstrumentation() {}
 
-void X86AsmInstrumentation::InstrumentInstruction(
-    const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {}
+void X86AsmInstrumentation::InstrumentInstruction(const MCInst &Inst,
+                                                  OperandVector &Operands,
+                                                  MCContext &Ctx,
+                                                  const MCInstrInfo &MII,
+                                                  MCStreamer &Out) {}
 
 X86AsmInstrumentation *
 CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 0369b14d98d2..1bc3c0930153 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -12,6 +12,8 @@
 
 #include "llvm/ADT/SmallVector.h"
 
+#include <memory>
+
 namespace llvm {
 
 class MCContext;
@@ -35,10 +37,9 @@ class X86AsmInstrumentation {
   // Instruments Inst. Should be called just before the original
   // instruction is sent to Out.
   virtual void InstrumentInstruction(
-      const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx,
-      const MCInstrInfo &MII,
-      MCStreamer &Out);
+      const MCInst &Inst,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
+      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
 
 protected:
   friend X86AsmInstrumentation *
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d3e695e31ab1..b30eeeb0e037 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -618,52 +618,52 @@ class X86AsmParser : public MCTargetAsmParser {
       return Error(L, Msg, Ranges, MatchingInlineAsm);
   }
 
-  X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
+  std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
     Error(Loc, Msg);
     return nullptr;
   }
 
-  X86Operand *DefaultMemSIOperand(SMLoc Loc);
-  X86Operand *DefaultMemDIOperand(SMLoc Loc);
-  X86Operand *ParseOperand();
-  X86Operand *ParseATTOperand();
-  X86Operand *ParseIntelOperand();
-  X86Operand *ParseIntelOffsetOfOperator();
+  std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> ParseOperand();
+  std::unique_ptr<X86Operand> ParseATTOperand();
+  std::unique_ptr<X86Operand> ParseIntelOperand();
+  std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
   bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
-  X86Operand *ParseIntelOperator(unsigned OpKind);
-  X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
-  X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc,
-                                   unsigned Size);
+  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  std::unique_ptr<X86Operand>
+  ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  std::unique_ptr<X86Operand>
+  ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size);
   bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
-  X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
-                                       int64_t ImmDisp, unsigned Size);
+  std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg,
+                                                       SMLoc Start,
+                                                       int64_t ImmDisp,
+                                                       unsigned Size);
   bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
                             InlineAsmIdentifierInfo &Info,
                             bool IsUnevaluatedOperand, SMLoc &End);
 
-  X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
-  X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                                    unsigned BaseReg, unsigned IndexReg,
-                                    unsigned Scale, SMLoc Start, SMLoc End,
-                                    unsigned Size, StringRef Identifier,
-                                    InlineAsmIdentifierInfo &Info);
+  std::unique_ptr<X86Operand>
+  CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+                        unsigned IndexReg, unsigned Scale, SMLoc Start,
+                        SMLoc End, unsigned Size, StringRef Identifier,
+                        InlineAsmIdentifierInfo &Info);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
-  bool processInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
 
   /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
   /// instrumentation around Inst.
-  void EmitInstruction(MCInst &Inst,
-                       SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                       MCStreamer &Out);
+  void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   /// doSrcDstMatch - Returns true if operands are matching in their
@@ -674,8 +674,8 @@ class X86AsmParser : public MCTargetAsmParser {
   /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
   /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
   /// \return \c true if no parsing errors occurred, \c false otherwise.
-  bool HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                            const MCParsedAsmOperand &Op);
+  bool HandleAVX512Operand(OperandVector &Operands,
+                           const MCParsedAsmOperand &Op);
 
   bool is64BitMode() const {
     // FIXME: Can tablegen auto-generate this?
@@ -725,9 +725,8 @@ class X86AsmParser : public MCTargetAsmParser {
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
-  bool
-    ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                     SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseDirective(AsmToken DirectiveID) override;
 };
@@ -908,7 +907,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
   return false;
 }
 
-X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
   const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
@@ -916,7 +915,7 @@ X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
                                /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
 }
 
-X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
   const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
@@ -924,7 +923,7 @@ X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
                                /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
 }
 
-X86Operand *X86AsmParser::ParseOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
   if (isParsingIntelSyntax())
     return ParseIntelOperand();
   return ParseATTOperand();
@@ -946,12 +945,10 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
   return Size;
 }
 
-X86Operand *
-X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                                    unsigned BaseReg, unsigned IndexReg,
-                                    unsigned Scale, SMLoc Start, SMLoc End,
-                                    unsigned Size, StringRef Identifier,
-                                    InlineAsmIdentifierInfo &Info){
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+    unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+    unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+    InlineAsmIdentifierInfo &Info) {
   // If this is not a VarDecl then assume it is a FuncDecl or some other label
   // reference.  We need an 'r' constraint here, so we need to create register
   // operand to ensure proper matching.  Just pick a GPR based on the size of
@@ -1164,9 +1161,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   return false;
 }
 
-X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
-                                                   int64_t ImmDisp,
-                                                   unsigned Size) {
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+                                       int64_t ImmDisp, unsigned Size) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
   if (getLexer().isNot(AsmToken::LBrac))
@@ -1270,9 +1267,9 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
 }
 
 /// \brief Parse intel style segment override.
-X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
-                                                    SMLoc Start,
-                                                    unsigned Size) {
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+                                        unsigned Size) {
   assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
   const AsmToken &Tok = Parser.getTok(); // Eat colon.
   if (Tok.isNot(AsmToken::Colon))
@@ -1321,8 +1318,9 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
 }
 
 /// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
-                                               unsigned Size) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
+                                                               SMLoc Start,
+                                                               unsigned Size) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc End;
 
@@ -1425,7 +1423,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
 
 /// Parse the 'offset' operator.  This operator is used to specify the
 /// location rather then the content of a variable.
-X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
   const AsmToken &Tok = Parser.getTok();
   SMLoc OffsetOfLoc = Tok.getLoc();
   Parser.Lex(); // Eat offset.
@@ -1462,7 +1460,7 @@ enum IntelOperatorKind {
 /// variable.  A variable's size is the product of its LENGTH and TYPE.  The
 /// TYPE operator returns the size of a C or C++ type or variable. If the
 /// variable is an array, TYPE returns the size of a single element.
-X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc TypeLoc = Tok.getLoc();
   Parser.Lex(); // Eat operator.
@@ -1495,7 +1493,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   return X86Operand::CreateImm(Imm, Start, End);
 }
 
-X86Operand *X86AsmParser::ParseIntelOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
 
@@ -1577,7 +1575,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
   return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
 }
 
-X86Operand *X86AsmParser::ParseATTOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
   switch (getLexer().getKind()) {
   default:
     // Parse a memory operand with no segment register.
@@ -1613,9 +1611,8 @@ X86Operand *X86AsmParser::ParseATTOperand() {
   }
 }
 
-bool
-X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                  const MCParsedAsmOperand &Op) {
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+                                       const MCParsedAsmOperand &Op) {
   if(STI.getFeatureBits() & X86::FeatureAVX512) {
     if (getLexer().is(AsmToken::LCurly)) {
       // Eat "{" and mark the current place.
@@ -1653,8 +1650,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
       } else {
         // Parse mask register {%k1}
         Operands.push_back(X86Operand::CreateToken("{", consumedToken));
-        if (X86Operand *Op = ParseOperand()) {
-          Operands.push_back(Op);
+        if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+          Operands.push_back(std::move(Op));
           if (!getLexer().is(AsmToken::RCurly))
             return !ErrorAndEatStatement(getLexer().getLoc(),
                                          "Expected } at this point");
@@ -1682,7 +1679,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
 
 /// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
 /// has already been parsed if present.
-X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+                                                          SMLoc MemStart) {
 
   // We have to disambiguate a parenthesized expression "(4+5)" from the start
   // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
@@ -1845,9 +1843,8 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
                                MemStart, MemEnd);
 }
 
-bool X86AsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
   InstInfo = &Info;
   StringRef PatchedName = Name;
 
@@ -1940,9 +1937,9 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
 
     // Read the operands.
     while(1) {
-      if (X86Operand *Op = ParseOperand()) {
-         Operands.push_back(Op);
-        if (!HandleAVX512Operand(Operands, *Op))
+      if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+        Operands.push_back(std::move(Op));
+        if (!HandleAVX512Operand(Operands, *Operands.back()))
           return true;
       } else {
          Parser.eatToEndOfStatement();
@@ -1973,27 +1970,25 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
   // documented form in various unofficial manuals, so a lot of code uses it.
   if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
       Operands.size() == 3) {
-    X86Operand &Op = *(X86Operand*)Operands.back();
+    X86Operand &Op = (X86Operand &)*Operands.back();
     if (Op.isMem() && Op.Mem.SegReg == 0 &&
         isa<MCConstantExpr>(Op.Mem.Disp) &&
         cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
         Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
       SMLoc Loc = Op.getEndLoc();
       Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
-      delete &Op;
     }
   }
   // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
   if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
       Operands.size() == 3) {
-    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op = (X86Operand &)*Operands[1];
     if (Op.isMem() && Op.Mem.SegReg == 0 &&
         isa<MCConstantExpr>(Op.Mem.Disp) &&
         cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
         Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
       SMLoc Loc = Op.getEndLoc();
-      Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
-      delete &Op;
+      Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
     }
   }
 
@@ -2060,8 +2055,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
         Operands.push_back(DefaultMemSIOperand(NameLoc));
       }
     } else if (Operands.size() == 3) {
-      X86Operand &Op = *(X86Operand*)Operands.begin()[1];
-      X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
       if (!doSrcDstMatch(Op, Op2))
         return Error(Op.getStartLoc(),
                      "mismatching source and destination index registers");
@@ -2076,10 +2071,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       (Name == "smov" || Name == "smovb" || Name == "smovw" ||
        Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
     if (Operands.size() == 1) {
-      if (Name == "movsd") {
-        delete Operands.back();
+      if (Name == "movsd")
         Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
-      }
       if (isParsingIntelSyntax()) {
         Operands.push_back(DefaultMemDIOperand(NameLoc));
         Operands.push_back(DefaultMemSIOperand(NameLoc));
@@ -2088,8 +2081,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
         Operands.push_back(DefaultMemDIOperand(NameLoc));
       }
     } else if (Operands.size() == 3) {
-      X86Operand &Op = *(X86Operand*)Operands.begin()[1];
-      X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
       if (!doSrcDstMatch(Op, Op2))
         return Error(Op.getStartLoc(),
                      "mismatching source and destination index registers");
@@ -2105,31 +2098,26 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       Operands.size() == 3) {
     if (isParsingIntelSyntax()) {
       // Intel syntax
-      X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
-      if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-        delete Operands[2];
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
         Operands.pop_back();
-      }
     } else {
-      X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
-      if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-        delete Operands[1];
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
         Operands.erase(Operands.begin() + 1);
-      }
     }
   }
 
   // Transforms "int $3" into "int3" as a size optimization.  We can't write an
   // instalias with an immediate operand yet.
   if (Name == "int" && Operands.size() == 2) {
-    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
-    if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-        cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) {
-      delete Operands[1];
+    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+    if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+        cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) {
       Operands.erase(Operands.begin() + 1);
-      static_cast<X86Operand*>(Operands[0])->setTokenValue("int3");
+      static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
     }
   }
 
@@ -2175,9 +2163,7 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
   return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
 }
 
-bool X86AsmParser::
-processInstruction(MCInst &Inst,
-                   const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
   switch (Inst.getOpcode()) {
   default: return false;
   case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
@@ -2258,51 +2244,47 @@ processInstruction(MCInst &Inst,
 
 static const char *getSubtargetFeatureName(unsigned Val);
 
-void X86AsmParser::EmitInstruction(
-    MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCStreamer &Out) {
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+                                   MCStreamer &Out) {
   Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
                                          Out);
   Out.EmitInstruction(Inst, STI);
 }
 
-bool X86AsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
-  X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
-  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
   ArrayRef<SMRange> EmptyRanges = None;
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
   // Also, MatchInstructionImpl should actually *do* the EmitInstruction
   // call.
-  if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
-      Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
-      Op->getToken() == "finit" || Op->getToken() == "fsave" ||
-      Op->getToken() == "fstenv" || Op->getToken() == "fclex") {
+  if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" ||
+      Op.getToken() == "fstsww" || Op.getToken() == "fstcww" ||
+      Op.getToken() == "finit" || Op.getToken() == "fsave" ||
+      Op.getToken() == "fstenv" || Op.getToken() == "fclex") {
     MCInst Inst;
     Inst.setOpcode(X86::WAIT);
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
       EmitInstruction(Inst, Operands, Out);
 
-    const char *Repl =
-      StringSwitch<const char*>(Op->getToken())
-        .Case("finit",  "fninit")
-        .Case("fsave",  "fnsave")
-        .Case("fstcw",  "fnstcw")
-        .Case("fstcww",  "fnstcw")
-        .Case("fstenv", "fnstenv")
-        .Case("fstsw",  "fnstsw")
-        .Case("fstsww", "fnstsw")
-        .Case("fclex",  "fnclex")
-        .Default(nullptr);
+    const char *Repl = StringSwitch<const char *>(Op.getToken())
+                           .Case("finit", "fninit")
+                           .Case("fsave", "fnsave")
+                           .Case("fstcw", "fnstcw")
+                           .Case("fstcww", "fnstcw")
+                           .Case("fstenv", "fnstenv")
+                           .Case("fstsw", "fnstsw")
+                           .Case("fstsww", "fnstsw")
+                           .Case("fclex", "fnclex")
+                           .Default(nullptr);
     assert(Repl && "Unknown wait-prefixed instruction");
-    delete Operands[0];
     Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
   }
 
@@ -2355,11 +2337,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   // following hack.
 
   // Change the operand to point to a temporary token.
-  StringRef Base = Op->getToken();
+  StringRef Base = Op.getToken();
   SmallString<16> Tmp;
   Tmp += Base;
   Tmp += ' ';
-  Op->setTokenValue(Tmp.str());
+  Op.setTokenValue(Tmp.str());
 
   // If this instruction starts with an 'f', then it is a floating point stack
   // instruction.  These come in up to three forms for 32-bit, 64-bit, and
@@ -2400,7 +2382,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     ErrorInfoMissingFeature = ErrorInfoIgnore;
 
   // Restore the old token.
-  Op->setTokenValue(Base);
+  Op.setTokenValue(Base);
 
   // If exactly one matched, then we treat that as a successful match (and the
   // instruction will already have been filled in correctly, since the failing
@@ -2450,8 +2432,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
       (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
     if (!WasOriginallyInvalidOperand) {
-      ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges :
-        Op->getLocRange();
+      ArrayRef<SMRange> Ranges =
+          MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
       return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
                    Ranges, MatchingInlineAsm);
     }
@@ -2462,10 +2444,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         return Error(IDLoc, "too few operands for instruction",
                      EmptyRanges, MatchingInlineAsm);
 
-      X86Operand *Operand = (X86Operand*)Operands[ErrorInfo];
-      if (Operand->getStartLoc().isValid()) {
-        SMRange OperandRange = Operand->getLocRange();
-        return Error(Operand->getStartLoc(), "invalid operand for instruction",
+      X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+      if (Operand.getStartLoc().isValid()) {
+        SMRange OperandRange = Operand.getLocRange();
+        return Error(Operand.getStartLoc(), "invalid operand for instruction",
                      OperandRange, MatchingInlineAsm);
       }
     }
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index de3be38c1d5a..1bbfc1192447 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -13,6 +13,7 @@
 #include "X86AsmParserCommon.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 
@@ -410,20 +411,19 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
   }
 
-  static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+  static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
     SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
-    X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
+    auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
     Res->Tok.Data = Str.data();
     Res->Tok.Length = Str.size();
     return Res;
   }
 
-  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
-                               bool AddressOf = false,
-                               SMLoc OffsetOfLoc = SMLoc(),
-                               StringRef SymName = StringRef(),
-                               void *OpDecl = nullptr) {
-    X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+  static std::unique_ptr<X86Operand>
+  CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+            bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
     Res->AddressOf = AddressOf;
     Res->OffsetOfLoc = OffsetOfLoc;
@@ -432,17 +432,18 @@ struct X86Operand : public MCParsedAsmOperand {
     return Res;
   }
 
-  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
-    X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+  static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+                                               SMLoc StartLoc, SMLoc EndLoc) {
+    auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
     Res->Imm.Val = Val;
     return Res;
   }
 
   /// Create an absolute memory operand.
-  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0, StringRef SymName = StringRef(),
-                               void *OpDecl = nullptr) {
-    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+  static std::unique_ptr<X86Operand>
+  CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0,
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = 0;
@@ -456,12 +457,11 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   /// Create a generalized memory operand.
-  static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
-                               unsigned BaseReg, unsigned IndexReg,
-                               unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0,
-                               StringRef SymName = StringRef(),
-                               void *OpDecl = nullptr) {
+  static std::unique_ptr<X86Operand>
+  CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+            unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+            unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -469,7 +469,7 @@ struct X86Operand : public MCParsedAsmOperand {
     // The scale should always be one of {1,2,4,8}.
     assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
            "Invalid scale!");
-    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = SegReg;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = BaseReg;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index e63036c1a557..5e29e5c359ac 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -197,14 +197,13 @@ void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
   }
 }
 
-unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
-  Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::x86_64)
+unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) {
+  if (TT.getArch() == Triple::x86_64)
     return DWARFFlavour::X86_64;
 
-  if (TheTriple.isOSDarwin())
+  if (TT.isOSDarwin())
     return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
-  if (TheTriple.isOSCygMing())
+  if (TT.isOSCygMing())
     // Unsupported by now, just quick fallback
     return DWARFFlavour::X86_32_Generic;
   return DWARFFlavour::X86_32_Generic;
@@ -251,8 +250,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
 
   MCRegisterInfo *X = new MCRegisterInfo();
   InitX86MCRegisterInfo(X, RA,
-                        X86_MC::getDwarfRegFlavour(TT, false),
-                        X86_MC::getDwarfRegFlavour(TT, true),
+                        X86_MC::getDwarfRegFlavour(TheTriple, false),
+                        X86_MC::getDwarfRegFlavour(TheTriple, true),
                         RA);
   X86_MC::InitLLVM2SEHRegisterMapping(X);
   return X;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 8fe40fd23620..ebe74cfeaddc 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -28,6 +28,7 @@ class MCSubtargetInfo;
 class MCRelocationInfo;
 class MCStreamer;
 class Target;
+class Triple;
 class StringRef;
 class raw_ostream;
 
@@ -64,7 +65,7 @@ namespace X86_MC {
 
   void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
 
-  unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
+  unsigned getDwarfRegFlavour(Triple TT, bool isEH);
 
   void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
 
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 18e6845efec0..64e8ea834f47 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -30,9 +30,9 @@ class X86TargetMachine;
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
-/// createGlobalBaseRegPass - This pass initializes a global base
+/// createX86GlobalBaseRegPass - This pass initializes a global base
 /// register for PIC on x86-32.
-FunctionPass* createGlobalBaseRegPass();
+FunctionPass* createX86GlobalBaseRegPass();
 
 /// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
 /// to local-dynamic TLS variables so that the TLS base address for the module
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 6912b579b12c..0ac801a9b0a0 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -168,6 +168,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
 def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+                                   "INC and DEC instructions are slower than ADD and SUB">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -228,7 +230,7 @@ def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
                                FeaturePCLMUL, FeatureAES,
                                FeatureCallRegIndirect,
                                FeaturePRFCHW,
-                               FeatureSlowLEA,
+                               FeatureSlowLEA, FeatureSlowIncDec,
                                FeatureSlowBTMem, FeatureFastUAMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 68e136bd776c..1dca5689adee 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -670,16 +670,12 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         DLLExportedGlobals.push_back(getSymbol(&Global));
 
     for (const auto &Alias : M.aliases()) {
-      const GlobalValue *GV = &Alias;
-      if (!GV->hasDLLExportStorageClass())
+      if (!Alias.hasDLLExportStorageClass())
         continue;
 
-      while (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV))
-        GV = A->getAliasee();
-
-      if (isa<Function>(GV))
+      if (Alias.getType()->getElementType()->isFunctionTy())
         DLLExportedFns.push_back(getSymbol(&Alias));
-      else if (isa<GlobalVariable>(GV))
+      else
         DLLExportedGlobals.push_back(getSymbol(&Alias));
     }
 
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 76718d0b6877..b275a9cc3e48 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -1113,9 +1113,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     case TargetOpcode::INLINEASM:
       // We allow inline assembler nodes with empty bodies - they can
       // implicitly define registers, which is ok for JIT.
-      if (MI.getOperand(0).getSymbolName()[0])
+      if (MI.getOperand(0).getSymbolName()[0]) {
+        DebugLoc DL = MI.getDebugLoc();
+        DL.print(MI.getParent()->getParent()->getFunction()->getContext(),
+                 llvm::errs());
         report_fatal_error("JIT does not support inline asm!");
+      }
       break;
+    case TargetOpcode::DBG_VALUE:
     case TargetOpcode::CFI_INSTRUCTION:
       break;
     case TargetOpcode::GC_LABEL:
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 56bcfa30ff90..9557d96d5309 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -16,10 +16,12 @@
 #include "X86.h"
 #include "X86CallingConv.h"
 #include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -78,12 +80,14 @@ class X86FastISel final : public FastISel {
 private:
   bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
 
-  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
+  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
+                       unsigned &ResultReg);
 
   bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
-                        bool Aligned = false);
-  bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM,
-                        bool Aligned = false);
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+  bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                        const X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
 
   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
                          unsigned &ResultReg);
@@ -151,6 +155,84 @@ class X86FastISel final : public FastISel {
 
 } // end anonymous namespace.
 
+static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) {
+  // If both operands are the same, then try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = CI->getPredicate();
+  if (CI->getOperand(0) != CI->getOperand(1))
+    return Predicate;
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid predicate!");
+  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
+
+  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  }
+
+  return Predicate;
+}
+
+static std::pair<X86::CondCode, bool>
+getX86ConditonCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ: // fall-through
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
   EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
   if (evt == MVT::Other || !evt.isSimple())
@@ -180,7 +262,7 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
 /// Return true and the result register by reference if it is possible.
 bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
-                                  unsigned &ResultReg) {
+                                  MachineMemOperand *MMO, unsigned &ResultReg) {
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = nullptr;
@@ -228,8 +310,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
   }
 
   ResultReg = createResultReg(RC);
-  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                         DbgLoc, TII.get(Opc), ResultReg), AM);
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  addFullAddress(MIB, AM);
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
   return true;
 }
 
@@ -237,9 +322,9 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
 /// and a displacement offset, or a GlobalAddress,
 /// i.e. V. Return true if it is possible.
-bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
-                              const X86AddressMode &AM, bool Aligned) {
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                                   const X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -249,7 +334,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1);
+            TII.get(X86::AND8ri), AndResult)
+      .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
     ValReg = AndResult;
   }
   // FALLTHROUGH, handling i1 as i8.
@@ -288,13 +374,18 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
     break;
   }
 
-  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                         DbgLoc, TII.get(Opc)), AM).addReg(ValReg);
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+
   return true;
 }
 
 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
-                                   const X86AddressMode &AM, bool Aligned) {
+                                   const X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Val))
     Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
@@ -317,10 +408,12 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
     }
 
     if (Opc) {
-      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                             DbgLoc, TII.get(Opc)), AM)
-                             .addImm(Signed ? (uint64_t) CI->getSExtValue() :
-                                              CI->getZExtValue());
+      MachineInstrBuilder MIB =
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+      addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+                                            : CI->getZExtValue());
+      if (MMO)
+        MIB->addMemOperand(*FuncInfo.MF, MMO);
       return true;
     }
   }
@@ -329,7 +422,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
   if (ValReg == 0)
     return false;
 
-  return X86FastEmitStore(VT, ValReg, AM, Aligned);
+  bool ValKill = hasTrivialKill(Val);
+  return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
 }
 
 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
@@ -355,17 +449,8 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
       return false;
 
     // Can't handle TLS yet.
-    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-      if (GVar->isThreadLocal())
-        return false;
-
-    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
-    // it works...).
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      if (const GlobalVariable *GVar =
-              dyn_cast_or_null<GlobalVariable>(GA->getAliasee()))
-        if (GVar->isThreadLocal())
-          return false;
+    if (GV->isThreadLocal())
+      return false;
 
     // RIP-relative addresses can't have additional register operands, so if
     // we've already folded stuff into the addressing mode, just force the
@@ -749,19 +834,24 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
   if (S->isAtomic())
     return false;
 
-  unsigned SABIAlignment =
-    DL.getABITypeAlignment(S->getValueOperand()->getType());
-  bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment;
+  const Value *Val = S->getValueOperand();
+  const Value *Ptr = S->getPointerOperand();
 
   MVT VT;
-  if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
+  if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
     return false;
 
+  unsigned Alignment = S->getAlignment();
+  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = ABIAlignment;
+  bool Aligned = Alignment >= ABIAlignment;
+
   X86AddressMode AM;
-  if (!X86SelectAddress(I->getOperand(1), AM))
+  if (!X86SelectAddress(Ptr, AM))
     return false;
 
-  return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned);
+  return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
 }
 
 /// X86SelectRet - Select and emit code to implement ret instructions.
@@ -896,25 +986,29 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
 /// X86SelectLoad - Select and emit code to implement load instructions.
 ///
-bool X86FastISel::X86SelectLoad(const Instruction *I)  {
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+  const LoadInst *LI = cast<LoadInst>(I);
+
   // Atomic loads need special handling.
-  if (cast<LoadInst>(I)->isAtomic())
+  if (LI->isAtomic())
     return false;
 
   MVT VT;
-  if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
+  if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
     return false;
 
+  const Value *Ptr = LI->getPointerOperand();
+
   X86AddressMode AM;
-  if (!X86SelectAddress(I->getOperand(0), AM))
+  if (!X86SelectAddress(Ptr, AM))
     return false;
 
   unsigned ResultReg = 0;
-  if (X86FastEmitLoad(VT, AM, ResultReg)) {
-    UpdateValueMap(I, ResultReg);
-    return true;
-  }
-  return false;
+  if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
 }
 
 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
@@ -994,73 +1088,89 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
     return false;
 
-  unsigned ResultReg = createResultReg(&X86::GR8RegClass);
-  unsigned SetCCOpc;
-  bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
-  switch (CI->getPredicate()) {
-  case CmpInst::FCMP_OEQ: {
-    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_FALSE: {
+    ResultReg = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+            ResultReg);
+    ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                           X86::sub_8bit);
+    if (!ResultReg)
       return false;
+    break;
+  }
+  case CmpInst::FCMP_TRUE: {
+    ResultReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            ResultReg).addImm(1);
+    break;
+  }
+  }
 
-    unsigned EReg = createResultReg(&X86::GR8RegClass);
-    unsigned NPReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETEr), EReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(X86::SETNPr), NPReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+  if (ResultReg) {
     UpdateValueMap(I, ResultReg);
     return true;
   }
-  case CmpInst::FCMP_UNE: {
-    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+    if (RHSC && RHSC->isNullValue())
+      RHS = LHS;
+  }
+
+  // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+  static unsigned SETFOpcTable[2][3] = {
+    { X86::SETEr,  X86::SETNPr, X86::AND8rr },
+    { X86::SETNEr, X86::SETPr,  X86::OR8rr  }
+  };
+  unsigned *SETFOpc = nullptr;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+  case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+  }
+
+  ResultReg = createResultReg(&X86::GR8RegClass);
+  if (SETFOpc) {
+    if (!X86FastEmitCompare(LHS, RHS, VT))
       return false;
 
-    unsigned NEReg = createResultReg(&X86::GR8RegClass);
-    unsigned PReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETNEr), NEReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETPr), PReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::OR8rr),ResultReg)
-      .addReg(PReg).addReg(NEReg);
+    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+            FlagReg1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+            FlagReg2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+            ResultReg).addReg(FlagReg1).addReg(FlagReg2);
     UpdateValueMap(I, ResultReg);
     return true;
   }
-  case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
-  case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
-  case CmpInst::FCMP_OLT: SwapArgs = true;  SetCCOpc = X86::SETAr;  break;
-  case CmpInst::FCMP_OLE: SwapArgs = true;  SetCCOpc = X86::SETAEr; break;
-  case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
-  case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
-  case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr;  break;
-  case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr;  break;
-  case CmpInst::FCMP_UGT: SwapArgs = true;  SetCCOpc = X86::SETBr;  break;
-  case CmpInst::FCMP_UGE: SwapArgs = true;  SetCCOpc = X86::SETBEr; break;
-  case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
-  case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
-
-  case CmpInst::ICMP_EQ:  SwapArgs = false; SetCCOpc = X86::SETEr;  break;
-  case CmpInst::ICMP_NE:  SwapArgs = false; SetCCOpc = X86::SETNEr; break;
-  case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
-  case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
-  case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
-  case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
-  case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr;  break;
-  case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
-  case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr;  break;
-  case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
-  default:
-    return false;
-  }
 
-  const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = getX86ConditonCode(Predicate);
+  assert(CC <= X86::LAST_VALID_COND && "Unexpected conditon code.");
+  unsigned Opc = X86::getSETFromCond(CC);
+
   if (SwapArgs)
-    std::swap(Op0, Op1);
+    std::swap(LHS, RHS);
 
-  // Emit a compare of Op0/Op1.
-  if (!X86FastEmitCompare(Op0, Op1, VT))
+  // Emit a compare of LHS/RHS.
+  if (!X86FastEmitCompare(LHS, RHS, VT))
     return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SetCCOpc), ResultReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1130,69 +1240,84 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
       EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
 
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true;
+      case CmpInst::FCMP_TRUE:  FastEmitBranch(TrueMBB, DbgLoc); return true;
+      }
+
+      const Value *CmpLHS = CI->getOperand(0);
+      const Value *CmpRHS = CI->getOperand(1);
+
+      // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+      // 0.0.
+      // We don't have to materialize a zero constant for this case and can just
+      // use %x again on the RHS.
+      if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+        const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+        if (CmpRHSC && CmpRHSC->isNullValue())
+          CmpRHS = CmpLHS;
+      }
+
       // Try to take advantage of fallthrough opportunities.
-      CmpInst::Predicate Predicate = CI->getPredicate();
       if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
         std::swap(TrueMBB, FalseMBB);
         Predicate = CmpInst::getInversePredicate(Predicate);
       }
 
-      bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
-      unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
-
+      // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/conditon
+      // code check. Instead two branch instructions are required to check all
+      // the flags. First we change the predicate to a supported conditon code,
+      // which will be the first branch. Later one we will emit the second
+      // branch.
+      bool NeedExtraBranch = false;
       switch (Predicate) {
+      default: break;
       case CmpInst::FCMP_OEQ:
-        std::swap(TrueMBB, FalseMBB);
-        Predicate = CmpInst::FCMP_UNE;
-        // FALL THROUGH
-      case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
-      case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4;  break;
-      case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
-      case CmpInst::FCMP_OLT: SwapArgs = true;  BranchOpc = X86::JA_4;  break;
-      case CmpInst::FCMP_OLE: SwapArgs = true;  BranchOpc = X86::JAE_4; break;
-      case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
-      case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break;
-      case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4;  break;
-      case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4;  break;
-      case CmpInst::FCMP_UGT: SwapArgs = true;  BranchOpc = X86::JB_4;  break;
-      case CmpInst::FCMP_UGE: SwapArgs = true;  BranchOpc = X86::JBE_4; break;
-      case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4;  break;
-      case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
-
-      case CmpInst::ICMP_EQ:  SwapArgs = false; BranchOpc = X86::JE_4;  break;
-      case CmpInst::ICMP_NE:  SwapArgs = false; BranchOpc = X86::JNE_4; break;
-      case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4;  break;
-      case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
-      case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4;  break;
-      case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
-      case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4;  break;
-      case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break;
-      case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4;  break;
-      case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break;
-      default:
-        return false;
+        std::swap(TrueMBB, FalseMBB); // fall-through
+      case CmpInst::FCMP_UNE:
+        NeedExtraBranch = true;
+        Predicate = CmpInst::FCMP_ONE;
+        break;
       }
 
-      const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+      X86::CondCode CC;
+      bool SwapArgs;
+      unsigned BranchOpc;
+      std::tie(CC, SwapArgs) = getX86ConditonCode(Predicate);
+      assert(CC <= X86::LAST_VALID_COND && "Unexpected conditon code.");
+
+      BranchOpc = X86::GetCondBranchFromCond(CC);
       if (SwapArgs)
-        std::swap(Op0, Op1);
+        std::swap(CmpLHS, CmpRHS);
 
       // Emit a compare of the LHS and RHS, setting the flags.
-      if (!X86FastEmitCompare(Op0, Op1, VT))
+      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT))
         return false;
 
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
         .addMBB(TrueMBB);
 
-      if (Predicate == CmpInst::FCMP_UNE) {
-        // X86 requires a second branch to handle UNE (and OEQ,
-        // which is mapped to UNE above).
+      // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+      // to UNE above).
+      if (NeedExtraBranch) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4))
           .addMBB(TrueMBB);
       }
 
+      // Obtain the branch weight and add the TrueBB to the successor list.
+      uint32_t BranchWeight = 0;
+      if (FuncInfo.BPI)
+        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                   TrueMBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+
+      // Emits an unconditional branch to the FalseBB, obtains the branch
+      // weight, andd adds it to the successor list.
       FastEmitBranch(FalseMBB, DbgLoc);
-      FuncInfo.MBB->addSuccessor(TrueMBB);
+
       return true;
     }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
@@ -1224,7 +1349,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
           .addMBB(TrueMBB);
         FastEmitBranch(FalseMBB, DbgLoc);
-        FuncInfo.MBB->addSuccessor(TrueMBB);
+        uint32_t BranchWeight = 0;
+        if (FuncInfo.BPI)
+          BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                     TrueMBB->getBasicBlock());
+        FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
         return true;
       }
     }
@@ -1241,7 +1370,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4))
     .addMBB(TrueMBB);
   FastEmitBranch(FalseMBB, DbgLoc);
-  FuncInfo.MBB->addSuccessor(TrueMBB);
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                               TrueMBB->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
   return true;
 }
 
@@ -1633,8 +1766,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
     }
 
     unsigned Reg;
-    bool RV = X86FastEmitLoad(VT, SrcAM, Reg);
-    RV &= X86FastEmitStore(VT, Reg, DestAM);
+    bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
     assert(RV && "Failed to emit load or store??");
 
     unsigned Size = VT.getSizeInBits()/8;
@@ -1646,10 +1779,74 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
   return true;
 }
 
+static bool isCommutativeIntrinsic(IntrinsicInst const &I) {
+  switch (I.getIntrinsicID()) {
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default: return false;
+  case Intrinsic::frameaddress: {
+    Type *RetTy = I.getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC = nullptr;
+
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Invalid result type for frameaddress.");
+    case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+    case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+    }
+
+    // This needs to be set before we call getFrameRegister, otherwise we get
+    // the wrong frame register.
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+    unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+            (FrameReg == X86::EBP && VT == MVT::i32)) &&
+           "Invalid Frame Register!");
+
+    // Always make a copy of the frame register to to a vreg first, so that we
+    // never directly reference the frame register (the TwoAddressInstruction-
+    // Pass doesn't like that).
+    unsigned SrcReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+    // Now recursively load from the frame address.
+    // movq (%rbp), %rax
+    // movq (%rax), %rax
+    // movq (%rax), %rax
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(Opc), DestReg), SrcReg);
+      SrcReg = DestReg;
+    }
+
+    UpdateValueMap(&I, SrcReg);
+    return true;
+  }
   case Intrinsic::memcpy: {
     const MemCpyInst &MCI = cast<MemCpyInst>(I);
     // Don't handle volatile or variable length memcpys.
@@ -1726,52 +1923,217 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
     return true;
   }
-  case Intrinsic::sadd_with_overflow:
-  case Intrinsic::uadd_with_overflow: {
-    // FIXME: Should fold immediates.
+  case Intrinsic::sqrt: {
+    if (!Subtarget->hasSSE1())
+      return false;
 
-    // Replace "add with overflow" intrinsics with an "add" instruction followed
-    // by a seto/setc instruction.
-    const Function *Callee = I.getCalledFunction();
-    Type *RetTy =
-      cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+    Type *RetTy = I.getCalledFunction()->getReturnType();
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
-    const Value *Op1 = I.getArgOperand(0);
-    const Value *Op2 = I.getArgOperand(1);
-    unsigned Reg1 = getRegForValue(Op1);
-    unsigned Reg2 = getRegForValue(Op2);
+    // Unfortunatelly we can't use FastEmit_r, because the AVX version of FSQRT
+    // is not generated by FastISel yet.
+    // FIXME: Update this code once tablegen can handle it.
+    static const unsigned SqrtOpc[2][2] = {
+      {X86::SQRTSSr, X86::VSQRTSSr},
+      {X86::SQRTSDr, X86::VSQRTSDr}
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (VT.SimpleTy) {
+    default: return false;
+    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+    }
+
+    const Value *SrcVal = I.getArgOperand(0);
+    unsigned SrcReg = getRegForValue(SrcVal);
 
-    if (Reg1 == 0 || Reg2 == 0)
-      // FIXME: Handle values *not* in registers.
+    if (SrcReg == 0)
       return false;
 
-    unsigned OpC = 0;
-    if (VT == MVT::i32)
-      OpC = X86::ADD32rr;
-    else if (VT == MVT::i64)
-      OpC = X86::ADD64rr;
-    else
+    unsigned ImplicitDefReg = 0;
+    if (HasAVX) {
+      ImplicitDefReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+    }
+
+    unsigned ResultReg = createResultReg(RC);
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                  ResultReg);
+
+    if (ImplicitDefReg)
+      MIB.addReg(ImplicitDefReg);
+
+    MIB.addReg(SrcReg);
+
+    UpdateValueMap(&I, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics
+    // into add/sub/mul folowed by either seto or setb.
+    const Function *Callee = I.getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+    Type *CondTy = Ty->getTypeAtIndex(1);
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT < MVT::i8 || VT > MVT::i64)
+      return false;
+
+    const Value *LHS = I.getArgOperand(0);
+    const Value *RHS = I.getArgOperand(1);
+
+    // Canonicalize immediates to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(I))
+      std::swap(LHS, RHS);
+
+    unsigned BaseOpc, CondOpc;
+    switch (I.getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+    case Intrinsic::uadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+    case Intrinsic::ssub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+    case Intrinsic::usub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+    case Intrinsic::smul_with_overflow:
+      BaseOpc = ISD::MUL; CondOpc = X86::SETOr; break;
+    case Intrinsic::umul_with_overflow:
+      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+    }
+
+    unsigned LHSReg = getRegForValue(LHS);
+    if (LHSReg == 0)
       return false;
+    bool LHSIsKill = hasTrivialKill(LHS);
+
+    unsigned ResultReg = 0;
+    // Check if we have an immediate version.
+    if (auto const *C = dyn_cast<ConstantInt>(RHS)) {
+      ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+                              C->getZExtValue());
+    }
+
+    unsigned RHSReg;
+    bool RHSIsKill;
+    if (!ResultReg) {
+      RHSReg = getRegForValue(RHS);
+      if (RHSReg == 0)
+        return false;
+      RHSIsKill = hasTrivialKill(RHS);
+      ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill);
+    }
 
-    // The call to CreateRegs builds two sequential registers, to store the
-    // both the returned values.
-    unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg)
-      .addReg(Reg1).addReg(Reg2);
+    // FastISel doesn't have a pattern for X86::MUL*r. Emit it manually.
+    if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+      { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+      // First copy the first operand into RAX, which is an implicit input to
+      // the X86::MUL*r instruction.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+      ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+    }
+
+    if (!ResultReg)
+      return false;
 
-    unsigned Opc = X86::SETBr;
-    if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
-      Opc = X86::SETOr;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
-            ResultReg + 1);
+    unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+            ResultReg2);
 
     UpdateValueMap(&I, ResultReg, 2);
     return true;
   }
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64: {
+    bool IsInputDouble;
+    switch (I.getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::x86_sse_cvttss2si:
+    case Intrinsic::x86_sse_cvttss2si64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      IsInputDouble = false;
+      break;
+    case Intrinsic::x86_sse2_cvttsd2si:
+    case Intrinsic::x86_sse2_cvttsd2si64:
+      if (!Subtarget->hasSSE2())
+        return false;
+      IsInputDouble = true;
+      break;
+    }
+
+    Type *RetTy = I.getCalledFunction()->getReturnType();
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    static const unsigned CvtOpc[2][2][2] = {
+      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
+        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
+      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
+        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected result type.");
+    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+    }
+
+    // Check if we can fold insertelement instructions into the convert.
+    const Value *Op = I.getArgOperand(0);
+    while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+      const Value *Index = IE->getOperand(2);
+      if (!isa<ConstantInt>(Index))
+        break;
+      unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+      if (Idx == 0) {
+        Op = IE->getOperand(1);
+        break;
+      }
+      Op = IE->getOperand(0);
+    }
+
+    unsigned Reg = getRegForValue(Op);
+    if (Reg == 0)
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Reg);
+
+    UpdateValueMap(&I, ResultReg);
+    return true;
+  }
   }
 }
 
@@ -1794,31 +2156,43 @@ bool X86FastISel::FastLowerArguments() {
     return false;
   
   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
-  unsigned Idx = 1;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++Idx) {
-    if (Idx > 6)
-      return false;
-
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
     if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
         F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
         F->getAttributes().hasAttribute(Idx, Attribute::Nest))
       return false;
 
-    Type *ArgTy = I->getType();
+    Type *ArgTy = Arg.getType();
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
     EVT ArgVT = TLI.getValueType(ArgTy);
     if (!ArgVT.isSimple()) return false;
     switch (ArgVT.getSimpleVT().SimpleTy) {
+    default: return false;
     case MVT::i32:
     case MVT::i64:
+      ++GPRCnt;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      ++FPRCnt;
       break;
-    default:
-      return false;
     }
+
+    if (GPRCnt > 6)
+      return false;
+
+    if (FPRCnt > 8)
+      return false;
   }
 
   static const MCPhysReg GPR32ArgRegs[] = {
@@ -1827,24 +2201,33 @@ bool X86FastISel::FastLowerArguments() {
   static const MCPhysReg GPR64ArgRegs[] = {
     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
   };
+  static const MCPhysReg XMMArgRegs[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
 
-  Idx = 0;
-  const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32);
-  const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++Idx) {
-    bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
-    const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
-    unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+    unsigned SrcReg;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type.");
+    case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+    case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+    case MVT::f32: // fall-through
+    case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+    }
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
     unsigned ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(DstReg, getKillRegState(true));
-    UpdateValueMap(I, ResultReg);
+            TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(&Arg, ResultReg);
   }
   return true;
 }
@@ -2147,7 +2530,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
         if (!X86FastEmitStore(ArgVT, ArgVal, AM))
           return false;
       } else {
-        if (!X86FastEmitStore(ArgVT, Arg, AM))
+        if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM))
           return false;
       }
     }
@@ -2430,7 +2813,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     return 0;
   }
 
-  // Materialize addresses with LEA instructions.
+  // Materialize addresses with LEA/MOV instructions.
   if (isa<GlobalValue>(C)) {
     X86AddressMode AM;
     if (X86SelectAddress(C, AM)) {
@@ -2440,10 +2823,19 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
           AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
         return AM.Base.Reg;
 
-      Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
       unsigned ResultReg = createResultReg(RC);
-      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+      if (TM.getRelocationModel() == Reloc::Static &&
+          TLI.getPointerTy() == MVT::i64) {
+        // The displacement code be more than 32 bits away so we need to use
+        // an instruction with a 64 bit immediate
+        Opc = X86::MOV64ri;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C));
+      } else {
+        Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+        addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                              TII.get(Opc), ResultReg), AM);
+      }
       return ResultReg;
     }
     return 0;
@@ -2544,8 +2936,9 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
 
 bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                       const LoadInst *LI) {
+  const Value *Ptr = LI->getPointerOperand();
   X86AddressMode AM;
-  if (!X86SelectAddress(LI->getOperand(0), AM))
+  if (!X86SelectAddress(Ptr, AM))
     return false;
 
   const X86InstrInfo &XII = (const X86InstrInfo&)TII;
@@ -2553,13 +2946,18 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   unsigned Size = DL.getTypeAllocSize(LI->getType());
   unsigned Alignment = LI->getAlignment();
 
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = DL.getABITypeAlignment(LI->getType());
+
   SmallVector<MachineOperand, 8> AddrOps;
   AM.getFullAddress(AddrOps);
 
   MachineInstr *Result =
     XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
-  if (!Result) return false;
+  if (!Result)
+    return false;
 
+  Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
   FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
   MI->eraseFromParent();
   return true;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 576f39d3c3a3..4be766a19f96 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -32,86 +32,89 @@ using namespace llvm;
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
-  class FixupLEAPass : public MachineFunctionPass {
-    enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
-    static char ID;
-    /// \brief Loop over all of the instructions in the basic block
-    /// replacing applicable instructions with LEA instructions,
-    /// where appropriate.
-    bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+class FixupLEAPass : public MachineFunctionPass {
+  enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+  static char ID;
+  /// \brief Loop over all of the instructions in the basic block
+  /// replacing applicable instructions with LEA instructions,
+  /// where appropriate.
+  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-    const char *getPassName() const override { return "X86 Atom LEA Fixup";}
+  const char *getPassName() const override { return "X86 Atom LEA Fixup"; }
 
-    /// \brief Given a machine register, look for the instruction
-    /// which writes it in the current basic block. If found,
-    /// try to replace it with an equivalent LEA instruction.
-    /// If replacement succeeds, then also process the the newly created
-    /// instruction.
-    void  seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I,
-                      MachineFunction::iterator MFI);
+  /// \brief Given a machine register, look for the instruction
+  /// which writes it in the current basic block. If found,
+  /// try to replace it with an equivalent LEA instruction.
+  /// If replacement succeeds, then also process the the newly created
+  /// instruction.
+  void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+                    MachineFunction::iterator MFI);
 
-    /// \brief Given a memory access or LEA instruction
-    /// whose address mode uses a base and/or index register, look for
-    /// an opportunity to replace the instruction which sets the base or index
-    /// register with an equivalent LEA instruction.
-    void processInstruction(MachineBasicBlock::iterator& I,
-                            MachineFunction::iterator MFI);
+  /// \brief Given a memory access or LEA instruction
+  /// whose address mode uses a base and/or index register, look for
+  /// an opportunity to replace the instruction which sets the base or index
+  /// register with an equivalent LEA instruction.
+  void processInstruction(MachineBasicBlock::iterator &I,
+                          MachineFunction::iterator MFI);
 
-    /// \brief Given a LEA instruction which is unprofitable
-    /// on Silvermont try to replace it with an equivalent ADD instruction
-    void processInstructionForSLM(MachineBasicBlock::iterator& I,
-                                  MachineFunction::iterator MFI);
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on Silvermont try to replace it with an equivalent ADD instruction
+  void processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI);
 
-    /// \brief Determine if an instruction references a machine register
-    /// and, if so, whether it reads or writes the register.
-    RegUsageState usesRegister(MachineOperand& p,
-                               MachineBasicBlock::iterator I);
+  /// \brief Determine if an instruction references a machine register
+  /// and, if so, whether it reads or writes the register.
+  RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
 
-    /// \brief Step backwards through a basic block, looking
-    /// for an instruction which writes a register within 
-    /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
-    MachineBasicBlock::iterator searchBackwards(MachineOperand& p,
-                                                MachineBasicBlock::iterator& I,
-                                                MachineFunction::iterator MFI);
+  /// \brief Step backwards through a basic block, looking
+  /// for an instruction which writes a register within
+  /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+  MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+                                              MachineBasicBlock::iterator &I,
+                                              MachineFunction::iterator MFI);
 
-    /// \brief if an instruction can be converted to an 
-    /// equivalent LEA, insert the new instruction into the basic block
-    /// and return a pointer to it. Otherwise, return zero.
-    MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI,
-                                     MachineBasicBlock::iterator &MBBI) const;
+  /// \brief if an instruction can be converted to an
+  /// equivalent LEA, insert the new instruction into the basic block
+  /// and return a pointer to it. Otherwise, return zero.
+  MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                   MachineBasicBlock::iterator &MBBI) const;
 
-  public:
-    FixupLEAPass() : MachineFunctionPass(ID) {}
+public:
+  FixupLEAPass() : MachineFunctionPass(ID) {}
 
-    /// \brief Loop over all of the basic blocks,
-    /// replacing instructions by equivalent LEA instructions
-    /// if needed and when possible.
-    bool runOnMachineFunction(MachineFunction &MF) override;
+  /// \brief Loop over all of the basic blocks,
+  /// replacing instructions by equivalent LEA instructions
+  /// if needed and when possible.
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  private:
-    MachineFunction *MF;
-    const TargetMachine *TM;
-    const X86InstrInfo *TII; // Machine instruction info.
-
-  };
-  char FixupLEAPass::ID = 0;
+private:
+  MachineFunction *MF;
+  const TargetMachine *TM;
+  const X86InstrInfo *TII; // Machine instruction info.
+};
+char FixupLEAPass::ID = 0;
 }
 
 MachineInstr *
 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
                                  MachineBasicBlock::iterator &MBBI) const {
-  MachineInstr* MI = MBBI;
-  MachineInstr* NewMI;
+  MachineInstr *MI = MBBI;
+  MachineInstr *NewMI;
   switch (MI->getOpcode()) {
   case X86::MOV32rr:
   case X86::MOV64rr: {
-    const MachineOperand& Src = MI->getOperand(1);
-    const MachineOperand& Dest = MI->getOperand(0);
+    const MachineOperand &Src = MI->getOperand(1);
+    const MachineOperand &Dest = MI->getOperand(0);
     NewMI = BuildMI(*MF, MI->getDebugLoc(),
-      TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r))
-      .addOperand(Dest)
-      .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0);
-    MFI->insert(MBBI, NewMI);   // Insert the new inst
+                    TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r
+                                                             : X86::LEA64r))
+                .addOperand(Dest)
+                .addOperand(Src)
+                .addImm(1)
+                .addReg(0)
+                .addImm(0)
+                .addReg(0);
+    MFI->insert(MBBI, NewMI); // Insert the new inst
     return NewMI;
   }
   case X86::ADD64ri32:
@@ -144,14 +147,16 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
   return TII->convertToThreeAddress(MFI, MBBI, nullptr);
 }
 
-FunctionPass *llvm::createX86FixupLEAs() {
-  return new FixupLEAPass();
-}
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TM = &MF->getTarget();
-  TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo());
+  TM = &Func.getTarget();
+  const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
+  if (!ST.LEAusesAG() && !ST.slowLEA())
+    return false;
+
+  TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo());
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
@@ -162,14 +167,14 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   return true;
 }
 
-FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
-                                MachineBasicBlock::iterator I) {
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
   RegUsageState RegUsage = RU_NotUsed;
-  MachineInstr* MI = I;
+  MachineInstr *MI = I;
 
   for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
-    MachineOperand& opnd = MI->getOperand(i);
-    if (opnd.isReg() && opnd.getReg() == p.getReg()){
+    MachineOperand &opnd = MI->getOperand(i);
+    if (opnd.isReg() && opnd.getReg() == p.getReg()) {
       if (opnd.isDef())
         return RU_Write;
       RegUsage = RU_Read;
@@ -182,23 +187,22 @@ FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
 /// block, return a reference to the previous instruction in the block,
 /// wrapping around to the last instruction of the block if the block
 /// branches to itself.
-static inline bool getPreviousInstr(MachineBasicBlock::iterator& I,
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
                                     MachineFunction::iterator MFI) {
   if (I == MFI->begin()) {
     if (MFI->isPredecessor(MFI)) {
       I = --MFI->end();
       return true;
-    }
-    else
+    } else
       return false;
   }
   --I;
   return true;
 }
 
-MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
-                                   MachineBasicBlock::iterator& I,
-                                   MachineFunction::iterator MFI) {
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+                              MachineFunction::iterator MFI) {
   int InstrDistance = 1;
   MachineBasicBlock::iterator CurInst;
   static const int INSTR_DISTANCE_THRESHOLD = 5;
@@ -206,12 +210,12 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
   CurInst = I;
   bool Found;
   Found = getPreviousInstr(CurInst, MFI);
-  while( Found && I != CurInst) {
+  while (Found && I != CurInst) {
     if (CurInst->isCall() || CurInst->isInlineAsm())
       break;
     if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
       break; // too far back to make a difference
-    if (usesRegister(p, CurInst) == RU_Write){
+    if (usesRegister(p, CurInst) == RU_Write) {
       return CurInst;
     }
     InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
@@ -220,32 +224,32 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
   return nullptr;
 }
 
-void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
                                       MachineFunction::iterator MFI) {
   // Process a load, store, or LEA instruction.
   MachineInstr *MI = I;
   int opcode = MI->getOpcode();
-  const MCInstrDesc& Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI->getDesc();
   int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
   if (AddrOffset >= 0) {
     AddrOffset += X86II::getOperandBias(Desc);
-    MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+    MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
     if (p.isReg() && p.getReg() != X86::ESP) {
       seekLEAFixup(p, I, MFI);
     }
-    MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+    MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
     if (q.isReg() && q.getReg() != X86::ESP) {
       seekLEAFixup(q, I, MFI);
     }
   }
 }
 
-void FixupLEAPass::seekLEAFixup(MachineOperand& p,
-                                MachineBasicBlock::iterator& I,
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+                                MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI) {
   MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
   if (MBI) {
-    MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
+    MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
     if (NewMI) {
       ++NumLEAs;
       DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
@@ -253,7 +257,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
       DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
       MFI->erase(MBI);
       MachineBasicBlock::iterator J =
-                             static_cast<MachineBasicBlock::iterator> (NewMI);
+          static_cast<MachineBasicBlock::iterator>(NewMI);
       processInstruction(J, MFI);
     }
   }
@@ -296,7 +300,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
   }
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
-  MachineInstr *NewMI = 0;
+  MachineInstr *NewMI = nullptr;
   const MachineOperand &Dst = MI->getOperand(0);
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 1c1b06623bde..fab0560e3bcd 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -45,7 +45,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineModuleInfo &MMI = MF.getMMI();
-  const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
 
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           RegInfo->needsStackRealignment(MF) ||
@@ -312,13 +312,14 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   if (CSI.empty()) return;
 
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
@@ -400,8 +401,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   bool needsFrameMoves = MMI.hasDebugInfo() ||
@@ -409,6 +411,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   bool IsLP64 = STI.isTarget64BitLP64();
   bool IsWin64 = STI.isTargetWin64();
@@ -719,12 +722,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI != MBB.end() && "Returning block has no instructions");
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc DL = MBBI->getDebugLoc();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   bool IsLP64 = STI.isTarget64BitLP64();
   bool UseLEA = STI.useLeaForSP();
@@ -979,13 +984,15 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   MachineFunction &MF = *MBB.getParent();
-
-  unsigned SlotSize = STI.is64Bit() ? 8 : 4;
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FPReg = TRI->getFrameRegister(MF);
   unsigned CalleeFrameSize = 0;
 
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Push GPRs. It increases frame size.
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
@@ -1035,6 +1042,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Reload XMMs from stack frame.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -1065,9 +1073,10 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
 void
 X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                   RegScavenger *RS) const {
+                                                       RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
   unsigned SlotSize = RegInfo->getSlotSize();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1160,8 +1169,9 @@ void
 X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MachineBasicBlock &prologueMBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   uint64_t StackSize;
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   unsigned TlsReg, TlsOffset;
   DebugLoc DL;
@@ -1176,6 +1186,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
       !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1200,11 +1219,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MF.push_front(allocMBB);
   MF.push_front(checkMBB);
 
-  // Eventually StackSize will be calculated by a link-time pass; which will
-  // also decide whether checking code needs to be injected into this particular
-  // prologue.
-  StackSize = MFI->getStackSize();
-
   // When the frame size is less than 256 we just compare the stack
   // boundary directly to the value of the stack pointer, per gcc.
   bool CompareStackPointer = StackSize < kSplitStackAvailable;
@@ -1364,9 +1378,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 ///       temp0 = sp - MaxStack
 ///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize();
+  const unsigned SlotSize =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo())
+          ->getSlotSize();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   const bool Is64Bit = STI.is64Bit();
   DebugLoc DL;
   // HiPE-specific values
@@ -1495,12 +1512,14 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  const X86InstrInfo &TII = *TM.getInstrInfo();
-  const X86RegisterInfo &RegInfo = *TM.getRegisterInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const X86RegisterInfo &RegInfo =
+      *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
   unsigned StackPtr = RegInfo.getStackRegister();
   bool reseveCallFrame = hasReservedCallFrame(MF);
   int Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL = I->getDebugLoc();
   uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
@@ -1518,7 +1537,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
-    unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+    unsigned StackAlign =
+        MF.getTarget().getFrameLowering()->getStackAlignment();
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
     MachineInstr *New = nullptr;
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 208bb8bd6a34..5c43c1488d4a 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,6 @@
 #ifndef X86_FRAMELOWERING_H
 #define X86_FRAMELOWERING_H
 
-#include "X86Subtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
@@ -23,15 +22,9 @@ class MCSymbol;
 class X86TargetMachine;
 
 class X86FrameLowering : public TargetFrameLowering {
-  const X86TargetMachine &TM;
-  const X86Subtarget &STI;
 public:
-  explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti)
-    : TargetFrameLowering(StackGrowsDown,
-                          sti.getStackAlignment(),
-                          (sti.is64Bit() ? -8 : -4)),
-      TM(tm), STI(sti) {
-  }
+  explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
+    : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI, DebugLoc DL,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 03c9620db4a2..74386d33990d 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -162,6 +162,13 @@ namespace {
       return "X86 DAG->DAG Instruction Selection";
     }
 
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Reset the subtarget each time through.
+      Subtarget = &TM.getSubtarget<X86Subtarget>();
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
     void EmitFunctionEntryCode() override;
 
     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 72743a97abd9..851607eac96e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -178,29 +178,28 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
-static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  bool is64Bit = Subtarget->is64Bit();
-
-  if (Subtarget->isTargetMacho()) {
-    if (is64Bit)
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::x86_64)
       return new X86_64MachoTargetObjectFile();
     return new TargetLoweringObjectFileMachO();
   }
 
-  if (Subtarget->isTargetLinux())
+  if (TT.isOSLinux())
     return new X86LinuxTargetObjectFile();
-  if (Subtarget->isTargetELF())
+  if (TT.isOSBinFormatELF())
     return new TargetLoweringObjectFileELF();
-  if (Subtarget->isTargetKnownWindowsMSVC())
+  if (TT.isKnownWindowsMSVCEnvironment())
     return new X86WindowsTargetObjectFile();
-  if (Subtarget->isTargetCOFF())
+  if (TT.isOSBinFormatCOFF())
     return new TargetLoweringObjectFileCOFF();
   llvm_unreachable("unknown subtarget type");
 }
 
+// FIXME: This should stop caching the target machine as soon as
+// we can remove resetOperationActions et al.
 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)) {
+  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
@@ -443,7 +442,13 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
   if (Subtarget->is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
@@ -575,7 +580,7 @@ void X86TargetLowering::resetOperationActions() {
   // Expand certain atomics
   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
     MVT VT = IntVTs[i];
-    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
@@ -596,7 +601,7 @@ void X86TargetLowering::resetOperationActions() {
   }
 
   if (Subtarget->hasCmpxchg16b()) {
-    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
   }
 
   // FIXME - use subtarget debug flags
@@ -861,6 +866,7 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
       setTruncStoreAction(VT,
@@ -1040,6 +1046,8 @@ void X86TargetLowering::resetOperationActions() {
     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
 
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
@@ -1431,6 +1439,11 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
 
+    if (Subtarget->hasCDI()) {
+      setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
+    }
+
     // Custom lower several nodes.
     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1561,6 +1574,7 @@ void X86TargetLowering::resetOperationActions() {
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -1723,7 +1737,7 @@ const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
                                              unsigned uid,MCContext &Ctx) const{
-  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+  assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
          Subtarget->isPICStyleGOT());
   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   // entries.
@@ -1822,7 +1836,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
                  RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
@@ -1842,7 +1856,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
                  RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
@@ -2014,7 +2028,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget->is64Bit();
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2164,8 +2178,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
-  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
-                              getTargetMachine().Options.GuaranteedTailCallOpt);
+  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
+      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
 
@@ -2222,7 +2236,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -2386,7 +2400,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         TotalNumXMMRegs = 0;
 
       if (IsWin64) {
-        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
         // Get to the caller-allocated home save location.  Add 8 to account
         // for the return address.
         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2585,7 +2599,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -2600,7 +2614,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
-  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
            IsTailCallConvention(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
@@ -2647,7 +2661,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     // Skip inalloca arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2838,7 +2852,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     InFlag = Chain.getValue(1);
   }
 
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
     // through a register, since the call instruction's 32-bit
@@ -2862,7 +2876,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       // has hidden or protected visibility, or if it is static or local, then
       // we don't need to use the PLT - we can directly call it.
       if (Subtarget->isTargetELF() &&
-          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+          DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
         OpFlags = X86II::MO_PLT;
       } else if (Subtarget->isPICStyleStubAny() &&
@@ -2904,7 +2918,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
     // external symbols should go through the PLT.
     if (Subtarget->isTargetELF() &&
-        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+        DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
                (!Subtarget->getTargetTriple().isMacOSX() ||
@@ -2943,7 +2957,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2967,7 +2981,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPop;
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
-                       getTargetMachine().Options.GuaranteedTailCallOpt))
+                       DAG.getTarget().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
            !Subtarget->getTargetTriple().isOSMSVCRT() &&
@@ -3138,7 +3152,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
 
-  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) && CCMatch)
       return true;
     return false;
@@ -3150,7 +3164,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
@@ -3179,7 +3193,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   DAG.getTarget(), ArgLocs, *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -3200,7 +3214,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
-                   getTargetMachine(), RVLocs, *DAG.getContext());
+                   DAG.getTarget(), RVLocs, *DAG.getContext());
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
@@ -3214,12 +3228,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
+                    DAG.getTarget(), RVLocs1, *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
+                    DAG.getTarget(), RVLocs2, *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -3246,7 +3260,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   DAG.getTarget(), ArgLocs, *DAG.getContext());
 
     // Allocate shadow area for Win64
     if (IsCalleeWin64)
@@ -3263,7 +3277,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII =
-        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
+          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3286,12 +3300,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     if (!Subtarget->is64Bit() &&
         ((!isa<GlobalAddressSDNode>(Callee) &&
           !isa<ExternalSymbolSDNode>(Callee)) ||
-         getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
       unsigned NumInRegs = 0;
       // In PIC we need an extra register to formulate the address computation
       // for the callee.
       unsigned MaxInRegs =
-          (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
 
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
@@ -3415,7 +3429,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
@@ -3965,14 +3979,22 @@ static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
 
   unsigned CorrectPosV1 = 0;
   unsigned CorrectPosV2 = 0;
-  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
+    if (Mask[i] == -1) {
+      ++CorrectPosV1;
+      ++CorrectPosV2;
+      continue;
+    }
+
     if (Mask[i] == i)
       ++CorrectPosV1;
     else if (Mask[i] == i + 4)
       ++CorrectPosV2;
+  }
 
   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
-    // We have 3 elements from one vector, and one from another.
+    // We have 3 elements (undefs count as elements from any vector) from one
+    // vector, and one from another.
     return true;
 
   return false;
@@ -6040,6 +6062,229 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
 }
 
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+/// 
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+                              unsigned BaseIdx, unsigned LastIdx,
+                              SDValue &V0, SDValue &V1) {
+  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+  assert(N->getValueType(0).isVector() &&
+         N->getValueType(0).getVectorNumElements() >= LastIdx &&
+         "Invalid Vector in input!");
+  
+  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+  bool CanFold = true;
+  unsigned ExpectedVExtractIdx = BaseIdx;
+  unsigned NumElts = LastIdx - BaseIdx;
+
+  // Check if N implements a horizontal binop.
+  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+    SDValue Op = N->getOperand(i + BaseIdx);
+    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+    if (!CanFold)
+      break;
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0) == Op1.getOperand(0) &&
+        isa<ConstantSDNode>(Op0.getOperand(1)) &&
+        isa<ConstantSDNode>(Op1.getOperand(1)));
+    if (!CanFold)
+      break;
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+ 
+    if (i == 0)
+      V0 = Op0.getOperand(0);
+    else if (i * 2 == NumElts) {
+      V1 = Op0.getOperand(0);
+      ExpectedVExtractIdx = BaseIdx;
+    }
+
+    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+    if (I0 == ExpectedVExtractIdx)
+      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+    else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+      // Try to match the following dag sequence:
+      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+    } else
+      CanFold = false;
+
+    ExpectedVExtractIdx += 2;
+  }
+
+  return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector. 
+///
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations. 
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V0_HI
+///     HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V1_LO
+///     HADD V0_HI, V1_HI
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+                                     SDLoc DL, SelectionDAG &DAG,
+                                     unsigned X86Opcode, bool Mode) {
+  EVT VT = V0.getValueType();
+  assert(VT.is256BitVector() && VT == V1.getValueType() &&
+         "Invalid nodes in input!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
+  SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
+  SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
+  SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
+  EVT NewVT = V0_LO.getValueType();
+
+  SDValue LO, HI;
+  if (Mode) {
+    LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+    HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+  } else {
+    LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+    HI = DAG.getNode(X86Opcode, DL, NewVT, V1_HI, V1_HI);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
+                                          const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+  SDValue InVec0, InVec1;
+
+  // Try to match horizontal ADD/SUB.
+  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+    // Try to match an SSE3 float HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+    
+    if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+    // Try to match an SSSE3 integer HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+    
+    if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+  }
+  
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+    // Try to match an AVX horizontal add/sub of packed single/double
+    // precision floating point values from 256-bit vectors.
+    SDValue InVec2, InVec3;
+    if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts/2, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FADD, NumElts/2, NumElts, InVec2, InVec3) &&
+        InVec0.getNode() == InVec2.getNode() &&
+        InVec1.getNode() == InVec3.getNode())
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts/2, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FSUB, NumElts/2, NumElts, InVec2, InVec3) &&
+        InVec0.getNode() == InVec2.getNode() &&
+        InVec1.getNode() == InVec3.getNode())
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+    // Try to match an AVX2 horizontal add/sub of signed integers.
+    SDValue InVec2, InVec3;
+    unsigned X86Opcode;
+    bool CanFold = true;
+
+    if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts/2, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::ADD, NumElts/2, NumElts, InVec2, InVec3) &&
+        InVec0.getNode() == InVec2.getNode() &&
+        InVec1.getNode() == InVec3.getNode())
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts/2, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::SUB, NumElts/2, NumElts, InVec2, InVec3) &&
+        InVec0.getNode() == InVec2.getNode() &&
+        InVec1.getNode() == InVec3.getNode())
+      X86Opcode = X86ISD::HSUB;
+    else
+      CanFold = false;
+
+    if (CanFold) {
+      // Fold this build_vector into a single horizontal add/sub.
+      // Do this only if the target has AVX2.
+      if (Subtarget->hasAVX2())
+        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+ 
+      // Convert this build_vector into a pair of horizontal binop followed by
+      // a concat vector.
+      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false);
+    }
+  }
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+       VT == MVT::v16i16) && Subtarget->hasAVX()) {
+    unsigned X86Opcode;
+    if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HSUB;
+    else if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHADD;
+    else if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHSUB;
+    else
+      return SDValue();
+
+    // Convert this build_vector into two horizontal add/sub followed by
+    // a concat vector.
+    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true);
+  }
+
+  return SDValue();
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -6427,38 +6672,30 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
 
-// Try to lower a shuffle node into a simple blend instruction.
-static SDValue
-LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
-                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  MVT VT = SVOp->getSimpleValueType(0);
+static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
+                        bool hasInt256, unsigned *MaskOut = nullptr) {
   MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
 
   // There is no blend with immediate in AVX-512.
   if (VT.is512BitVector())
-    return SDValue();
+    return false;
 
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
+  if (!hasSSE41 || EltVT == MVT::i8)
+    return false;
+  if (!hasInt256 && VT == MVT::v16i16)
+    return false;
 
-  // Check the mask for BLEND and build the value.
   unsigned MaskValue = 0;
+  unsigned NumElems = VT.getVectorNumElements();
   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  unsigned NumLanes = (NumElems-1)/8 + 1;
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
   unsigned NumElemsInLane = NumElems / NumLanes;
 
   // Blend for v16i16 should be symetric for the both lanes.
   for (unsigned i = 0; i < NumElemsInLane; ++i) {
 
-    int SndLaneEltIdx = (NumLanes == 2) ?
-      SVOp->getMaskElt(i + NumElemsInLane) : -1;
-    int EltIdx = SVOp->getMaskElt(i);
+    int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
+    int EltIdx = MaskVals[i];
 
     if ((EltIdx < 0 || EltIdx == (int)i) &&
         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
@@ -6467,11 +6704,34 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
     if (((unsigned)EltIdx == (i + NumElems)) &&
         (SndLaneEltIdx < 0 ||
          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
-      MaskValue |= (1<<i);
+      MaskValue |= (1 << i);
     else
-      return SDValue();
+      return false;
   }
 
+  if (MaskOut)
+    *MaskOut = MaskValue;
+  return true;
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+// This function assumes isBlendMask returns true for this
+// SuffleVectorSDNode
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                                          unsigned MaskValue,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
+                     Subtarget->hasInt256() && "Trying to lower a "
+                                               "VECTOR_SHUFFLE to a Blend but "
+                                               "with the wrong mask"));
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  unsigned NumElems = VT.getVectorNumElements();
+
   // Convert i32 vectors to floating point if it is not AVX2.
   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
   MVT BlendVT = VT;
@@ -7448,8 +7708,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
          "unsupported vector type for insertps/pinsrd");
 
-  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i < 4; });
+  auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
+  auto FromV2Predicate = [](const int &i) { return i >= 4; };
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
 
   SDValue From;
   SDValue To;
@@ -7457,15 +7718,17 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
   if (FromV1 == 1) {
     From = V1;
     To = V2;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i < 4; }) -
+    DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
                 Mask.begin();
   } else {
+    assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
+           "More than one element from V1 and from V2, or no elements from one "
+           "of the vectors. This case should not have returned true from "
+           "isINSERTPSMask");
     From = V2;
     To = V1;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i >= 4; }) -
-                Mask.begin();
+    DestIndex =
+        std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
   }
 
   if (MayFoldLoad(From)) {
@@ -7908,9 +8171,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
-  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
-  if (BlendOp.getNode())
-    return BlendOp;
+  unsigned MaskValue;
+  if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
+                  &MaskValue))
+    return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
 
   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
     return getINSERTPS(SVOp, dl, DAG);
@@ -7980,7 +8244,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// This function assumes its argument is a BUILD_VECTOR of constand or
+// This function assumes its argument is a BUILD_VECTOR of constants or
 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
 // true.
 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
@@ -8004,9 +8268,13 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
       Lane2Cond = !isZero(SndLaneEltCond);
 
     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
-      MaskValue |= !!Lane1Cond << i;
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      MaskValue |= !Lane1Cond << i;
     else if (Lane1Cond < 0)
-      MaskValue |= !!Lane2Cond << i;
+      MaskValue |= !Lane2Cond << i;
     else
       return false;
   }
@@ -8524,7 +8792,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -8557,7 +8825,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -8590,7 +8858,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel)) {
@@ -8611,7 +8879,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
 
   // With PIC, the address is actually $g + Offset.
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+  if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
       !Subtarget->is64Bit()) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
@@ -8633,7 +8901,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   // Create the TargetBlockAddressAddress node.
   unsigned char OpFlags =
     Subtarget->ClassifyBlockAddressReference();
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
@@ -8662,8 +8930,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
   unsigned char OpFlags =
-    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+      Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
   SDValue Result;
   if (OpFlags == X86II::MO_NO_FLAG &&
       X86::isOffsetSuitableForCodeModel(Offset, M)) {
@@ -8862,7 +9130,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = GA->getGlobal();
 
   if (Subtarget->isTargetELF()) {
-    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
 
     switch (model) {
       case TLSModel::GeneralDynamic:
@@ -8874,9 +9142,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                            Subtarget->is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
-        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
-                                   Subtarget->is64Bit(),
-                        getTargetMachine().getRelocationModel() == Reloc::PIC_);
+        return LowerToTLSExecModel(
+            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
+            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -8889,8 +9157,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
     // global base reg.
-    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
-                  !Subtarget->is64Bit();
+    bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
+                 !Subtarget->is64Bit();
     if (PIC32)
       OpFlag = X86II::MO_TLVP_PIC_BASE;
     else
@@ -8939,10 +9207,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // Windows 64bit: gs:0x58
     // Windows 32bit: fs:__tls_array
 
-    // If GV is an alias then use the aliasee for determining
-    // thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->getAliasee();
     SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
@@ -10048,10 +10312,27 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     break;
   case X86::COND_G: case X86::COND_GE:
   case X86::COND_L: case X86::COND_LE:
-  case X86::COND_O: case X86::COND_NO:
-    NeedOF = true;
+  case X86::COND_O: case X86::COND_NO: {
+    // Check if we really need to set the
+    // Overflow flag. If NoSignedWrap is present
+    // that is not actually needed.
+    switch (Op->getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::SHL: {
+      const BinaryWithFlagsSDNode *BinNode =
+          cast<BinaryWithFlagsSDNode>(Op.getNode());
+      if (BinNode->hasNoSignedWrap())
+        break;
+    }
+    default:
+      NeedOF = true;
+      break;
+    }
     break;
   }
+  }
   // See if we can use the EFLAGS value from the operand instead of
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
@@ -10113,14 +10394,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     if (ConstantSDNode *C =
         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
       // An add of one will be selected as an INC.
-      if (C->getAPIntValue() == 1) {
+      if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
         Opcode = X86ISD::INC;
         NumOperands = 1;
         break;
       }
 
       // An add of negative one (subtract of one) will be selected as a DEC.
-      if (C->getAPIntValue().isAllOnesValue()) {
+      if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
         Opcode = X86ISD::DEC;
         NumOperands = 1;
         break;
@@ -10136,7 +10417,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     // If we have a constant logical shift that's only used in a comparison
     // against zero turn it into an equivalent AND. This allows turning it into
     // a TEST instruction later.
-    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
       EVT VT = Op.getValueType();
       unsigned BitWidth = VT.getSizeInBits();
@@ -11467,8 +11748,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (addTest) {
-    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
+    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    Cond = EmitTest(Cond, X86Cond, dl, DAG);
   }
   Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11511,7 +11793,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-    const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+    const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
@@ -11570,7 +11852,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
 
     const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
     unsigned SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
@@ -11679,7 +11961,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
-    assert(!getTargetMachine().Options.UseSoftFloat &&
+    assert(!DAG.getTarget().Options.UseSoftFloat &&
            !(DAG.getMachineFunction()
                 .getFunction()->getAttributes()
                 .hasAttribute(AttributeSet::FunctionIndex,
@@ -12824,7 +13106,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
     unsigned HintVal;
-    if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+    if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
     unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
@@ -12871,7 +13153,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -12893,7 +13175,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -12922,7 +13204,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
@@ -12934,7 +13216,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 
   EVT PtrVT = getPointerTy();
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -12981,7 +13263,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
 
   if (Subtarget->is64Bit()) {
     SDValue OutChains[6];
@@ -14252,7 +14534,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
     break;
   }
   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
-                                    Op.getOperand(2), SDValue());
+                                  Op.getOperand(2), SDValue());
   SDValue Ops[] = { cpIn.getValue(0),
                     Op.getOperand(1),
                     Op.getOperand(3),
@@ -14262,9 +14544,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
                                            Ops, T, MMO);
+
   SDValue cpOut =
     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
-  return cpOut;
+  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+                                      MVT::i32, cpOut.getValue(2));
+  SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
+                                DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+  return SDValue();
 }
 
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
@@ -14272,19 +14563,31 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
-  if (SrcVT == MVT::v2i32) {
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
     if (DstVT != MVT::f64)
       // This conversion needs to be expanded.
       return SDValue();
 
+    SDValue InVec = Op->getOperand(0);
     SDLoc dl(Op);
-    SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                               Op->getOperand(0), DAG.getIntPtrConstant(0));
-    SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                               Op->getOperand(0), DAG.getIntPtrConstant(1));
-    SDValue Elts[] = {Elt0, Elt1, Elt0, Elt0};
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Elts);
+    unsigned NumElts = SrcVT.getVectorNumElements();
+    EVT SVT = SrcVT.getVectorElementType();
+
+    // Widen the vector in input in the case of MVT::v2i32.
+    // Example: from MVT::v2i32 to MVT::v4i32.
+    SmallVector<SDValue, 16> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+                                 DAG.getIntPtrConstant(i)));
+
+    // Explicitly mark the extra elements as Undef.
+    SDValue Undef = DAG.getUNDEF(SVT);
+    for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
+      Elts.push_back(Undef);
+
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
                        DAG.getIntPtrConstant(0));
@@ -14432,7 +14735,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
-  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -14514,8 +14818,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static void ReplaceATOMIC_LOAD(SDNode *Node,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) {
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) {
   SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
@@ -14524,15 +14828,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
   //        (The only way to get a 16-byte load is cmpxchg16b)
   // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
   SDValue Zero = DAG.getConstant(0, VT);
-  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
-                               Node->getOperand(0),
-                               Node->getOperand(1), Zero, Zero,
-                               cast<AtomicSDNode>(Node)->getMemOperand(),
-                               cast<AtomicSDNode>(Node)->getOrdering(),
-                               cast<AtomicSDNode>(Node)->getOrdering(),
-                               cast<AtomicSDNode>(Node)->getSynchScope());
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+  SDValue Swap =
+      DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
+                           Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+                           cast<AtomicSDNode>(Node)->getMemOperand(),
+                           cast<AtomicSDNode>(Node)->getOrdering(),
+                           cast<AtomicSDNode>(Node)->getOrdering(),
+                           cast<AtomicSDNode>(Node)->getSynchScope());
   Results.push_back(Swap.getValue(0));
-  Results.push_back(Swap.getValue(1));
+  Results.push_back(Swap.getValue(2));
 }
 
 static void
@@ -14648,7 +14953,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
   }
-  case ISD::ATOMIC_CMP_SWAP: {
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     EVT T = N->getValueType(0);
     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
     bool Regs64bit = T == MVT::i128;
@@ -14690,8 +14995,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                         Regs64bit ? X86::RDX : X86::EDX,
                                         HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+                                        MVT::i32, cpOutH.getValue(2));
+    SDValue Success =
+        DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                    DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
-    Results.push_back(cpOutH.getValue(1));
+    Results.push_back(Success);
+    Results.push_back(EFLAGS.getValue(1));
     return;
   }
   case ISD::ATOMIC_LOAD_ADD:
@@ -14754,17 +15068,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT DstVT = N->getValueType(0);
     EVT SrcVT = N->getOperand(0)->getValueType(0);
 
-    if (SrcVT == MVT::f64 && DstVT == MVT::v2i32) {
-      SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                     MVT::v2f64, N->getOperand(0));
-      SDValue ToV4I32 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Expanded);
-      SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                 ToV4I32, DAG.getIntPtrConstant(0));
-      SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                 ToV4I32, DAG.getIntPtrConstant(1));
-      SDValue Elts[] = {Elt0, Elt1};
-      Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Elts));
-    }
+    if (SrcVT != MVT::f64 ||
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+      return;
+
+    unsigned NumElts = DstVT.getVectorNumElements();
+    EVT SVT = DstVT.getVectorElementType();
+    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   MVT::v2f64, N->getOperand(0));
+    SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+
+    SmallVector<SDValue, 8> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+                                   ToVecInt, DAG.getIntPtrConstant(i)));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
   }
   }
 }
@@ -14790,6 +15110,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FST:                return "X86ISD::FST";
   case X86ISD::CALL:               return "X86ISD::CALL";
   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   case X86ISD::BT:                 return "X86ISD::BT";
   case X86ISD::CMP:                return "X86ISD::CMP";
   case X86ISD::COMI:               return "X86ISD::COMI";
@@ -14843,6 +15164,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
   case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
   case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
@@ -15153,7 +15475,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
-          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
+          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
 }
 
 bool
@@ -15378,10 +15701,10 @@ static unsigned getPseudoCMOVOpc(EVT VT) {
 MachineBasicBlock *
 X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
                                        MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
-  MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -15539,7 +15862,7 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
           .addReg(t4);
       } else {
         // Promote i8 to i32 to use CMOV32
-        const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+        const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
         const TargetRegisterClass *RC32 =
           TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
         unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
@@ -15651,10 +15974,10 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
 MachineBasicBlock *
 X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
                                            MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
-  MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -16048,7 +16371,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
   // Machine Information
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -16304,7 +16627,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   XMMSaveMBB->addSuccessor(EndMBB);
 
   // Now add the instructions.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned CountReg = MI->getOperand(0).getReg();
@@ -16387,7 +16710,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                      MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -16413,7 +16736,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
   if (!MI->killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
     copy0MBB->addLiveIn(X86::EFLAGS);
@@ -16454,9 +16777,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
                                         bool Is64Bit) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
   MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
@@ -16526,7 +16849,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
-    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (Is64Bit) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
@@ -16574,8 +16897,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
-                                          MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+                                        MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   assert(!Subtarget->isTargetMacho());
@@ -16631,10 +16954,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   // our load from the relocation, sticking it in either RDI (x86-64)
   // or EAX and doing an indirect call.  The return value will then
   // be in the normal return register.
+  MachineFunction *F = BB->getParent();
   const X86InstrInfo *TII
-    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
+    = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
   DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *F = BB->getParent();
 
   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
   assert(MI->getOperand(3).isGlobal() && "This should be a global");
@@ -16643,7 +16966,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
-    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (Subtarget->is64Bit()) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV64rm), X86::RDI)
@@ -16655,7 +16978,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
-  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+  } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV32rm), X86::EAX)
     .addReg(0)
@@ -16687,9 +17010,8 @@ MachineBasicBlock *
 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
   MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -16751,8 +17073,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned PtrStoreOpc = 0;
   unsigned LabelReg = 0;
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
-  Reloc::Model RM = getTargetMachine().getRelocationModel();
-  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+  Reloc::Model RM = MF->getTarget().getRelocationModel();
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
 
   // Prepare IP either in reg or imm.
@@ -16796,7 +17118,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
           .addMBB(restoreMBB);
 
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
@@ -16825,9 +17147,8 @@ MachineBasicBlock *
 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
   MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
@@ -16843,7 +17164,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
@@ -17018,12 +17339,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::FP80_TO_INT16_IN_MEM:
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
-    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    MachineFunction *F = BB->getParent();
+    const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
     DebugLoc DL = MI->getDebugLoc();
 
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
-    MachineFunction *F = BB->getParent();
     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
@@ -17103,7 +17424,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRM128MEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+    return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
 
   // String/text processing lowering.
   case X86::PCMPISTRIREG:
@@ -17116,15 +17437,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRIMEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+    return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
 
   // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
+    return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
 
   // xbegin
   case X86::XBEGIN:
-    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
+    return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
 
   // Atomic Lowering.
   case X86::ATOMAND8:
@@ -17458,6 +17779,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget *Subtarget) {
   SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   // Don't create instructions with illegal types after legalize types has run.
@@ -17470,6 +17793,57 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
+  // During Type Legalization, when promoting illegal vector types,
+  // the backend might introduce new shuffle dag nodes and bitcasts.
+  //
+  // This code performs the following transformation:
+  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+  //
+  // We do this only if both the bitcast and the BINOP dag nodes have
+  // one use. Also, perform this transformation only if the new binary
+  // operation is legal. This is to avoid introducing dag nodes that
+  // potentially need to be further expanded (or custom lowered) into a
+  // less optimal sequence of dag nodes.
+  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+      N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+      N0.getOpcode() == ISD::BITCAST) {
+    SDValue BC0 = N0.getOperand(0);
+    EVT SVT = BC0.getValueType();
+    unsigned Opcode = BC0.getOpcode();
+    unsigned NumElts = VT.getVectorNumElements();
+    
+    if (BC0.hasOneUse() && SVT.isVector() &&
+        SVT.getVectorNumElements() * 2 == NumElts &&
+        TLI.isOperationLegal(Opcode, VT)) {
+      bool CanFold = false;
+      switch (Opcode) {
+      default : break;
+      case ISD::ADD :
+      case ISD::FADD :
+      case ISD::SUB :
+      case ISD::FSUB :
+      case ISD::MUL :
+      case ISD::FMUL :
+        CanFold = true;
+      }
+
+      unsigned SVTNumElts = SVT.getVectorNumElements();
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) < 0;
+
+      if (CanFold) {
+        SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+      }
+    }
+  }
+
   // Only handle 128 wide vector from here on.
   if (!VT.is128BitVector())
     return SDValue();
@@ -20622,6 +20996,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
   case X86ISD::INSERTPS:
     return PerformINSERTPSCombine(N, DAG, Subtarget);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
   }
 
   return SDValue();
@@ -21126,8 +21501,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     const GlobalValue *GV = GA->getGlobal();
     // If we require an extra load to get this address, as in PIC mode, we
     // can't accept it.
-    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
-                                                        getTargetMachine())))
+    if (isGlobalStubReference(
+            Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
       return;
 
     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
@@ -21405,3 +21780,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
     return AM.Scale != 0;
   return -1;
 }
+
+bool X86TargetLowering::isTargetFTOL() const {
+  return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 9f51b53a00c4..af2e4344e5b6 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -15,13 +15,13 @@
 #ifndef X86ISELLOWERING_H
 #define X86ISELLOWERING_H
 
-#include "X86Subtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 
 namespace llvm {
+  class X86Subtarget;
   class X86TargetMachine;
 
   namespace X86ISD {
@@ -766,9 +766,7 @@ namespace llvm {
 
     /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
     /// for fptoui.
-    bool isTargetFTOL() const {
-      return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
-    }
+    bool isTargetFTOL() const;
 
     /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
     /// used for fptoui to the given type.
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 37bcc5235e4f..7cac5ebbece7 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -517,10 +517,12 @@ def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
                   []>, EVEX;
 }
 
+let Predicates = [HasCDI] in {
 defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
                                              VK16, v16i32, v16i1>, EVEX_V512;
 defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
                                             VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+}
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - VPERM
@@ -1787,6 +1789,46 @@ def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
 def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
         (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
 
+//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+
+def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst),
+                            (ins i512mem:$src),
+                            "vmovntdqa\t{$src, $dst|$dst, $src}",
+                            [(set VR512:$dst,
+                              (int_x86_avx512_movntdqa addr:$src))]>,
+                   EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// Prefer non-temporal over temporal versions
+let AddedComplexity = 400, SchedRW = [WriteStore] in {
+
+def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs),
+                            (ins f512mem:$dst, VR512:$src),
+                            "vmovntps\t{$src, $dst|$dst, $src}",
+                            [(alignednontemporalstore (v16f32 VR512:$src),
+                                                      addr:$dst)],
+                            IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs),
+                            (ins f512mem:$dst, VR512:$src),
+                            "vmovntpd\t{$src, $dst|$dst, $src}",
+                            [(alignednontemporalstore (v8f64 VR512:$src),
+                                                      addr:$dst)],
+			    IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+
+def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs),
+                           (ins i512mem:$dst, VR512:$src),
+                           "vmovntdq\t{$src, $dst|$dst, $src}",
+                           [(alignednontemporalstore (v8i64 VR512:$src),
+                                                     addr:$dst)],
+                           IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - Integer arithmetic
 //
@@ -3161,6 +3203,10 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
           (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
            (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
 
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+           (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
 def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
                    (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
           (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>;
@@ -4343,6 +4389,37 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
           (VPCONFLICTQrrk VR512:$src1,
            (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
 
+let Predicates = [HasCDI] in {
+defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM,
+                    i512mem, i32mem, "{1to16}">,
+                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM,
+                    i512mem, i64mem, "{1to8}">,
+                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+}
+
+def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1,
+                                              GR16:$mask),
+          (VPLZCNTDrrk VR512:$src1,
+           (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
+
+def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1,
+                                              GR8:$mask),
+          (VPLZCNTQrrk VR512:$src1,
+           (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))),
+          (VPLZCNTDrm addr:$src)>;
+def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))),
+          (VPLZCNTDrr VR512:$src)>;
+def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))),
+          (VPLZCNTQrm addr:$src)>;
+def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))),
+          (VPLZCNTQrr VR512:$src)>;
+
 def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
 def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
 def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 34d8fb9e1817..9b3dce52f72e 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1696,20 +1696,34 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
           (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
 
 // Increment reg.
-def : Pat<(add GR8 :$src, 1), (INC8r     GR8 :$src)>;
-def : Pat<(add GR16:$src, 1), (INC16r    GR16:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, 1), (INC32r    GR32:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR64:$src, 1), (INC64r    GR64:$src)>;
+// Do not make INC if it is slow
+def : Pat<(add GR8:$src, 1),
+          (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, 1),
+          (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, 1),
+          (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+          (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+          (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, 1),
+          (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
 
 // Decrement reg.
-def : Pat<(add GR8 :$src, -1), (DEC8r     GR8 :$src)>;
-def : Pat<(add GR16:$src, -1), (DEC16r    GR16:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC32r    GR32:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR64:$src, -1), (DEC64r    GR64:$src)>;
+// Do not make DEC if it is slow
+def : Pat<(add GR8:$src, -1),
+          (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, -1),
+          (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, -1),
+          (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+          (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+          (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, -1),
+          (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
 
 // or reg/reg.
 def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index efb5c70c8108..bfc8e2759dcb 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -97,14 +98,11 @@ struct X86OpTblEntry {
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
 
-X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
-  : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
-                     ? X86::ADJCALLSTACKDOWN64
-                     : X86::ADJCALLSTACKDOWN32),
-                    (tm.getSubtarget<X86Subtarget>().is64Bit()
-                     ? X86::ADJCALLSTACKUP64
-                     : X86::ADJCALLSTACKUP32)),
-    TM(tm), RI(tm) {
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+    : X86GenInstrInfo(
+          (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
+          (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
+      Subtarget(STI), RI(STI) {
 
   static const X86OpTblEntry OpTbl2Addr[] = {
     { X86::ADC32ri,     X86::ADC32mi,    0 },
@@ -1472,7 +1470,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   case X86::MOVSX32rr8:
   case X86::MOVZX32rr8:
   case X86::MOVSX64rr8:
-    if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (!Subtarget.is64Bit())
       // It's not always legal to reference the low 8-bit of the larger
       // register in 32-bit mode.
       return false;
@@ -1950,7 +1948,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
   unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
   unsigned Opc, leaInReg;
-  if (TM.getSubtarget<X86Subtarget>().is64Bit()) {
+  if (Subtarget.is64Bit()) {
     Opc = X86::LEA64_32r;
     leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
   } else {
@@ -2006,7 +2004,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
       // just a single insert_subreg.
       addRegReg(MIB, leaInReg, true, leaInReg, false);
     } else {
-      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      if (Subtarget.is64Bit())
         leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
       else
         leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
@@ -2076,13 +2074,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // we have better subtarget support, enable the 16-bit LEA generation here.
   // 16-bit LEA is also slow on Core2.
   bool DisableLEA16 = true;
-  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  bool is64Bit = Subtarget.is64Bit();
 
   unsigned MIOpc = MI->getOpcode();
   switch (MIOpc) {
   case X86::SHUFPSrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
+    if (!Subtarget.hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
@@ -2094,7 +2092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHUFPDrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
+    if (!Subtarget.hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
@@ -2672,8 +2670,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
 
 /// getSETFromCond - Return a set opcode for the given condition and
 /// whether it has memory operand.
-static unsigned getSETFromCond(X86::CondCode CC,
-                               bool HasMemoryOperand) {
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
   static const uint16_t Opc[16][2] = {
     { X86::SETAr,  X86::SETAm  },
     { X86::SETAEr, X86::SETAEm },
@@ -2693,7 +2690,7 @@ static unsigned getSETFromCond(X86::CondCode CC,
     { X86::SETSr,  X86::SETSm  }
   };
 
-  assert(CC < 16 && "Can only handle standard cond codes");
+  assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
   return Opc[CC][HasMemoryOperand ? 1 : 0];
 }
 
@@ -2976,7 +2973,7 @@ canInsertSelect(const MachineBasicBlock &MBB,
                 unsigned TrueReg, unsigned FalseReg,
                 int &CondCycles, int &TrueCycles, int &FalseCycles) const {
   // Not all subtargets have cmov instructions.
-  if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+  if (!Subtarget.hasCMov())
     return false;
   if (Cond.size() != 1)
     return false;
@@ -3027,8 +3024,7 @@ static bool isHReg(unsigned Reg) {
 
 // Try and copy between VR128/VR64 and GR64 registers.
 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
-                                        const X86Subtarget& Subtarget) {
-
+                                        const X86Subtarget &Subtarget) {
 
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
@@ -3107,8 +3103,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned SrcReg,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
   unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
@@ -3120,7 +3116,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // Copying to or from a physical H register on x86-64 requires a NOREX
     // move.  Otherwise use a normal move.
     if ((isHReg(DestReg) || isHReg(SrcReg)) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit()) {
+        Subtarget.is64Bit()) {
       Opc = X86::MOV8rr_NOREX;
       // Both operands must be encodable without an REX prefix.
       assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
@@ -3137,7 +3133,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
   if (!Opc)
-    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>());
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -3183,9 +3179,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       const TargetRegisterClass *RC,
                                       bool isStackAligned,
-                                      const TargetMachine &TM,
+                                      const X86Subtarget &STI,
                                       bool load) {
-  if (TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+  if (STI.hasAVX512()) {
     if (X86::VK8RegClass.hasSubClassEq(RC)  ||
       X86::VK16RegClass.hasSubClassEq(RC))
       return load ? X86::KMOVWkm : X86::KMOVWmk;
@@ -3197,13 +3193,13 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
       return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  bool HasAVX = STI.hasAVX();
   switch (RC->getSize()) {
   default:
     llvm_unreachable("Unknown spill size");
   case 1:
     assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (STI.is64Bit())
       // Copying to or from a physical H register on x86-64 requires a NOREX
       // move.  Otherwise use a normal move.
       if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
@@ -3270,16 +3266,16 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
 static unsigned getStoreRegOpcode(unsigned SrcReg,
                                   const TargetRegisterClass *RC,
                                   bool isStackAligned,
-                                  TargetMachine &TM) {
-  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false);
+                                  const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
 }
 
 
 static unsigned getLoadRegOpcode(unsigned DestReg,
                                  const TargetRegisterClass *RC,
                                  bool isStackAligned,
-                                 const TargetMachine &TM) {
-  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true);
+                                 const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
 }
 
 void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -3291,9 +3287,10 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
-    RI.canRealignStack(MF);
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  bool isAligned =
+      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
     .addReg(SrcReg, getKillRegState(isKill));
@@ -3309,7 +3306,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
@@ -3327,9 +3324,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
-    RI.canRealignStack(MF);
-  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  bool isAligned =
+      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
 }
@@ -3343,7 +3341,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
-  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
@@ -3741,7 +3739,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       continue;
 
     // EFLAGS is used by this instruction.
-    X86::CondCode OldCC;
+    X86::CondCode OldCC = X86::COND_INVALID;
     bool OpcIsSET = false;
     if (IsCmpZero || IsSwapped) {
       // We decode the condition code from opcode.
@@ -3964,7 +3962,7 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
 }
 
 bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  bool HasAVX = Subtarget.hasAVX();
   MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
   switch (MI->getOpcode()) {
   case X86::MOV32r0:
@@ -4075,7 +4073,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     unsigned Size, unsigned Align) const {
   const DenseMap<unsigned,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
-  bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
+  bool isCallRegIndirect = Subtarget.callRegIndirect();
   bool isTwoAddrFold = false;
 
   // Atom favors register form of call. So, we do not fold loads into calls
@@ -4316,7 +4314,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
-    bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+    bool HasAVX = Subtarget.hasAVX();
     unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
       .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
@@ -4352,7 +4350,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
   // If the function stack isn't realigned we don't want to fold instructions
   // that need increased alignment.
   if (!RI.needsStackRealignment(MF))
-    Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment());
+    Alignment = std::min(
+        Alignment, MF.getTarget().getFrameLowering()->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -4453,14 +4452,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     // Create a constant-pool entry and operands to load from it.
 
     // Medium and large mode can't fold loads this way.
-    if (TM.getCodeModel() != CodeModel::Small &&
-        TM.getCodeModel() != CodeModel::Kernel)
+    if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+        MF.getTarget().getCodeModel() != CodeModel::Kernel)
       return nullptr;
 
     // x86-32 PIC requires a PIC base register for constant pools.
     unsigned PICBase = 0;
-    if (TM.getRelocationModel() == Reloc::PIC_) {
-      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget.is64Bit())
         PICBase = X86::RIP;
       else
         // FIXME: PICBase = getGlobalBaseReg(&MF);
@@ -4600,7 +4599,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   if (!MI->hasOneMemOperand() &&
       RC == &X86::VR128RegClass &&
-      !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      !Subtarget.isUnalignedMemAccessFast())
     // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
     // conservatively assume the address is unaligned. That's bad for
     // performance.
@@ -4748,13 +4747,13 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                             cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+        !Subtarget.isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned load.
       return false;
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
-    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
                               VT, MVT::Other, AddrOps);
     NewNodes.push_back(Load);
 
@@ -4791,15 +4790,15 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                              cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+        !Subtarget.isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned store.
       return false;
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
-    SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
-                                                         isAligned, TM),
-                                       dl, MVT::Other, AddrOps);
+    SDNode *Store =
+        DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+                           dl, MVT::Other, AddrOps);
     NewNodes.push_back(Store);
 
     // Preserve memory reference information.
@@ -4960,7 +4959,7 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   default:
     // XMM registers. In 64-bit mode we can be a bit more aggressive since we
     // have 16 of them to play with.
-    if (TM.getSubtargetImpl()->is64Bit()) {
+    if (Subtarget.is64Bit()) {
       if (NumLoads >= 3)
         return false;
     } else if (NumLoads) {
@@ -4986,7 +4985,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
-  if (!TM.getSubtarget<X86Subtarget>().hasAVX())
+  if (!Subtarget.hasAVX())
     return false;
 
   enum {
@@ -5168,7 +5167,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
 /// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
 ///
 unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
-  assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+  assert(!Subtarget.is64Bit() &&
          "X86-64 PIC uses RIP relative addressing");
 
   X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
@@ -5271,7 +5270,7 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
 std::pair<uint16_t, uint16_t>
 X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
-  bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
+  bool hasAVX2 = Subtarget.hasAVX2();
   uint16_t validDomains = 0;
   if (domain && lookup(MI->getOpcode(), domain))
     validDomains = 0xe;
@@ -5286,7 +5285,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   assert(dom && "Not an SSE instruction");
   const uint16_t *table = lookup(MI->getOpcode(), dom);
   if (!table) { // try the other table
-    assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) &&
+    assert((Subtarget.hasAVX2() || Domain < 3) &&
            "256-bit vector operations only available in AVX2");
     table = lookupAVX2(MI->getOpcode(), dom);
   }
@@ -5299,6 +5298,16 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
+void X86InstrInfo::getUnconditionalBranch(
+    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+  Branch.setOpcode(X86::JMP_4);
+  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+}
+
+void X86InstrInfo::getTrap(MCInst &MI) const {
+  MI.setOpcode(X86::TRAP);
+}
+
 bool X86InstrInfo::isHighLatencyDef(int opc) const {
   switch (opc) {
   default: return false;
@@ -5395,8 +5404,10 @@ namespace {
       const X86TargetMachine *TM =
         static_cast<const X86TargetMachine *>(&MF.getTarget());
 
-      assert(!TM->getSubtarget<X86Subtarget>().is64Bit() &&
-             "X86-64 PIC uses RIP relative addressing");
+      // Don't do anything if this is 64-bit as 64-bit PIC
+      // uses RIP relative addressing.
+      if (TM->getSubtarget<X86Subtarget>().is64Bit())
+        return false;
 
       // Only emit a global base reg in PIC mode.
       if (TM->getRelocationModel() != Reloc::PIC_)
@@ -5451,7 +5462,7 @@ namespace {
 
 char CGBR::ID = 0;
 FunctionPass*
-llvm::createGlobalBaseRegPass() { return new CGBR(); }
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
 
 namespace {
   struct LDTLSCleanup : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5f3491536d8a..d76c52ce47df 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
   class X86RegisterInfo;
-  class X86TargetMachine;
+  class X86Subtarget;
 
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
@@ -46,6 +46,7 @@ namespace X86 {
     COND_O  = 13,
     COND_P  = 14,
     COND_S  = 15,
+    LAST_VALID_COND = COND_S,
 
     // Artificial condition codes. These are used by AnalyzeBranch
     // to indicate a block terminated with two conditional branches to
@@ -61,12 +62,16 @@ namespace X86 {
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
 
+  /// \brief Return a set opcode for the given condition and whether it has
+  /// a memory operand.
+  unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
   // Turn CMov opcode into condition code.
   CondCode getCondFromCMovOpc(unsigned Opc);
 
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
-  CondCode GetOppositeBranchCondition(X86::CondCode CC);
+  CondCode GetOppositeBranchCondition(CondCode CC);
 }  // end namespace X86;
 
 
@@ -129,7 +134,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) {
 }
 
 class X86InstrInfo final : public X86GenInstrInfo {
-  X86TargetMachine &TM;
+  X86Subtarget &Subtarget;
   const X86RegisterInfo RI;
 
   /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
@@ -156,7 +161,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   virtual void anchor();
 
 public:
-  explicit X86InstrInfo(X86TargetMachine &tm);
+  explicit X86InstrInfo(X86Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -396,6 +401,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
                                       const SmallVectorImpl<MachineOperand> &MOs,
                                       unsigned Size, unsigned Alignment) const;
 
+  void
+  getUnconditionalBranch(MCInst &Branch,
+                         const MCSymbolRefExpr *BranchTarget) const override;
+
+  void getTrap(MCInst &MI) const override;
+
   bool isHighLatencyDef(int opc) const override;
 
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0d97669b2259..5d34c326ace6 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -795,6 +795,7 @@ def OptForSpeed  : Predicate<"!OptForSize">;
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
 def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1eb04851b721..043b2f32c6ff 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7969,6 +7969,16 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
         [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
 
+class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                           X86MemOperand x86memop, ValueType VT,
+                           PatFrag ld_frag, SchedWrite Sched> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+        Sched<[Sched]>, VEX {
+    let mayLoad = 1;
+}
+
 // AVX2 adds register forms
 class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
                          Intrinsic Int, SchedWrite Sched> :
@@ -7977,16 +7987,15 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
          [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
 
 let ExeDomain = SSEPackedSingle in {
-  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
-                                      int_x86_avx_vbroadcast_ss, WriteLoad>;
-  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
-                                      int_x86_avx_vbroadcast_ss_256,
-                                      WriteFShuffleLd>, VEX_L;
+  def VBROADCASTSSrm  : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
+                                             f32mem, v4f32, loadf32, WriteLoad>;
+  def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
+                                             f32mem, v8f32, loadf32,
+                                             WriteFShuffleLd>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
-                                    int_x86_avx_vbroadcast_sd_256,
-                                    WriteFShuffleLd>, VEX_L;
+def VBROADCASTSDYrm  : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
+                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
                                    int_x86_avx_vbroadcastf128_pd_256,
                                    WriteFShuffleLd>, VEX_L;
@@ -8543,13 +8552,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
 }
 
 let Predicates = [HasAVX] in {
-def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VBROADCASTSDYrm addr:$src)>;
-def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
-
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
   let AddedComplexity = 20 in {
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index e969ef2cf3d7..a082c4f8b0ef 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -432,7 +432,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
   // SSE Callback should be called for SSE-enabled LLVM.
   return X86CompilationCallback_SSE;
 #else
-  if (Subtarget->hasSSE1())
+  if (useSSE)
     return X86CompilationCallback_SSE;
 #endif
 #endif
@@ -440,8 +440,8 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
   return X86CompilationCallback;
 }
 
-X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
-  Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86JITInfo::X86JITInfo(bool UseSSE) {
+  useSSE = UseSSE;
   useGOT = 0;
   TLSOffset = nullptr;
 }
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index 4d279de3425f..564343ffa3fa 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -19,16 +19,14 @@
 #include "llvm/Target/TargetJITInfo.h"
 
 namespace llvm {
-  class X86TargetMachine;
   class X86Subtarget;
 
   class X86JITInfo : public TargetJITInfo {
-    X86TargetMachine &TM;
-    const X86Subtarget *Subtarget;
     uintptr_t PICBase;
-    char* TLSOffset;
+    char *TLSOffset;
+    bool useSSE;
   public:
-    explicit X86JITInfo(X86TargetMachine &tm);
+    explicit X86JITInfo(bool UseSSE);
 
     /// replaceMachineCodeForFunction - Make it so that calling the function
     /// whose machine code is at OLD turns into a call to NEW, perhaps by
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 84521ccee481..6639875d07e3 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -17,6 +17,7 @@
 
 #include "X86.h"
 #include "X86InstrInfo.h"
+#include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -101,6 +102,9 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
   }
 
   TM = &MF.getTarget();
+  if (!TM->getSubtarget<X86Subtarget>().padShortFunctions())
+    return false;
+
   TII = TM->getInstrInfo();
 
   // Search through basic blocks and mark the ones that have early returns
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index a83e1e487b82..236e1a4d949a 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -53,20 +53,18 @@ static cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
-X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
-  : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
-                         ? X86::RIP : X86::EIP),
-                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
-                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
-                       (tm.getSubtarget<X86Subtarget>().is64Bit()
-                         ? X86::RIP : X86::EIP)),
-                       TM(tm) {
+X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI)
+    : X86GenRegisterInfo(
+          (STI.is64Bit() ? X86::RIP : X86::EIP),
+          X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false),
+          X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true),
+          (STI.is64Bit() ? X86::RIP : X86::EIP)),
+      Subtarget(STI) {
   X86_MC::InitLLVM2SEHRegisterMapping(this);
 
   // Cache some information.
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  Is64Bit = Subtarget->is64Bit();
-  IsWin64 = Subtarget->isTargetWin64();
+  Is64Bit = Subtarget.is64Bit();
+  IsWin64 = Subtarget.isTargetWin64();
 
   if (Is64Bit) {
     SlotSize = 8;
@@ -173,9 +171,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
 }
 
 const TargetRegisterClass *
-X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
-                                                                         const {
-  const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                    unsigned Kind) const {
   switch (Kind) {
   default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
   case 0: // Normal GPRs.
@@ -225,7 +222,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case X86::GR64RegClassID:
     return 12 - FPDiff;
   case X86::VR128RegClassID:
-    return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4;
+    return Subtarget.is64Bit() ? 10 : 4;
   case X86::VR64RegClassID:
     return 4;
   }
@@ -233,8 +230,8 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 
 const MCPhysReg *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
 
   assert(MF && "MachineFunction required");
   switch (MF->getFunction()->getCallingConv()) {
@@ -287,8 +284,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
 
   switch (CC) {
   case CallingConv::GHC:
@@ -406,7 +403,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         Reserved.set(*AI);
     }
   }
-  if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+  if (!Is64Bit || !Subtarget.hasAVX512()) {
     for (unsigned n = 16; n != 32; ++n) {
       for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
@@ -459,7 +456,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
   bool requiresRealignment =
     ((MFI->getMaxAlignment() > StackAlign) ||
      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 2289d91960c8..ba346c85a7f9 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -22,11 +22,11 @@
 namespace llvm {
   class Type;
   class TargetInstrInfo;
-  class X86TargetMachine;
+  class X86Subtarget;
 
 class X86RegisterInfo final : public X86GenRegisterInfo {
 public:
-  X86TargetMachine &TM;
+  const X86Subtarget &Subtarget;
 
 private:
   /// Is64Bit - Is the target 64-bits.
@@ -55,7 +55,7 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
   unsigned BasePtr;
 
 public:
-  X86RegisterInfo(X86TargetMachine &tm);
+  X86RegisterInfo(const X86Subtarget &STI);
 
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 744890d50c83..b6ecdf8ecf63 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -11,21 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86TargetMachine.h"
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-selectiondag-info"
 
-X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
-  TargetSelectionDAGInfo(TM),
-  Subtarget(&TM.getSubtarget<X86Subtarget>()),
-  TLI(*TM.getTargetLowering()) {
-}
+X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
-X86SelectionDAGInfo::~X86SelectionDAGInfo() {
-}
+X86SelectionDAGInfo::~X86SelectionDAGInfo() {}
 
 SDValue
 X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
@@ -35,6 +37,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                              bool isVolatile,
                                          MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
 
   // If to a segment-relative address space, use the default lowering.
   if (DstPtrInfo.getAddrSpace() >= 256)
@@ -43,16 +46,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
   // If not DWORD aligned or size is more than the threshold, call the library.
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
-  if ((Align & 3) != 0 ||
-      !ConstantSize ||
-      ConstantSize->getZExtValue() >
-        Subtarget->getMaxInlineSizeThreshold()) {
+  if ((Align & 3) != 0 || !ConstantSize ||
+      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
     if (const char *bzeroEntry =  V &&
-        V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) {
-      EVT IntPtr = TLI.getPointerTy();
+        V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+      EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy();
       Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
@@ -68,7 +69,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                    DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
         .setDiscardResult();
 
-      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+      std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
       return CallResult.second;
     }
 
@@ -99,7 +100,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       ValReg = X86::EAX;
       Val = (Val << 8)  | Val;
       Val = (Val << 16) | Val;
-      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+      if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
         AVT = MVT::i64;
         ValReg = X86::RAX;
         Val = (Val << 32) | Val;
@@ -128,13 +129,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     InFlag = Chain.getValue(1);
   }
 
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
-                                                              X86::ECX,
-                            Count, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
-                                                              X86::EDI,
-                            Dst, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -182,10 +181,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
   if (!ConstantSize)
     return SDValue();
   uint64_t SizeVal = ConstantSize->getZExtValue();
-  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
     return SDValue();
 
   /// If not DWORD aligned, it is more efficient to call the library.  However
@@ -218,7 +218,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     AVT = MVT::i32;
   else
     // QWORD aligned
-    AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+    AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
 
   unsigned UBytes = AVT.getSizeInBits() / 8;
   unsigned CountVal = SizeVal / UBytes;
@@ -226,15 +226,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   unsigned BytesLeft = SizeVal % UBytes;
 
   SDValue InFlag;
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
                                                               X86::ECX,
                             Count, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
                                                               X86::EDI,
                             Dst, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
                                                               X86::ESI,
                             Src, InFlag);
   InFlag = Chain.getValue(1);
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index 0d5dc38ae898..c12555a59617 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -23,14 +23,8 @@ class X86TargetMachine;
 class X86Subtarget;
 
 class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const X86Subtarget *Subtarget;
-
-  const X86TargetLowering &TLI;
-
 public:
-  explicit X86SelectionDAGInfo(const X86TargetMachine &TM);
+  explicit X86SelectionDAGInfo(const DataLayout &DL);
   ~X86SelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index b94bd712ff72..79b7e68c320b 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
@@ -35,6 +36,13 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "X86GenSubtargetInfo.inc"
 
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+               cl::desc("Enable early if-conversion on X86"));
+
+
 /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
 /// current subtarget according to how we should reference it in a non-pcrel
 /// context.
@@ -283,13 +291,60 @@ void X86Subtarget::initializeEnvironment() {
   CallRegIndirect = false;
   LEAUsesAG = false;
   SlowLEA = false;
+  SlowIncDec = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
 }
 
+static std::string computeDataLayout(const X86Subtarget &ST) {
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+  // X86 and x32 have 32 bit pointers.
+  if (ST.isTarget64BitILP32() || !ST.is64Bit())
+    Ret += "-p:32:32";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (ST.isTargetNaCl())
+    ; // No f80
+  else if (ST.is64Bit() || ST.isTargetDarwin())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (ST.is64Bit())
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if (!ST.is64Bit() && ST.isOSWindows())  
+    Ret += "-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
 X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, unsigned StackAlignOverride)
+                           const std::string &FS, X86TargetMachine &TM,
+                           unsigned StackAlignOverride)
     : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
       PICStyle(PICStyles::None), TargetTriple(TT),
       StackAlignOverride(StackAlignOverride),
@@ -297,10 +352,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
       In32BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
-                  TargetTriple.getEnvironment() == Triple::CODE16) {
-  initializeEnvironment();
-  resetSubtargetFeatures(CPU, FS);
-}
+                  TargetTriple.getEnvironment() == Triple::CODE16),
+      DL(computeDataLayout(*this)), TSInfo(DL),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(),
+                    is64Bit() ? -8 : -4),
+      JITInfo(hasSSE1()) {}
 
 bool
 X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
@@ -310,3 +367,8 @@ X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
   CriticalPathRCs.clear();
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
+
+bool
+X86Subtarget::enableEarlyIfConversion() const {
+  return hasCMov() && X86EarlyIfConv;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 8ec680efff83..09db0ebc5a9b 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -14,6 +14,11 @@
 #ifndef X86SUBTARGET_H
 #define X86SUBTARGET_H
 
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86SelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -40,6 +45,7 @@ enum Style {
 }
 
 class X86Subtarget final : public X86GenSubtargetInfo {
+
 protected:
   enum X86SSEEnum {
     NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -181,6 +187,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// SlowLEA - True if the LEA instruction with certain arguments is slow
   bool SlowLEA;
 
+  /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
+  bool SlowIncDec;
+
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
   
@@ -217,14 +226,31 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit.
   bool In16BitMode;
 
+  // Calculates type size & alignment
+  const DataLayout DL;
+  X86SelectionDAGInfo TSInfo;
+  // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+  // X86TargetLowering needs.
+  X86InstrInfo InstrInfo;
+  X86TargetLowering TLInfo;
+  X86FrameLowering FrameLowering;
+  X86JITInfo JITInfo;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   X86Subtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS,
+               const std::string &FS, X86TargetMachine &TM,
                unsigned StackAlignOverride);
 
+  const X86TargetLowering *getTargetLowering() const { return &TLInfo; }
+  const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const X86FrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  X86JITInfo *getJITInfo() { return &JITInfo; }
+
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
@@ -241,6 +267,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// \brief Reset the features for the X86 target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
+  /// \brief Initialize the full set of dependencies so we can use an initializer
+  /// list for X86Subtarget.
+  X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
 public:
@@ -319,6 +348,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
+  bool slowIncDec() const { return SlowIncDec; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
@@ -430,6 +460,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
 
   bool postRAScheduler() const { return PostRAScheduler; }
 
+  bool enableEarlyIfConversion() const override;
+
   /// getInstrItins = Return the instruction itineraries based on the
   /// subtarget selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index d0449f42320c..74ce31a15d5c 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -29,61 +29,14 @@ extern "C" void LLVMInitializeX86Target() {
 
 void X86TargetMachine::anchor() { }
 
-static std::string computeDataLayout(const X86Subtarget &ST) {
-  // X86 is little endian
-  std::string Ret = "e";
-
-  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
-  // X86 and x32 have 32 bit pointers.
-  if (ST.isTarget64BitILP32() || !ST.is64Bit())
-    Ret += "-p:32:32";
-
-  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
-  if (ST.is64Bit() || ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC() ||
-      ST.isTargetNaCl())
-    Ret += "-i64:64";
-  else
-    Ret += "-f64:32:64";
-
-  // Some ABIs align long double to 128 bits, others to 32.
-  if (ST.isTargetNaCl())
-    ; // No f80
-  else if (ST.is64Bit() || ST.isTargetDarwin())
-    Ret += "-f80:128";
-  else
-    Ret += "-f80:32";
-
-  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
-  if (ST.is64Bit())
-    Ret += "-n8:16:32:64";
-  else
-    Ret += "-n8:16:32";
-
-  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
-  if (!ST.is64Bit() && (ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC()))
-    Ret += "-S32";
-  else
-    Ret += "-S128";
-
-  return Ret;
-}
-
 /// X86TargetMachine ctor - Create an X86 target.
 ///
-X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, Options.StackAlignmentOverride),
-    FrameLowering(*this, Subtarget),
-    InstrItins(Subtarget.getInstrItineraryData()),
-    DL(computeDataLayout(*getSubtargetImpl())),
-    InstrInfo(*this),
-    TLInfo(*this),
-    TSInfo(*this),
-    JITInfo(*this) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
@@ -126,12 +79,6 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
-// Temporary option to control early if-conversion for x86 while adding machine
-// models.
-static cl::opt<bool>
-X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
-	       cl::desc("Enable early if-conversion on X86"));
-
 //===----------------------------------------------------------------------===//
 // X86 Analysis Pass Setup
 //===----------------------------------------------------------------------===//
@@ -184,19 +131,14 @@ bool X86PassConfig::addInstSelector() {
   if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
     addPass(createCleanupLocalDynamicTLSPass());
 
-  // For 32-bit, prepend instructions to set the "global base reg" for PIC.
-  if (!getX86Subtarget().is64Bit())
-    addPass(createGlobalBaseRegPass());
+  addPass(createX86GlobalBaseRegPass());
 
   return false;
 }
 
 bool X86PassConfig::addILPOpts() {
-  if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
-    addPass(&EarlyIfConverterID);
-    return true;
-  }
-  return false;
+  addPass(&EarlyIfConverterID);
+  return true;
 }
 
 bool X86PassConfig::addPreRegAlloc() {
@@ -215,19 +157,13 @@ bool X86PassConfig::addPreEmitPass() {
     ShouldPrint = true;
   }
 
-  if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
+  if (UseVZeroUpper) {
     addPass(createX86IssueVZeroUpperPass());
     ShouldPrint = true;
   }
 
-  if (getOptLevel() != CodeGenOpt::None &&
-      getX86Subtarget().padShortFunctions()) {
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(createX86PadShortFunctions());
-    ShouldPrint = true;
-  }
-  if (getOptLevel() != CodeGenOpt::None &&
-      (getX86Subtarget().LEAusesAG() ||
-       getX86Subtarget().slowLEA())){
     addPass(createX86FixupLEAs());
     ShouldPrint = true;
   }
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 57e6eda6bc15..87ddaf42a71e 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -18,7 +18,6 @@
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
 #include "X86JITInfo.h"
-#include "X86SelectionDAGInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
@@ -30,13 +29,6 @@ class StringRef;
 class X86TargetMachine final : public LLVMTargetMachine {
   virtual void anchor();
   X86Subtarget       Subtarget;
-  X86FrameLowering   FrameLowering;
-  InstrItineraryData InstrItins;
-  const DataLayout   DL; // Calculates type size & alignment
-  X86InstrInfo       InstrInfo;
-  X86TargetLowering  TLInfo;
-  X86SelectionDAGInfo TSInfo;
-  X86JITInfo         JITInfo;
 
 public:
   X86TargetMachine(const Target &T, StringRef TT,
@@ -44,28 +36,28 @@ class X86TargetMachine final : public LLVMTargetMachine {
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
 
-  const DataLayout *getDataLayout() const override { return &DL; }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
   const X86InstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
+    return getSubtargetImpl()->getInstrInfo();
   }
   const TargetFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  X86JITInfo *getJITInfo() override {
-    return &JITInfo;
+    return getSubtargetImpl()->getFrameLowering();
   }
+  X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
   const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; }
   const X86TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
   const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
   const X86RegisterInfo  *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
   const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+    return &getSubtargetImpl()->getInstrItineraryData();
   }
 
   /// \brief Register X86 analysis passes with a pass manager.
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 69f34a16b6f7..299f9a581b8d 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 #define DEBUG_TYPE "x86tti"
 
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeX86TTIPass(PassRegistry &);
@@ -102,6 +102,8 @@ class X86TTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) const override;
 
+  unsigned getIntImmCost(int64_t) const;
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
 
   unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
@@ -808,6 +810,19 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
   return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
 }
 
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned X86TTI::getIntImmCost(int64_t Val) const {
+  if (Val == 0)
+    return TCC_Free;
+
+  if (isInt<32>(Val))
+    return TCC_Basic;
+
+  return 2 * TCC_Basic;
+}
+
 unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   assert(Ty->isIntegerTy());
 
@@ -825,11 +840,21 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   if (Imm == 0)
     return TCC_Free;
 
-  if (Imm.getBitWidth() <= 64 &&
-      (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue())))
-    return TCC_Basic;
-  else
-    return 2 * TCC_Basic;
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
 }
 
 unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
@@ -889,9 +914,13 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
     break;
   }
 
-  if ((Idx == ImmIdx) &&
-      Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
-    return TCC_Free;
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = X86TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free)
+      : Cost;
+  }
 
   return X86TTI::getIntImmCost(Imm, Ty);
 }
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 945ea3e88179..0bb5f990cae7 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -247,7 +247,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getTarget().getSubtarget<X86Subtarget>().hasAVX512())
+  const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+  if (!ST.hasAVX() || ST.hasAVX512())
     return false;
   TII = MF.getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 638d8701fad1..5e763aead97a 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -92,15 +92,12 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   // XCore does not have the NodeTypes below.
   setOperationAction(ISD::BR_CC,     MVT::i32,   Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32,   Expand);
   setOperationAction(ISD::ADDC, MVT::i32, Expand);
   setOperationAction(ISD::ADDE, MVT::i32, Expand);
   setOperationAction(ISD::SUBC, MVT::i32, Expand);
   setOperationAction(ISD::SUBE, MVT::i32, Expand);
 
-  // Stop the combiner recombining select and set_cc
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
-
   // 64bit
   setOperationAction(ISD::ADD, MVT::i64, Custom);
   setOperationAction(ISD::SUB, MVT::i64, Custom);
@@ -217,7 +214,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
   case ISD::LOAD:               return LowerLOAD(Op, DAG);
   case ISD::STORE:              return LowerSTORE(Op, DAG);
-  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::SMUL_LOHI:          return LowerSMUL_LOHI(Op, DAG);
@@ -258,35 +254,21 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
 
-SDValue XCoreTargetLowering::
-LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
-{
-  SDLoc dl(Op);
-  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2),
-                             Op.getOperand(3), Op.getOperand(4));
-  return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0),
-                     Op.getOperand(1));
-}
-
-SDValue XCoreTargetLowering::
-getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
-                        SelectionDAG &DAG) const
-{
+SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
+                                                     const GlobalValue *GV,
+                                                     SelectionDAG &DAG) const {
   // FIXME there is no actual debug info here
   SDLoc dl(GA);
-  const GlobalValue *UnderlyingGV = GV;
-  // If GV is an alias then use the aliasee to determine the wrapper type
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    UnderlyingGV = GA->getAliasee();
-  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(UnderlyingGV)) {
-    if (  ( GVar->isConstant() &&
-            UnderlyingGV->isLocalLinkage(GV->getLinkage()) )
-       || ( GVar->hasSection() &&
-            StringRef(GVar->getSection()).startswith(".cp.") ) )
-      return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
-    return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
-  }
-  return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+
+  if (GV->getType()->getElementType()->isFunctionTy())
+    return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+
+  const auto *GVar = dyn_cast<GlobalVariable>(GV);
+  if ((GV->hasSection() && StringRef(GV->getSection()).startswith(".cp.")) ||
+      (GVar && GVar->isConstant() && GV->hasLocalLinkage()))
+    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
 }
 
 static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) {
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index d28715b7178f..81d91875100b 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -157,7 +157,6 @@ namespace llvm {
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 5a6bbe7b1d3e..b72c520d84a5 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -17,8 +17,7 @@ using namespace llvm;
 #define DEBUG_TYPE "xcore-selectiondag-info"
 
 XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+    : TargetSelectionDAGInfo(TM.getDataLayout()) {}
 
 XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
 }
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
index 079be1fc0abc..80d193d1c26e 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.cpp
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 #define DEBUG_TYPE "xcoretti"
 
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeXCoreTTIPass(PassRegistry &);
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index fed8839e6f0f..8174df9ec069 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -449,14 +449,29 @@ determinePointerReadAttrs(Argument *A,
 
     case Instruction::Call:
     case Instruction::Invoke: {
+      bool Captures = true;
+
+      if (I->getType()->isVoidTy())
+        Captures = false;
+
+      auto AddUsersToWorklistIfCapturing = [&] {
+        if (Captures)
+          for (Use &UU : I->uses())
+            if (Visited.insert(&UU))
+              Worklist.push_back(&UU);
+      };
+
       CallSite CS(I);
-      if (CS.doesNotAccessMemory())
+      if (CS.doesNotAccessMemory()) {
+        AddUsersToWorklistIfCapturing();
         continue;
+      }
 
       Function *F = CS.getCalledFunction();
       if (!F) {
         if (CS.onlyReadsMemory()) {
           IsRead = true;
+          AddUsersToWorklistIfCapturing();
           continue;
         }
         return Attribute::None;
@@ -471,6 +486,7 @@ determinePointerReadAttrs(Argument *A,
                    "More params than args in non-varargs call.");
             return Attribute::None;
           }
+          Captures &= !CS.doesNotCapture(A - B);
           if (SCCNodes.count(AI))
             continue;
           if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B))
@@ -479,6 +495,7 @@ determinePointerReadAttrs(Argument *A,
             IsRead = true;
         }
       }
+      AddUsersToWorklistIfCapturing();
       break;
     }
 
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 9decddcff80d..57481e1e429a 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -62,7 +62,7 @@ static bool isEmptyFunction(Function *F) {
   if (Entry.size() != 1 || !isa<ReturnInst>(Entry.front()))
     return false;
   ReturnInst &RI = cast<ReturnInst>(Entry.front());
-  return RI.getReturnValue() == NULL;
+  return RI.getReturnValue() == nullptr;
 }
 
 char GlobalDCE::ID = 0;
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 10b20cfc917f..9087ab23bb70 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -301,6 +301,13 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
   return thres;
 }
 
+static void emitAnalysis(CallSite CS, const Twine &Msg) {
+  Function *Caller = CS.getCaller();
+  LLVMContext &Ctx = Caller->getContext();
+  DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+  emitOptimizationRemarkAnalysis(Ctx, DEBUG_TYPE, *Caller, DLoc, Msg);
+}
+
 /// shouldInline - Return true if the inliner should attempt to inline
 /// at the given CallSite.
 bool Inliner::shouldInline(CallSite CS) {
@@ -309,12 +316,16 @@ bool Inliner::shouldInline(CallSite CS) {
   if (IC.isAlways()) {
     DEBUG(dbgs() << "    Inlining: cost=always"
           << ", Call: " << *CS.getInstruction() << "\n");
+    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) +
+                         " should always be inlined (cost=always)");
     return true;
   }
   
   if (IC.isNever()) {
     DEBUG(dbgs() << "    NOT Inlining: cost=never"
           << ", Call: " << *CS.getInstruction() << "\n");
+    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
+                           " should never be inlined (cost=never)"));
     return false;
   }
   
@@ -323,6 +334,10 @@ bool Inliner::shouldInline(CallSite CS) {
     DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
           << ", thres=" << (IC.getCostDelta() + IC.getCost())
           << ", Call: " << *CS.getInstruction() << "\n");
+    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
+                           " too costly to inline (cost=") +
+                         Twine(IC.getCost()) + ", threshold=" +
+                         Twine(IC.getCostDelta() + IC.getCost()) + ")");
     return false;
   }
   
@@ -390,6 +405,11 @@ bool Inliner::shouldInline(CallSite CS) {
       DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() <<
            " Cost = " << IC.getCost() <<
            ", outer Cost = " << TotalSecondaryCost << '\n');
+      emitAnalysis(
+          CS, Twine("Not inlining. Cost of inlining " +
+                    CS.getCalledFunction()->getName() +
+                    " increases the cost of inlining " +
+                    CS.getCaller()->getName() + " in other contexts"));
       return false;
     }
   }
@@ -397,6 +417,10 @@ bool Inliner::shouldInline(CallSite CS) {
   DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
         << ", thres=" << (IC.getCostDelta() + IC.getCost())
         << ", Call: " << *CS.getInstruction() << '\n');
+  emitAnalysis(
+      CS, CS.getCalledFunction()->getName() + Twine(" can be inlined into ") +
+              CS.getCaller()->getName() + " with cost=" + Twine(IC.getCost()) +
+              " (threshold=" + Twine(IC.getCostDelta() + IC.getCost()) + ")");
   return true;
 }
 
@@ -518,24 +542,35 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
             InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
           continue;
         
-        
-        // If the policy determines that we should inline this function,
-        // try to do so.
-        if (!shouldInline(CS))
-          continue;
+        LLVMContext &CallerCtx = Caller->getContext();
 
         // Get DebugLoc to report. CS will be invalid after Inliner.
         DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
 
+        // If the policy determines that we should inline this function,
+        // try to do so.
+        if (!shouldInline(CS)) {
+          emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
+                                       Twine(Callee->getName() +
+                                             " will not be inlined into " +
+                                             Caller->getName()));
+          continue;
+        }
+
         // Attempt to inline the function.
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
-                                  InlineHistoryID, InsertLifetime, DL))
+                                  InlineHistoryID, InsertLifetime, DL)) {
+          emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
+                                       Twine(Callee->getName() +
+                                             " will not be inlined into " +
+                                             Caller->getName()));
           continue;
+        }
         ++NumInlined;
 
         // Report the inline decision.
-        Caller->getContext().emitOptimizationRemark(
-            DEBUG_TYPE, *Caller, DLoc,
+        emitOptimizationRemark(
+            CallerCtx, DEBUG_TYPE, *Caller, DLoc,
             Twine(Callee->getName() + " inlined into " + Caller->getName()));
 
         // If inlining this function gave us any new call sites, throw them
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index c3a2b1205c1a..c9b4af769136 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -847,6 +847,9 @@ int FunctionComparator::cmpOperation(const Instruction *L,
     if (int Res = cmpNumbers(CXI->isVolatile(),
                              cast<AtomicCmpXchgInst>(R)->isVolatile()))
       return Res;
+    if (int Res = cmpNumbers(CXI->isWeak(),
+                             cast<AtomicCmpXchgInst>(R)->isWeak()))
+      return Res;
     if (int Res = cmpNumbers(CXI->getSuccessOrdering(),
                              cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
       return Res;
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 10c51ab553a7..c20c717de5e7 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -53,6 +53,10 @@ static cl::opt<bool>
 RunLoopRerolling("reroll-loops", cl::Hidden,
                  cl::desc("Run the loop rerolling pass"));
 
+static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false),
+                                    cl::Hidden,
+                                    cl::desc("Run the load combining pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -65,6 +69,7 @@ PassManagerBuilder::PassManagerBuilder() {
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
+    LoadCombine = RunLoadCombine;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -157,6 +162,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
     MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
 
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+    addExtensionsToPM(EP_Peephole, MPM);
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
   }
 
@@ -183,6 +189,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+  addExtensionsToPM(EP_Peephole, MPM);
 
   if (!DisableTailCalls)
     MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
@@ -208,6 +215,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   // Run instcombine after redundancy elimination to exploit opportunities
   // opened up by them.
   MPM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createJumpThreadingPass());         // Thread jumps
   MPM.add(createCorrelatedValuePropagationPass());
   MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
@@ -222,6 +230,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   if (BBVectorize) {
     MPM.add(createBBVectorizePass());
     MPM.add(createInstructionCombiningPass());
+    addExtensionsToPM(EP_Peephole, MPM);
     if (OptLevel > 1 && UseGVNAfterVectorization)
       MPM.add(createGVNPass());           // Remove redundancies
     else
@@ -232,9 +241,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(createLoopUnrollPass());
   }
 
+  if (LoadCombine)
+    MPM.add(createLoadCombinePass());
+
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+  addExtensionsToPM(EP_Peephole, MPM);
 
   // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
   // pass manager that we are specifically trying to avoid. To prevent this
@@ -247,6 +260,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   // as function calls, so that we can only pass them when the vectorizer
   // changed the code.
   MPM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createCFGSimplificationPass());
 
   if (!DisableUnrollLoops)
@@ -299,6 +313,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // function pointers.  When this happens, we often have to resolve varargs
   // calls, etc, so let instcombine do this.
   PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
 
   // Inline small functions
   if (RunInliner)
@@ -317,6 +332,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
 
   // The IPO passes may leave cruft around.  Clean up after them.
   PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
   PM.add(createJumpThreadingPass());
 
   // Break up allocas
@@ -344,8 +360,12 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // More scalar chains could be vectorized due to more alias information
   PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
 
+  if (LoadCombine)
+    PM.add(createLoadCombinePass());
+
   // Cleanup and simplify the code after the scalar optimizations.
   PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
 
   PM.add(createJumpThreadingPass());
 
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index e04b1be53dfb..ab4dc1ce23e6 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -37,8 +37,9 @@ enum SelectPatternFlavor {
   SPF_SMIN,
   SPF_UMIN,
   SPF_SMAX,
-  SPF_UMAX
-  // SPF_ABS - TODO.
+  SPF_UMAX,
+  SPF_ABS,
+  SPF_NABS
 };
 
 /// getComplexity:  Assign a complexity or rank value to LLVM Values...
@@ -246,6 +247,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
                                  bool DoXform = true);
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
   bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
+  bool WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS);
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c37a9cf2ef9f..f6658964ae85 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -889,29 +889,93 @@ static inline Value *dyn_castFoldableMul(Value *V, Constant *&CST) {
   return nullptr;
 }
 
+// If one of the operands only has one non-zero bit, and if the other
+// operand has a known-zero bit in a more significant place than it (not
+// including the sign bit) the ripple may go up to and fill the zero, but
+// won't change the sign. For example, (X & ~4) + 1.
+static bool checkRippleForAdd(const APInt &Op0KnownZero,
+                              const APInt &Op1KnownZero) {
+  APInt Op1MaybeOne = ~Op1KnownZero;
+  // Make sure that one of the operand has at most one bit set to 1.
+  if (Op1MaybeOne.countPopulation() != 1)
+    return false;
+
+  // Find the most significant known 0 other than the sign bit.
+  int BitWidth = Op0KnownZero.getBitWidth();
+  APInt Op0KnownZeroTemp(Op0KnownZero);
+  Op0KnownZeroTemp.clearBit(BitWidth - 1);
+  int Op0ZeroPosition = BitWidth - Op0KnownZeroTemp.countLeadingZeros() - 1;
+
+  int Op1OnePosition = BitWidth - Op1MaybeOne.countLeadingZeros() - 1;
+  assert(Op1OnePosition >= 0);
+
+  // This also covers the case of no known zero, since in that case
+  // Op0ZeroPosition is -1.
+  return Op0ZeroPosition >= Op1OnePosition;
+}
 
 /// WillNotOverflowSignedAdd - Return true if we can prove that:
 ///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
+/// TODO: Handle this for Vectors.
 bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
   // There are different heuristics we can use for this.  Here are some simple
   // ones.
 
-  // Add has the property that adding any two 2's complement numbers can only
-  // have one carry bit which can change a sign.  As such, if LHS and RHS each
-  // have at least two sign bits, we know that the addition of the two values
-  // will sign extend fine.
+  // If LHS and RHS each have at least two sign bits, the addition will look
+  // like
+  //
+  // XX..... +
+  // YY.....
+  //
+  // If the carry into the most significant position is 0, X and Y can't both
+  // be 1 and therefore the carry out of the addition is also 0.
+  //
+  // If the carry into the most significant position is 1, X and Y can't both
+  // be 0 and therefore the carry out of the addition is also 1.
+  //
+  // Since the carry into the most significant position is always equal to
+  // the carry out of the addition, there is no signed overflow.
   if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
     return true;
 
+  if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) {
+    int BitWidth = IT->getBitWidth();
+    APInt LHSKnownZero(BitWidth, 0);
+    APInt LHSKnownOne(BitWidth, 0);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
 
-  // If one of the operands only has one non-zero bit, and if the other operand
-  // has a known-zero bit in a more significant place than it (not including the
-  // sign bit) the ripple may go up to and fill the zero, but won't change the
-  // sign.  For example, (X & ~4) + 1.
+    APInt RHSKnownZero(BitWidth, 0);
+    APInt RHSKnownOne(BitWidth, 0);
+    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
+
+    // Addition of two 2's compliment numbers having opposite signs will never
+    // overflow.
+    if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) ||
+        (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1]))
+      return true;
+
+    // Check if carry bit of addition will not cause overflow.
+    if (checkRippleForAdd(LHSKnownZero, RHSKnownZero))
+      return true;
+    if (checkRippleForAdd(RHSKnownZero, LHSKnownZero))
+      return true;
+  }
+  return false;
+}
 
-  // TODO: Implement.
+/// WillNotOverflowUnsignedAdd - Return true if we can prove that:
+///    (zext (add LHS, RHS))  === (add (zext LHS), (zext RHS))
+bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS) {
+  // There are different heuristics we can use for this. Here is a simple one.
+  // If the sign bit of LHS and that of RHS are both zero, no unsigned wrap.
+  bool LHSKnownNonNegative, LHSKnownNegative;
+  bool RHSKnownNonNegative, RHSKnownNegative;
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0);
+  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0);
+  if (LHSKnownNonNegative && RHSKnownNonNegative)
+    return true;
 
   return false;
 }
@@ -1191,6 +1255,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       return BinaryOperator::CreateOr(A, B);
   }
 
+  // TODO(jingyue): Consider WillNotOverflowSignedAdd and
+  // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+  // computeKnownBits.
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedAdd(LHS, RHS)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
   return Changed ? &I : nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 18a0f9c167a9..d4bdd75fa82a 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -718,6 +718,46 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::x86_sse41_pblendvb:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_avx_blendv_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx2_pblendvb: {
+    // Convert blendv* to vector selects if the mask is constant.
+    // This optimization is convoluted because the intrinsic is defined as
+    // getting a vector of floats or doubles for the ps and pd versions.
+    // FIXME: That should be changed.
+    Value *Mask = II->getArgOperand(2);
+    if (auto C = dyn_cast<ConstantDataVector>(Mask)) {
+      auto Tyi1 = Builder->getInt1Ty();
+      auto SelectorType = cast<VectorType>(Mask->getType());
+      auto EltTy = SelectorType->getElementType();
+      unsigned Size = SelectorType->getNumElements();
+      unsigned BitWidth =
+          EltTy->isFloatTy()
+              ? 32
+              : (EltTy->isDoubleTy() ? 64 : EltTy->getIntegerBitWidth());
+      assert((BitWidth == 64 || BitWidth == 32 || BitWidth == 8) &&
+             "Wrong arguments for variable blend intrinsic");
+      SmallVector<Constant *, 32> Selectors;
+      for (unsigned I = 0; I < Size; ++I) {
+        // The intrinsics only read the top bit
+        uint64_t Selector;
+        if (BitWidth == 8)
+          Selector = C->getElementAsInteger(I);
+        else
+          Selector = C->getElementAsAPFloat(I).bitcastToAPInt().getZExtValue();
+        Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1)));
+      }
+      auto NewSelector = ConstantVector::get(Selectors);
+      return SelectInst::Create(NewSelector, II->getArgOperand(1),
+                                II->getArgOperand(0), "blendv");
+    } else {
+      break;
+    }
+  }
+
   case Intrinsic::x86_avx_vpermilvar_ps:
   case Intrinsic::x86_avx_vpermilvar_ps_256:
   case Intrinsic::x86_avx_vpermilvar_pd:
@@ -760,6 +800,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
+    // a vectorshuffle for little endian, we must undo the transformation
+    // performed on vec_perm in altivec.h.  That is, we must complement
+    // the permutation mask with respect to 31 and reverse the order of
+    // V1 and V2.
     if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
       assert(Mask->getType()->getVectorNumElements() == 16 &&
              "Bad type for intrinsic!");
@@ -792,10 +837,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           unsigned Idx =
             cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
           Idx &= 31;  // Match the hardware behavior.
+          if (DL && DL->isLittleEndian())
+            Idx = 31 - Idx;
 
           if (!ExtractedElts[Idx]) {
+            Value *Op0ToUse = (DL && DL->isLittleEndian()) ? Op1 : Op0;
+            Value *Op1ToUse = (DL && DL->isLittleEndian()) ? Op0 : Op1;
             ExtractedElts[Idx] =
-              Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
+              Builder->CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
                                             Builder->getInt32(Idx&15));
           }
 
@@ -836,8 +885,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu:
-  case Intrinsic::arm64_neon_smull:
-  case Intrinsic::arm64_neon_umull: {
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
 
@@ -848,7 +897,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // Check for constant LHS & RHS - in this case we just simplify.
     bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu ||
-                 II->getIntrinsicID() == Intrinsic::arm64_neon_umull);
+                 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull);
     VectorType *NewVT = cast<VectorType>(II->getType());
     if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
       if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 356803ad7caf..ff083d7926cc 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1434,7 +1434,12 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
     // If casting the result of a getelementptr instruction with no offset, turn
     // this into a cast of the original pointer!
-    if (GEP->hasAllZeroIndices()) {
+    if (GEP->hasAllZeroIndices() &&
+        // If CI is an addrspacecast and GEP changes the poiner type, merging
+        // GEP into CI would undo canonicalizing addrspacecast with different
+        // pointer types, causing infinite loops.
+        (!isa<AddrSpaceCastInst>(CI) ||
+          GEP->getType() == GEP->getPointerOperand()->getType())) {
       // Changing the cast operand is usually not a good idea but it is safe
       // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
@@ -1904,5 +1909,24 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
 }
 
 Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+  // If the destination pointer element type is not the the same as the source's
+  // do the addrspacecast to the same type, and then the bitcast in the new
+  // address space. This allows the cast to be exposed to other transforms.
+  Value *Src = CI.getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
+  PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());
+
+  Type *DestElemTy = DestTy->getElementType();
+  if (SrcTy->getElementType() != DestElemTy) {
+    Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
+    if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) {
+      // Handle vectors of pointers.
+      MidTy = VectorType::get(MidTy, VT->getNumElements());
+    }
+
+    Value *NewBitCast = Builder->CreateBitCast(Src, MidTy);
+    return new AddrSpaceCastInst(NewBitCast, CI.getType());
+  }
+
   return commonPointerCastTransforms(CI);
 }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 02e8bf10135f..639d8319e0de 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -612,9 +612,10 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
   if (ICmpInst::isSigned(Cond))
     return nullptr;
 
-  // Look through bitcasts.
-  if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
-    RHS = BCI->getOperand(0);
+  // Look through bitcasts and addrspacecasts. We do not however want to remove
+  // 0 GEPs.
+  if (!isa<GetElementPtrInst>(RHS))
+    RHS = RHS->stripPointerCasts();
 
   Value *PtrBase = GEPLHS->getOperand(0);
   if (DL && PtrBase == RHS && GEPLHS->isInBounds()) {
@@ -655,9 +656,24 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
           (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
           PtrBase->stripPointerCasts() ==
             GEPRHS->getOperand(0)->stripPointerCasts()) {
+        Value *LOffset = EmitGEPOffset(GEPLHS);
+        Value *ROffset = EmitGEPOffset(GEPRHS);
+
+        // If we looked through an addrspacecast between different sized address
+        // spaces, the LHS and RHS pointers are different sized
+        // integers. Truncate to the smaller one.
+        Type *LHSIndexTy = LOffset->getType();
+        Type *RHSIndexTy = ROffset->getType();
+        if (LHSIndexTy != RHSIndexTy) {
+          if (LHSIndexTy->getPrimitiveSizeInBits() <
+              RHSIndexTy->getPrimitiveSizeInBits()) {
+            ROffset = Builder->CreateTrunc(ROffset, LHSIndexTy);
+          } else
+            LOffset = Builder->CreateTrunc(LOffset, RHSIndexTy);
+        }
+
         Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond),
-                                         EmitGEPOffset(GEPLHS),
-                                         EmitGEPOffset(GEPRHS));
+                                         LOffset, ROffset);
         return ReplaceInstUsesWith(I, Cmp);
       }
 
@@ -2523,7 +2539,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
       APInt Op0KnownZeroInverted = ~Op0KnownZero;
-      if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
+      if (~Op1KnownZero == 0) {
         // If the LHS is an AND with the same constant, look through it.
         Value *LHS = nullptr;
         ConstantInt *LHSC = nullptr;
@@ -2533,11 +2549,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) == 0" into "x != 3".
+        // or turn "((1 << x)&7) == 0" into "x > 2".
         Value *X = nullptr;
         if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
-          unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
-          return new ICmpInst(ICmpInst::ICMP_NE, X,
-                              ConstantInt::get(X->getType(), CmpVal));
+          APInt ValToCheck = Op0KnownZeroInverted;
+          if (ValToCheck.isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros();
+            return new ICmpInst(ICmpInst::ICMP_NE, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          } else if ((++ValToCheck).isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros() - 1;
+            return new ICmpInst(ICmpInst::ICMP_UGT, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          }
         }
 
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
@@ -2560,7 +2584,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
       APInt Op0KnownZeroInverted = ~Op0KnownZero;
-      if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
+      if (~Op1KnownZero == 0) {
         // If the LHS is an AND with the same constant, look through it.
         Value *LHS = nullptr;
         ConstantInt *LHSC = nullptr;
@@ -2570,11 +2594,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) != 0" into "x == 3".
+        // or turn "((1 << x)&7) != 0" into "x < 3".
         Value *X = nullptr;
         if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
-          unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
-          return new ICmpInst(ICmpInst::ICMP_EQ, X,
-                              ConstantInt::get(X->getType(), CmpVal));
+          APInt ValToCheck = Op0KnownZeroInverted;
+          if (ValToCheck.isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros();
+            return new ICmpInst(ICmpInst::ICMP_EQ, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          } else if ((++ValToCheck).isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros();
+            return new ICmpInst(ICmpInst::ICMP_ULT, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          }
         }
 
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 9a41e4b9406a..06c9e290c6ea 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -31,13 +31,18 @@ MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) {
   ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition());
   if (!ICI) return SPF_UNKNOWN;
 
-  LHS = ICI->getOperand(0);
-  RHS = ICI->getOperand(1);
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  Value *TrueVal = SI->getTrueValue();
+  Value *FalseVal = SI->getFalseValue();
+
+  LHS = CmpLHS;
+  RHS = CmpRHS;
 
   // (icmp X, Y) ? X : Y
-  if (SI->getTrueValue() == ICI->getOperand(0) &&
-      SI->getFalseValue() == ICI->getOperand(1)) {
-    switch (ICI->getPredicate()) {
+  if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
+    switch (Pred) {
     default: return SPF_UNKNOWN; // Equality.
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE: return SPF_UMAX;
@@ -51,18 +56,35 @@ MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) {
   }
 
   // (icmp X, Y) ? Y : X
-  if (SI->getTrueValue() == ICI->getOperand(1) &&
-      SI->getFalseValue() == ICI->getOperand(0)) {
-    switch (ICI->getPredicate()) {
-      default: return SPF_UNKNOWN; // Equality.
-      case ICmpInst::ICMP_UGT:
-      case ICmpInst::ICMP_UGE: return SPF_UMIN;
-      case ICmpInst::ICMP_SGT:
-      case ICmpInst::ICMP_SGE: return SPF_SMIN;
-      case ICmpInst::ICMP_ULT:
-      case ICmpInst::ICMP_ULE: return SPF_UMAX;
-      case ICmpInst::ICMP_SLT:
-      case ICmpInst::ICMP_SLE: return SPF_SMAX;
+  if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
+    switch (Pred) {
+    default: return SPF_UNKNOWN; // Equality.
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE: return SPF_UMIN;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE: return SPF_SMIN;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE: return SPF_UMAX;
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE: return SPF_SMAX;
+    }
+  }
+
+  if (ConstantInt *C1 = dyn_cast<ConstantInt>(CmpRHS)) {
+    if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) ||
+        (CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) {
+
+      // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
+      // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
+      if (Pred == ICmpInst::ICMP_SGT && (C1->isZero() || C1->isMinusOne())) {
+        return (CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS;
+      }
+
+      // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
+      // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
+      if (Pred == ICmpInst::ICMP_SLT && (C1->isZero() || C1->isOne())) {
+        return (CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS;
+      }
     }
   }
 
@@ -365,7 +387,15 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 /// 1. The icmp predicate is inverted
 /// 2. The select operands are reversed
 /// 3. The magnitude of C2 and C1 are flipped
-static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
+///
+/// This also tries to turn
+/// --- Single bit tests:
+/// if ((x & C) == 0) x |= C	to  x |= C
+/// if ((x & C) != 0) x ^= C	to  x &= ~C
+/// if ((x & C) == 0) x ^= C	to  x |= C
+/// if ((x & C) != 0) x &= ~C	to  x &= ~C
+/// if ((x & C) == 0) x &= ~C	to  nothing
+static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
@@ -384,6 +414,25 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
     return nullptr;
 
   const APInt *C2;
+  if (match(TrueVal, m_Specific(X))) {
+    // if ((X & C) != 0) X ^= C becomes X &= ~C
+    if (match(FalseVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2)
+      return Builder->CreateAnd(X, ~(*C1));
+    // if ((X & C) != 0) X &= ~C becomes X &= ~C
+    if (match(FalseVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2))
+      return FalseVal;
+  } else if (match(FalseVal, m_Specific(X))) {
+    // if ((X & C) == 0) X ^= C becomes X |= C
+    if (match(TrueVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2)
+      return Builder->CreateOr(X, *C1);
+    // if ((X & C) == 0) X &= ~C becomes nothing
+    if (match(TrueVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2))
+      return X;
+    // if ((X & C) == 0) X |= C becomes X |= C
+    if (match(TrueVal, m_Or(m_Specific(X), m_APInt(C2))) && C1 == C2)
+      return TrueVal;
+  }
+
   bool OrOnTrueVal = false;
   bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
   if (!OrOnFalseVal)
@@ -677,6 +726,22 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
       }
     }
   }
+
+  // ABS(ABS(X)) -> ABS(X)
+  // NABS(NABS(X)) -> NABS(X)
+  if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
+    return ReplaceInstUsesWith(Outer, Inner);
+  }
+
+  // ABS(NABS(X)) -> ABS(X)
+  // NABS(ABS(X)) -> NABS(X)
+  if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
+      (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
+    SelectInst *SI = cast<SelectInst>(Inner);
+    Value *NewSI = Builder->CreateSelect(
+        SI->getCondition(), SI->getFalseValue(), SI->getTrueValue());
+    return ReplaceInstUsesWith(Outer, NewSI);
+  }
   return nullptr;
 }
 
@@ -981,7 +1046,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
     // TODO.
     // ABS(-X) -> ABS(X)
-    // ABS(ABS(X)) -> ABS(X)
   }
 
   // See if we can fold the select into a phi node if the condition is a select.
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4c36887f6285..991ad796a7e3 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1220,6 +1220,91 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (MadeChange) return &GEP;
   }
 
+  // Check to see if the inputs to the PHI node are getelementptr instructions.
+  if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) {
+    GetElementPtrInst *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+    if (!Op1)
+      return nullptr;
+
+    signed DI = -1;
+
+    for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
+      GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I);
+      if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
+        return nullptr;
+
+      // Keep track of the type as we walk the GEP.
+      Type *CurTy = Op1->getOperand(0)->getType()->getScalarType();
+
+      for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
+        if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
+          return nullptr;
+
+        if (Op1->getOperand(J) != Op2->getOperand(J)) {
+          if (DI == -1) {
+            // We have not seen any differences yet in the GEPs feeding the
+            // PHI yet, so we record this one if it is allowed to be a
+            // variable.
+
+            // The first two arguments can vary for any GEP, the rest have to be
+            // static for struct slots
+            if (J > 1 && CurTy->isStructTy())
+              return nullptr;
+
+            DI = J;
+          } else {
+            // The GEP is different by more than one input. While this could be
+            // extended to support GEPs that vary by more than one variable it
+            // doesn't make sense since it greatly increases the complexity and
+            // would result in an R+R+R addressing mode which no backend
+            // directly supports and would need to be broken into several
+            // simpler instructions anyway.
+            return nullptr;
+          }
+        }
+
+        // Sink down a layer of the type for the next iteration.
+        if (J > 0) {
+          if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
+            CurTy = CT->getTypeAtIndex(Op1->getOperand(J));
+          } else {
+            CurTy = nullptr;
+          }
+        }
+      }
+    }
+
+    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+
+    if (DI == -1) {
+      // All the GEPs feeding the PHI are identical. Clone one down into our
+      // BB so that it can be merged with the current GEP.
+      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
+                                            NewGEP);
+    } else {
+      // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
+      // into the current block so it can be merged, and create a new PHI to
+      // set that index.
+      Instruction *InsertPt = Builder->GetInsertPoint();
+      Builder->SetInsertPoint(PN);
+      PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(),
+                                          PN->getNumOperands());
+      Builder->SetInsertPoint(InsertPt);
+
+      for (auto &I : PN->operands())
+        NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
+                           PN->getIncomingBlock(I));
+
+      NewGEP->setOperand(DI, NewPN);
+      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
+                                            NewGEP);
+      NewGEP->setOperand(DI, NewPN);
+    }
+
+    GEP.setOperand(0, NewGEP);
+    PtrOp = NewGEP;
+  }
+
   // Combine Indices - If the source pointer to this getelementptr instruction
   // is a getelementptr instruction, combine the indices of the two
   // getelementptr instructions into a single instruction.
@@ -2014,7 +2099,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
   // Simplify the list of clauses, eg by removing repeated catch clauses
   // (these are often created by inlining).
   bool MakeNewInstruction = false; // If true, recreate using the following:
-  SmallVector<Value *, 16> NewClauses; // - Clauses for the new instruction;
+  SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
   bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup.
 
   SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
@@ -2022,8 +2107,8 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
     bool isLastClause = i + 1 == e;
     if (LI.isCatch(i)) {
       // A catch clause.
-      Value *CatchClause = LI.getClause(i);
-      Constant *TypeInfo = cast<Constant>(CatchClause->stripPointerCasts());
+      Constant *CatchClause = LI.getClause(i);
+      Constant *TypeInfo = CatchClause->stripPointerCasts();
 
       // If we already saw this clause, there is no point in having a second
       // copy of it.
@@ -2052,7 +2137,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
       // equal (for example if one represents a C++ class, and the other some
       // class derived from it).
       assert(LI.isFilter(i) && "Unsupported landingpad clause!");
-      Value *FilterClause = LI.getClause(i);
+      Constant *FilterClause = LI.getClause(i);
       ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
       unsigned NumTypeInfos = FilterType->getNumElements();
 
@@ -2096,8 +2181,8 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
         // catch-alls.  If so, the filter can be discarded.
         bool SawCatchAll = false;
         for (unsigned j = 0; j != NumTypeInfos; ++j) {
-          Value *Elt = Filter->getOperand(j);
-          Constant *TypeInfo = cast<Constant>(Elt->stripPointerCasts());
+          Constant *Elt = Filter->getOperand(j);
+          Constant *TypeInfo = Elt->stripPointerCasts();
           if (isCatchAll(Personality, TypeInfo)) {
             // This element is a catch-all.  Bail out, noting this fact.
             SawCatchAll = true;
@@ -2202,7 +2287,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
         continue;
       // If Filter is a subset of LFilter, i.e. every element of Filter is also
       // an element of LFilter, then discard LFilter.
-      SmallVectorImpl<Value *>::iterator J = NewClauses.begin() + j;
+      SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
       // If Filter is empty then it is a subset of LFilter.
       if (!FElts) {
         // Discard LFilter.
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 291ad2ed47e5..6dbcde03cf2d 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -48,6 +47,7 @@
 #include "llvm/Transforms/Utils/SpecialCaseList.h"
 #include <algorithm>
 #include <string>
+#include <system_error>
 
 using namespace llvm;
 
@@ -70,7 +70,7 @@ static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
 
 static const char *const kAsanModuleCtorName = "asan.module_ctor";
 static const char *const kAsanModuleDtorName = "asan.module_dtor";
-static const int         kAsanCtorAndCtorPriority = 1;
+static const int         kAsanCtorAndDtorPriority = 1;
 static const char *const kAsanReportErrorTemplate = "__asan_report_";
 static const char *const kAsanReportLoadN = "__asan_report_load_n";
 static const char *const kAsanReportStoreN = "__asan_report_store_n";
@@ -80,6 +80,7 @@ static const char *const kAsanUnregisterGlobalsName =
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *const kAsanInitName = "__asan_init_v3";
+static const char *const kAsanCovModuleInitName = "__sanitizer_cov_module_init";
 static const char *const kAsanCovName = "__sanitizer_cov";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
 static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
@@ -127,9 +128,8 @@ static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb",
 // This flag may need to be replaced with -f[no]asan-stack.
 static cl::opt<bool> ClStack("asan-stack",
        cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
-// This flag may need to be replaced with -f[no]asan-use-after-return.
 static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
-       cl::desc("Check return-after-free"), cl::Hidden, cl::init(false));
+       cl::desc("Check return-after-free"), cl::Hidden, cl::init(true));
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
        cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
@@ -141,7 +141,7 @@ static cl::opt<int> ClCoverageBlockThreshold("asan-coverage-block-threshold",
                 "are more than this number of blocks."),
        cl::Hidden, cl::init(1500));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
-       cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
+       cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(true));
 static cl::opt<bool> ClInvalidPointerPairs("asan-detect-invalid-pointer-pair",
        cl::desc("Instrument <, <=, >, >=, - with pointer operands"),
        cl::Hidden, cl::init(false));
@@ -156,7 +156,7 @@ static cl::opt<int> ClInstrumentationWithCallsThreshold(
        cl::desc("If the function being instrumented contains more than "
                 "this number of memory accesses, use callbacks instead of "
                 "inline checks (-1 means never use callbacks)."),
-       cl::Hidden, cl::init(10000));
+       cl::Hidden, cl::init(7000));
 static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
        "asan-memory-access-callback-prefix",
        cl::desc("Prefix for memory access callbacks"), cl::Hidden,
@@ -224,8 +224,7 @@ class SetOfDynamicallyInitializedGlobals {
         M.getNamedMetadata("llvm.asan.dynamically_initialized_globals");
     if (!DynamicGlobals)
       return;
-    for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) {
-      MDNode *MDN = DynamicGlobals->getOperand(i);
+    for (const auto MDN : DynamicGlobals->operands()) {
       assert(MDN->getNumOperands() == 1);
       Value *VG = MDN->getOperand(0);
       // The optimizer may optimize away a global entirely, in which case we
@@ -305,16 +304,7 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  AddressSanitizer(bool CheckInitOrder = true,
-                   bool CheckUseAfterReturn = false,
-                   bool CheckLifetime = false,
-                   StringRef BlacklistFile = StringRef())
-      : FunctionPass(ID),
-        CheckInitOrder(CheckInitOrder || ClInitializers),
-        CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn),
-        CheckLifetime(CheckLifetime || ClCheckLifetime),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+  AddressSanitizer() : FunctionPass(ID) {}
   const char *getPassName() const override {
     return "AddressSanitizerFunctionPass";
   }
@@ -343,11 +333,6 @@ struct AddressSanitizer : public FunctionPass {
   bool InjectCoverage(Function &F, const ArrayRef<BasicBlock*> AllBlocks);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB);
 
-  bool CheckInitOrder;
-  bool CheckUseAfterReturn;
-  bool CheckLifetime;
-  SmallString<64> BlacklistFile;
-
   LLVMContext *C;
   const DataLayout *DL;
   int LongSize;
@@ -358,7 +343,6 @@ struct AddressSanitizer : public FunctionPass {
   Function *AsanHandleNoReturnFunc;
   Function *AsanCovFunction;
   Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
-  std::unique_ptr<SpecialCaseList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
   Function *AsanMemoryAccessCallback[2][kNumberOfAccessSizes];
@@ -374,12 +358,9 @@ struct AddressSanitizer : public FunctionPass {
 
 class AddressSanitizerModule : public ModulePass {
  public:
-  AddressSanitizerModule(bool CheckInitOrder = true,
-                         StringRef BlacklistFile = StringRef())
-      : ModulePass(ID),
-        CheckInitOrder(CheckInitOrder || ClInitializers),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+  AddressSanitizerModule(StringRef BlacklistFile = StringRef())
+      : ModulePass(ID), BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                                            : BlacklistFile) {}
   bool runOnModule(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
   const char *getPassName() const override {
@@ -389,13 +370,14 @@ class AddressSanitizerModule : public ModulePass {
  private:
   void initializeCallbacks(Module &M);
 
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M);
   bool ShouldInstrumentGlobal(GlobalVariable *G);
+  void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
   void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
   size_t MinRedzoneSizeForGlobal() const {
     return RedzoneSizeForScale(Mapping.Scale);
   }
 
-  bool CheckInitOrder;
   SmallString<64> BlacklistFile;
 
   std::unique_ptr<SpecialCaseList> BL;
@@ -408,6 +390,7 @@ class AddressSanitizerModule : public ModulePass {
   Function *AsanUnpoisonGlobals;
   Function *AsanRegisterGlobals;
   Function *AsanUnregisterGlobals;
+  Function *AsanCovModuleInit;
 };
 
 // Stack poisoning does not play well with exception handling.
@@ -495,7 +478,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   /// \brief Collect lifetime intrinsic calls to check for use-after-scope
   /// errors.
   void visitIntrinsicInst(IntrinsicInst &II) {
-    if (!ASan.CheckLifetime) return;
+    if (!ClCheckLifetime) return;
     Intrinsic::ID ID = II.getIntrinsicID();
     if (ID != Intrinsic::lifetime_start &&
         ID != Intrinsic::lifetime_end)
@@ -550,20 +533,16 @@ char AddressSanitizer::ID = 0;
 INITIALIZE_PASS(AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
     false, false)
-FunctionPass *llvm::createAddressSanitizerFunctionPass(
-    bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime,
-    StringRef BlacklistFile) {
-  return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn,
-                              CheckLifetime, BlacklistFile);
+FunctionPass *llvm::createAddressSanitizerFunctionPass() {
+  return new AddressSanitizer();
 }
 
 char AddressSanitizerModule::ID = 0;
 INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
     "ModulePass", false, false)
-ModulePass *llvm::createAddressSanitizerModulePass(
-    bool CheckInitOrder, StringRef BlacklistFile) {
-  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile);
+ModulePass *llvm::createAddressSanitizerModulePass(StringRef BlacklistFile) {
+  return new AddressSanitizerModule(BlacklistFile);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -623,26 +602,31 @@ void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
 }
 
 // If I is an interesting memory access, return the PointerOperand
-// and set IsWrite. Otherwise return NULL.
-static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
+// and set IsWrite/Alignment. Otherwise return NULL.
+static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
+                                        unsigned *Alignment) {
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (!ClInstrumentReads) return nullptr;
     *IsWrite = false;
+    *Alignment = LI->getAlignment();
     return LI->getPointerOperand();
   }
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (!ClInstrumentWrites) return nullptr;
     *IsWrite = true;
+    *Alignment = SI->getAlignment();
     return SI->getPointerOperand();
   }
   if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
     if (!ClInstrumentAtomics) return nullptr;
     *IsWrite = true;
+    *Alignment = 0;
     return RMW->getPointerOperand();
   }
   if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
     if (!ClInstrumentAtomics) return nullptr;
     *IsWrite = true;
+    *Alignment = 0;
     return XCHG->getPointerOperand();
   }
   return nullptr;
@@ -692,13 +676,14 @@ AddressSanitizer::instrumentPointerComparisonOrSubtraction(Instruction *I) {
 
 void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) {
   bool IsWrite = false;
-  Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
+  unsigned Alignment = 0;
+  Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &Alignment);
   assert(Addr);
   if (ClOpt && ClOptGlobals) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
       // If initialization order checking is disabled, a simple access to a
       // dynamically initialized global is always valid.
-      if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) {
+      if (!ClInitializers || GlobalIsLinkerInitialized(G)) {
         NumOptimizedAccessesToGlobalVar++;
         return;
       }
@@ -727,11 +712,14 @@ void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) {
   else
     NumInstrumentedReads++;
 
-  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
-  if (TypeSize == 8  || TypeSize == 16 ||
-      TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
+  unsigned Granularity = 1 << Mapping.Scale;
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+  // if the data is properly aligned.
+  if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+       TypeSize == 128) &&
+      (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
     return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls);
-  // Instrument unusual size (but still multiple of 8).
+  // Instrument unusual size or unusual alignment.
   // We can not do it with a single check, so we do 1-byte check for the first
   // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
   // to report the actual access size.
@@ -840,48 +828,36 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
-void AddressSanitizerModule::createInitializerPoisonCalls(
-    Module &M, GlobalValue *ModuleName) {
-  // We do all of our poisoning and unpoisoning within a global constructor.
-  // These are called _GLOBAL__(sub_)?I_.*.
-  // TODO: Consider looking through the functions in
-  // M.getGlobalVariable("llvm.global_ctors") instead of using this stringly
-  // typed approach.
-  Function *GlobalInit = nullptr;
-  for (auto &F : M.getFunctionList()) {
-    StringRef FName = F.getName();
-
-    const char kGlobalPrefix[] = "_GLOBAL__";
-    if (!FName.startswith(kGlobalPrefix))
-      continue;
-    FName = FName.substr(strlen(kGlobalPrefix));
-
-    const char kOptionalSub[] = "sub_";
-    if (FName.startswith(kOptionalSub))
-      FName = FName.substr(strlen(kOptionalSub));
-
-    if (FName.startswith("I_")) {
-      GlobalInit = &F;
-      break;
-    }
-  }
-  // If that function is not present, this TU contains no globals, or they have
-  // all been optimized away
-  if (!GlobalInit)
-    return;
-
+void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
+                                                  GlobalValue *ModuleName) {
   // Set up the arguments to our poison/unpoison functions.
-  IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt());
+  IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt());
 
   // Add a call to poison all external globals before the given function starts.
   Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
   IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
 
   // Add calls to unpoison all globals before each return instruction.
-  for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
-       I != E; ++I) {
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) {
+  for (auto &BB : GlobalInit.getBasicBlockList())
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
       CallInst::Create(AsanUnpoisonGlobals, "", RI);
+}
+
+void AddressSanitizerModule::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    // Must have a function or null ptr.
+    // (CS->getOperand(0) is the init priority.)
+    if (Function* F = dyn_cast<Function>(CS->getOperand(1))) {
+      if (F->getName() != kAsanModuleCtorName)
+        poisonOneInitializer(*F, ModuleName);
     }
   }
 }
@@ -981,34 +957,23 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
       kAsanUnregisterGlobalsName,
       IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+  AsanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanCovModuleInitName,
+      IRB.getVoidTy(), IntptrTy, NULL));
+  AsanCovModuleInit->setLinkage(Function::ExternalLinkage);
 }
 
 // This function replaces all global variables with new variables that have
 // trailing redzones. It also creates a function that poisons
 // redzones and inserts this function into llvm.global_ctors.
-bool AddressSanitizerModule::runOnModule(Module &M) {
-  if (!ClGlobals) return false;
-
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP)
-    return false;
-  DL = &DLP->getDataLayout();
-
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
-  if (BL->isIn(M)) return false;
-  C = &(M.getContext());
-  int LongSize = DL->getPointerSizeInBits();
-  IntptrTy = Type::getIntNTy(*C, LongSize);
-  Mapping = getShadowMapping(M, LongSize);
-  initializeCallbacks(M);
+bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
   DynamicallyInitializedGlobals.Init(M);
 
   SmallVector<GlobalVariable *, 16> GlobalsToChange;
 
-  for (Module::GlobalListType::iterator G = M.global_begin(),
-       E = M.global_end(); G != E; ++G) {
-    if (ShouldInstrumentGlobal(G))
-      GlobalsToChange.push_back(G);
+  for (auto &G : M.globals()) {
+    if (ShouldInstrumentGlobal(&G))
+      GlobalsToChange.push_back(&G);
   }
 
   size_t n = GlobalsToChange.size();
@@ -1027,10 +992,6 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
                                                IntptrTy, IntptrTy, NULL);
   SmallVector<Constant *, 16> Initializers(n);
 
-  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
-  assert(CtorFunc);
-  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
-
   bool HasDynamicallyInitializedGlobals = false;
 
   // We shouldn't merge same module names, as this string serves as unique
@@ -1059,8 +1020,6 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
     // Determine whether this global should be poisoned in initialization.
     bool GlobalHasDynamicInitializer =
         DynamicallyInitializedGlobals.Contains(G);
-    // Don't check initialization order if this global is blacklisted.
-    GlobalHasDynamicInitializer &= !BL->isIn(*G, "init");
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
     Constant *NewInitializer = ConstantStruct::get(
@@ -1100,7 +1059,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
         NULL);
 
     // Populate the first and last globals declared in this TU.
-    if (CheckInitOrder && GlobalHasDynamicInitializer)
+    if (ClInitializers && GlobalHasDynamicInitializer)
       HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
@@ -1112,7 +1071,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
   // Create calls for poisoning before initializers run and unpoisoning after.
-  if (CheckInitOrder && HasDynamicallyInitializedGlobals)
+  if (HasDynamicallyInitializedGlobals)
     createInitializerPoisonCalls(M, ModuleName);
   IRB.CreateCall2(AsanRegisterGlobals,
                   IRB.CreatePointerCast(AllGlobals, IntptrTy),
@@ -1128,12 +1087,42 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
                        IRB.CreatePointerCast(AllGlobals, IntptrTy),
                        ConstantInt::get(IntptrTy, n));
-  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndCtorPriority);
+  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
 
   DEBUG(dbgs() << M);
   return true;
 }
 
+bool AddressSanitizerModule::runOnModule(Module &M) {
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  if (!DLP)
+    return false;
+  DL = &DLP->getDataLayout();
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
+  C = &(M.getContext());
+  int LongSize = DL->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  Mapping = getShadowMapping(M, LongSize);
+  initializeCallbacks(M);
+
+  bool Changed = false;
+
+  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
+  assert(CtorFunc);
+  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+
+  if (ClCoverage > 0) {
+    Function *CovFunc = M.getFunction(kAsanCovName);
+    int nCov = CovFunc ? CovFunc->getNumUses() : 0;
+    IRB.CreateCall(AsanCovModuleInit, ConstantInt::get(IntptrTy, nCov));
+    Changed = true;
+  }
+
+  if (ClGlobals && !BL->isIn(M)) Changed |= InstrumentGlobals(IRB, M);
+
+  return Changed;
+}
+
 void AddressSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   // Create __asan_report* callbacks.
@@ -1197,7 +1186,6 @@ bool AddressSanitizer::doInitialization(Module &M) {
     report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   DynamicallyInitializedGlobals.Init(M);
 
   C = &(M.getContext());
@@ -1217,7 +1205,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
 
   Mapping = getShadowMapping(M, LongSize);
 
-  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
+  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
   return true;
 }
 
@@ -1248,7 +1236,9 @@ void AddressSanitizer::InjectCoverageAtBlock(Function &F, BasicBlock &BB) {
       break;
   }
 
+  DebugLoc EntryLoc = IP->getDebugLoc().getFnDebugLoc(*C);
   IRBuilder<> IRB(IP);
+  IRB.SetCurrentDebugLocation(EntryLoc);
   Type *Int8Ty = IRB.getInt8Ty();
   GlobalVariable *Guard = new GlobalVariable(
       *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
@@ -1260,10 +1250,10 @@ void AddressSanitizer::InjectCoverageAtBlock(Function &F, BasicBlock &BB) {
   Instruction *Ins = SplitBlockAndInsertIfThen(
       Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
   IRB.SetInsertPoint(Ins);
+  IRB.SetCurrentDebugLocation(EntryLoc);
   // We pass &F to __sanitizer_cov. We could avoid this and rely on
   // GET_CALLER_PC, but having the PC of the first instruction is just nice.
-  Instruction *Call = IRB.CreateCall(AsanCovFunction);
-  Call->setDebugLoc(IP->getDebugLoc());
+  IRB.CreateCall(AsanCovFunction);
   StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
   Store->setAtomic(Monotonic);
   Store->setAlignment(1);
@@ -1297,14 +1287,13 @@ bool AddressSanitizer::InjectCoverage(Function &F,
       (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) {
     InjectCoverageAtBlock(F, F.getEntryBlock());
   } else {
-    for (size_t i = 0, n = AllBlocks.size(); i < n; i++)
-      InjectCoverageAtBlock(F, *AllBlocks[i]);
+    for (auto BB : AllBlocks)
+      InjectCoverageAtBlock(F, *BB);
   }
   return true;
 }
 
 bool AddressSanitizer::runOnFunction(Function &F) {
-  if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
@@ -1328,31 +1317,31 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   SmallVector<Instruction*, 16> PointerComparisonsOrSubtracts;
   int NumAllocas = 0;
   bool IsWrite;
+  unsigned Alignment;
 
   // Fill the set of memory operations to instrument.
-  for (Function::iterator FI = F.begin(), FE = F.end();
-       FI != FE; ++FI) {
-    AllBlocks.push_back(FI);
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
     TempsToInstrument.clear();
     int NumInsnsPerBB = 0;
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
-         BI != BE; ++BI) {
-      if (LooksLikeCodeInBug11395(BI)) return false;
-      if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) {
+    for (auto &Inst : BB) {
+      if (LooksLikeCodeInBug11395(&Inst)) return false;
+      if (Value *Addr =
+              isInterestingMemoryAccess(&Inst, &IsWrite, &Alignment)) {
         if (ClOpt && ClOptSameTemp) {
           if (!TempsToInstrument.insert(Addr))
             continue;  // We've seen this temp in the current BB.
         }
       } else if (ClInvalidPointerPairs &&
-                 isInterestingPointerComparisonOrSubtraction(BI)) {
-        PointerComparisonsOrSubtracts.push_back(BI);
+                 isInterestingPointerComparisonOrSubtraction(&Inst)) {
+        PointerComparisonsOrSubtracts.push_back(&Inst);
         continue;
-      } else if (isa<MemIntrinsic>(BI)) {
+      } else if (isa<MemIntrinsic>(Inst)) {
         // ok, take it.
       } else {
-        if (isa<AllocaInst>(BI))
+        if (isa<AllocaInst>(Inst))
           NumAllocas++;
-        CallSite CS(BI);
+        CallSite CS(&Inst);
         if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
@@ -1361,7 +1350,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
         }
         continue;
       }
-      ToInstrument.push_back(BI);
+      ToInstrument.push_back(&Inst);
       NumInsnsPerBB++;
       if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB)
         break;
@@ -1386,11 +1375,10 @@ bool AddressSanitizer::runOnFunction(Function &F) {
 
   // Instrument.
   int NumInstrumented = 0;
-  for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
-    Instruction *Inst = ToInstrument[i];
+  for (auto Inst : ToInstrument) {
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
-      if (isInterestingMemoryAccess(Inst, &IsWrite))
+      if (isInterestingMemoryAccess(Inst, &IsWrite, &Alignment))
         instrumentMop(Inst, UseCalls);
       else
         instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
@@ -1403,14 +1391,13 @@ bool AddressSanitizer::runOnFunction(Function &F) {
 
   // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
   // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
-  for (size_t i = 0, n = NoReturnCalls.size(); i != n; i++) {
-    Instruction *CI = NoReturnCalls[i];
+  for (auto CI : NoReturnCalls) {
     IRBuilder<> IRB(CI);
     IRB.CreateCall(AsanHandleNoReturnFunc);
   }
 
-  for (size_t i = 0, n = PointerComparisonsOrSubtracts.size(); i != n; i++) {
-    instrumentPointerComparisonOrSubtraction(PointerComparisonsOrSubtracts[i]);
+  for (auto Inst : PointerComparisonsOrSubtracts) {
+    instrumentPointerComparisonOrSubtraction(Inst);
     NumInstrumented++;
   }
 
@@ -1523,12 +1510,10 @@ void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
 }
 
 static DebugLoc getFunctionEntryDebugLocation(Function &F) {
-  BasicBlock::iterator I = F.getEntryBlock().begin(),
-                       E = F.getEntryBlock().end();
-  for (; I != E; ++I)
-    if (!isa<AllocaInst>(I))
-      break;
-  return I->getDebugLoc();
+  for (const auto &Inst : F.getEntryBlock())
+    if (!isa<AllocaInst>(Inst))
+      return Inst.getDebugLoc();
+  return DebugLoc();
 }
 
 void FunctionStackPoisoner::poisonStack() {
@@ -1542,8 +1527,7 @@ void FunctionStackPoisoner::poisonStack() {
 
   SmallVector<ASanStackVariableDescription, 16> SVD;
   SVD.reserve(AllocaVec.size());
-  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
-    AllocaInst *AI = AllocaVec[i];
+  for (AllocaInst *AI : AllocaVec) {
     ASanStackVariableDescription D = { AI->getName().data(),
                                    getAllocaSizeInBytes(AI),
                                    AI->getAlignment(), AI, 0};
@@ -1557,7 +1541,7 @@ void FunctionStackPoisoner::poisonStack() {
   DEBUG(dbgs() << L.DescriptionString << " --- " << L.FrameSize << "\n");
   uint64_t LocalStackSize = L.FrameSize;
   bool DoStackMalloc =
-      ASan.CheckUseAfterReturn && LocalStackSize <= kMaxStackMallocSize;
+      ClUseAfterReturn && LocalStackSize <= kMaxStackMallocSize;
 
   Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
   AllocaInst *MyAlloca =
@@ -1598,8 +1582,7 @@ void FunctionStackPoisoner::poisonStack() {
 
   // Insert poison calls for lifetime intrinsics for alloca.
   bool HavePoisonedAllocas = false;
-  for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) {
-    const AllocaPoisonCall &APC = AllocaPoisonCallVec[i];
+  for (const auto &APC : AllocaPoisonCallVec) {
     assert(APC.InsBefore);
     assert(APC.AI);
     IRBuilder<> IRB(APC.InsBefore);
@@ -1608,11 +1591,10 @@ void FunctionStackPoisoner::poisonStack() {
   }
 
   // Replace Alloca instructions with base+offset.
-  for (size_t i = 0, n = SVD.size(); i < n; i++) {
-    AllocaInst *AI = SVD[i].AI;
+  for (const auto &Desc : SVD) {
+    AllocaInst *AI = Desc.AI;
     Value *NewAllocaPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(LocalStackBase,
-                      ConstantInt::get(IntptrTy, SVD[i].Offset)),
+        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
         AI->getType());
     replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB);
     AI->replaceAllUsesWith(NewAllocaPtr);
@@ -1645,8 +1627,7 @@ void FunctionStackPoisoner::poisonStack() {
   poisonRedZones(L.ShadowBytes, IRB, ShadowBase, true);
 
   // (Un)poison the stack before all ret instructions.
-  for (size_t i = 0, n = RetVec.size(); i < n; i++) {
-    Instruction *Ret = RetVec[i];
+  for (auto Ret : RetVec) {
     IRBuilder<> IRBRet(Ret);
     // Mark the current frame as retired.
     IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
@@ -1700,8 +1681,8 @@ void FunctionStackPoisoner::poisonStack() {
   }
 
   // We are done. Remove the old unused alloca instructions.
-  for (size_t i = 0, n = AllocaVec.size(); i < n; i++)
-    AllocaVec[i]->eraseFromParent();
+  for (auto AI : AllocaVec)
+    AI->eraseFromParent();
 }
 
 void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
index 18bda1a3208f..1bfef574ddb9 100644
--- a/lib/Transforms/Instrumentation/DebugIR.cpp
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp
@@ -352,6 +352,7 @@ class DIUpdater : public InstVisitor<DIUpdater> {
   }
 
   std::string getTypeName(Type *T) {
+    assert(T != nullptr && "Expecting non-null Type");
     std::string TypeName;
     raw_string_ostream TypeStream(TypeName);
     T->print(TypeStream);
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 8330a9bc3351..cfeb62eb1f9f 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -211,6 +211,7 @@ namespace {
   class GCOVLines : public GCOVRecord {
    public:
     void addLine(uint32_t Line) {
+      assert(Line != 0 && "Line zero is not a valid real line number.");
       Lines.push_back(Line);
     }
 
@@ -453,10 +454,17 @@ static bool functionHasLines(Function *F) {
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
          I != IE; ++I) {
+      // Debug intrinsic locations correspond to the location of the
+      // declaration, not necessarily any statements or expressions.
+      if (isa<DbgInfoIntrinsic>(I)) continue;
+
       const DebugLoc &Loc = I->getDebugLoc();
       if (Loc.isUnknown()) continue;
-      if (Loc.getLine() != 0)
-        return true;
+
+      // Artificial lines such as calls to the global constructors.
+      if (Loc.getLine() == 0) continue; 
+
+      return true;
     }
   }
   return false;
@@ -515,8 +523,16 @@ void GCOVProfiler::emitProfileNotes() {
         uint32_t Line = 0;
         for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
              I != IE; ++I) {
+          // Debug intrinsic locations correspond to the location of the
+          // declaration, not necessarily any statements or expressions.
+          if (isa<DbgInfoIntrinsic>(I)) continue;
+
           const DebugLoc &Loc = I->getDebugLoc();
           if (Loc.isUnknown()) continue;
+
+          // Artificial lines such as calls to the global constructors.
+          if (Loc.getLine() == 0) continue;
+
           if (Line == Loc.getLine()) continue;
           Line = Loc.getLine();
           if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue;
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b8e632ead10a..75c56c2d4300 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -10,8 +10,6 @@
 /// This file is a part of MemorySanitizer, a detector of uninitialized
 /// reads.
 ///
-/// Status: early prototype.
-///
 /// The algorithm of the tool is similar to Memcheck
 /// (http://goo.gl/QKbem). We associate a few shadow bits with every
 /// byte of the application memory, poison the shadow of the malloc-ed
@@ -117,7 +115,6 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/SpecialCaseList.h"
 
 using namespace llvm;
 
@@ -178,10 +175,6 @@ static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
        cl::desc("print out instructions with default strict semantics"),
        cl::Hidden, cl::init(false));
 
-static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
-       cl::desc("File containing the list of functions where MemorySanitizer "
-                "should not report bugs"), cl::Hidden);
-
 static cl::opt<int> ClInstrumentationWithCallThreshold(
     "msan-instrumentation-with-call-threshold",
     cl::desc(
@@ -211,13 +204,11 @@ namespace {
 /// uninitialized reads.
 class MemorySanitizer : public FunctionPass {
  public:
-  MemorySanitizer(int TrackOrigins = 0,
-                  StringRef BlacklistFile = StringRef())
+  MemorySanitizer(int TrackOrigins = 0)
       : FunctionPass(ID),
         TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
         DL(nullptr),
         WarningFn(nullptr),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile),
         WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {}
   const char *getPassName() const override { return "MemorySanitizer"; }
   bool runOnFunction(Function &F) override;
@@ -282,10 +273,6 @@ class MemorySanitizer : public FunctionPass {
   MDNode *ColdCallWeights;
   /// \brief Branch weights for origin store.
   MDNode *OriginStoreWeights;
-  /// \brief Path to blacklist file.
-  SmallString<64> BlacklistFile;
-  /// \brief The blacklist.
-  std::unique_ptr<SpecialCaseList> BL;
   /// \brief An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
 
@@ -305,9 +292,8 @@ INITIALIZE_PASS(MemorySanitizer, "msan",
                 "MemorySanitizer: detects uninitialized reads.",
                 false, false)
 
-FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins,
-                                              StringRef BlacklistFile) {
-  return new MemorySanitizer(TrackOrigins, BlacklistFile);
+FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins) {
+  return new MemorySanitizer(TrackOrigins);
 }
 
 /// \brief Create a non-const global initialized with the given string.
@@ -431,7 +417,6 @@ bool MemorySanitizer::doInitialization(Module &M) {
     report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   C = &(M.getContext());
   unsigned PtrSize = DL->getPointerSizeInBits(/* AddressSpace */0);
   switch (PtrSize) {
@@ -544,9 +529,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
-    bool SanitizeFunction = !MS.BL->isIn(F) && F.getAttributes().hasAttribute(
-                                                   AttributeSet::FunctionIndex,
-                                                   Attribute::SanitizeMemory);
+    bool SanitizeFunction = F.getAttributes().hasAttribute(
+        AttributeSet::FunctionIndex, Attribute::SanitizeMemory);
     InsertChecks = SanitizeFunction;
     LoadShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
@@ -599,26 +583,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void materializeStores(bool InstrumentWithCalls) {
-    for (size_t i = 0, n = StoreList.size(); i < n; i++) {
-      StoreInst &I = *dyn_cast<StoreInst>(StoreList[i]);
+    for (auto Inst : StoreList) {
+      StoreInst &SI = *dyn_cast<StoreInst>(Inst);
 
-      IRBuilder<> IRB(&I);
-      Value *Val = I.getValueOperand();
-      Value *Addr = I.getPointerOperand();
-      Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+      IRBuilder<> IRB(&SI);
+      Value *Val = SI.getValueOperand();
+      Value *Addr = SI.getPointerOperand();
+      Value *Shadow = SI.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
       StoreInst *NewSI =
-          IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
+          IRB.CreateAlignedStore(Shadow, ShadowPtr, SI.getAlignment());
       DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
 
-      if (ClCheckAccessAddress) insertShadowCheck(Addr, &I);
+      if (ClCheckAccessAddress) insertShadowCheck(Addr, &SI);
 
-      if (I.isAtomic()) I.setOrdering(addReleaseOrdering(I.getOrdering()));
+      if (SI.isAtomic()) SI.setOrdering(addReleaseOrdering(SI.getOrdering()));
 
       if (MS.TrackOrigins) {
-        unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
+        unsigned Alignment = std::max(kMinOriginAlignment, SI.getAlignment());
         storeOrigin(IRB, Addr, Shadow, getOrigin(Val), Alignment,
                     InstrumentWithCalls);
       }
@@ -662,18 +646,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void materializeChecks(bool InstrumentWithCalls) {
-    for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
-      Instruction *OrigIns = InstrumentationList[i].OrigIns;
-      Value *Shadow = InstrumentationList[i].Shadow;
-      Value *Origin = InstrumentationList[i].Origin;
+    for (const auto &ShadowData : InstrumentationList) {
+      Instruction *OrigIns = ShadowData.OrigIns;
+      Value *Shadow = ShadowData.Shadow;
+      Value *Origin = ShadowData.Origin;
       materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
     }
     DEBUG(dbgs() << "DONE:\n" << F);
   }
 
   void materializeIndirectCalls() {
-    for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) {
-      CallSite CS = IndirectCallList[i];
+    for (auto &CS : IndirectCallList) {
       Instruction *I = CS.getInstruction();
       BasicBlock *B = I->getParent();
       IRBuilder<> IRB(I);
@@ -732,8 +715,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
 
     // Finalize PHI nodes.
-    for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) {
-      PHINode *PN = ShadowPHINodes[i];
+    for (PHINode *PN : ShadowPHINodes) {
       PHINode *PNS = cast<PHINode>(getShadow(PN));
       PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
       size_t NumValues = PN->getNumIncomingValues();
@@ -950,22 +932,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Function *F = A->getParent();
       IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
       unsigned ArgOffset = 0;
-      for (Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-           AI != AE; ++AI) {
-        if (!AI->getType()->isSized()) {
+      for (auto &FArg : F->args()) {
+        if (!FArg.getType()->isSized()) {
           DEBUG(dbgs() << "Arg is not sized\n");
           continue;
         }
-        unsigned Size = AI->hasByValAttr()
-          ? MS.DL->getTypeAllocSize(AI->getType()->getPointerElementType())
-          : MS.DL->getTypeAllocSize(AI->getType());
-        if (A == AI) {
-          Value *Base = getShadowPtrForArgument(AI, EntryIRB, ArgOffset);
-          if (AI->hasByValAttr()) {
+        unsigned Size = FArg.hasByValAttr()
+          ? MS.DL->getTypeAllocSize(FArg.getType()->getPointerElementType())
+          : MS.DL->getTypeAllocSize(FArg.getType());
+        if (A == &FArg) {
+          Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+          if (FArg.hasByValAttr()) {
             // ByVal pointer itself has clean shadow. We copy the actual
             // argument shadow to the underlying memory.
             // Figure out maximal valid memcpy alignment.
-            unsigned ArgAlign = AI->getParamAlignment();
+            unsigned ArgAlign = FArg.getParamAlignment();
             if (ArgAlign == 0) {
               Type *EltType = A->getType()->getPointerElementType();
               ArgAlign = MS.DL->getABITypeAlignment(EltType);
@@ -980,10 +961,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           } else {
             *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
           }
-          DEBUG(dbgs() << "  ARG:    "  << *AI << " ==> " <<
+          DEBUG(dbgs() << "  ARG:    "  << FArg << " ==> " <<
                 **ShadowPtr << "\n");
           if (MS.TrackOrigins) {
-            Value* OriginPtr = getOriginPtrForArgument(AI, EntryIRB, ArgOffset);
+            Value *OriginPtr =
+                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
             setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
           }
         }
@@ -1320,10 +1302,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         if (!Origin) {
           Origin = OpOrigin;
         } else {
-          Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
-          Value *Cond = IRB.CreateICmpNE(FlatShadow,
-                                         MSV->getCleanShadow(FlatShadow));
-          Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
+          // No point in adding something that might result in 0 origin value.
+          if (!ConstOrigin || !ConstOrigin->isNullValue()) {
+            Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+            Value *Cond =
+                IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
+            Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          }
         }
       }
       return *this;
@@ -1411,13 +1397,61 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     SC.Done(&I);
   }
 
+  // \brief Handle multiplication by constant.
+  //
+  // Handle a special case of multiplication by constant that may have one or
+  // more zeros in the lower bits. This makes corresponding number of lower bits
+  // of the result zero as well. We model it by shifting the other operand
+  // shadow left by the required number of bits. Effectively, we transform
+  // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
+  // We use multiplication by 2**N instead of shift to cover the case of
+  // multiplication by 0, which may occur in some elements of a vector operand.
+  void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
+                           Value *OtherArg) {
+    Constant *ShadowMul;
+    Type *Ty = ConstArg->getType();
+    if (Ty->isVectorTy()) {
+      unsigned NumElements = Ty->getVectorNumElements();
+      Type *EltTy = Ty->getSequentialElementType();
+      SmallVector<Constant *, 16> Elements;
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        ConstantInt *Elt =
+            dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx));
+        APInt V = Elt->getValue();
+        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+        Elements.push_back(ConstantInt::get(EltTy, V2));
+      }
+      ShadowMul = ConstantVector::get(Elements);
+    } else {
+      ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg);
+      APInt V = Elt->getValue();
+      APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+      ShadowMul = ConstantInt::get(Elt->getType(), V2);
+    }
+
+    IRBuilder<> IRB(&I);
+    setShadow(&I,
+              IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
+    setOrigin(&I, getOrigin(OtherArg));
+  }
+
+  void visitMul(BinaryOperator &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    if (constOp0 && !constOp1)
+      handleMulByConstant(I, constOp0, I.getOperand(1));
+    else if (constOp1 && !constOp0)
+      handleMulByConstant(I, constOp1, I.getOperand(0));
+    else
+      handleShadowOr(I);
+  }
+
   void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
   void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
   void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
   void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
   void visitSub(BinaryOperator &I) { handleShadowOr(I); }
   void visitXor(BinaryOperator &I) { handleShadowOr(I); }
-  void visitMul(BinaryOperator &I) { handleShadowOr(I); }
 
   void handleDiv(Instruction &I) {
     IRBuilder<> IRB(&I);
@@ -1946,6 +1980,120 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // \brief Get an X86_MMX-sized vector type.
+  Type *getMMXVectorTy(unsigned EltSizeInBits) {
+    const unsigned X86_MMXSizeInBits = 64;
+    return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                           X86_MMXSizeInBits / EltSizeInBits);
+  }
+
+  // \brief Returns a signed counterpart for an (un)signed-saturate-and-pack
+  // intrinsic.
+  Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
+    switch (id) {
+      case llvm::Intrinsic::x86_sse2_packsswb_128:
+      case llvm::Intrinsic::x86_sse2_packuswb_128:
+        return llvm::Intrinsic::x86_sse2_packsswb_128;
+
+      case llvm::Intrinsic::x86_sse2_packssdw_128:
+      case llvm::Intrinsic::x86_sse41_packusdw:
+        return llvm::Intrinsic::x86_sse2_packssdw_128;
+
+      case llvm::Intrinsic::x86_avx2_packsswb:
+      case llvm::Intrinsic::x86_avx2_packuswb:
+        return llvm::Intrinsic::x86_avx2_packsswb;
+
+      case llvm::Intrinsic::x86_avx2_packssdw:
+      case llvm::Intrinsic::x86_avx2_packusdw:
+        return llvm::Intrinsic::x86_avx2_packssdw;
+
+      case llvm::Intrinsic::x86_mmx_packsswb:
+      case llvm::Intrinsic::x86_mmx_packuswb:
+        return llvm::Intrinsic::x86_mmx_packsswb;
+
+      case llvm::Intrinsic::x86_mmx_packssdw:
+        return llvm::Intrinsic::x86_mmx_packssdw;
+      default:
+        llvm_unreachable("unexpected intrinsic id");
+    }
+  }
+
+  // \brief Instrument vector pack instrinsic.
+  //
+  // This function instruments intrinsics like x86_mmx_packsswb, that
+  // packs elements of 2 input vectors into half as many bits with saturation.
+  // Shadow is propagated with the signed variant of the same intrinsic applied
+  // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
+  // EltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+    assert(I.getNumArgOperands() == 2);
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    IRBuilder<> IRB(&I);
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    assert(isX86_MMX || S1->getType()->isVectorTy());
+
+    // SExt and ICmpNE below must apply to individual elements of input vectors.
+    // In case of x86mmx arguments, cast them to appropriate vector types and
+    // back.
+    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
+    if (isX86_MMX) {
+      S1 = IRB.CreateBitCast(S1, T);
+      S2 = IRB.CreateBitCast(S2, T);
+    }
+    Value *S1_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S1, llvm::Constant::getNullValue(T)), T);
+    Value *S2_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S2, llvm::Constant::getNullValue(T)), T);
+    if (isX86_MMX) {
+      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
+      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
+      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    }
+
+    Function *ShadowFn = Intrinsic::getDeclaration(
+        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+    Value *S = IRB.CreateCall2(ShadowFn, S1_ext, S2_ext, "_msprop_vector_pack");
+    if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument sum-of-absolute-differencies intrinsic.
+  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+    const unsigned SignificantBitsPerResultElement = 16;
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    unsigned ZeroBitsPerResultElement =
+        ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
+
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument multiply-add intrinsic.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+                                  unsigned EltSizeInBits = 0) {
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
@@ -2062,6 +2210,47 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // case llvm::Intrinsic::x86_sse2_psll_dq_bs:
     // case llvm::Intrinsic::x86_sse2_psrl_dq_bs:
 
+    case llvm::Intrinsic::x86_sse2_packsswb_128:
+    case llvm::Intrinsic::x86_sse2_packssdw_128:
+    case llvm::Intrinsic::x86_sse2_packuswb_128:
+    case llvm::Intrinsic::x86_sse41_packusdw:
+    case llvm::Intrinsic::x86_avx2_packsswb:
+    case llvm::Intrinsic::x86_avx2_packssdw:
+    case llvm::Intrinsic::x86_avx2_packuswb:
+    case llvm::Intrinsic::x86_avx2_packusdw:
+      handleVectorPackIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packsswb:
+    case llvm::Intrinsic::x86_mmx_packuswb:
+      handleVectorPackIntrinsic(I, 16);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packssdw:
+      handleVectorPackIntrinsic(I, 32);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_psad_bw:
+    case llvm::Intrinsic::x86_sse2_psad_bw:
+    case llvm::Intrinsic::x86_avx2_psad_bw:
+      handleVectorSadIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_sse2_pmadd_wd:
+    case llvm::Intrinsic::x86_avx2_pmadd_wd:
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case llvm::Intrinsic::x86_avx2_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I, 8);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_pmadd_wd:
+      handleVectorPmaddIntrinsic(I, 16);
+      break;
+
     default:
       if (!handleUnknownIntrinsic(I))
         visitInstruction(I);
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8fe9bcae69dc..f3bc36f04cf0 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -46,8 +46,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "tsan"
 
-static cl::opt<std::string>  ClBlacklistFile("tsan-blacklist",
-       cl::desc("Blacklist file"), cl::Hidden);
 static cl::opt<bool>  ClInstrumentMemoryAccesses(
     "tsan-instrument-memory-accesses", cl::init(true),
     cl::desc("Instrument memory accesses"), cl::Hidden);
@@ -76,11 +74,7 @@ namespace {
 
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
-  ThreadSanitizer(StringRef BlacklistFile = StringRef())
-      : FunctionPass(ID),
-        DL(nullptr),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) { }
+  ThreadSanitizer() : FunctionPass(ID), DL(nullptr) {}
   const char *getPassName() const override;
   bool runOnFunction(Function &F) override;
   bool doInitialization(Module &M) override;
@@ -98,8 +92,6 @@ struct ThreadSanitizer : public FunctionPass {
 
   const DataLayout *DL;
   Type *IntptrTy;
-  SmallString<64> BlacklistFile;
-  std::unique_ptr<SpecialCaseList> BL;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
   Function *TsanFuncEntry;
@@ -129,8 +121,8 @@ const char *ThreadSanitizer::getPassName() const {
   return "ThreadSanitizer";
 }
 
-FunctionPass *llvm::createThreadSanitizerPass(StringRef BlacklistFile) {
-  return new ThreadSanitizer(BlacklistFile);
+FunctionPass *llvm::createThreadSanitizerPass() {
+  return new ThreadSanitizer();
 }
 
 static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
@@ -228,7 +220,6 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   if (!DLP)
     report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -322,7 +313,6 @@ static bool isAtomic(Instruction *I) {
 
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!DL) return false;
-  if (BL->isIn(F)) return false;
   initializeCallbacks(*F.getParent());
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> AllLoadsAndStores;
@@ -331,22 +321,20 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   SmallVector<Instruction*, 8> MemIntrinCalls;
   bool Res = false;
   bool HasCalls = false;
+  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
 
   // Traverse all instructions, collect loads/stores/returns, check for calls.
-  for (Function::iterator FI = F.begin(), FE = F.end();
-       FI != FE; ++FI) {
-    BasicBlock &BB = *FI;
-    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
-         BI != BE; ++BI) {
-      if (isAtomic(BI))
-        AtomicAccesses.push_back(BI);
-      else if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
-        LocalLoadsAndStores.push_back(BI);
-      else if (isa<ReturnInst>(BI))
-        RetVec.push_back(BI);
-      else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
-        if (isa<MemIntrinsic>(BI))
-          MemIntrinCalls.push_back(BI);
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isAtomic(&Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<ReturnInst>(Inst))
+        RetVec.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
         HasCalls = true;
         chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
       }
@@ -358,21 +346,22 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // FIXME: many of these accesses do not need to be checked for races
   // (e.g. variables that do not escape, etc).
 
-  // Instrument memory accesses.
-  if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread))
-    for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
-      Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
+  // Instrument memory accesses only if we want to report bugs in the function.
+  if (ClInstrumentMemoryAccesses && SanitizeFunction)
+    for (auto Inst : AllLoadsAndStores) {
+      Res |= instrumentLoadOrStore(Inst);
     }
 
-  // Instrument atomic memory accesses.
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
   if (ClInstrumentAtomics)
-    for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) {
-      Res |= instrumentAtomic(AtomicAccesses[i]);
+    for (auto Inst : AtomicAccesses) {
+      Res |= instrumentAtomic(Inst);
     }
 
-  if (ClInstrumentMemIntrinsics)
-    for (size_t i = 0, n = MemIntrinCalls.size(); i < n; ++i) {
-      Res |= instrumentMemIntrinsic(MemIntrinCalls[i]);
+  if (ClInstrumentMemIntrinsics && SanitizeFunction)
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(Inst);
     }
 
   // Instrument function entry/exit points if there were instrumented accesses.
@@ -382,8 +371,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
         IRB.getInt32(0));
     IRB.CreateCall(TsanFuncEntry, ReturnAddress);
-    for (size_t i = 0, n = RetVec.size(); i < n; ++i) {
-      IRBuilder<> IRBRet(RetVec[i]);
+    for (auto RetInst : RetVec) {
+      IRBuilder<> IRBRet(RetInst);
       IRBRet.CreateCall(TsanFuncExit);
     }
     Res = true;
@@ -543,8 +532,14 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
                      IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
                      createOrdering(&IRB, CASI->getSuccessOrdering()),
                      createOrdering(&IRB, CASI->getFailureOrdering())};
-    CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args));
-    ReplaceInstWithInst(I, C);
+    CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
+    Value *Success = IRB.CreateICmpEQ(C, CASI->getCompareOperand());
+
+    Value *Res = IRB.CreateInsertValue(UndefValue::get(CASI->getType()), C, 0);
+    Res = IRB.CreateInsertValue(Res, Success, 1);
+
+    I->replaceAllUsesWith(Res);
+    I->eraseFromParent();
   } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
     Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
     Function *F = FI->getSynchScope() == SingleThread ?
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 3ad1488d00a3..2dcfa237ca33 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -8,10 +8,10 @@ add_llvm_library(LLVMScalarOpts
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   GVN.cpp
-  GlobalMerge.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
+  LoadCombine.cpp
   LoopDeletion.cpp
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 6d07dddd3e5e..56781d44aaa0 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1464,6 +1464,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       continue;
     }
 
+    // Loading from calloc (which zero initializes memory) -> zero
+    if (isCallocLikeFn(DepInst, TLI)) {
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(
+          DepBB, Constant::getNullValue(LI->getType())));
+      continue;
+    }
+
     if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
       // Reject loads and stores that are to the same address but are of
       // different types if we have to.
@@ -1988,6 +1995,15 @@ bool GVN::processLoad(LoadInst *L) {
     }
   }
 
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  if (isCallocLikeFn(DepInst, TLI)) {
+    L->replaceAllUsesWith(Constant::getNullValue(L->getType()));
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 230a381593e0..e501ff29d038 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -158,6 +158,9 @@ bool JumpThreading::runOnFunction(Function &F) {
   TLI = &getAnalysis<TargetLibraryInfo>();
   LVI = &getAnalysis<LazyValueInfo>();
 
+  // Remove unreachable blocks from function as they may result in infinite loop.
+  removeUnreachableBlocks(F);
+
   FindLoopHeaders(F);
 
   bool Changed, EverChanged = false;
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
new file mode 100644
index 000000000000..846aa703c9c3
--- /dev/null
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -0,0 +1,268 @@
+//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation combines adjacent loads.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-combine"
+
+STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
+STATISTIC(NumLoadsCombined, "Number of loads combined");
+
+namespace {
+struct PointerOffsetPair {
+  Value *Pointer;
+  uint64_t Offset;
+};
+
+struct LoadPOPPair {
+  LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O)
+      : Load(L), POP(P), InsertOrder(O) {}
+  LoadPOPPair() {}
+  LoadInst *Load;
+  PointerOffsetPair POP;
+  /// \brief The new load needs to be created before the first load in IR order.
+  unsigned InsertOrder;
+};
+
+class LoadCombine : public BasicBlockPass {
+  LLVMContext *C;
+  const DataLayout *DL;
+
+public:
+  LoadCombine()
+      : BasicBlockPass(ID),
+        C(nullptr), DL(nullptr) {
+    initializeSROAPass(*PassRegistry::getPassRegistry());
+  }
+  bool doInitialization(Function &) override;
+  bool runOnBasicBlock(BasicBlock &BB) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  const char *getPassName() const override { return "LoadCombine"; }
+  static char ID;
+
+  typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+private:
+  BuilderTy *Builder;
+
+  PointerOffsetPair getPointerOffsetPair(LoadInst &);
+  bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &);
+  bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &);
+  bool combineLoads(SmallVectorImpl<LoadPOPPair> &);
+};
+}
+
+bool LoadCombine::doInitialization(Function &F) {
+  DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
+  C = &F.getContext();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  if (!DLP) {
+    DEBUG(dbgs() << "  Skipping LoadCombine -- no target data!\n");
+    return false;
+  }
+  DL = &DLP->getDataLayout();
+  return true;
+}
+
+PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
+  PointerOffsetPair POP;
+  POP.Pointer = LI.getPointerOperand();
+  POP.Offset = 0;
+  while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
+      unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType());
+      APInt Offset(BitWidth, 0);
+      if (GEP->accumulateConstantOffset(*DL, Offset))
+        POP.Offset += Offset.getZExtValue();
+      else
+        // Can't handle GEPs with variable indices.
+        return POP;
+      POP.Pointer = GEP->getPointerOperand();
+    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer))
+      POP.Pointer = BC->getOperand(0);
+  }
+  return POP;
+}
+
+bool LoadCombine::combineLoads(
+    DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) {
+  bool Combined = false;
+  for (auto &Loads : LoadMap) {
+    if (Loads.second.size() < 2)
+      continue;
+    std::sort(Loads.second.begin(), Loads.second.end(),
+              [](const LoadPOPPair &A, const LoadPOPPair &B) {
+      return A.POP.Offset < B.POP.Offset;
+    });
+    if (aggregateLoads(Loads.second))
+      Combined = true;
+  }
+  return Combined;
+}
+
+/// \brief Try to aggregate loads from a sorted list of loads to be combined.
+///
+/// It is guaranteed that no writes occur between any of the loads. All loads
+/// have the same base pointer. There are at least two loads.
+bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+  assert(Loads.size() >= 2 && "Insufficient loads!");
+  LoadInst *BaseLoad = nullptr;
+  SmallVector<LoadPOPPair, 8> AggregateLoads;
+  bool Combined = false;
+  uint64_t PrevOffset = -1ull;
+  uint64_t PrevSize = 0;
+  for (auto &L : Loads) {
+    if (PrevOffset == -1ull) {
+      BaseLoad = L.Load;
+      PrevOffset = L.POP.Offset;
+      PrevSize = DL->getTypeStoreSize(L.Load->getType());
+      AggregateLoads.push_back(L);
+      continue;
+    }
+    if (L.Load->getAlignment() > BaseLoad->getAlignment())
+      continue;
+    if (L.POP.Offset > PrevOffset + PrevSize) {
+      // No other load will be combinable
+      if (combineLoads(AggregateLoads))
+        Combined = true;
+      AggregateLoads.clear();
+      PrevOffset = -1;
+      continue;
+    }
+    if (L.POP.Offset != PrevOffset + PrevSize)
+      // This load is offset less than the size of the last load.
+      // FIXME: We may want to handle this case.
+      continue;
+    PrevOffset = L.POP.Offset;
+    PrevSize = DL->getTypeStoreSize(L.Load->getType());
+    AggregateLoads.push_back(L);
+  }
+  if (combineLoads(AggregateLoads))
+    Combined = true;
+  return Combined;
+}
+
+/// \brief Given a list of combinable load. Combine the maximum number of them.
+bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+  // Remove loads from the end while the size is not a power of 2.
+  unsigned TotalSize = 0;
+  for (const auto &L : Loads)
+    TotalSize += L.Load->getType()->getPrimitiveSizeInBits();
+  while (TotalSize != 0 && !isPowerOf2_32(TotalSize))
+    TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits();
+  if (Loads.size() < 2)
+    return false;
+
+  DEBUG({
+    dbgs() << "***** Combining Loads ******\n";
+    for (const auto &L : Loads) {
+      dbgs() << L.POP.Offset << ": " << *L.Load << "\n";
+    }
+  });
+
+  // Find first load. This is where we put the new load.
+  LoadPOPPair FirstLP;
+  FirstLP.InsertOrder = -1u;
+  for (const auto &L : Loads)
+    if (L.InsertOrder < FirstLP.InsertOrder)
+      FirstLP = L;
+
+  unsigned AddressSpace =
+      FirstLP.POP.Pointer->getType()->getPointerAddressSpace();
+
+  Builder->SetInsertPoint(FirstLP.Load);
+  Value *Ptr = Builder->CreateConstGEP1_64(
+      Builder->CreatePointerCast(Loads[0].POP.Pointer,
+                                 Builder->getInt8PtrTy(AddressSpace)),
+      Loads[0].POP.Offset);
+  LoadInst *NewLoad = new LoadInst(
+      Builder->CreatePointerCast(
+          Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
+                                Ptr->getType()->getPointerAddressSpace())),
+      Twine(Loads[0].Load->getName()) + ".combined", false,
+      Loads[0].Load->getAlignment(), FirstLP.Load);
+
+  for (const auto &L : Loads) {
+    Builder->SetInsertPoint(L.Load);
+    Value *V = Builder->CreateExtractInteger(
+        *DL, NewLoad, cast<IntegerType>(L.Load->getType()),
+        L.POP.Offset - Loads[0].POP.Offset, "combine.extract");
+    L.Load->replaceAllUsesWith(V);
+  }
+
+  NumLoadsCombined = NumLoadsCombined + Loads.size();
+  return true;
+}
+
+bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
+  if (skipOptnoneFunction(BB) || !DL)
+    return false;
+
+  IRBuilder<true, TargetFolder>
+  TheBuilder(BB.getContext(), TargetFolder(DL));
+  Builder = &TheBuilder;
+
+  DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
+
+  bool Combined = false;
+  unsigned Index = 0;
+  for (auto &I : BB) {
+    if (I.mayWriteToMemory() || I.mayThrow()) {
+      if (combineLoads(LoadMap))
+        Combined = true;
+      LoadMap.clear();
+      continue;
+    }
+    LoadInst *LI = dyn_cast<LoadInst>(&I);
+    if (!LI)
+      continue;
+    ++NumLoadsAnalyzed;
+    if (!LI->isSimple() || !LI->getType()->isIntegerTy())
+      continue;
+    auto POP = getPointerOffsetPair(*LI);
+    if (!POP.Pointer)
+      continue;
+    LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++));
+  }
+  if (combineLoads(LoadMap))
+    Combined = true;
+  return Combined;
+}
+
+void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+char LoadCombine::ID = 0;
+
+BasicBlockPass *llvm::createLoadCombinePass() {
+  return new LoadCombine();
+}
+
+INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false,
+                false)
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 26a83dfdc3c0..a12f5a7a0334 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -112,7 +112,7 @@ namespace {
     /// the variable involved in the comparion is returned. This function will
     /// be called to see if the precondition and postcondition of the loop
     /// are in desirable form.
-    Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
+    Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const;
 
     /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
     /// is set to the instruction counting the population bit. 2) \p CntPhi
@@ -122,7 +122,7 @@ namespace {
       (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
 
     /// Insert ctpop intrinsic function and some obviously dead instructions.
-    void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var);
+    void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var);
 
     /// Create llvm.ctpop.* intrinsic function.
     CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 5c747e1ac518..2ce58314f8ef 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -32,7 +33,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-rotate"
 
-#define MAX_HEADER_SIZE 16
+static cl::opt<unsigned>
+DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,
+       cl::desc("The default maximum header size for automatic loop rotation"));
 
 STATISTIC(NumRotated, "Number of loops rotated");
 namespace {
@@ -40,8 +43,12 @@ namespace {
   class LoopRotate : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopRotate() : LoopPass(ID) {
+    LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
       initializeLoopRotatePass(*PassRegistry::getPassRegistry());
+      if (SpecifiedMaxHeaderSize == -1)
+        MaxHeaderSize = DefaultRotationThreshold;
+      else
+        MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
     }
 
     // LCSSA form makes instruction renaming easier.
@@ -62,6 +69,7 @@ namespace {
     bool rotateLoop(Loop *L, bool SimplifiedLatch);
 
   private:
+    unsigned MaxHeaderSize;
     LoopInfo *LI;
     const TargetTransformInfo *TTI;
   };
@@ -75,7 +83,9 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
 
-Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
+  return new LoopRotate(MaxHeaderSize);
+}
 
 /// Rotate Loop L as many times as possible. Return true if
 /// the loop is rotated at least once.
@@ -320,7 +330,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
             << " instructions: "; L->dump());
       return false;
     }
-    if (Metrics.NumInsts > MAX_HEADER_SIZE)
+    if (Metrics.NumInsts > MaxHeaderSize)
       return false;
   }
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 13e4fceec664..914b56aa8167 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -238,7 +238,15 @@ struct Formula {
   int64_t Scale;
 
   /// BaseRegs - The list of "base" registers for this use. When this is
-  /// non-empty,
+  /// non-empty. The canonical representation of a formula is
+  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// #1 enforces that the scaled register is always used when at least two
+  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+  /// #2 enforces that 1 * reg is reg.
+  /// This invariant can be temporarly broken while building a formula.
+  /// However, every formula inserted into the LSRInstance must be in canonical
+  /// form.
   SmallVector<const SCEV *, 4> BaseRegs;
 
   /// ScaledReg - The 'scaled' register for this use. This should be non-null
@@ -256,6 +264,12 @@ struct Formula {
 
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
+  bool isCanonical() const;
+
+  void Canonicalize();
+
+  bool Unscale();
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -346,6 +360,52 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
       BaseRegs.push_back(Sum);
     HasBaseReg = true;
   }
+  Canonicalize();
+}
+
+/// \brief Check whether or not this formula statisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical() const {
+  if (ScaledReg)
+    return Scale != 1 || !BaseRegs.empty();
+  return BaseRegs.size() <= 1;
+}
+
+/// \brief Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::Canonicalize() {
+  if (isCanonical())
+    return;
+  // So far we did not need this case. This is easy to implement but it is
+  // useless to maintain dead code. Beside it could hurt compile time.
+  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+  ScaledReg = BaseRegs.back();
+  BaseRegs.pop_back();
+  Scale = 1;
+  size_t BaseRegsSize = BaseRegs.size();
+  size_t Try = 0;
+  // If ScaledReg is an invariant, try to find a variant expression.
+  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
+    std::swap(ScaledReg, BaseRegs[Try++]);
+}
+
+/// \brief Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::Unscale() {
+  if (Scale != 1)
+    return false;
+  Scale = 0;
+  BaseRegs.push_back(ScaledReg);
+  ScaledReg = nullptr;
+  return true;
 }
 
 /// getNumRegs - Return the total number of register operands used by this
@@ -776,9 +836,18 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 namespace {
 class LSRUse;
 }
-// Check if it is legal to fold 2 base registers.
-static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
-                             const Formula &F);
+
+/// \brief Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F);
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
                                      const LSRUse &LU, const Formula &F);
@@ -922,6 +991,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
@@ -945,11 +1015,13 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   }
 
   // Determine how many (unfolded) adds we'll need inside the loop.
-  size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
+  size_t NumBaseParts = F.getNumRegs();
   if (NumBaseParts > 1)
     // Do not count the base and a possible second register if the target
     // allows to fold 2 registers.
-    NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
+    NumBaseAdds +=
+        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+  NumBaseAdds += (F.UnfoldedOffset != 0);
 
   // Accumulate non-free scaling amounts.
   ScaleCost += getScalingFactorCost(TTI, LU, F);
@@ -1210,7 +1282,10 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
+/// The formula must be in canonical form.
 bool LSRUse::InsertFormula(const Formula &F) {
+  assert(F.isCanonical() && "Invalid canonical representation");
+
   if (!Formulae.empty() && RigidFormula)
     return false;
 
@@ -1236,6 +1311,8 @@ bool LSRUse::InsertFormula(const Formula &F) {
 
   // Record registers now being used by this use.
   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  if (F.ScaledReg)
+    Regs.insert(F.ScaledReg);
 
   return true;
 }
@@ -1302,12 +1379,10 @@ void LSRUse::dump() const {
 }
 #endif
 
-/// isLegalUse - Test whether the use described by AM is "legal", meaning it can
-/// be completely folded into the user instruction at isel time. This includes
-/// address-mode folding and special icmp tricks.
-static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
-                       Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset,
-                       bool HasBaseReg, int64_t Scale) {
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
   switch (Kind) {
   case LSRUse::Address:
     return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
@@ -1358,10 +1433,11 @@ static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
   llvm_unreachable("Invalid LSRUse Kind!");
 }
 
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
-                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
-                       int64_t Scale) {
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
   // Check for overflow.
   if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
       (MinOffset > 0))
@@ -1372,9 +1448,41 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
     return false;
   MaxOffset = (uint64_t)BaseOffset + MaxOffset;
 
-  return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg,
-                    Scale) &&
-         isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale);
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+                              HasBaseReg, Scale) &&
+         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 const Formula &F) {
+  // For the purpose of isAMCompletelyFolded either having a canonical formula
+  // or a scale not equal to zero is correct.
+  // Problems may arise from non canonical formulae having a scale == 0.
+  // Strictly speaking it would best to just rely on canonical formulae.
+  // However, when we generate the scaled formulae, we first check that the
+  // scaling factor is profitable before computing the actual ScaledReg for
+  // compile time sake.
+  assert((F.isCanonical() || F.Scale != 0));
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// isLegalUse - Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
+                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
+                       int64_t Scale) {
+  // We know how to expand completely foldable formulae.
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale) ||
+         // Or formulae that use a base register produced by a sum of base
+         // registers.
+         (Scale == 1 &&
+          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                               BaseGV, BaseOffset, true, 0));
 }
 
 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
@@ -1384,36 +1492,23 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
-static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
-                             const Formula &F) {
-  // If F is used as an Addressing Mode, it may fold one Base plus one
-  // scaled register. If the scaled register is nil, do as if another
-  // element of the base regs is a 1-scaled register.
-  // This is possible if BaseRegs has at least 2 registers.
-
-  // If this is not an address calculation, this is not an addressing mode
-  // use.
-  if (LU.Kind !=  LSRUse::Address)
-    return false;
-
-  // F is already scaled.
-  if (F.Scale != 0)
-    return false;
-
-  // We need to keep one register for the base and one to scale.
-  if (F.BaseRegs.size() < 2)
-    return false;
-
-  return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
-                    F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
- }
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F) {
+  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+                              F.Scale);
+}
 
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
                                      const LSRUse &LU, const Formula &F) {
   if (!F.Scale)
     return 0;
-  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                    LU.AccessTy, F) && "Illegal formula in use.");
+
+  // If the use is not completely folded in that instruction, we will have to
+  // pay an extra cost only for scale != 1.
+  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                            LU.AccessTy, F))
+    return F.Scale != 1;
 
   switch (LU.Kind) {
   case LSRUse::Address: {
@@ -1432,12 +1527,10 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
     return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
   }
   case LSRUse::ICmpZero:
-    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
-    // Therefore, return 0 in case F.Scale == -1.
-    return F.Scale != -1;
-
   case LSRUse::Basic:
   case LSRUse::Special:
+    // The use is completely folded, i.e., everything is folded into the
+    // instruction.
     return 0;
   }
 
@@ -1462,7 +1555,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
     HasBaseReg = true;
   }
 
-  return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+                              HasBaseReg, Scale);
 }
 
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
@@ -1487,8 +1581,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
   // base and a scale.
   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
 
-  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
-                    BaseOffset, HasBaseReg, Scale);
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale);
 }
 
 namespace {
@@ -1644,8 +1738,19 @@ class LSRInstance {
 
   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
                               unsigned Depth = 0);
+
+  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                  const Formula &Base, unsigned Depth,
+                                  size_t Idx, bool IsScaledReg = false);
   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base, size_t Idx,
+                                   bool IsScaledReg = false);
   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base,
+                                   const SmallVectorImpl<int64_t> &Worklist,
+                                   size_t Idx, bool IsScaledReg = false);
   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
@@ -2148,23 +2253,25 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   // the uses will have all its uses outside the loop, for example.
   if (LU.Kind != Kind)
     return false;
+
+  // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
+  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
+    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
+
   // Conservatively assume HasBaseReg is true for now.
   if (NewOffset < LU.MinOffset) {
-    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
                           LU.MaxOffset - NewOffset, HasBaseReg))
       return false;
     NewMinOffset = NewOffset;
   } else if (NewOffset > LU.MaxOffset) {
-    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
                           NewOffset - LU.MinOffset, HasBaseReg))
       return false;
     NewMaxOffset = NewOffset;
   }
-  // Check for a mismatched access type, and fall back conservatively as needed.
-  // TODO: Be less conservative when the type is similar and can use the same
-  // addressing modes.
-  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
-    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
 
   // Update the use.
   LU.MinOffset = NewMinOffset;
@@ -2994,6 +3101,9 @@ void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+  // Do not insert formula that we will not be able to expand.
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+         "Formula is illegal");
   if (!LU.InsertFormula(F))
     return false;
 
@@ -3149,84 +3259,104 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
   return S;
 }
 
-/// GenerateReassociations - Split out subexpressions from adds and the bases of
-/// addrecs.
-void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
-                                         Formula Base,
-                                         unsigned Depth) {
-  // Arbitrarily cap recursion to protect compile time.
-  if (Depth >= 3) return;
-
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
-    const SCEV *BaseReg = Base.BaseRegs[i];
+/// \brief Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                             const Formula &Base,
+                                             unsigned Depth, size_t Idx,
+                                             bool IsScaledReg) {
+  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  SmallVector<const SCEV *, 8> AddOps;
+  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+  if (Remainder)
+    AddOps.push_back(Remainder);
+
+  if (AddOps.size() == 1)
+    return;
 
-    SmallVector<const SCEV *, 8> AddOps;
-    const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
-    if (Remainder)
-      AddOps.push_back(Remainder);
+  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+                                                     JE = AddOps.end();
+       J != JE; ++J) {
 
-    if (AddOps.size() == 1) continue;
+    // Loop-variant "unknown" values are uninteresting; we won't be able to
+    // do anything meaningful with them.
+    if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+      continue;
 
-    for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
-         JE = AddOps.end(); J != JE; ++J) {
+    // Don't pull a constant into a register if the constant could be folded
+    // into an immediate field.
+    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, *J, Base.getNumRegs() > 1))
+      continue;
 
-      // Loop-variant "unknown" values are uninteresting; we won't be able to
-      // do anything meaningful with them.
-      if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
-        continue;
+    // Collect all operands except *J.
+    SmallVector<const SCEV *, 8> InnerAddOps(
+        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+    InnerAddOps.append(std::next(J),
+                       ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+    // Don't leave just a constant behind in a register if the constant could
+    // be folded into an immediate field.
+    if (InnerAddOps.size() == 1 &&
+        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+      continue;
 
-      // Don't pull a constant into a register if the constant could be folded
-      // into an immediate field.
-      if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                           LU.AccessTy, *J, Base.getNumRegs() > 1))
-        continue;
+    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+    if (InnerSum->isZero())
+      continue;
+    Formula F = Base;
 
-      // Collect all operands except *J.
-      SmallVector<const SCEV *, 8> InnerAddOps(
-          ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
-      InnerAddOps.append(std::next(J),
-                         ((const SmallVector<const SCEV *, 8> &)AddOps).end());
-
-      // Don't leave just a constant behind in a register if the constant could
-      // be folded into an immediate field.
-      if (InnerAddOps.size() == 1 &&
-          isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                           LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
-        continue;
+    // Add the remaining pieces of the add back into the new formula.
+    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+    if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                InnerSumSC->getValue()->getZExtValue())) {
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+      if (IsScaledReg)
+        F.ScaledReg = nullptr;
+      else
+        F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+    } else if (IsScaledReg)
+      F.ScaledReg = InnerSum;
+    else
+      F.BaseRegs[Idx] = InnerSum;
+
+    // Add J as its own register, or an unfolded immediate.
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+    if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                SC->getValue()->getZExtValue()))
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+    else
+      F.BaseRegs.push_back(*J);
+    // We may have changed the number of register in base regs, adjust the
+    // formula accordingly.
+    F.Canonicalize();
+
+    if (InsertFormula(LU, LUIdx, F))
+      // If that formula hadn't been seen before, recurse to find more like
+      // it.
+      GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+  }
+}
 
-      const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
-      if (InnerSum->isZero())
-        continue;
-      Formula F = Base;
+/// GenerateReassociations - Split out subexpressions from adds and the bases of
+/// addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base, unsigned Depth) {
+  assert(Base.isCanonical() && "Input must be in the canonical form");
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return;
 
-      // Add the remaining pieces of the add back into the new formula.
-      const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
-      if (InnerSumSC &&
-          SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
-          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
-                                  InnerSumSC->getValue()->getZExtValue())) {
-        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
-                           InnerSumSC->getValue()->getZExtValue();
-        F.BaseRegs.erase(F.BaseRegs.begin() + i);
-      } else
-        F.BaseRegs[i] = InnerSum;
-
-      // Add J as its own register, or an unfolded immediate.
-      const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
-      if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
-          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
-                                  SC->getValue()->getZExtValue()))
-        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
-                           SC->getValue()->getZExtValue();
-      else
-        F.BaseRegs.push_back(*J);
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
 
-      if (InsertFormula(LU, LUIdx, F))
-        // If that formula hadn't been seen before, recurse to find more like
-        // it.
-        GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
-    }
-  }
+  if (Base.Scale == 1)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+                               /* Idx */ -1, /* IsScaledReg */ true);
 }
 
 /// GenerateCombinations - Generate a formula consisting of all of the
@@ -3234,8 +3364,12 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
   // This method is only interesting on a plurality of registers.
-  if (Base.BaseRegs.size() <= 1) return;
+  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+    return;
 
+  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+  // processing the formula.
+  Base.Unscale();
   Formula F = Base;
   F.BaseRegs.clear();
   SmallVector<const SCEV *, 4> Ops;
@@ -3255,29 +3389,87 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
     // rather than proceed with zero in a register.
     if (!Sum->isZero()) {
       F.BaseRegs.push_back(Sum);
+      F.Canonicalize();
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
 }
 
+/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                              const Formula &Base, size_t Idx,
+                                              bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  GlobalValue *GV = ExtractSymbol(G, SE);
+  if (G->isZero() || !GV)
+    return;
+  Formula F = Base;
+  F.BaseGV = GV;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
                                           Formula Base) {
   // We can't add a symbolic offset if the address already contains one.
   if (Base.BaseGV) return;
 
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
-    const SCEV *G = Base.BaseRegs[i];
-    GlobalValue *GV = ExtractSymbol(G, SE);
-    if (G->isZero() || !GV)
-      continue;
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+  if (Base.Scale == 1)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+    LSRUse &LU, unsigned LUIdx, const Formula &Base,
+    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
+                                                E = Worklist.end();
+       I != E; ++I) {
     Formula F = Base;
-    F.BaseGV = GV;
-    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
-      continue;
-    F.BaseRegs[i] = G;
-    (void)InsertFormula(LU, LUIdx, F);
+    F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
+    if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
+                   LU.AccessTy, F)) {
+      // Add the offset to the base register.
+      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
+      // If it cancelled out, drop the base register, otherwise update it.
+      if (NewG->isZero()) {
+        if (IsScaledReg) {
+          F.Scale = 0;
+          F.ScaledReg = nullptr;
+        } else
+          F.DeleteBaseReg(F.BaseRegs[Idx]);
+        F.Canonicalize();
+      } else if (IsScaledReg)
+        F.ScaledReg = NewG;
+      else
+        F.BaseRegs[Idx] = NewG;
+
+      (void)InsertFormula(LU, LUIdx, F);
+    }
   }
+
+  int64_t Imm = ExtractImmediate(G, SE);
+  if (G->isZero() || Imm == 0)
+    return;
+  Formula F = Base;
+  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
 }
 
 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
@@ -3290,38 +3482,11 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
   if (LU.MaxOffset != LU.MinOffset)
     Worklist.push_back(LU.MaxOffset);
 
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
-    const SCEV *G = Base.BaseRegs[i];
-
-    for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
-         E = Worklist.end(); I != E; ++I) {
-      Formula F = Base;
-      F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
-      if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
-                     LU.AccessTy, F)) {
-        // Add the offset to the base register.
-        const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
-        // If it cancelled out, drop the base register, otherwise update it.
-        if (NewG->isZero()) {
-          std::swap(F.BaseRegs[i], F.BaseRegs.back());
-          F.BaseRegs.pop_back();
-        } else
-          F.BaseRegs[i] = NewG;
-
-        (void)InsertFormula(LU, LUIdx, F);
-      }
-    }
-
-    int64_t Imm = ExtractImmediate(G, SE);
-    if (G->isZero() || Imm == 0)
-      continue;
-    Formula F = Base;
-    F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
-    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
-      continue;
-    F.BaseRegs[i] = G;
-    (void)InsertFormula(LU, LUIdx, F);
-  }
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+  if (Base.Scale == 1)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+                                /* IsScaledReg */ true);
 }
 
 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
@@ -3421,7 +3586,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   if (!IntTy) return;
 
   // If this Formula already has a scaled register, we can't add another one.
-  if (Base.Scale != 0) return;
+  // Try to unscale the formula to generate a better scale.
+  if (Base.Scale != 0 && !Base.Unscale())
+    return;
+
+  assert(Base.Scale == 0 && "Unscale did not did its job!");
 
   // Check each interesting stride.
   for (SmallSetVector<int64_t, 8>::const_iterator
@@ -3462,6 +3631,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           Formula F = Base;
           F.ScaledReg = Quotient;
           F.DeleteBaseReg(F.BaseRegs[i]);
+          // The canonical representation of 1*reg is reg, which is already in
+          // Base. In that case, do not try to insert the formula, it will be
+          // rejected anyway.
+          if (F.Scale == 1 && F.BaseRegs.empty())
+            continue;
           (void)InsertFormula(LU, LUIdx, F);
         }
       }
@@ -3626,7 +3800,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 
     // TODO: Use a more targeted data structure.
     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
-      const Formula &F = LU.Formulae[L];
+      Formula F = LU.Formulae[L];
+      // FIXME: The code for the scaled and unscaled registers looks
+      // very similar but slightly different. Investigate if they
+      // could be merged. That way, we would not have to unscale the
+      // Formula.
+      F.Unscale();
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
@@ -3652,6 +3831,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
             continue;
 
         // OK, looks good.
+        NewF.Canonicalize();
         (void)InsertFormula(LU, LUIdx, NewF);
       } else {
         // Use the immediate in a base register.
@@ -3685,6 +3865,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                 goto skip_formula;
 
           // Ok, looks good.
+          NewF.Canonicalize();
           (void)InsertFormula(LU, LUIdx, NewF);
           break;
         skip_formula:;
@@ -3938,7 +4119,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
     for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
          E = LU.Formulae.end(); I != E; ++I) {
       const Formula &F = *I;
-      if (F.BaseOffset == 0 || F.Scale != 0)
+      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
         continue;
 
       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
@@ -4399,25 +4580,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                                      Loops, SE, DT);
 
     if (LU.Kind == LSRUse::ICmpZero) {
-      // An interesting way of "folding" with an icmp is to use a negated
-      // scale, which we'll implement by inserting it into the other operand
-      // of the icmp.
-      assert(F.Scale == -1 &&
-             "The only scale supported by ICmpZero uses is -1!");
-      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP);
+      // Expand ScaleReg as if it was part of the base regs.
+      if (F.Scale == 1)
+        Ops.push_back(
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)));
+      else {
+        // An interesting way of "folding" with an icmp is to use a negated
+        // scale, which we'll implement by inserting it into the other operand
+        // of the icmp.
+        assert(F.Scale == -1 &&
+               "The only scale supported by ICmpZero uses is -1!");
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP);
+      }
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
       // which is expected to be matched as part of the address.
 
       // Flush the operand list to suppress SCEVExpander hoisting address modes.
-      if (!Ops.empty() && LU.Kind == LSRUse::Address) {
+      // Unless the addressing mode will not be folded.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+          isAMCompletelyFolded(TTI, LU, F)) {
         Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
         Ops.clear();
         Ops.push_back(SE.getUnknown(FullV));
       }
       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP));
-      ScaledS = SE.getMulExpr(ScaledS,
-                              SE.getConstant(ScaledS->getType(), F.Scale));
+      if (F.Scale != 1)
+        ScaledS =
+            SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
       Ops.push_back(ScaledS);
     }
   }
@@ -4495,7 +4685,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       }
       CI->setOperand(1, ICmpScaledV);
     } else {
-      assert(F.Scale == 0 &&
+      // A scale of 1 means that the scale has been expanded as part of the
+      // base regs.
+      assert((F.Scale == 0 || F.Scale == 1) &&
              "ICmp does not support folding a global value and "
              "a scale at the same time!");
       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index fc28fd2bdce2..0af5a71c126d 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -18,8 +18,10 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -36,7 +38,8 @@ UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
 
 static cl::opt<unsigned>
 UnrollCount("unroll-count", cl::init(0), cl::Hidden,
-  cl::desc("Use this unroll count for all loops, for testing purposes"));
+  cl::desc("Use this unroll count for all loops including those with "
+           "unroll_count pragma values, for testing purposes"));
 
 static cl::opt<bool>
 UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
@@ -47,6 +50,11 @@ static cl::opt<bool>
 UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden,
   cl::desc("Unroll loops with run-time trip counts"));
 
+static cl::opt<unsigned>
+PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+  cl::desc("Unrolled size limit for loops with an unroll(enable) or "
+           "unroll_count pragma."));
+
 namespace {
   class LoopUnroll : public LoopPass {
   public:
@@ -109,6 +117,66 @@ namespace {
       // For now, recreate dom info, if loop is unrolled.
       AU.addPreserved<DominatorTreeWrapperPass>();
     }
+
+    // Fill in the UnrollingPreferences parameter with values from the
+    // TargetTransformationInfo.
+    void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,
+                                 TargetTransformInfo::UnrollingPreferences &UP) {
+      UP.Threshold = CurrentThreshold;
+      UP.OptSizeThreshold = OptSizeUnrollThreshold;
+      UP.PartialThreshold = CurrentThreshold;
+      UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
+      UP.Count = CurrentCount;
+      UP.MaxCount = UINT_MAX;
+      UP.Partial = CurrentAllowPartial;
+      UP.Runtime = CurrentRuntime;
+      TTI.getUnrollingPreferences(L, UP);
+    }
+
+    // Select and return an unroll count based on parameters from
+    // user, unroll preferences, unroll pragmas, or a heuristic.
+    // SetExplicitly is set to true if the unroll count is is set by
+    // the user or a pragma rather than selected heuristically.
+    unsigned
+    selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma,
+                      unsigned PragmaCount,
+                      const TargetTransformInfo::UnrollingPreferences &UP,
+                      bool &SetExplicitly);
+
+
+    // Select threshold values used to limit unrolling based on a
+    // total unrolled size.  Parameters Threshold and PartialThreshold
+    // are set to the maximum unrolled size for fully and partially
+    // unrolled loops respectively.
+    void selectThresholds(const Loop *L, bool HasPragma,
+                          const TargetTransformInfo::UnrollingPreferences &UP,
+                          unsigned &Threshold, unsigned &PartialThreshold) {
+      // Determine the current unrolling threshold.  While this is
+      // normally set from UnrollThreshold, it is overridden to a
+      // smaller value if the current function is marked as
+      // optimize-for-size, and the unroll threshold was not user
+      // specified.
+      Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
+      PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
+      if (!UserThreshold &&
+          L->getHeader()->getParent()->getAttributes().
+              hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::OptimizeForSize)) {
+        Threshold = UP.OptSizeThreshold;
+        PartialThreshold = UP.PartialOptSizeThreshold;
+      }
+      if (HasPragma) {
+        // If the loop has an unrolling pragma, we want to be more
+        // aggressive with unrolling limits.  Set thresholds to at
+        // least the PragmaTheshold value which is larger than the
+        // default limits.
+        if (Threshold != NoThreshold)
+          Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold);
+        if (PartialThreshold != NoThreshold)
+          PartialThreshold =
+              std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold);
+      }
+    }
   };
 }
 
@@ -151,6 +219,105 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
   return LoopSize;
 }
 
+// Returns the value associated with the given metadata node name (for
+// example, "llvm.loopunroll.count").  If no such named metadata node
+// exists, then nullptr is returned.
+static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
+                                                 StringRef Name) {
+  MDNode *LoopID = L->getLoopID();
+  if (!LoopID) return nullptr;
+
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD) continue;
+
+    const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S) continue;
+
+    if (Name.equals(S->getString())) {
+      assert(MD->getNumOperands() == 2 &&
+             "Unroll hint metadata should have two operands.");
+      return cast<ConstantInt>(MD->getOperand(1));
+    }
+  }
+  return nullptr;
+}
+
+// Returns true if the loop has an unroll(enable) pragma.
+static bool HasUnrollEnablePragma(const Loop *L) {
+  const ConstantInt *EnableValue =
+      GetUnrollMetadataValue(L, "llvm.loopunroll.enable");
+  return (EnableValue && EnableValue->getZExtValue());
+  return false;
+}
+
+// Returns true if the loop has an unroll(disable) pragma.
+static bool HasUnrollDisablePragma(const Loop *L) {
+  const ConstantInt *EnableValue =
+      GetUnrollMetadataValue(L, "llvm.loopunroll.enable");
+  return (EnableValue && !EnableValue->getZExtValue());
+  return false;
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned UnrollCountPragmaValue(const Loop *L) {
+  const ConstantInt *CountValue =
+      GetUnrollMetadataValue(L, "llvm.loopunroll.count");
+  if (CountValue) {
+    unsigned Count = CountValue->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+unsigned LoopUnroll::selectUnrollCount(
+    const Loop *L, unsigned TripCount, bool HasEnablePragma,
+    unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,
+    bool &SetExplicitly) {
+  SetExplicitly = true;
+
+  // User-specified count (either as a command-line option or
+  // constructor parameter) has highest precedence.
+  unsigned Count = UserCount ? CurrentCount : 0;
+
+  // If there is no user-specified count, unroll pragmas have the next
+  // highest precendence.
+  if (Count == 0) {
+    if (PragmaCount) {
+      Count = PragmaCount;
+    } else if (HasEnablePragma) {
+      // unroll(enable) pragma without an unroll_count pragma
+      // indicates to unroll loop fully.
+      Count = TripCount;
+    }
+  }
+
+  if (Count == 0)
+    Count = UP.Count;
+
+  if (Count == 0) {
+    SetExplicitly = false;
+    if (TripCount == 0)
+      // Runtime trip count.
+      Count = UnrollRuntimeCount;
+    else
+      // Conservative heuristic: if we know the trip count, see if we can
+      // completely unroll (subject to the threshold, checked below); otherwise
+      // try to find greatest modulo of the trip count which is still under
+      // threshold value.
+      Count = TripCount;
+  }
+  if (TripCount && Count > TripCount)
+    return TripCount;
+  return Count;
+}
+
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipOptnoneFunction(L))
     return false;
@@ -162,33 +329,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
         << "] Loop %" << Header->getName() << "\n");
-  (void)Header;
 
-  TargetTransformInfo::UnrollingPreferences UP;
-  UP.Threshold = CurrentThreshold;
-  UP.OptSizeThreshold = OptSizeUnrollThreshold;
-  UP.PartialThreshold = CurrentThreshold;
-  UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
-  UP.Count = CurrentCount;
-  UP.MaxCount = UINT_MAX;
-  UP.Partial = CurrentAllowPartial;
-  UP.Runtime = CurrentRuntime;
-  TTI.getUnrollingPreferences(L, UP);
-
-  // Determine the current unrolling threshold.  While this is normally set
-  // from UnrollThreshold, it is overridden to a smaller value if the current
-  // function is marked as optimize-for-size, and the unroll threshold was
-  // not user specified.
-  unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
-  unsigned PartialThreshold =
-    UserThreshold ? CurrentThreshold : UP.PartialThreshold;
-  if (!UserThreshold &&
-      Header->getParent()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex,
-                     Attribute::OptimizeForSize)) {
-    Threshold = UP.OptSizeThreshold;
-    PartialThreshold = UP.PartialOptSizeThreshold;
+  if (HasUnrollDisablePragma(L)) {
+    return false;
   }
+  bool HasEnablePragma = HasUnrollEnablePragma(L);
+  unsigned PragmaCount = UnrollCountPragmaValue(L);
+  bool HasPragma = HasEnablePragma || PragmaCount > 0;
+
+  TargetTransformInfo::UnrollingPreferences UP;
+  getUnrollingPreferences(L, TTI, UP);
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -202,79 +352,117 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
   }
 
-  bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime;
-
-  // Use a default unroll-count if the user doesn't specify a value
-  // and the trip count is a run-time value.  The default is different
-  // for run-time or compile-time trip count loops.
-  unsigned Count = UserCount ? CurrentCount : UP.Count;
-  if (Runtime && Count == 0 && TripCount == 0)
-    Count = UnrollRuntimeCount;
+  // Select an initial unroll count.  This may be reduced later based
+  // on size thresholds.
+  bool CountSetExplicitly;
+  unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount,
+                                     UP, CountSetExplicitly);
+
+  unsigned NumInlineCandidates;
+  bool notDuplicatable;
+  unsigned LoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI);
+  DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+  uint64_t UnrolledSize = (uint64_t)LoopSize * Count;
+  if (notDuplicatable) {
+    DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
+                 << " instructions.\n");
+    return false;
+  }
+  if (NumInlineCandidates != 0) {
+    DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return false;
+  }
 
-  if (Count == 0) {
-    // Conservative heuristic: if we know the trip count, see if we can
-    // completely unroll (subject to the threshold, checked below); otherwise
-    // try to find greatest modulo of the trip count which is still under
-    // threshold value.
-    if (TripCount == 0)
-      return false;
-    Count = TripCount;
+  unsigned Threshold, PartialThreshold;
+  selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold);
+
+  // Given Count, TripCount and thresholds determine the type of
+  // unrolling which is to be performed.
+  enum { Full = 0, Partial = 1, Runtime = 2 };
+  int Unrolling;
+  if (TripCount && Count == TripCount) {
+    if (Threshold != NoThreshold && UnrolledSize > Threshold) {
+      DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
+                   << " because size: " << UnrolledSize << ">" << Threshold
+                   << "\n");
+      Unrolling = Partial;
+    } else {
+      Unrolling = Full;
+    }
+  } else if (TripCount && Count < TripCount) {
+    Unrolling = Partial;
+  } else {
+    Unrolling = Runtime;
   }
 
-  // Enforce the threshold.
-  if (Threshold != NoThreshold && PartialThreshold != NoThreshold) {
-    unsigned NumInlineCandidates;
-    bool notDuplicatable;
-    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates,
-                                            notDuplicatable, TTI);
-    DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
-    if (notDuplicatable) {
-      DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
-            << " instructions.\n");
+  // Reduce count based on the type of unrolling and the threshold values.
+  unsigned OriginalCount = Count;
+  bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime;
+  if (Unrolling == Partial) {
+    bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
+    if (!AllowPartial && !CountSetExplicitly) {
+      DEBUG(dbgs() << "  will not try to unroll partially because "
+                   << "-unroll-allow-partial not given\n");
       return false;
     }
-    if (NumInlineCandidates != 0) {
-      DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) {
+      // Reduce unroll count to be modulo of TripCount for partial unrolling.
+      Count = PartialThreshold / LoopSize;
+      while (Count != 0 && TripCount % Count != 0)
+        Count--;
+    }
+  } else if (Unrolling == Runtime) {
+    if (!AllowRuntime && !CountSetExplicitly) {
+      DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
+                   << "-unroll-runtime not given\n");
       return false;
     }
-    uint64_t Size = (uint64_t)LoopSize*Count;
-    if (TripCount != 1 &&
-        (Size > Threshold || (Count != TripCount && Size > PartialThreshold))) {
-      if (Size > Threshold)
-        DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
-                     << " because size: " << Size << ">" << Threshold << "\n");
-
-      bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
-      if (!AllowPartial && !(Runtime && TripCount == 0)) {
-        DEBUG(dbgs() << "  will not try to unroll partially because "
-              << "-unroll-allow-partial not given\n");
-        return false;
-      }
-      if (TripCount) {
-        // Reduce unroll count to be modulo of TripCount for partial unrolling
-        Count = PartialThreshold / LoopSize;
-        while (Count != 0 && TripCount%Count != 0)
-          Count--;
-      }
-      else if (Runtime) {
-        // Reduce unroll count to be a lower power-of-two value
-        while (Count != 0 && Size > PartialThreshold) {
-          Count >>= 1;
-          Size = LoopSize*Count;
-        }
-      }
-      if (Count > UP.MaxCount)
-        Count = UP.MaxCount;
-      if (Count < 2) {
-        DEBUG(dbgs() << "  could not unroll partially\n");
-        return false;
+    // Reduce unroll count to be the largest power-of-two factor of
+    // the original count which satisfies the threshold limit.
+    while (Count != 0 && UnrolledSize > PartialThreshold) {
+      Count >>= 1;
+      UnrolledSize = LoopSize * Count;
+    }
+    if (Count > UP.MaxCount)
+      Count = UP.MaxCount;
+    DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
+  }
+
+  if (HasPragma) {
+    // Emit optimization remarks if we are unable to unroll the loop
+    // as directed by a pragma.
+    DebugLoc LoopLoc = L->getStartLoc();
+    Function *F = Header->getParent();
+    LLVMContext &Ctx = F->getContext();
+    if (HasEnablePragma && PragmaCount == 0) {
+      if (TripCount && Count != TripCount) {
+        emitOptimizationRemarkMissed(
+            Ctx, DEBUG_TYPE, *F, LoopLoc,
+            "Unable to fully unroll loop as directed by unroll(enable) pragma "
+            "because unrolled size is too large.");
+      } else if (!TripCount) {
+        emitOptimizationRemarkMissed(
+            Ctx, DEBUG_TYPE, *F, LoopLoc,
+            "Unable to fully unroll loop as directed by unroll(enable) pragma "
+            "because loop has a runtime trip count.");
       }
-      DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
+    } else if (PragmaCount > 0 && Count != OriginalCount) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to unroll loop the number of times directed by "
+          "unroll_count pragma because unrolled size is too large.");
     }
   }
 
+  if (Unrolling != Full && Count < 2) {
+    // Partial unrolling by 1 is a nop.  For full unrolling, a factor
+    // of 1 makes sense because loop control can be eliminated.
+    return false;
+  }
+
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, this, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 4251ac47ed51..3314e1ed41ab 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -32,7 +32,10 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
   Value *Res = Builder.CreateSelect(Equal, Val, Orig);
   Builder.CreateStore(Res, Ptr);
 
-  CXI->replaceAllUsesWith(Orig);
+  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+  Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+  CXI->replaceAllUsesWith(Res);
   CXI->eraseFromParent();
   return true;
 }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 986d6a4bae14..ea2cf7cf9b5f 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1368,11 +1368,10 @@ Value *Reassociate::OptimizeXor(Instruction *I,
 Value *Reassociate::OptimizeAdd(Instruction *I,
                                 SmallVectorImpl<ValueEntry> &Ops) {
   // Scan the operand lists looking for X and -X pairs.  If we find any, we
-  // can simplify the expression. X+-X == 0.  While we're at it, scan for any
+  // can simplify expressions like X+-X == 0 and X+~X ==-1.  While we're at it,
+  // scan for any
   // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
-  //
-  // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1".
-  //
+
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     Value *TheOp = Ops[i].Op;
     // Check to see if we've seen this operand before.  If so, we factor all
@@ -1412,19 +1411,28 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
       continue;
     }
 
-    // Check for X and -X in the operand list.
-    if (!BinaryOperator::isNeg(TheOp))
+    // Check for X and -X or X and ~X in the operand list.
+    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp))
       continue;
 
-    Value *X = BinaryOperator::getNegArgument(TheOp);
+    Value *X = nullptr;
+    if (BinaryOperator::isNeg(TheOp))
+      X = BinaryOperator::getNegArgument(TheOp);
+    else if (BinaryOperator::isNot(TheOp))
+      X = BinaryOperator::getNotArgument(TheOp);
+
     unsigned FoundX = FindInOperandList(Ops, i, X);
     if (FoundX == i)
       continue;
 
     // Remove X and -X from the operand list.
-    if (Ops.size() == 2)
+    if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp))
       return Constant::getNullValue(X->getType());
 
+    // Remove X and ~X from the operand list.
+    if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+      return Constant::getAllOnesValue(X->getType());
+
     Ops.erase(Ops.begin()+i);
     if (i < FoundX)
       --FoundX;
@@ -1434,6 +1442,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     ++NumAnnihil;
     --i;     // Revisit element.
     e -= 2;  // Removed two elements.
+
+    // if X and ~X we append -1 to the operand list.
+    if (BinaryOperator::isNot(TheOp)) {
+      Value *V = Constant::getAllOnesValue(X->getType());
+      Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+      e += 1;
+    }
   }
 
   // Scan the operand list, checking to see if there are any common factors
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index feeb23136eaa..90c3520c8323 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -494,7 +494,9 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
-  void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); }
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    markAnythingOverdefined(&I);
+  }
   void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
   void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 04bf4f8dfc3a..6532b7a09b99 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1032,11 +1032,6 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       UserTy = SI->getValueOperand()->getType();
     }
 
-    if (!UserTy || (Ty && Ty != UserTy))
-      TyIsCommon = false; // Give up on anything but an iN type.
-    else
-      Ty = UserTy;
-
     if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
       // If the type is larger than the partition, skip it. We only encounter
       // this for split integer operations where we want to use the type of the
@@ -1051,6 +1046,13 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
         ITy = UserITy;
     }
+
+    // To avoid depending on the order of slices, Ty and TyIsCommon must not
+    // depend on types skipped above.
+    if (!UserTy || (Ty && Ty != UserTy))
+      TyIsCommon = false; // Give up on anything but an iN type.
+    else
+      Ty = UserTy;
   }
 
   return TyIsCommon ? Ty : ITy;
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 8e557aaa2f26..888e25273839 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -451,7 +451,7 @@ void SampleModuleProfile::dump() {
 /// \returns true if the file was loaded successfully, false otherwise.
 bool SampleModuleProfile::loadText() {
   std::unique_ptr<MemoryBuffer> Buffer;
-  error_code EC = MemoryBuffer::getFile(Filename, Buffer);
+  std::error_code EC = MemoryBuffer::getFile(Filename, Buffer);
   if (EC) {
     std::string Msg(EC.message());
     M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index f8f828c84057..edf012d81171 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -65,6 +65,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSinkingPass(Registry);
   initializeTailCallElimPass(Registry);
   initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeLoadCombinePass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 0ccf29c225b1..62f2026b8d9f 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -121,41 +121,75 @@ class ConstantOffsetExtractor {
   /// numeric value of the extracted constant offset (0 if failed), and a
   /// new index representing the remainder (equal to the original index minus
   /// the constant offset).
-  /// \p Idx The given GEP index
-  /// \p NewIdx The new index to replace
-  /// \p DL The datalayout of the module
-  /// \p IP Calculating the new index requires new instructions. IP indicates
-  /// where to insert them (typically right before the GEP).
+  /// \p Idx    The given GEP index
+  /// \p NewIdx The new index to replace (output)
+  /// \p DL     The datalayout of the module
+  /// \p GEP    The given GEP
   static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL,
-                         Instruction *IP);
+                         GetElementPtrInst *GEP);
   /// Looks for a constant offset without extracting it. The meaning of the
   /// arguments and the return value are the same as Extract.
-  static int64_t Find(Value *Idx, const DataLayout *DL);
+  static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP);
 
  private:
   ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt)
       : DL(Layout), IP(InsertionPt) {}
-  /// Searches the expression that computes V for a constant offset. If the
-  /// searching is successful, update UserChain as a path from V to the constant
-  /// offset.
-  int64_t find(Value *V);
-  /// A helper function to look into both operands of a binary operator U.
-  /// \p IsSub Whether U is a sub operator. If so, we need to negate the
-  /// constant offset at some point.
-  int64_t findInEitherOperand(User *U, bool IsSub);
-  /// After finding the constant offset and how it is reached from the GEP
-  /// index, we build a new index which is a clone of the old one except the
-  /// constant offset is removed. For example, given (a + (b + 5)) and knowning
-  /// the constant offset is 5, this function returns (a + b).
+  /// Searches the expression that computes V for a non-zero constant C s.t.
+  /// V can be reassociated into the form V' + C. If the searching is
+  /// successful, returns C and update UserChain as a def-use chain from C to V;
+  /// otherwise, UserChain is empty.
   ///
-  /// We cannot simply change the constant to zero because the expression that
-  /// computes the index or its intermediate result may be used by others.
-  Value *rebuildWithoutConstantOffset();
-  // A helper function for rebuildWithoutConstantOffset that rebuilds the direct
-  // user (U) of the constant offset (C).
-  Value *rebuildLeafWithoutConstantOffset(User *U, Value *C);
-  /// Returns a clone of U except the first occurrence of From with To.
-  Value *cloneAndReplace(User *U, Value *From, Value *To);
+  /// \p V            The given expression
+  /// \p SignExtended Whether V will be sign-extended in the computation of the
+  ///                 GEP index
+  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+  ///                 GEP index
+  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
+  ///                 an index of an inbounds GEP is guaranteed to be
+  ///                 non-negative. Levaraging this, we can better split
+  ///                 inbounds GEPs.
+  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+  /// A helper function to look into both operands of a binary operator.
+  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+                            bool ZeroExtended);
+  /// After finding the constant offset C from the GEP index I, we build a new
+  /// index I' s.t. I' + C = I. This function builds and returns the new
+  /// index I' according to UserChain produced by function "find".
+  ///
+  /// The building conceptually takes two steps:
+  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+  /// that computes I
+  /// 2) reassociate the expression tree to the form I' + C.
+  ///
+  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+  /// sext to a, b and 5 so that we have
+  ///   sext(a) + (sext(b) + 5).
+  /// Then, we reassociate it to
+  ///   (sext(a) + sext(b)) + 5.
+  /// Given this form, we know I' is sext(a) + sext(b).
+  Value *rebuildWithoutConstOffset();
+  /// After the first step of rebuilding the GEP index without the constant
+  /// offset, distribute s/zext to the operands of all operators in UserChain.
+  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+  ///
+  /// The function also updates UserChain to point to new subexpressions after
+  /// distributing s/zext. e.g., the old UserChain of the above example is
+  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+  /// and the new UserChain is
+  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+  ///
+  /// \p ChainIndex The index to UserChain. ChainIndex is initially
+  ///               UserChain.size() - 1, and is decremented during
+  ///               the recursion.
+  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+  /// Reassociates the GEP index to the form I' + C and returns I'.
+  Value *removeConstOffset(unsigned ChainIndex);
+  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+  /// returns "sext i32 (zext i16 V to i32) to i64".
+  Value *applyExts(Value *V);
 
   /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0.
   bool NoCommonBits(Value *LHS, Value *RHS) const;
@@ -163,16 +197,26 @@ class ConstantOffsetExtractor {
   /// \p KnownOne Mask of all bits that are known to be one.
   /// \p KnownZero Mask of all bits that are known to be zero.
   void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const;
-  /// Finds the first use of Used in U. Returns -1 if not found.
-  static unsigned FindFirstUse(User *U, Value *Used);
+  /// A helper function that returns whether we can trace into the operands
+  /// of binary operator BO for a constant offset.
+  ///
+  /// \p SignExtended Whether BO is surrounded by sext
+  /// \p ZeroExtended Whether BO is surrounded by zext
+  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+  ///                array index.
+  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+                    bool NonNegative);
 
   /// The path from the constant offset to the old GEP index. e.g., if the GEP
   /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
   /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
   /// UserChain[2] will be the entire expression "a * b + (c + 5)".
   ///
-  /// This path helps rebuildWithoutConstantOffset rebuild the new GEP index.
+  /// This path helps to rebuild the new GEP index.
   SmallVector<User *, 8> UserChain;
+  /// A data structure used in rebuildWithoutConstOffset. Contains all
+  /// sext/zext instructions along UserChain.
+  SmallVector<CastInst *, 16> ExtInsts;
   /// The data layout of the module. Used in ComputeKnownBits.
   const DataLayout *DL;
   Instruction *IP;  /// Insertion position of cloned instructions.
@@ -192,6 +236,15 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
     AU.addRequired<DataLayoutPass>();
     AU.addRequired<TargetTransformInfo>();
   }
+
+  bool doInitialization(Module &M) override {
+    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+    if (DLP == nullptr)
+      report_fatal_error("data layout missing");
+    DL = &DLP->getDataLayout();
+    return false;
+  }
+
   bool runOnFunction(Function &F) override;
 
  private:
@@ -202,8 +255,42 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
   /// function only inspects the GEP without changing it. The output
   /// NeedsExtraction indicates whether we can extract a non-zero constant
   /// offset from any index.
-  int64_t accumulateByteOffset(GetElementPtrInst *GEP, const DataLayout *DL,
-                               bool &NeedsExtraction);
+  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+  /// Canonicalize array indices to pointer-size integers. This helps to
+  /// simplify the logic of splitting a GEP. For example, if a + b is a
+  /// pointer-size integer, we have
+  ///   gep base, a + b = gep (gep base, a), b
+  /// However, this equality may not hold if the size of a + b is smaller than
+  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+  /// pointer size before computing the address
+  /// (http://llvm.org/docs/LangRef.html#id181).
+  ///
+  /// This canonicalization is very likely already done in clang and
+  /// instcombine. Therefore, the program will probably remain the same.
+  ///
+  /// Returns true if the module changes.
+  ///
+  /// Verified in @i32_add in split-gep.ll
+  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+  /// For each array index that is in the form of zext(a), convert it to sext(a)
+  /// if we can prove zext(a) <= max signed value of typeof(a). We prefer
+  /// sext(a) to zext(a), because in the special case where x + y >= 0 and
+  /// (x >= 0 or y >= 0), function CanTraceInto can split sext(x + y),
+  /// while no such case exists for zext(x + y).
+  ///
+  /// Note that
+  ///   zext(x + y) = zext(x) + zext(y)
+  /// is wrong, e.g.,
+  ///   zext i32(UINT_MAX + 1) to i64 !=
+  ///   (zext i32 UINT_MAX to i64) + (zext i32 1 to i64)
+  ///
+  /// Returns true if the module changes.
+  ///
+  /// Verified in @inbounds_zext_add in split-gep.ll and @sum_of_array3 in
+  /// split-gep-and-gvn.ll
+  bool convertInBoundsZExtToSExt(GetElementPtrInst *GEP);
+
+  const DataLayout *DL;
 };
 }  // anonymous namespace
 
@@ -223,169 +310,272 @@ FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() {
   return new SeparateConstOffsetFromGEP();
 }
 
-int64_t ConstantOffsetExtractor::findInEitherOperand(User *U, bool IsSub) {
-  assert(U->getNumOperands() == 2);
-  int64_t ConstantOffset = find(U->getOperand(0));
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+                                            bool ZeroExtended,
+                                            BinaryOperator *BO,
+                                            bool NonNegative) {
+  // We only consider ADD, SUB and OR, because a non-zero constant found in
+  // expressions composed of these operations can be easily hoisted as a
+  // constant offset by reassociation.
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Sub &&
+      BO->getOpcode() != Instruction::Or) {
+    return false;
+  }
+
+  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+  if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS))
+    return false;
+
+  // In addition, tracing into BO requires that its surrounding s/zext (if
+  // any) is distributable to both operands.
+  //
+  // Suppose BO = A op B.
+  //  SignExtended | ZeroExtended | Distributable?
+  // --------------+--------------+----------------------------------
+  //       0       |      0       | true because no s/zext exists
+  //       0       |      1       | zext(BO) == zext(A) op zext(B)
+  //       1       |      0       | sext(BO) == sext(A) op sext(B)
+  //       1       |      1       | zext(sext(BO)) ==
+  //               |              |     zext(sext(A)) op zext(sext(B))
+  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+    // If a + b >= 0 and (a >= 0 or b >= 0), then
+    //   sext(a + b) = sext(a) + sext(b)
+    // even if the addition is not marked nsw.
+    //
+    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+    // index if the constant offset is non-negative.
+    //
+    // Verified in @sext_add in split-gep.ll.
+    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+      if (!ConstLHS->isNegative())
+        return true;
+    }
+    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+      if (!ConstRHS->isNegative())
+        return true;
+    }
+  }
+
+  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+  if (BO->getOpcode() == Instruction::Add ||
+      BO->getOpcode() == Instruction::Sub) {
+    if (SignExtended && !BO->hasNoSignedWrap())
+      return false;
+    if (ZeroExtended && !BO->hasNoUnsignedWrap())
+      return false;
+  }
+
+  return true;
+}
+
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+                                                   bool SignExtended,
+                                                   bool ZeroExtended) {
+  // BO being non-negative does not shed light on whether its operands are
+  // non-negative. Clear the NonNegative flag here.
+  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+                              /* NonNegative */ false);
   // If we found a constant offset in the left operand, stop and return that.
   // This shortcut might cause us to miss opportunities of combining the
   // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
   // However, such cases are probably already handled by -instcombine,
   // given this pass runs after the standard optimizations.
   if (ConstantOffset != 0) return ConstantOffset;
-  ConstantOffset = find(U->getOperand(1));
+  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+                        /* NonNegative */ false);
   // If U is a sub operator, negate the constant offset found in the right
   // operand.
-  return IsSub ? -ConstantOffset : ConstantOffset;
+  if (BO->getOpcode() == Instruction::Sub)
+    ConstantOffset = -ConstantOffset;
+  return ConstantOffset;
 }
 
-int64_t ConstantOffsetExtractor::find(Value *V) {
-  // TODO(jingyue): We can even trace into integer/pointer casts, such as
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+                                    bool ZeroExtended, bool NonNegative) {
+  // TODO(jingyue): We could trace into integer/pointer casts, such as
   // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
   // integers because it gives good enough results for our benchmarks.
-  assert(V->getType()->isIntegerTy());
+  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
 
+  // We cannot do much with Values that are not a User, such as an Argument.
   User *U = dyn_cast<User>(V);
-  // We cannot do much with Values that are not a User, such as BasicBlock and
-  // MDNode.
-  if (U == nullptr) return 0;
+  if (U == nullptr) return APInt(BitWidth, 0);
 
-  int64_t ConstantOffset = 0;
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(U)) {
+  APInt ConstantOffset(BitWidth, 0);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     // Hooray, we found it!
-    ConstantOffset = CI->getSExtValue();
-  } else if (Operator *O = dyn_cast<Operator>(U)) {
-    // The GEP index may be more complicated than a simple addition of a
-    // varaible and a constant. Therefore, we trace into subexpressions for more
-    // hoisting opportunities.
-    switch (O->getOpcode()) {
-      case Instruction::Add: {
-        ConstantOffset = findInEitherOperand(U, false);
-        break;
-      }
-      case Instruction::Sub: {
-        ConstantOffset = findInEitherOperand(U, true);
-        break;
-      }
-      case Instruction::Or: {
-        // If LHS and RHS don't have common bits, (LHS | RHS) is equivalent to
-        // (LHS + RHS).
-        if (NoCommonBits(U->getOperand(0), U->getOperand(1)))
-          ConstantOffset = findInEitherOperand(U, false);
-        break;
-      }
-      case Instruction::SExt: {
-        // For safety, we trace into sext only when its operand is marked
-        // "nsw" because xxx.nsw guarantees no signed wrap. e.g., we can safely
-        // transform "sext (add nsw a, 5)" into "add nsw (sext a), 5".
-        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0))) {
-          if (BO->hasNoSignedWrap())
-            ConstantOffset = find(U->getOperand(0));
-        }
-        break;
-      }
-      case Instruction::ZExt: {
-        // Similarly, we trace into zext only when its operand is marked with
-        // "nuw" because zext (add nuw a, b) == add nuw (zext a), (zext b).
-        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0))) {
-          if (BO->hasNoUnsignedWrap())
-            ConstantOffset = find(U->getOperand(0));
-        }
-        break;
-      }
+    ConstantOffset = CI->getValue();
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+    // Trace into subexpressions for more hoisting opportunities.
+    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) {
+      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
     }
+  } else if (isa<SExtInst>(V)) {
+    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+                          ZeroExtended, NonNegative).sext(BitWidth);
+  } else if (isa<ZExtInst>(V)) {
+    // As an optimization, we can clear the SignExtended flag because
+    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+    //
+    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+    ConstantOffset =
+        find(U->getOperand(0), /* SignExtended */ false,
+             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
   }
-  // If we found a non-zero constant offset, adds it to the path for future
-  // transformation (rebuildWithoutConstantOffset). Zero is a valid constant
-  // offset, but doesn't help this optimization.
+
+  // If we found a non-zero constant offset, add it to the path for
+  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+  // help this optimization.
   if (ConstantOffset != 0)
     UserChain.push_back(U);
   return ConstantOffset;
 }
 
-unsigned ConstantOffsetExtractor::FindFirstUse(User *U, Value *Used) {
-  for (unsigned I = 0, E = U->getNumOperands(); I < E; ++I) {
-    if (U->getOperand(I) == Used)
-      return I;
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+  Value *Current = V;
+  // ExtInsts is built in the use-def order. Therefore, we apply them to V
+  // in the reversed order.
+  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+    if (Constant *C = dyn_cast<Constant>(Current)) {
+      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+    } else {
+      Instruction *Ext = (*I)->clone();
+      Ext->setOperand(0, Current);
+      Ext->insertBefore(IP);
+      Current = Ext;
+    }
   }
-  return -1;
+  return Current;
 }
 
-Value *ConstantOffsetExtractor::cloneAndReplace(User *U, Value *From,
-                                                Value *To) {
-  // Finds in U the first use of From. It is safe to ignore future occurrences
-  // of From, because findInEitherOperand similarly stops searching the right
-  // operand when the first operand has a non-zero constant offset.
-  unsigned OpNo = FindFirstUse(U, From);
-  assert(OpNo != (unsigned)-1 && "UserChain wasn't built correctly");
-
-  // ConstantOffsetExtractor::find only follows Operators (i.e., Instructions
-  // and ConstantExprs). Therefore, U is either an Instruction or a
-  // ConstantExpr.
-  if (Instruction *I = dyn_cast<Instruction>(U)) {
-    Instruction *Clone = I->clone();
-    Clone->setOperand(OpNo, To);
-    Clone->insertBefore(IP);
-    return Clone;
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+  distributeExtsAndCloneChain(UserChain.size() - 1);
+  // Remove all nullptrs (used to be s/zext) from UserChain.
+  unsigned NewSize = 0;
+  for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) {
+    if (*I != nullptr) {
+      UserChain[NewSize] = *I;
+      NewSize++;
+    }
   }
-  // cast<Constant>(To) is safe because a ConstantExpr only uses Constants.
-  return cast<ConstantExpr>(U)
-      ->getWithOperandReplaced(OpNo, cast<Constant>(To));
+  UserChain.resize(NewSize);
+  return removeConstOffset(UserChain.size() - 1);
 }
 
-Value *ConstantOffsetExtractor::rebuildLeafWithoutConstantOffset(User *U,
-                                                                 Value *C) {
-  assert(U->getNumOperands() <= 2 &&
-         "We didn't trace into any operator with more than 2 operands");
-  // If U has only one operand which is the constant offset, removing the
-  // constant offset leaves U as a null value.
-  if (U->getNumOperands() == 1)
-    return Constant::getNullValue(U->getType());
-
-  // U->getNumOperands() == 2
-  unsigned OpNo = FindFirstUse(U, C); // U->getOperand(OpNo) == C
-  assert(OpNo < 2 && "UserChain wasn't built correctly");
-  Value *TheOther = U->getOperand(1 - OpNo); // The other operand of U
-  // If U = C - X, removing C makes U = -X; otherwise U will simply be X.
-  if (!isa<SubOperator>(U) || OpNo == 1)
-    return TheOther;
-  if (isa<ConstantExpr>(U))
-    return ConstantExpr::getNeg(cast<Constant>(TheOther));
-  return BinaryOperator::CreateNeg(TheOther, "", IP);
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+  User *U = UserChain[ChainIndex];
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(U));
+    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+    assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
+           "We only traced into two types of CastInst: sext and zext");
+    ExtInsts.push_back(Cast);
+    UserChain[ChainIndex] = nullptr;
+    return distributeExtsAndCloneChain(ChainIndex - 1);
+  }
+
+  // Function find only trace into BinaryOperator and CastInst.
+  BinaryOperator *BO = cast<BinaryOperator>(U);
+  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+  BinaryOperator *NewBO = nullptr;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+                                   BO->getName(), IP);
+  } else {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+                                   BO->getName(), IP);
+  }
+  return UserChain[ChainIndex] = NewBO;
 }
 
-Value *ConstantOffsetExtractor::rebuildWithoutConstantOffset() {
-  assert(UserChain.size() > 0 && "you at least found a constant, right?");
-  // Start with the constant and go up through UserChain, each time building a
-  // clone of the subexpression but with the constant removed.
-  // e.g., to build a clone of (a + (b + (c + 5)) but with the 5 removed, we
-  // first c, then (b + c), and finally (a + (b + c)).
-  //
-  // Fast path: if the GEP index is a constant, simply returns 0.
-  if (UserChain.size() == 1)
-    return ConstantInt::get(UserChain[0]->getType(), 0);
-
-  Value *Remainder =
-      rebuildLeafWithoutConstantOffset(UserChain[1], UserChain[0]);
-  for (size_t I = 2; I < UserChain.size(); ++I)
-    Remainder = cloneAndReplace(UserChain[I], UserChain[I - 1], Remainder);
-  return Remainder;
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(UserChain[ChainIndex]));
+    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+  }
+
+  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+  Value *NextInChain = removeConstOffset(ChainIndex - 1);
+  Value *TheOther = BO->getOperand(1 - OpNo);
+
+  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+  // sub-expression to be just TheOther.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+      return TheOther;
+  }
+
+  if (BO->getOpcode() == Instruction::Or) {
+    // Rebuild "or" as "add", because "or" may be invalid for the new
+    // epxression.
+    //
+    // For instance, given
+    //   a | (b + 5) where a and b + 5 have no common bits,
+    // we can extract 5 as the constant offset.
+    //
+    // However, reusing the "or" in the new index would give us
+    //   (a | b) + 5
+    // which does not equal a | (b + 5).
+    //
+    // Replacing the "or" with "add" is fine, because
+    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
+    return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1),
+                                     BO->getName(), IP);
+  }
+
+  // We can reuse BO in this case, because the new expression shares the same
+  // instruction type and BO is used at most once.
+  assert(BO->getNumUses() <= 1 &&
+         "distributeExtsAndCloneChain clones each BinaryOperator in "
+         "UserChain, so no one should be used more than "
+         "once");
+  BO->setOperand(OpNo, NextInChain);
+  BO->setHasNoSignedWrap(false);
+  BO->setHasNoUnsignedWrap(false);
+  // Make sure it appears after all instructions we've inserted so far.
+  BO->moveBefore(IP);
+  return BO;
 }
 
 int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx,
                                          const DataLayout *DL,
-                                         Instruction *IP) {
-  ConstantOffsetExtractor Extractor(DL, IP);
+                                         GetElementPtrInst *GEP) {
+  ConstantOffsetExtractor Extractor(DL, GEP);
   // Find a non-zero constant offset first.
-  int64_t ConstantOffset = Extractor.find(Idx);
-  if (ConstantOffset == 0)
-    return 0;
-  // Then rebuild a new index with the constant removed.
-  NewIdx = Extractor.rebuildWithoutConstantOffset();
-  return ConstantOffset;
+  APInt ConstantOffset =
+      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+                     GEP->isInBounds());
+  if (ConstantOffset != 0) {
+    // Separates the constant offset from the GEP index.
+    NewIdx = Extractor.rebuildWithoutConstOffset();
+  }
+  return ConstantOffset.getSExtValue();
 }
 
-int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL) {
-  return ConstantOffsetExtractor(DL, nullptr).find(Idx);
+int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL,
+      GetElementPtrInst *GEP) {
+  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+  return ConstantOffsetExtractor(DL, GEP)
+      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+            GEP->isInBounds())
+      .getSExtValue();
 }
 
 void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne,
@@ -405,8 +595,64 @@ bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const {
   return (LHSKnownZero | RHSKnownZero).isAllOnesValue();
 }
 
-int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(
-    GetElementPtrInst *GEP, const DataLayout *DL, bool &NeedsExtraction) {
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+    GetElementPtrInst *GEP) {
+  bool Changed = false;
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+       I != E; ++I, ++GTI) {
+    // Skip struct member indices which must be i32.
+    if (isa<SequentialType>(*GTI)) {
+      if ((*I)->getType() != IntPtrTy) {
+        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool
+SeparateConstOffsetFromGEP::convertInBoundsZExtToSExt(GetElementPtrInst *GEP) {
+  if (!GEP->isInBounds())
+    return false;
+
+  // TODO: consider alloca
+  GlobalVariable *UnderlyingObject =
+      dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+  if (UnderlyingObject == nullptr)
+    return false;
+
+  uint64_t ObjectSize =
+      DL->getTypeAllocSize(UnderlyingObject->getType()->getElementType());
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  bool Changed = false;
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E;
+       ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      if (ZExtInst *Extended = dyn_cast<ZExtInst>(*I)) {
+        unsigned SrcBitWidth =
+            cast<IntegerType>(Extended->getSrcTy())->getBitWidth();
+        // For GEP operand zext(a), if a <= max signed value of typeof(a), then
+        // the sign bit of a is zero and sext(a) = zext(a). Because the GEP is
+        // in bounds, we know a <= ObjectSize, so the condition can be reduced
+        // to ObjectSize <= max signed value of typeof(a).
+        if (ObjectSize <=
+            APInt::getSignedMaxValue(SrcBitWidth).getZExtValue()) {
+          *I = new SExtInst(Extended->getOperand(0), Extended->getType(),
+                            Extended->getName(), GEP);
+          Changed = true;
+        }
+      }
+    }
+  }
+  return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+                                                 bool &NeedsExtraction) {
   NeedsExtraction = false;
   int64_t AccumulativeByteOffset = 0;
   gep_type_iterator GTI = gep_type_begin(*GEP);
@@ -414,7 +660,7 @@ int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(
     if (isa<SequentialType>(*GTI)) {
       // Tries to extract a constant offset from this GEP index.
       int64_t ConstantOffset =
-          ConstantOffsetExtractor::Find(GEP->getOperand(I), DL);
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP);
       if (ConstantOffset != 0) {
         NeedsExtraction = true;
         // A GEP may have multiple indices.  We accumulate the extracted
@@ -439,31 +685,11 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
     return false;
 
   bool Changed = false;
+  Changed |= canonicalizeArrayIndicesToPointerSize(GEP);
+  Changed |= convertInBoundsZExtToSExt(GEP);
 
-  // Shortcuts integer casts. Eliminating these explicit casts can make
-  // subsequent optimizations more obvious: ConstantOffsetExtractor needn't
-  // trace into these casts.
-  if (GEP->isInBounds()) {
-    // Doing this to inbounds GEPs is safe because their indices are guaranteed
-    // to be non-negative and in bounds.
-    gep_type_iterator GTI = gep_type_begin(*GEP);
-    for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
-      if (isa<SequentialType>(*GTI)) {
-        if (Operator *O = dyn_cast<Operator>(GEP->getOperand(I))) {
-          if (O->getOpcode() == Instruction::SExt ||
-              O->getOpcode() == Instruction::ZExt) {
-            GEP->setOperand(I, O->getOperand(0));
-            Changed = true;
-          }
-        }
-      }
-    }
-  }
-
-  const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout();
   bool NeedsExtraction;
-  int64_t AccumulativeByteOffset =
-      accumulateByteOffset(GEP, DL, NeedsExtraction);
+  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
 
   if (!NeedsExtraction)
     return Changed;
@@ -487,32 +713,32 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
       int64_t ConstantOffset =
           ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP);
       if (ConstantOffset != 0) {
-        assert(NewIdx && "ConstantOffset != 0 implies NewIdx is set");
+        assert(NewIdx != nullptr &&
+               "ConstantOffset != 0 implies NewIdx is set");
         GEP->setOperand(I, NewIdx);
-        // Clear the inbounds attribute because the new index may be off-bound.
-        // e.g.,
-        //
-        // b = add i64 a, 5
-        // addr = gep inbounds float* p, i64 b
-        //
-        // is transformed to:
-        //
-        // addr2 = gep float* p, i64 a
-        // addr = gep float* addr2, i64 5
-        //
-        // If a is -4, although the old index b is in bounds, the new index a is
-        // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
-        // inbounds keyword is not present, the offsets are added to the base
-        // address with silently-wrapping two's complement arithmetic".
-        // Therefore, the final code will be a semantically equivalent.
-        //
-        // TODO(jingyue): do some range analysis to keep as many inbounds as
-        // possible. GEPs with inbounds are more friendly to alias analysis.
-        GEP->setIsInBounds(false);
-        Changed = true;
       }
     }
   }
+  // Clear the inbounds attribute because the new index may be off-bound.
+  // e.g.,
+  //
+  // b = add i64 a, 5
+  // addr = gep inbounds float* p, i64 b
+  //
+  // is transformed to:
+  //
+  // addr2 = gep float* p, i64 a
+  // addr = gep float* addr2, i64 5
+  //
+  // If a is -4, although the old index b is in bounds, the new index a is
+  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+  // inbounds keyword is not present, the offsets are added to the base
+  // address with silently-wrapping two's complement arithmetic".
+  // Therefore, the final code will be a semantically equivalent.
+  //
+  // TODO(jingyue): do some range analysis to keep as many inbounds as
+  // possible. GEPs with inbounds are more friendly to alias analysis.
+  GEP->setIsInBounds(false);
 
   // Offsets the base with the accumulative byte offset.
   //
@@ -522,44 +748,67 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // => add the offset
   //
   //   %gep2                       ; clone of %gep
-  //   %0       = ptrtoint %gep2
-  //   %1       = add %0, <offset>
-  //   %new.gep = inttoptr %1
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
   //   %gep                        ; will be removed
   //   ... %gep ...
   //
   // => replace all uses of %gep with %new.gep and remove %gep
   //
   //   %gep2                       ; clone of %gep
-  //   %0       = ptrtoint %gep2
-  //   %1       = add %0, <offset>
-  //   %new.gep = inttoptr %1
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
   //   ... %new.gep ...
   //
-  // TODO(jingyue): Emit a GEP instead of an "uglygep"
-  // (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep) to make the IR
-  // prettier and more alias analysis friendly. One caveat: if the original GEP
-  // ends with a StructType, we need to split the GEP at the last
-  // SequentialType. For instance, consider the following IR:
+  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+  // type of %gep.
   //
-  //   %struct.S = type { float, double }
-  //   @array = global [1024 x %struct.S]
-  //   %p = getelementptr %array, 0, %i + 5, 1
-  //
-  // To separate the constant 5 from %p, we would need to split %p at the last
-  // array index so that we have:
-  //
-  //   %addr = gep %array, 0, %i
-  //   %p = gep %addr, 5, 1
+  //   %gep2                       ; clone of %gep
+  //   %0       = bitcast %gep2 to i8*
+  //   %uglygep = gep %0, <offset>
+  //   %new.gep = bitcast %uglygep to <type of %gep>
+  //   ... %new.gep ...
   Instruction *NewGEP = GEP->clone();
   NewGEP->insertBefore(GEP);
+
+  uint64_t ElementTypeSizeOfGEP =
+      DL->getTypeAllocSize(GEP->getType()->getElementType());
   Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
-  Value *Addr = new PtrToIntInst(NewGEP, IntPtrTy, "", GEP);
-  Addr = BinaryOperator::CreateAdd(
-      Addr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "", GEP);
-  Addr = new IntToPtrInst(Addr, GEP->getType(), "", GEP);
+  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+    // Very likely. As long as %gep is natually aligned, the byte offset we
+    // extracted should be a multiple of sizeof(*%gep).
+    // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we
+    // cast ElementTypeSizeOfGEP to signed.
+    int64_t Index =
+        AccumulativeByteOffset / static_cast<int64_t>(ElementTypeSizeOfGEP);
+    NewGEP = GetElementPtrInst::Create(
+        NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP);
+  } else {
+    // Unlikely but possible. For example,
+    // #pragma pack(1)
+    // struct S {
+    //   int a[3];
+    //   int64 b[8];
+    // };
+    // #pragma pack()
+    //
+    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+    // sizeof(int64).
+    //
+    // Emit an uglygep in this case.
+    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+                                       GEP->getPointerAddressSpace());
+    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+    NewGEP = GetElementPtrInst::Create(
+        NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true),
+        "uglygep", GEP);
+    if (GEP->getType() != I8PtrTy)
+      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+  }
 
-  GEP->replaceAllUsesWith(Addr);
+  GEP->replaceAllUsesWith(NewGEP);
   GEP->eraseFromParent();
 
   return true;
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 946af800776f..05b9892470bb 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -64,6 +64,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -318,8 +319,8 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
           break;
         }
         if (SafeToTail) {
-          F.getContext().emitOptimizationRemark(
-              "tailcallelim", F, CI->getDebugLoc(),
+          emitOptimizationRemark(
+              F.getContext(), "tailcallelim", F, CI->getDebugLoc(),
               "marked this readnone call a tail call candidate");
           CI->setTailCall();
           Modified = true;
@@ -365,9 +366,9 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
     if (Visited[CI->getParent()] != ESCAPED) {
       // If the escape point was part way through the block, calls after the
       // escape point wouldn't have been put into DeferredTails.
-      F.getContext().emitOptimizationRemark(
-          "tailcallelim", F, CI->getDebugLoc(),
-          "marked this call a tail call candidate");
+      emitOptimizationRemark(F.getContext(), "tailcallelim", F,
+                             CI->getDebugLoc(),
+                             "marked this call a tail call candidate");
       CI->setTailCall();
       Modified = true;
     } else {
@@ -678,9 +679,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   BasicBlock *BB = Ret->getParent();
   Function *F = BB->getParent();
 
-  F->getContext().emitOptimizationRemark(
-      "tailcallelim", *F, CI->getDebugLoc(),
-      "transforming tail recursion to loop");
+  emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(),
+                         "transforming tail recursion to loop");
 
   // OK! We can transform this tail call.  If this is the first one found,
   // create the new entry block, allowing us to branch back to the old entry.
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index eb67db1f85db..3f75b3e677ee 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -107,7 +107,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
     GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
-    if (const GlobalObject *C = I->getAliasee())
+    if (const Constant *C = I->getAliasee())
       GA->setAliasee(cast<GlobalObject>(MapValue(C, VMap)));
   }
 
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index e01d0c38ec40..d93db5dc1ef9 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -466,7 +466,13 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       DebugLoc DL = BI->getDebugLoc();
-      if (!DL.isUnknown()) {
+      if (DL.isUnknown()) {
+        // If the inlined instruction has no line number, make it look as if it
+        // originates from the call location. This is important for
+        // ((__always_inline__, __nodebug__)) functions which must use caller
+        // location for all instructions in their function body.
+        BI->setDebugLoc(TheCallDL);
+      } else {
         BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext()));
         if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
           LLVMContext &Ctx = BI->getContext();
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index faaab5c12e4c..16975b9e6374 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -237,9 +238,9 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   if (CompletelyUnroll) {
     DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
           << " with trip count " << TripCount << "!\n");
-    Ctx.emitOptimizationRemark(DEBUG_TYPE, *F, LoopLoc,
-                               Twine("completely unrolled loop with ") +
-                                   Twine(TripCount) + " iterations");
+    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc,
+                           Twine("completely unrolled loop with ") +
+                               Twine(TripCount) + " iterations");
   } else {
     DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
           << " by " << Count);
@@ -255,7 +256,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       DiagMsg.concat(" with run-time trip count");
     }
     DEBUG(dbgs() << "!\n");
-    Ctx.emitOptimizationRemark(DEBUG_TYPE, *F, LoopLoc, DiagMsg);
+    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, DiagMsg);
   }
 
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
@@ -486,6 +487,15 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (OuterL) {
       ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
       simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE);
+
+      // LCSSA must be performed on the outermost affected loop. The unrolled
+      // loop's last loop latch is guaranteed to be in the outermost loop after
+      // deleteLoopFromQueue updates LoopInfo.
+      Loop *LatchLoop = LI->getLoopFor(Latches.back());
+      if (!OuterL->contains(LatchLoop))
+        while (OuterL->getParentLoop() != LatchLoop)
+          OuterL = OuterL->getParentLoop();
+
       formLCSSARecursively(*OuterL, *DT, SE);
     }
   }
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 9ef694c4cea3..eac693bdf8a4 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -14,11 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -58,16 +60,18 @@ namespace {
         Low(low), High(high), BB(bb) { }
     };
 
-    typedef std::vector<CaseRange>           CaseVector;
+    typedef std::vector<CaseRange> CaseVector;
     typedef std::vector<CaseRange>::iterator CaseItr;
   private:
     void processSwitchInst(SwitchInst *SI);
 
-    BasicBlock* switchConvert(CaseItr Begin, CaseItr End, Value* Val,
-                              BasicBlock* OrigBlock, BasicBlock* Default);
-    BasicBlock* newLeafBlock(CaseRange& Leaf, Value* Val,
-                             BasicBlock* OrigBlock, BasicBlock* Default);
-    unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
+    BasicBlock *switchConvert(CaseItr Begin, CaseItr End,
+                              ConstantInt *LowerBound, ConstantInt *UpperBound,
+                              Value *Val, BasicBlock *OrigBlock,
+                              BasicBlock *Default);
+    BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock,
+                             BasicBlock *Default);
+    unsigned Clusterify(CaseVector &Cases, SwitchInst *SI);
   };
 
   /// The comparison function for sorting the switch case values in the vector.
@@ -129,15 +133,26 @@ static raw_ostream& operator<<(raw_ostream &O,
 
 // switchConvert - Convert the switch statement into a binary lookup of
 // the case values. The function recursively builds this tree.
-//
-BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
-                                       Value* Val, BasicBlock* OrigBlock,
-                                       BasicBlock* Default)
-{
+// LowerBound and UpperBound are used to keep track of the bounds for Val
+// that have already been checked by a block emitted by one of the previous
+// calls to switchConvert in the call stack.
+BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
+                                       ConstantInt *LowerBound,
+                                       ConstantInt *UpperBound, Value *Val,
+                                       BasicBlock *OrigBlock,
+                                       BasicBlock *Default) {
   unsigned Size = End - Begin;
 
-  if (Size == 1)
+  if (Size == 1) {
+    // Check if the Case Range is perfectly squeezed in between
+    // already checked Upper and Lower bounds. If it is then we can avoid
+    // emitting the code that checks if the value actually falls in the range
+    // because the bounds already tell us so.
+    if (Begin->Low == LowerBound && Begin->High == UpperBound) {
+      return Begin->BB;
+    }
     return newLeafBlock(*Begin, Val, OrigBlock, Default);
+  }
 
   unsigned Mid = Size / 2;
   std::vector<CaseRange> LHS(Begin, Begin + Mid);
@@ -145,15 +160,50 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
   std::vector<CaseRange> RHS(Begin + Mid, End);
   DEBUG(dbgs() << "RHS: " << RHS << "\n");
 
-  CaseRange& Pivot = *(Begin + Mid);
-  DEBUG(dbgs() << "Pivot ==> " 
-               << cast<ConstantInt>(Pivot.Low)->getValue() << " -"
-               << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+  CaseRange &Pivot = *(Begin + Mid);
+  DEBUG(dbgs() << "Pivot ==> "
+               << cast<ConstantInt>(Pivot.Low)->getValue()
+               << " -" << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+
+  // NewLowerBound here should never be the integer minimal value.
+  // This is because it is computed from a case range that is never
+  // the smallest, so there is always a case range that has at least
+  // a smaller value.
+  ConstantInt *NewLowerBound = cast<ConstantInt>(Pivot.Low);
+  ConstantInt *NewUpperBound;
+
+  // If we don't have a Default block then it means that we can never
+  // have a value outside of a case range, so set the UpperBound to the highest
+  // value in the LHS part of the case ranges.
+  if (Default != nullptr) {
+    // Because NewLowerBound is never the smallest representable integer
+    // it is safe here to subtract one.
+    NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
+                                     NewLowerBound->getValue() - 1);
+  } else {
+    CaseItr LastLHS = LHS.begin() + LHS.size() - 1;
+    NewUpperBound = cast<ConstantInt>(LastLHS->High);
+  }
 
-  BasicBlock* LBranch = switchConvert(LHS.begin(), LHS.end(), Val,
-                                      OrigBlock, Default);
-  BasicBlock* RBranch = switchConvert(RHS.begin(), RHS.end(), Val,
-                                      OrigBlock, Default);
+  DEBUG(dbgs() << "LHS Bounds ==> ";
+        if (LowerBound) {
+          dbgs() << cast<ConstantInt>(LowerBound)->getSExtValue();
+        } else {
+          dbgs() << "NONE";
+        }
+        dbgs() << " - " << NewUpperBound->getSExtValue() << "\n";
+        dbgs() << "RHS Bounds ==> ";
+        dbgs() << NewLowerBound->getSExtValue() << " - ";
+        if (UpperBound) {
+          dbgs() << cast<ConstantInt>(UpperBound)->getSExtValue() << "\n";
+        } else {
+          dbgs() << "NONE\n";
+        });
+
+  BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound,
+                                      NewUpperBound, Val, OrigBlock, Default);
+  BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound,
+                                      UpperBound, Val, OrigBlock, Default);
 
   // Create a new node that checks if the value is < pivot. Go to the
   // left branch if it is and right branch if not.
@@ -291,13 +341,19 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
     return;
   }
 
+  const bool DefaultIsUnreachable =
+      Default->size() == 1 && isa<UnreachableInst>(Default->getTerminator());
   // Create a new, empty default block so that the new hierarchy of
   // if-then statements go to this and the PHI nodes are happy.
-  BasicBlock* NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
-  F->getBasicBlockList().insert(Default, NewDefault);
-
-  BranchInst::Create(Default, NewDefault);
-
+  // if the default block is set as an unreachable we avoid creating one
+  // because will never be a valid target.
+  BasicBlock *NewDefault = nullptr;
+  if (!DefaultIsUnreachable) {
+    NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
+    F->getBasicBlockList().insert(Default, NewDefault);
+
+    BranchInst::Create(Default, NewDefault);
+  }
   // If there is an entry in any PHI nodes for the default edge, make sure
   // to update them as well.
   for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
@@ -316,12 +372,31 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   DEBUG(dbgs() << "Cases: " << Cases << "\n");
   (void)numCmps;
   
-  BasicBlock* SwitchBlock = switchConvert(Cases.begin(), Cases.end(), Val,
-                                          OrigBlock, NewDefault);
+  ConstantInt *UpperBound = nullptr;
+  ConstantInt *LowerBound = nullptr;
+
+  // Optimize the condition where Default is an unreachable block. In this case
+  // we can make the bounds tightly fitted around the case value ranges,
+  // because we know that the value passed to the switch should always be
+  // exactly one of the case values.
+  if (DefaultIsUnreachable) {
+    CaseItr LastCase = Cases.begin() + Cases.size() - 1;
+    UpperBound = cast<ConstantInt>(LastCase->High);
+    LowerBound = cast<ConstantInt>(Cases.begin()->Low);
+  }
+  BasicBlock *SwitchBlock =
+      switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
+                    OrigBlock, NewDefault);
 
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);
 
   // We are now done with the switch instruction, delete it.
   CurBlock->getInstList().erase(SI);
+
+  pred_iterator PI = pred_begin(Default), E = pred_end(Default);
+  // If the Default block has no more predecessors just remove it
+  if (PI == E) {
+    DeleteDeadBlock(Default);
+  }
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 150dbdd4ec40..e155daf6fcce 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -227,6 +227,9 @@ static unsigned ComputeSpeculationCost(const User *I) {
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
+  case Instruction::BitCast:
+  case Instruction::ExtractElement:
+  case Instruction::InsertElement:
     return 1; // These are all cheap.
 
   case Instruction::Call:
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 0fd185816fd5..3b61bb575a8d 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -789,9 +790,9 @@ struct StrLenOpt : public LibCallOptimization {
       uint64_t LenTrue = GetStringLength(SI->getTrueValue());
       uint64_t LenFalse = GetStringLength(SI->getFalseValue());
       if (LenTrue && LenFalse) {
-        Context->emitOptimizationRemark(
-            "simplify-libcalls", *Caller, SI->getDebugLoc(),
-            "folded strlen(select) to select of constants");
+        emitOptimizationRemark(*Context, "simplify-libcalls", *Caller,
+                               SI->getDebugLoc(),
+                               "folded strlen(select) to select of constants");
         return B.CreateSelect(SI->getCondition(),
                               ConstantInt::get(CI->getType(), LenTrue-1),
                               ConstantInt::get(CI->getType(), LenFalse-1));
diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Transforms/Utils/SpecialCaseList.cpp
index 2c6fcd14b9e8..45a2b618a710 100644
--- a/lib/Transforms/Utils/SpecialCaseList.cpp
+++ b/lib/Transforms/Utils/SpecialCaseList.cpp
@@ -26,8 +26,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <string>
+#include <system_error>
 #include <utility>
 
 namespace llvm {
@@ -55,7 +55,7 @@ SpecialCaseList *SpecialCaseList::create(
   if (Path.empty())
     return new SpecialCaseList();
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
+  if (std::error_code EC = MemoryBuffer::getFile(Path, File)) {
     Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str();
     return nullptr;
   }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index d57fae34dd61..15d4c1c79d2b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -67,6 +67,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -1213,10 +1214,10 @@ struct LoopVectorize : public FunctionPass {
       DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
 
       // Report the unrolling decision.
-      F->getContext().emitOptimizationRemark(
-          DEBUG_TYPE, *F, L->getStartLoc(),
-          Twine("unrolled with interleaving factor " + Twine(UF) +
-                " (vectorization not beneficial)"));
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+                             Twine("unrolled with interleaving factor " +
+                                   Twine(UF) +
+                                   " (vectorization not beneficial)"));
 
       // We decided not to vectorize, but we may want to unroll.
       InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
@@ -1228,8 +1229,8 @@ struct LoopVectorize : public FunctionPass {
       ++LoopsVectorized;
 
       // Report the vectorization decision.
-      F->getContext().emitOptimizationRemark(
-          DEBUG_TYPE, *F, L->getStartLoc(),
+      emitOptimizationRemark(
+          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
           Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
               ", unrolling interleave factor: " + Twine(UF) + ")");
     }
@@ -1908,20 +1909,23 @@ void InnerLoopVectorizer::createEmptyLoop() {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-       [ ] <-- vector loop bypass (may consist of multiple blocks).
-     /  |
-    /   v
-   |   [ ]     <-- vector pre header.
-   |    |
-   |    v
-   |   [  ] \
-   |   [  ]_|   <-- vector loop.
-   |    |
-    \   v
-      >[ ]   <--- middle-block.
-     /  |
-    /   v
-   |   [ ]     <--- new preheader.
+       [ ] <-- Back-edge taken count overflow check.
+    /   |
+   /    v
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
+  |  /  |
+  | /   v
+  ||   [ ]     <-- vector pre header.
+  ||    |
+  ||    v
+  ||   [  ] \
+  ||   [  ]_|   <-- vector loop.
+  ||    |
+  | \   v
+  |   >[ ]   <--- middle-block.
+  |  /  |
+  | /   v
+  -|- >[ ]     <--- new preheader.
    |    |
    |    v
    |   [ ] \
@@ -1935,6 +1939,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(BypassBlock && "Invalid loop structure");
   assert(ExitBlock && "Must have an exit block");
 
   // Some loops have a single integer induction variable, while other loops
@@ -1957,15 +1962,31 @@ void InnerLoopVectorizer::createEmptyLoop() {
       IdxTy->getPrimitiveSizeInBits())
     ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
 
-  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
+  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
   // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(ExitCount->getType(), 1));
+  ExitCount = SE->getAddExpr(BackedgeTakeCount,
+                             SE->getConstant(BackedgeTakeCount->getType(), 1));
 
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, "induction");
 
+  // We need to test whether the backedge-taken count is uint##_max. Adding one
+  // to it will cause overflow and an incorrect loop trip count in the vector
+  // body. In case of overflow we want to directly jump to the scalar remainder
+  // loop.
+  Value *BackedgeCount =
+      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
+                        BypassBlock->getTerminator());
+  if (BackedgeCount->getType()->isPointerTy())
+    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
+                                                "backedge.ptrcnt.to.int",
+                                                BypassBlock->getTerminator());
+  Instruction *CheckBCOverflow =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
+                      Constant::getAllOnesValue(BackedgeCount->getType()),
+                      "backedge.overflow", BypassBlock->getTerminator());
+
   // Count holds the overall loop count (N).
   Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
                                    BypassBlock->getTerminator());
@@ -1979,7 +2000,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
                        IdxTy):
     ConstantInt::get(IdxTy, 0);
 
-  assert(BypassBlock && "Invalid loop structure");
   LoopBypassBlocks.push_back(BypassBlock);
 
   // Split the single block loop into the two loop structure described above.
@@ -2053,24 +2073,39 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
   BasicBlock *LastBypassBlock = BypassBlock;
 
+  // Generate code to check that the loops trip count that we computed by adding
+  // one to the backedge-taken count will not overflow.
+  {
+    auto PastOverflowCheck = std::next(BasicBlock::iterator(CheckBCOverflow));
+    BasicBlock *CheckBlock =
+        LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+    LoopBypassBlocks.push_back(CheckBlock);
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
+    BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
+    OldTerm->eraseFromParent();
+    LastBypassBlock = CheckBlock;
+  }
+
   // Generate the code to check that the strides we assumed to be one are really
   // one. We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
   Instruction *StrideCheck;
   Instruction *FirstCheckInst;
   std::tie(FirstCheckInst, StrideCheck) =
-      addStrideCheck(BypassBlock->getTerminator());
+      addStrideCheck(LastBypassBlock->getTerminator());
   if (StrideCheck) {
     // Create a new block containing the stride check.
     BasicBlock *CheckBlock =
-        BypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
+        LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
     if (ParentLoop)
       ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
     LoopBypassBlocks.push_back(CheckBlock);
 
     // Replace the branch into the memory check block with a conditional branch
     // for the "few elements case".
-    Instruction *OldTerm = BypassBlock->getTerminator();
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
     OldTerm->eraseFromParent();
 
@@ -2133,6 +2168,19 @@ void InnerLoopVectorizer::createEmptyLoop() {
       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
                       MiddleBlock->getTerminator()) : nullptr;
 
+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+                                           ScalarPH->getTerminator());
+    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
+
+    PHINode *BCTruncResumeVal = nullptr;
+    if (OrigPhi == OldInduction) {
+      BCTruncResumeVal =
+          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
+                          ScalarPH->getTerminator());
+      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
+    }
+
     Value *EndValue = nullptr;
     switch (II.IK) {
     case LoopVectorizationLegality::IK_NoInduction:
@@ -2149,10 +2197,12 @@ void InnerLoopVectorizer::createEmptyLoop() {
           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
         // The new PHI merges the original incoming value, in case of a bypass,
         // or the value at the end of the vectorized loop.
-        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
         TruncResumeVal->addIncoming(EndValue, VecBody);
 
+        BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+
         // We know what the end value is.
         EndValue = IdxEndRoundDown;
         // We also know which PHI node holds it.
@@ -2198,7 +2248,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
       if (OrigPhi == OldInduction)
         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
       else
@@ -2208,11 +2258,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
-    // The old inductions phi node in the scalar body needs the truncated value.
-    if (OrigPhi == OldInduction)
-      OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
-    else
-      OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+
+    // The old induction's phi node in the scalar body needs the truncated
+    // value.
+    if (OrigPhi == OldInduction) {
+      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
+    } else {
+      BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+    }
   }
 
   // If we are generating a new induction variable then we also need to
@@ -2223,7 +2278,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
     assert(!ResumeIndex && "Unexpected resume value found");
     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
                                   MiddleBlock->getTerminator());
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
   }
@@ -2493,7 +2548,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // To do so, we need to generate the 'identity' vector and override
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -2567,7 +2622,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
       Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
       NewPhi->addIncoming(RdxExitVal[part],
                           LoopVectorBody.back());
@@ -2625,6 +2680,13 @@ void InnerLoopVectorizer::vectorizeLoop() {
                                                     Builder.getInt32(0));
     }
 
+    // Create a phi node that merges control-flow from the backedge-taken check
+    // block and the middle block.
+    PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
+                                          LoopScalarPreHeader->getTerminator());
+    BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
+    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
     // We know that the loop is in LCSSA form. We need to update the
@@ -2654,7 +2716,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
     // Pick the other block.
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 
@@ -3061,9 +3123,14 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         scalarizeInstruction(it);
         break;
       default:
+        bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
         for (unsigned Part = 0; Part < UF; ++Part) {
           SmallVector<Value *, 4> Args;
           for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+            if (HasScalarOpd && i == 1) {
+              Args.push_back(CI->getArgOperand(i));
+              continue;
+            }
             VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
             Args.push_back(Arg[Part]);
           }
@@ -3111,8 +3178,8 @@ void InnerLoopVectorizer::updateAnalysis() {
     }
   }
 
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
-  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
 
@@ -3412,6 +3479,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return false;
       }
 
+      // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+      // second argument is the same (i.e. loop invariant)
+      if (CI &&
+          hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
+        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
+          DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+          return false;
+        }
+      }
+
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(it->getType()) &&
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e13ba956c398..f18202be167e 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -914,8 +914,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        BasicBlock *LeftBB = getSameBlock(Left);
+        BasicBlock *RightBB = getSameBlock(Right);
+        // If we have common uses on separate paths in the tree make sure we
+        // process the one with greater common depth first.
+        // We can use block numbering to determine the subtree traversal as
+        // earler user has to come in between the common use and the later user.
+        if (LeftBB && RightBB && LeftBB == RightBB &&
+            getLastIndex(Right) > getLastIndex(Left)) {
+          buildTree_rec(Right, Depth + 1);
+          buildTree_rec(Left, Depth + 1);
+        } else {
+          buildTree_rec(Left, Depth + 1);
+          buildTree_rec(Right, Depth + 1);
+        }
         return;
       }
 
@@ -929,6 +941,51 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       }
       return;
     }
+    case Instruction::GetElementPtr: {
+      // We don't combine GEPs with complicated (nested) indexing.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      // We can't combine several GEPs into one vector if they operate on
+      // different types.
+      Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+        if (Ty0 != CurTy) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      // We don't combine GEPs with non-constant indexes.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+        if (!isa<ConstantInt>(Op)) {
+          DEBUG(
+              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      for (unsigned i = 0, e = 2; i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     case Instruction::Store: {
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
@@ -961,9 +1018,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
-
       Function *Int = CI->getCalledFunction();
-
+      Value *A1I = nullptr;
+      if (hasVectorInstrinsicScalarOpd(ID, 1))
+        A1I = CI->getArgOperand(1);
       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
         if (!CI2 || CI2->getCalledFunction() != Int ||
@@ -973,6 +1031,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
                        << "\n");
           return;
         }
+        // ctlz,cttz and powi are special intrinsics whose second argument
+        // should be same in order for them to be vectorized.
+        if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+          Value *A1J = CI2->getArgOperand(1);
+          if (A1I != A1J) {
+            newTreeEntry(VL, false);
+            DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+                         << " argument "<< A1I<<"!=" << A1J
+                         << "\n");
+            return;
+          }
+        }
       }
 
       newTreeEntry(VL, true);
@@ -1121,6 +1191,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       }
       return VecCost - ScalarCost;
     }
+    case Instruction::GetElementPtr: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+      int VecCost =
+          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+
+      return VecCost - ScalarCost;
+    }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
       int ScalarLdCost = VecTy->getNumElements() *
@@ -1649,12 +1733,52 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       E->VectorizedValue = S;
       return propagateMetadata(S, E->Scalars);
     }
+    case Instruction::GetElementPtr: {
+      setInsertPointAfterBundle(E->Scalars);
+
+      ValueList Op0VL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+        Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+
+      Value *Op0 = vectorizeTree(Op0VL);
+
+      std::vector<Value *> OpVecs;
+      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+           ++j) {
+        ValueList OpVL;
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+          OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+
+        Value *OpVec = vectorizeTree(OpVL);
+        OpVecs.push_back(OpVec);
+      }
+
+      Value *V = Builder.CreateGEP(Op0, OpVecs);
+      E->VectorizedValue = V;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
       setInsertPointAfterBundle(E->Scalars);
+      Function *FI;
+      Intrinsic::ID IID  = Intrinsic::not_intrinsic;
+      if (CI && (FI = CI->getCalledFunction())) {
+        IID = (Intrinsic::ID) FI->getIntrinsicID();
+      }
       std::vector<Value *> OpVecs;
       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
         ValueList OpVL;
+        // ctlz,cttz and powi are special intrinsics whose second argument is
+        // a scalar. This argument should not be vectorized.
+        if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
+          CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+          OpVecs.push_back(CEI->getArgOperand(j));
+          continue;
+        }
         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
           CallInst *CEI = cast<CallInst>(E->Scalars[i]);
           OpVL.push_back(CEI->getArgOperand(j));
diff --git a/test/Analysis/CostModel/AArch64/lit.local.cfg b/test/Analysis/CostModel/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..7184443994b6
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Analysis/CostModel/ARM64/select.ll b/test/Analysis/CostModel/AArch64/select.ll
similarity index 100%
rename from test/Analysis/CostModel/ARM64/select.ll
rename to test/Analysis/CostModel/AArch64/select.ll
diff --git a/test/Analysis/CostModel/ARM64/store.ll b/test/Analysis/CostModel/AArch64/store.ll
similarity index 100%
rename from test/Analysis/CostModel/ARM64/store.ll
rename to test/Analysis/CostModel/AArch64/store.ll
diff --git a/test/Analysis/CostModel/ARM/lit.local.cfg b/test/Analysis/CostModel/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/Analysis/CostModel/ARM/lit.local.cfg
+++ b/test/Analysis/CostModel/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Analysis/CostModel/ARM64/lit.local.cfg b/test/Analysis/CostModel/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac9811f012..000000000000
--- a/test/Analysis/CostModel/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Analysis/CostModel/PowerPC/lit.local.cfg b/test/Analysis/CostModel/PowerPC/lit.local.cfg
index 2e463005586f..5d33887ff0a4 100644
--- a/test/Analysis/CostModel/PowerPC/lit.local.cfg
+++ b/test/Analysis/CostModel/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Analysis/CostModel/X86/lit.local.cfg b/test/Analysis/CostModel/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Analysis/CostModel/X86/lit.local.cfg
+++ b/test/Analysis/CostModel/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll b/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
new file mode 100644
index 000000000000..01a4b96b11a0
--- /dev/null
+++ b/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; Derived from the following code:
+;
+; void foo(long n, long m, long b, double A[n][m]) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       A[2i+b][2j] = 1.0;
+; }
+
+; AddRec: {{((%m * %b * sizeof(double)) + %A),+,(2 * %m * sizeof(double))}<%for.i>,+,(2 * sizeof(double))}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{%b,+,2}<%for.i>][{0,+,2}<%for.j>]
+
+
+define void @foo(i64 %n, i64 %m, i64 %b, double* %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %outerdim = mul nsw i64 %i, 2
+  %outerdim2 = add nsw i64 %outerdim, %b
+  %tmp = mul nsw i64 %outerdim2, %m
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j ]
+  %prodj = mul i64 %j, 2
+  %vlaarrayidx.sum = add i64 %prodj, %tmp
+  %arrayidx = getelementptr inbounds double* %A, i64 %vlaarrayidx.sum
+  store double 1.0, double* %arrayidx
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_2d.ll b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
index 48bec08190d9..5a88c4ce4eb1 100644
--- a/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
+++ b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
@@ -8,6 +8,15 @@
 ;       A[i][j] = 1.0;
 ; }
 
+; Inst:  %val = load double* %arrayidx
+; In Loop with Header: for.j
+; AddRec: {{0,+,(%m * sizeof(double))}<%for.i>,+,sizeof(double)}<%for.j>
+; Base offset: %A
+; ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
+; ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>]
+
+; Inst:  store double %val, double* %arrayidx
+; In Loop with Header: for.j
 ; AddRec: {{%A,+,(8 * %m)}<%for.i>,+,8}<%for.j>
 ; CHECK: Base offset: %A
 ; CHECK: ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
@@ -26,7 +35,8 @@ for.j:
   %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j ]
   %vlaarrayidx.sum = add i64 %j, %tmp
   %arrayidx = getelementptr inbounds double* %A, i64 %vlaarrayidx.sum
-  store double 1.0, double* %arrayidx
+  %val = load double* %arrayidx
+  store double %val, double* %arrayidx
   %j.inc = add nsw i64 %j, 1
   %j.exitcond = icmp eq i64 %j.inc, %m
   br i1 %j.exitcond, label %for.i.inc, label %for.j
diff --git a/test/Analysis/DependenceAnalysis/GCD.ll b/test/Analysis/DependenceAnalysis/GCD.ll
index fd9173b924c2..7eca18ed262c 100644
--- a/test/Analysis/DependenceAnalysis/GCD.ll
+++ b/test/Analysis/DependenceAnalysis/GCD.ll
@@ -269,10 +269,10 @@ entry:
 ; CHECK: da analyze - none!
 
 ; DELIN: 'Dependence Analysis' for function 'gcd4'
-; DELIN: da analyze - none!
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none!
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 
@@ -339,10 +339,10 @@ entry:
 ; CHECK: da analyze - none!
 
 ; DELIN: 'Dependence Analysis' for function 'gcd5'
-; DELIN: da analyze - none!
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - flow [<> *]!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none!
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 
@@ -410,10 +410,10 @@ entry:
 ; CHECK: da analyze - output [* *]!
 
 ; DELIN: 'Dependence Analysis' for function 'gcd6'
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - none!
-; DELIN: da analyze - flow [=> =>|<]!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none!
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - output [* *]!
 
diff --git a/test/Analysis/ScalarEvolution/max-trip-count.ll b/test/Analysis/ScalarEvolution/max-trip-count.ll
index 0cdbdf57a64c..31f06a46ad00 100644
--- a/test/Analysis/ScalarEvolution/max-trip-count.ll
+++ b/test/Analysis/ScalarEvolution/max-trip-count.ll
@@ -98,3 +98,112 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK: Determining loop execution counts for: @test
 ; CHECK-NEXT: backedge-taken count is
 ; CHECK-NEXT: max backedge-taken count is -1
+
+; PR19799: Indvars miscompile due to an incorrect max backedge taken count from SCEV.
+; CHECK-LABEL: @pr19799
+; CHECK: Loop %for.body.i: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body.i: max backedge-taken count is 1
+@a = common global i32 0, align 4
+
+define i32 @pr19799() {
+entry:
+  store i32 -1, i32* @a, align 4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i, %entry
+  %storemerge1.i = phi i32 [ -1, %entry ], [ %add.i.i, %for.cond.i ]
+  %tobool.i = icmp eq i32 %storemerge1.i, 0
+  %add.i.i = add nsw i32 %storemerge1.i, 2
+  br i1 %tobool.i, label %bar.exit, label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i
+  store i32 %add.i.i, i32* @a, align 4
+  %cmp.i = icmp slt i32 %storemerge1.i, 0
+  br i1 %cmp.i, label %for.body.i, label %bar.exit
+
+bar.exit:                                         ; preds = %for.cond.i, %for.body.i
+  ret i32 0
+}
+
+; PR18886: Indvars miscompile due to an incorrect max backedge taken count from SCEV.
+; CHECK-LABEL: @pr18886
+; CHECK: Loop %for.body: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body: max backedge-taken count is 3
+@aa = global i64 0, align 8
+
+define i32 @pr18886() {
+entry:
+  store i64 -21, i64* @aa, align 8
+  br label %for.body
+
+for.body:
+  %storemerge1 = phi i64 [ -21, %entry ], [ %add, %for.cond ]
+  %tobool = icmp eq i64 %storemerge1, 0
+  %add = add nsw i64 %storemerge1, 8
+  br i1 %tobool, label %return, label %for.cond
+
+for.cond:
+  store i64 %add, i64* @aa, align 8
+  %cmp = icmp slt i64 %add, 9
+  br i1 %cmp, label %for.body, label %return
+
+return:
+  %retval.0 = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  ret i32 %retval.0
+}
+
+; Here we have a must-exit loop latch that is not computable and a
+; may-exit early exit that can only have one non-exiting iteration
+; before the check is forever skipped.
+;
+; CHECK-LABEL: @cannot_compute_mustexit
+; CHECK: Loop %for.body.i: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body.i: Unpredictable max backedge-taken count. 
+@b = common global i32 0, align 4
+
+define i32 @cannot_compute_mustexit() {
+entry:
+  store i32 -1, i32* @a, align 4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i, %entry
+  %storemerge1.i = phi i32 [ -1, %entry ], [ %add.i.i, %for.cond.i ]
+  %tobool.i = icmp eq i32 %storemerge1.i, 0
+  %add.i.i = add nsw i32 %storemerge1.i, 2
+  br i1 %tobool.i, label %bar.exit, label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i
+  store i32 %add.i.i, i32* @a, align 4
+  %ld = load volatile i32* @b
+  %cmp.i = icmp ne i32 %ld, 0
+  br i1 %cmp.i, label %for.body.i, label %bar.exit
+
+bar.exit:                                         ; preds = %for.cond.i, %for.body.i
+  ret i32 0
+}
+
+; This loop has two must-exits, both of which dominate the latch. The
+; MaxBECount should be the minimum of them.
+;
+; CHECK-LABEL: @two_mustexit
+; CHECK: Loop %for.body.i: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body.i: max backedge-taken count is 1
+define i32 @two_mustexit() {
+entry:
+  store i32 -1, i32* @a, align 4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i, %entry
+  %storemerge1.i = phi i32 [ -1, %entry ], [ %add.i.i, %for.cond.i ]
+  %tobool.i = icmp sgt i32 %storemerge1.i, 0
+  %add.i.i = add nsw i32 %storemerge1.i, 2
+  br i1 %tobool.i, label %bar.exit, label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i
+  store i32 %add.i.i, i32* @a, align 4
+  %cmp.i = icmp slt i32 %storemerge1.i, 3
+  br i1 %cmp.i, label %for.body.i, label %bar.exit
+
+bar.exit:                                         ; preds = %for.cond.i, %for.body.i
+  ret i32 0
+}
diff --git a/test/Assembler/addrspacecast-alias.ll b/test/Assembler/addrspacecast-alias.ll
index 052a1414eabc..d7516599dfe2 100644
--- a/test/Assembler/addrspacecast-alias.ll
+++ b/test/Assembler/addrspacecast-alias.ll
@@ -3,5 +3,5 @@
 ; Test that global aliases are allowed to be constant addrspacecast
 
 @i = internal addrspace(1) global i8 42
-@ia = alias internal addrspace(2) i8 addrspace(3)*, i8 addrspace(1)* @i
-; CHECK: @ia = alias internal addrspace(2) i8 addrspace(3)*, i8 addrspace(1)* @i
+@ia = alias internal addrspacecast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(3)*)
+; CHECK: @ia = alias internal addrspacecast (i8 addrspace(2)* addrspace(1)* bitcast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(1)*) to i8 addrspace(2)* addrspace(3)*)
diff --git a/test/Assembler/alias-addrspace.ll b/test/Assembler/alias-addrspace.ll
deleted file mode 100644
index 6d378e45fa2b..000000000000
--- a/test/Assembler/alias-addrspace.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: not llvm-as %s 2>&1 | FileCheck %s
-
-@foo = global i32 42
-@bar = alias internal addrspace(1) i32* @foo
-
-CHECK: error: A type is required if addrspace is given
diff --git a/test/Assembler/alias-to-alias.ll b/test/Assembler/alias-to-alias.ll
deleted file mode 100644
index 1ea99bbb6946..000000000000
--- a/test/Assembler/alias-to-alias.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-; CHECK: Alias must point to function or variable
-
-@b1 = alias i32* @c1
-@c1 = alias i32* @b1
diff --git a/test/Assembler/alias-to-alias2.ll b/test/Assembler/alias-to-alias2.ll
deleted file mode 100644
index a8a0196f4364..000000000000
--- a/test/Assembler/alias-to-alias2.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-; CHECK: error: Alias is pointed by alias b1
-
-@g = global i32 42
-
-@b1 = alias i32* @c1
-@c1 = alias i32* @g
diff --git a/test/Assembler/alias-type.ll b/test/Assembler/alias-type.ll
deleted file mode 100644
index ead3e95243ff..000000000000
--- a/test/Assembler/alias-type.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: not llvm-as %s 2>&1 | FileCheck %s
-
-@foo = global i32 42
-@bar = alias i32 @foo
-
-CHECK: error: An alias must have pointer type
diff --git a/test/Assembler/atomic.ll b/test/Assembler/atomic.ll
index a2ae58e296ef..d7ccd9900bd8 100644
--- a/test/Assembler/atomic.ll
+++ b/test/Assembler/atomic.ll
@@ -16,6 +16,8 @@ define void @f(i32* %x) {
   cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire
   ; CHECK: cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic
   cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic
+  ; CHECK: cmpxchg weak i32* %x, i32 13, i32 0 seq_cst monotonic
+  cmpxchg weak i32* %x, i32 13, i32 0 seq_cst monotonic
   ; CHECK: atomicrmw add i32* %x, i32 10 seq_cst
   atomicrmw add i32* %x, i32 10 seq_cst
   ; CHECK: atomicrmw volatile xchg  i32* %x, i32 10 monotonic
diff --git a/test/Assembler/half-constprop.ll b/test/Assembler/half-constprop.ll
index 03ccdda97e0a..9e24f7242ba9 100644
--- a/test/Assembler/half-constprop.ll
+++ b/test/Assembler/half-constprop.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -O3 | llvm-dis | FileCheck %s
+; RUN: opt < %s -O3 -S | FileCheck %s
 ; Testing half constant propagation.
 
 define half @abc() nounwind {
diff --git a/test/Assembler/half-conv.ll b/test/Assembler/half-conv.ll
index bf9ae5713979..70a6b86c393f 100644
--- a/test/Assembler/half-conv.ll
+++ b/test/Assembler/half-conv.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -O3 | llvm-dis | FileCheck %s
+; RUN: opt < %s -O3 -S | FileCheck %s
 ; Testing half to float conversion.
 
 define float @abc() nounwind {
diff --git a/test/Bindings/Ocaml/target.ml b/test/Bindings/Ocaml/target.ml
index 26cd12939d1e..0a2283aa3ed5 100644
--- a/test/Bindings/Ocaml/target.ml
+++ b/test/Bindings/Ocaml/target.ml
@@ -46,7 +46,7 @@ let test_target_data () =
   let layout = "e-p:32:32-f64:32:64-v64:32:64-v128:32:128-n32-S32" in
   let dl     = DL.of_string layout in
   let sty    = struct_type context [| i32_type; i64_type |] in
-  
+
   assert_equal (DL.as_string dl) layout;
   assert_equal (DL.byte_order dl) Endian.Little;
   assert_equal (DL.pointer_size dl) 4;
@@ -86,7 +86,8 @@ let test_target_machine () =
   assert_equal (TM.triple machine) (Target.default_triple ());
   assert_equal (TM.cpu machine) "";
   assert_equal (TM.features machine) "";
-  ignore (TM.data_layout machine)
+  ignore (TM.data_layout machine);
+  TM.set_verbose_asm true machine
 
 
 (*===-- Code Emission -----------------------------------------------------===*)
diff --git a/test/Bindings/llvm-c/lit.local.cfg b/test/Bindings/llvm-c/lit.local.cfg
index d83ebeed8e1c..75b22c06fb2f 100644
--- a/test/Bindings/llvm-c/lit.local.cfg
+++ b/test/Bindings/llvm-c/lit.local.cfg
@@ -1,5 +1,4 @@
-targets = set(config.root.targets_to_build.split())
-if not "X86" in targets:
+if not "X86" in config.root.targets:
     config.unsupported = True
-if not "ARM" in targets:
+if not "ARM" in config.root.targets:
     config.unsupported = True
diff --git a/test/Bitcode/atomic.ll b/test/Bitcode/atomic.ll
new file mode 100644
index 000000000000..37815a749b55
--- /dev/null
+++ b/test/Bitcode/atomic.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+  cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  ; CHECK: cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+
+  cmpxchg volatile i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  ; CHECK: cmpxchg volatile i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+
+  cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire
+  ; CHECK: cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire
+
+  cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic
+  ; CHECK: cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic
+
+  ret void
+}
\ No newline at end of file
diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
index 02e1bb1c4e28..49366de9836d 100644
--- a/test/Bitcode/attributes.ll
+++ b/test/Bitcode/attributes.ll
@@ -203,7 +203,7 @@ define void @f34()
 ; CHECK: define void @f34()
 {
         call void @nobuiltin() nobuiltin
-; CHECK: call void @nobuiltin() #24
+; CHECK: call void @nobuiltin() #25
         ret void;
 }
 
@@ -223,6 +223,12 @@ define nonnull i8* @f37(i8* nonnull %a) {
         ret i8* %a
 }
 
+define void @f38() unnamed_addr jumptable {
+; CHECK: define void @f38() unnamed_addr #24
+    call void bitcast (void (i8*)* @f36 to void ()*)()
+    unreachable
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { readnone }
@@ -247,5 +253,5 @@ define nonnull i8* @f37(i8* nonnull %a) {
 ; CHECK: attributes #21 = { sspstrong }
 ; CHECK: attributes #22 = { minsize }
 ; CHECK: attributes #23 = { noinline optnone }
-; CHECK: attributes #24 = { nobuiltin }
-
+; CHECK: attributes #24 = { jumptable }
+; CHECK: attributes #25 = { nobuiltin }
diff --git a/test/Bitcode/memInstructions.3.2.ll b/test/Bitcode/memInstructions.3.2.ll
index 21c3deb8a5ad..e4cb6bdbe96b 100644
--- a/test/Bitcode/memInstructions.3.2.ll
+++ b/test/Bitcode/memInstructions.3.2.ll
@@ -223,68 +223,88 @@ define void @cmpxchg(i32* %ptr,i32 %cmp,i32 %new){
 entry:
   ;cmpxchg [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <ordering>
 
-; CHECK: %res1 = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: %res1 = extractvalue { i32, i1 } [[TMP]], 0
   %res1 = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
   
-; CHECK-NEXT: %res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: %res2 = extractvalue { i32, i1 } [[TMP]], 0
   %res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
   
-; CHECK-NEXT: %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: %res3 = extractvalue { i32, i1 } [[TMP]], 0
   %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
   
-; CHECK-NEXT: %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: %res4 = extractvalue { i32, i1 } [[TMP]], 0
   %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
   
   
-; CHECK-NEXT: %res5 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: %res5 = extractvalue { i32, i1 } [[TMP]], 0
   %res5 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
   
-; CHECK-NEXT: %res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: %res6 = extractvalue { i32, i1 } [[TMP]], 0
   %res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
   
-; CHECK-NEXT: %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: %res7 = extractvalue { i32, i1 } [[TMP]], 0
   %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
   
-; CHECK-NEXT: %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: %res8 = extractvalue { i32, i1 } [[TMP]], 0
   %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
   
   
-; CHECK-NEXT: %res9 = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: %res9 = extractvalue { i32, i1 } [[TMP]], 0
   %res9 = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
   
-; CHECK-NEXT: %res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: %res10 = extractvalue { i32, i1 } [[TMP]], 0
   %res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
   
-; CHECK-NEXT: %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: %res11 = extractvalue { i32, i1 } [[TMP]], 0
   %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
   
-; CHECK-NEXT: %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: %res12 = extractvalue { i32, i1 } [[TMP]], 0
   %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
   
   
-; CHECK-NEXT: %res13 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: %res13 = extractvalue { i32, i1 } [[TMP]], 0
   %res13 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
   
-; CHECK-NEXT: %res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: %res14 = extractvalue { i32, i1 } [[TMP]], 0
   %res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
   
-; CHECK-NEXT: %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: %res15 = extractvalue { i32, i1 } [[TMP]], 0
   %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
   
-; CHECK-NEXT: %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: %res16 = extractvalue { i32, i1 } [[TMP]], 0
   %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
   
   
-; CHECK-NEXT: %res17 = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: %res17 = extractvalue { i32, i1 } [[TMP]], 0
   %res17 = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
   
-; CHECK-NEXT: %res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: %res18 = extractvalue { i32, i1 } [[TMP]], 0
   %res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
   
-; CHECK-NEXT: %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: %res19 = extractvalue { i32, i1 } [[TMP]], 0
   %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
   
-; CHECK-NEXT: %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: %res20 = extractvalue { i32, i1 } [[TMP]], 0
   %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
 
   ret void
diff --git a/test/Bitcode/old-aliases.ll b/test/Bitcode/old-aliases.ll
index 4ef47c03dd72..7a0eea2f3f24 100644
--- a/test/Bitcode/old-aliases.ll
+++ b/test/Bitcode/old-aliases.ll
@@ -10,13 +10,13 @@
 ; CHECK: @v2 = global [1 x i32] zeroinitializer
 
 @v3 = alias bitcast (i32* @v1 to i16*)
-; CHECK: @v3 = alias i16, i32* @v1
+; CHECK: @v3 = alias bitcast (i32* @v1 to i16*)
 
 @v4 = alias getelementptr ([1 x i32]* @v2, i32 0, i32 0)
-; CHECK: @v4 = alias i32, [1 x i32]* @v2
+; CHECK: @v4 = alias getelementptr inbounds ([1 x i32]* @v2, i32 0, i32 0)
 
 @v5 = alias i32 addrspace(2)* addrspacecast (i32 addrspace(0)* @v1 to i32 addrspace(2)*)
-; CHECK: @v5 = alias addrspace(2) i32, i32* @v1
+; CHECK: @v5 = alias addrspacecast (i32* @v1 to i32 addrspace(2)*)
 
 @v6 = alias i16* @v3
-; CHECK: @v6 = alias i16, i32* @v1
+; CHECK: @v6 = alias i16* @v3
diff --git a/test/Bitcode/weak-cmpxchg-upgrade.ll b/test/Bitcode/weak-cmpxchg-upgrade.ll
new file mode 100644
index 000000000000..dbcd150633ed
--- /dev/null
+++ b/test/Bitcode/weak-cmpxchg-upgrade.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; cmpxchg-upgrade.ll.bc was produced by running a version of llvm-as from just
+; before the IR change on this file.
+
+define i32 @test(i32* %addr, i32 %old, i32 %new) {
+; CHECK:  [[TMP:%.*]] = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst monotonic
+; CHECK:  %val = extractvalue { i32, i1 } [[TMP]], 0
+  %val = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst monotonic
+  ret i32 %val
+}
+
+define i32 @test(i32* %addr, i32 %old, i32 %new) {
+  ret i1 %val
+}
diff --git a/test/Bitcode/weak-cmpxchg-upgrade.ll.bc b/test/Bitcode/weak-cmpxchg-upgrade.ll.bc
new file mode 100644
index 000000000000..f713c317d46f
Binary files /dev/null and b/test/Bitcode/weak-cmpxchg-upgrade.ll.bc differ
diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll
index 2360e858b574..a6f077698e40 100644
--- a/test/CodeGen/AArch64/128bit_load_store.ll
+++ b/test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon| FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK
 
 define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
 ; CHECK-LABEL: test_store_f128
@@ -18,12 +17,9 @@ entry:
 }
 
 define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
-; CHECK-ARM64-LABEL: test_vstrq_p128
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vstrq_p128
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
 
-; CHECK-AARCH64-LABEL: test_vstrq_p128
-; CHECK-AARCH64: str {{x[0-9]+}}, [{{x[0-9]+}}, #8]
-; CHECK-AARCH64: str {{x[0-9]+}}, [{{x[0-9]+}}]
 entry:
   %0 = bitcast i128* %ptr to fp128*
   %1 = bitcast i128 %val to fp128
@@ -32,12 +28,9 @@ entry:
 }
 
 define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
-; CHECK-ARM64-LABEL: test_vldrq_p128
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vldrq_p128
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
 
-; CHECK-AARCH64-LABEL: test_vldrq_p128
-; CHECK-AARCH64: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK-AARCH64: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #8]
 entry:
   %0 = bitcast i128* %ptr to fp128*
   %1 = load fp128* %0, align 16
diff --git a/test/CodeGen/ARM64/aarch64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
similarity index 100%
rename from test/CodeGen/ARM64/aarch64-neon-v1i1-setcc.ll
rename to test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 8742e450897c..892573ba06b1 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll
@@ -1,5 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
diff --git a/test/CodeGen/AArch64/addsub-shifted.ll b/test/CodeGen/AArch64/addsub-shifted.ll
index f3fdbefb47ae..0a93edd8290a 100644
--- a/test/CodeGen/AArch64/addsub-shifted.ll
+++ b/test/CodeGen/AArch64/addsub-shifted.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var32 = global i32 0
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index b64ad2a83d87..b85fdbb14ce2 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-linux-gnu | FileCheck %s
 
 ; Note that this should be refactored (for efficiency if nothing else)
 ; when the PCS is implemented so we don't have to worry about the
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index d33933e92232..ceea8a08ecee 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index f73365b20c24..f93efbc42e65 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-AARCH64 %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
 
 declare void @use_addr(i8*)
 
@@ -54,10 +52,8 @@ define i64 @test_alloca_with_local(i64 %n) {
 ; CHECK: bl use_addr
 
   %val = load i64* %loc
-; CHECK-AARCH64: sub x[[TMP:[0-9]+]], x29, #[[LOC_FROM_FP]]
-; CHECK-AARCH64: ldr x0, [x[[TMP]]]
 
-; CHECK-ARM64: ldur x0, [x29, #-[[LOC_FROM_FP]]]
+; CHECK: ldur x0, [x29, #-[[LOC_FROM_FP]]]
 
   ret i64 %val
   ; Make sure epilogue restores sp from fp
@@ -68,13 +64,7 @@ define i64 @test_alloca_with_local(i64 %n) {
 define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK-LABEL: test_variadic_alloca:
 
-; CHECK-AARCH64: sub     sp, sp, #{{[0-9]+}}
-; CHECK-AARCH64: add     x29, sp, #192
-; CHECK-AARCH64: sub     [[TMP:x[0-9]+]], x29, #192
-; CHECK-AARCH64: add     x8, [[TMP]], #0
-; CHECK-AARCH64-FP: str     q7, [x8, #112]
 ; [...]
-; CHECK-AARCH64-FP: str     q1, [x8, #16]
 
 
 ; CHECK-NOFP-AARCH64: sub     sp, sp, #80
@@ -84,16 +74,16 @@ define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK-NOFP-AARCH64: add     x8, [[TMP]], #0
 
 
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
-; CHECK-ARM64: sub     sp, sp, #192
-; CHECK-ARM64: stp     q6, q7, [x29, #-96]
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
+; CHECK: sub     sp, sp, #192
+; CHECK: stp     q6, q7, [x29, #-96]
 ; [...]
-; CHECK-ARM64: stp     q0, q1, [x29, #-192]
+; CHECK: stp     q0, q1, [x29, #-192]
 
-; CHECK-ARM64: stp     x6, x7, [x29, #-16]
+; CHECK: stp     x6, x7, [x29, #-16]
 ; [...]
-; CHECK-ARM64: stp     x2, x3, [x29, #-48]
+; CHECK: stp     x2, x3, [x29, #-48]
 
 ; CHECK-NOFP-ARM64: stp     x29, x30, [sp, #-16]!
 ; CHECK-NOFP-ARM64: mov     x29, sp
@@ -112,9 +102,6 @@ define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK: bl use_addr
 
   ret void
-; CHECK-AARCH64: sub sp, x29, #192
-; CHECK-AARCH64: ldp x29, x30, [sp, #192]
-; CHECK-AARCH64: add sp, sp, #208
 
 ; CHECK-NOFP-AARCH64: sub sp, x29, #64
 ; CHECK-NOFP-AARCH64: ldp x29, x30, [sp, #64]
@@ -127,17 +114,12 @@ define void @test_variadic_alloca(i64 %n, ...) {
 define void @test_alloca_large_frame(i64 %n) {
 ; CHECK-LABEL: test_alloca_large_frame:
 
-; CHECK-AARCH64: sub sp, sp, #496
-; CHECK-AARCH64: stp x29, x30, [sp, #480]
-; CHECK-AARCH64: add x29, sp, #480
-; CHECK-AARCH64: sub sp, sp, #48
-; CHECK-AARCH64: sub sp, sp, #1953, lsl #12
 
-; CHECK-ARM64: stp     x20, x19, [sp, #-32]!
-; CHECK-ARM64: stp     x29, x30, [sp, #16]
-; CHECK-ARM64: add     x29, sp, #16
-; CHECK-ARM64: sub     sp, sp, #1953, lsl #12
-; CHECK-ARM64: sub     sp, sp, #512
+; CHECK: stp     x20, x19, [sp, #-32]!
+; CHECK: stp     x29, x30, [sp, #16]
+; CHECK: add     x29, sp, #16
+; CHECK: sub     sp, sp, #1953, lsl #12
+; CHECK: sub     sp, sp, #512
 
   %addr1 = alloca i8, i64 %n
   %addr2 = alloca i64, i64 1000000
@@ -145,13 +127,10 @@ define void @test_alloca_large_frame(i64 %n) {
   call void @use_addr_loc(i8* %addr1, i64* %addr2)
 
   ret void
-; CHECK-AARCH64: sub sp, x29, #480
-; CHECK-AARCH64: ldp x29, x30, [sp, #480]
-; CHECK-AARCH64: add sp, sp, #496
 
-; CHECK-ARM64: sub     sp, x29, #16
-; CHECK-ARM64: ldp     x29, x30, [sp, #16]
-; CHECK-ARM64: ldp     x20, x19, [sp], #32
+; CHECK: sub     sp, x29, #16
+; CHECK: ldp     x29, x30, [sp, #16]
+; CHECK: ldp     x20, x19, [sp], #32
 }
 
 declare i8* @llvm.stacksave()
diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll
index b4fbf2edc483..6616b27c45b7 100644
--- a/test/CodeGen/AArch64/analyze-branch.ll
+++ b/test/CodeGen/AArch64/analyze-branch.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
 
 ; This test checks that LLVM can do basic stripping and reapplying of branches
 ; to basic blocks.
diff --git a/test/CodeGen/AArch64/andCmpBrToTBZ.ll b/test/CodeGen/AArch64/andCmpBrToTBZ.ll
deleted file mode 100644
index f564a5587f61..000000000000
--- a/test/CodeGen/AArch64/andCmpBrToTBZ.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -O1 -march=aarch64 -enable-andcmp-sinking=true < %s | FileCheck %s
-; arm64 has separate copy of this test
-
-; ModuleID = 'and-cbz-extr-mr.bc'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "aarch64-none-linux-gnu"
-
-define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
-; CHECK: foo:
-entry:
-  %tobool = icmp eq i8* %str14, null
-  br i1 %tobool, label %return, label %if.end
-
-; CHECK: %if.end
-; CHECK: tbz
-if.end:                                           ; preds = %entry
-  %and.i.i.i = and i32 %int1, 4
-  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
-  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
-
-land.rhs.i:                                       ; preds = %if.end
-  %cmp.i.i.i = icmp eq i8* %str12, %str13
-  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
-
-lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
-  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
-  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
-
-_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
-  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
-  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
-
-if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
-  %tmp11 = load i8* %str14, align 8
-  %tmp12 = and i8 %tmp11, 2
-  %tmp13 = icmp ne i8 %tmp12, 0
-  br label %return
-
-if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
-; CHECK: %if.end5
-; CHECK: tbz
-  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
-
-land.rhs.i19:                                     ; preds = %if.end5
-  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
-  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
-
-lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
-  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
-  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
-
-_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
-  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
-  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
-
-if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
-  br i1 %isTextField, label %if.then9, label %if.end12
-
-if.then9:                                         ; preds = %if.then7
-  %tmp23 = load i8* %str5, align 8
-  %tmp24 = and i8 %tmp23, 2
-  %tmp25 = icmp ne i8 %tmp24, 0
-  br label %return
-
-if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
-  %lnot = xor i1 %IsEditable, true
-  br label %return
-
-return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
-  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
-  ret i1 %retval.0
-}
-
-attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
similarity index 100%
rename from test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
rename to test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
diff --git a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
similarity index 100%
rename from test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
rename to test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
diff --git a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
similarity index 100%
rename from test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
rename to test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
diff --git a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
similarity index 100%
rename from test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
rename to test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
diff --git a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
similarity index 100%
rename from test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
rename to test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
diff --git a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
similarity index 100%
rename from test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
rename to test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
diff --git a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
similarity index 100%
rename from test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
rename to test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
diff --git a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
similarity index 100%
rename from test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
rename to test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
diff --git a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
similarity index 91%
rename from test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
rename to test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
index d1840d359420..7da2d2ca513e 100644
--- a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
@@ -2,14 +2,14 @@
 ; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
 ; <rdar://problem/11392109>
 
-define hidden void @t() optsize ssp {
+define hidden void @t(i64* %addr) optsize ssp {
 entry:
-  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* %addr, align 8
 ; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
 ; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
 ; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
 ; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
-  unreachable
+  ret void
 }
 
 declare i64 @x(i32) optsize
diff --git a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
similarity index 100%
rename from test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
rename to test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
diff --git a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
similarity index 100%
rename from test/CodeGen/ARM64/2012-06-06-FPToUI.ll
rename to test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
diff --git a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
similarity index 100%
rename from test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
rename to test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
diff --git a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
similarity index 72%
rename from test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
rename to test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
index b40a581d6118..e2c43d953bb9 100644
--- a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
+++ b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
diff --git a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
similarity index 100%
rename from test/CodeGen/ARM64/2013-01-23-frem-crash.ll
rename to test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
diff --git a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
similarity index 100%
rename from test/CodeGen/ARM64/2013-01-23-sext-crash.ll
rename to test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
diff --git a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
similarity index 83%
rename from test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
rename to test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
index 70e745fc5775..a350ba1472c9 100644
--- a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
+++ b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple
 
 ;CHECK-LABEL: Shuff:
 ;CHECK: tbl.8b
diff --git a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
similarity index 89%
rename from test/CodeGen/ARM64/AdvSIMD-Scalar.ll
rename to test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 3e75eed4cd5d..c4597d5a4815 100644
--- a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=generic -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
 
 define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
diff --git a/test/CodeGen/ARM64/2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-AnInfiniteLoopInDAGCombine.ll
similarity index 100%
rename from test/CodeGen/ARM64/2014-04-16-AnInfiniteLoopInDAGCombine.ll
rename to test/CodeGen/AArch64/arm64-AnInfiniteLoopInDAGCombine.ll
diff --git a/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
similarity index 94%
rename from test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll
rename to test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
index b0ab9fda22d5..1b2d54317c23 100644
--- a/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll
+++ b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; The following 2 test cases test shufflevector with beginning UNDEF mask.
 define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
diff --git a/test/CodeGen/ARM64/aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
similarity index 93%
rename from test/CodeGen/ARM64/aapcs.ll
rename to test/CodeGen/AArch64/arm64-aapcs.ll
index b713f0d5a531..ccf1371bb5ff 100644
--- a/test/CodeGen/ARM64/aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -101,3 +101,11 @@ define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
 ; CHECK: ldr {{q[0-9]+}}, [sp]
   ret fp128 %arg1
 }
+
+; Check if VPR can be correctly pass by stack.
+define <2 x double> @test_vreg_stack([8 x <2 x double>], <2 x double> %varg_stack) {
+entry:
+; CHECK-LABEL: test_vreg_stack:
+; CHECK: ldr {{q[0-9]+}}, [sp]
+  ret <2 x double> %varg_stack;
+}
diff --git a/test/CodeGen/ARM64/abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
similarity index 100%
rename from test/CodeGen/ARM64/abi-varargs.ll
rename to test/CodeGen/AArch64/arm64-abi-varargs.ll
diff --git a/test/CodeGen/ARM64/abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
similarity index 93%
rename from test/CodeGen/ARM64/abi.ll
rename to test/CodeGen/AArch64/arm64-abi.ll
index e2de434c7b07..a955029b3725 100644
--- a/test/CodeGen/ARM64/abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -debug -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
 ; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+; REQUIRES: asserts
 target triple = "arm64-apple-darwin"
 
 ; rdar://9932559
@@ -8,15 +9,15 @@ entry:
 ; CHECK-LABEL: i8i16callee:
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
-; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp]
 ; FAST-LABEL: i8i16callee:
-; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
-; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
-; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
-; FAST: ldrb  {{w[0-9]+}}, [sp]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #5]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #4]
+; FAST-DAG: ldrsh  {{w[0-9]+}}, [sp, #2]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp]
   %conv = sext i8 %a4 to i64
   %conv3 = sext i16 %a5 to i64
   %conv8 = sext i8 %b1 to i64
@@ -44,10 +45,10 @@ entry:
 ; CHECK: i8i16caller
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: strb {{w[0-9]+}}, [sp, #5]
-; CHECK: strb {{w[0-9]+}}, [sp, #4]
-; CHECK: strh {{w[0-9]+}}, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp]
 ; CHECK: bl
 ; FAST: i8i16caller
 ; FAST: strb {{w[0-9]+}}, [sp]
diff --git a/test/CodeGen/ARM64/abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
similarity index 100%
rename from test/CodeGen/ARM64/abi_align.ll
rename to test/CodeGen/AArch64/arm64-abi_align.ll
diff --git a/test/CodeGen/ARM64/addp.ll b/test/CodeGen/AArch64/arm64-addp.ll
similarity index 90%
rename from test/CodeGen/ARM64/addp.ll
rename to test/CodeGen/AArch64/arm64-addp.ll
index 428f6d4f28a5..3f1e5c5d44e3 100644
--- a/test/CodeGen/ARM64/addp.ll
+++ b/test/CodeGen/AArch64/arm64-addp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
 
 define double @foo(<2 x double> %a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/ARM64/addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
similarity index 100%
rename from test/CodeGen/ARM64/addr-mode-folding.ll
rename to test/CodeGen/AArch64/arm64-addr-mode-folding.ll
diff --git a/test/CodeGen/ARM64/addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
similarity index 100%
rename from test/CodeGen/ARM64/addr-type-promotion.ll
rename to test/CodeGen/AArch64/arm64-addr-type-promotion.ll
diff --git a/test/CodeGen/ARM64/addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
similarity index 100%
rename from test/CodeGen/ARM64/addrmode.ll
rename to test/CodeGen/AArch64/arm64-addrmode.ll
diff --git a/test/CodeGen/ARM64/alloc-no-stack-realign.ll b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
similarity index 100%
rename from test/CodeGen/ARM64/alloc-no-stack-realign.ll
rename to test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
diff --git a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
similarity index 100%
rename from test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
rename to test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
diff --git a/test/CodeGen/ARM64/andCmpBrToTBZ.ll b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
similarity index 100%
rename from test/CodeGen/ARM64/andCmpBrToTBZ.ll
rename to test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
diff --git a/test/CodeGen/ARM64/ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
similarity index 78%
rename from test/CodeGen/ARM64/ands-bad-peephole.ll
rename to test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
index 34d6287b8b43..38661a5f38f3 100644
--- a/test/CodeGen/ARM64/ands-bad-peephole.ll
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s
+; RUN: llc %s -o - -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 ; Check that ANDS (tst) is not merged with ADD when the immediate
 ; is not 0.
 ; <rdar://problem/16693089>
@@ -8,18 +8,18 @@ target triple = "arm64-apple-ios"
 ; CHECK-LABEL: tst1:
 ; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
 ; CHECK: tst [[REG]], #0x1
-define void @tst1() {
+define void @tst1(i1 %tst, i32 %true) {
 entry:
-  br i1 undef, label %for.end, label %for.body
+  br i1 %tst, label %for.end, label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
   %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
   %and = and i32 %i.08, 1
   %cmp1 = icmp eq i32 %and, 0
-  %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
+  %add2.result.0 = select i1 %cmp1, i32 %true, i32 %result.09
   %inc = add nsw i32 %i.08, 1
-  %cmp = icmp slt i32 %i.08, undef
+  %cmp = icmp slt i32 %i.08, %true
   br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
 
 for.cond.for.end_crit_edge:                       ; preds = %for.body
diff --git a/test/CodeGen/ARM64/anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
similarity index 100%
rename from test/CodeGen/ARM64/anyregcc-crash.ll
rename to test/CodeGen/AArch64/arm64-anyregcc-crash.ll
diff --git a/test/CodeGen/ARM64/anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
similarity index 100%
rename from test/CodeGen/ARM64/anyregcc.ll
rename to test/CodeGen/AArch64/arm64-anyregcc.ll
diff --git a/test/CodeGen/ARM64/arith-saturating.ll b/test/CodeGen/AArch64/arm64-arith-saturating.ll
similarity index 58%
rename from test/CodeGen/ARM64/arith-saturating.ll
rename to test/CodeGen/AArch64/arm64-arith-saturating.ll
index 2d188ff9cc7e..78cd1fcb1a21 100644
--- a/test/CodeGen/ARM64/arith-saturating.ll
+++ b/test/CodeGen/AArch64/arm64-arith-saturating.ll
@@ -5,7 +5,7 @@ define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK: sqadd s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqadd.i
 }
 
@@ -14,7 +14,7 @@ define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: sqadd d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqadd.i
 }
 
@@ -23,7 +23,7 @@ define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK: uqadd s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqadd.i
 }
 
@@ -32,21 +32,21 @@ define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: uqadd d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqadd.i
 }
 
-declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) nounwind readnone
 
 define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK-LABEL: qsubs:
 ; CHECK: sqsub s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqsub.i
 }
 
@@ -55,7 +55,7 @@ define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: sqsub d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqsub.i
 }
 
@@ -64,7 +64,7 @@ define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
 ; CHECK: uqsub s0, s0, s1
   %vecext = extractelement <4 x i32> %b, i32 0
   %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
   ret i32 %vqsub.i
 }
 
@@ -73,21 +73,21 @@ define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
 ; CHECK: uqsub d0, d0, d1
   %vecext = extractelement <2 x i64> %b, i32 0
   %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
   ret i64 %vqsub.i
 }
 
-declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) nounwind readnone
 
 define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
 ; CHECK-LABEL: qabss:
 ; CHECK: sqabs s0, s0
 ; CHECK: ret
   %vecext = extractelement <4 x i32> %b, i32 0
-  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
+  %vqabs.i = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %vecext) nounwind
   ret i32 %vqabs.i
 }
 
@@ -96,7 +96,7 @@ define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
 ; CHECK: sqabs d0, d0
 ; CHECK: ret
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
+  %vqabs.i = tail call i64 @llvm.aarch64.neon.sqabs.i64(i64 %vecext) nounwind
   ret i64 %vqabs.i
 }
 
@@ -105,7 +105,7 @@ define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
 ; CHECK: sqneg s0, s0
 ; CHECK: ret
   %vecext = extractelement <4 x i32> %b, i32 0
-  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
+  %vqneg.i = tail call i32 @llvm.aarch64.neon.sqneg.i32(i32 %vecext) nounwind
   ret i32 %vqneg.i
 }
 
@@ -114,21 +114,21 @@ define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
 ; CHECK: sqneg d0, d0
 ; CHECK: ret
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
+  %vqneg.i = tail call i64 @llvm.aarch64.neon.sqneg.i64(i64 %vecext) nounwind
   ret i64 %vqneg.i
 }
 
-declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) nounwind readnone
 
 
 define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: vqmovund:
 ; CHECK: sqxtun s0, d0
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  %vqmovun.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
   ret i32 %vqmovun.i
 }
 
@@ -136,7 +136,7 @@ define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: vqmovnd_s:
 ; CHECK: sqxtn s0, d0
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
   ret i32 %vqmovn.i
 }
 
@@ -144,10 +144,10 @@ define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: vqmovnd_u:
 ; CHECK: uqxtn s0, d0
   %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
   ret i32 %vqmovn.i
 }
 
-declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
similarity index 88%
rename from test/CodeGen/ARM64/arith.ll
rename to test/CodeGen/AArch64/arm64-arith.ll
index e4be8e98a780..f36e706b15dd 100644
--- a/test/CodeGen/ARM64/arith.ll
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -168,7 +168,7 @@ entry:
 ; CHECK-LABEL: t18:
 ; CHECK: sdiv w0, w0, w1
 ; CHECK: ret
-  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
+  %sdiv = call i32 @llvm.aarch64.sdiv.i32(i32 %a, i32 %b)
   ret i32 %sdiv
 }
 
@@ -177,7 +177,7 @@ entry:
 ; CHECK-LABEL: t19:
 ; CHECK: sdiv x0, x0, x1
 ; CHECK: ret
-  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
+  %sdiv = call i64 @llvm.aarch64.sdiv.i64(i64 %a, i64 %b)
   ret i64 %sdiv
 }
 
@@ -186,7 +186,7 @@ entry:
 ; CHECK-LABEL: t20:
 ; CHECK: udiv w0, w0, w1
 ; CHECK: ret
-  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
+  %udiv = call i32 @llvm.aarch64.udiv.i32(i32 %a, i32 %b)
   ret i32 %udiv
 }
 
@@ -195,14 +195,14 @@ entry:
 ; CHECK-LABEL: t21:
 ; CHECK: udiv x0, x0, x1
 ; CHECK: ret
-  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
+  %udiv = call i64 @llvm.aarch64.udiv.i64(i64 %a, i64 %b)
   ret i64 %udiv
 }
 
-declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.udiv.i64(i64, i64) nounwind readnone
 
 ; 32-bit not.
 define i32 @inv_32(i32 %x) nounwind ssp {
@@ -260,3 +260,11 @@ define i64 @f3(i64 %a) nounwind readnone ssp {
   %res = mul nsw i64 %a, 17
   ret i64 %res
 }
+
+define i32 @f4(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f4:
+; CHECK-NEXT: add w0, w0, w0, lsl #1
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 3
+  ret i32 %res
+}
diff --git a/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
similarity index 80%
rename from test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll
rename to test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
index babf4827693b..0904b62c4032 100644
--- a/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll
+++ b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-dead-def-elimination=false < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-dead-def-elimination=false < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
diff --git a/test/CodeGen/ARM64/atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
similarity index 98%
rename from test/CodeGen/ARM64/atomic-128.ll
rename to test/CodeGen/AArch64/arm64-atomic-128.ll
index 3b43aa16d2bb..0f5b23998ee8 100644
--- a/test/CodeGen/ARM64/atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -13,7 +13,8 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
 ; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 ; CHECK: [[DONE]]:
-  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  %val = extractvalue { i128, i1 } %pair, 0
   ret i128 %val
 }
 
diff --git a/test/CodeGen/ARM64/atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
similarity index 98%
rename from test/CodeGen/ARM64/atomic.ll
rename to test/CodeGen/AArch64/arm64-atomic.ll
index aa9b284410b1..aef79cb386b3 100644
--- a/test/CodeGen/ARM64/atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -10,7 +10,8 @@ define i32 @val_compare_and_swap(i32* %p) {
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 ; CHECK: [[LABEL2]]:
-  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  %pair = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -25,7 +26,8 @@ define i64 @val_compare_and_swap_64(i64* %p) {
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 ; CHECK: [[LABEL2]]:
-  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  %pair = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
 }
 
diff --git a/test/CodeGen/ARM64/basic-pic.ll b/test/CodeGen/AArch64/arm64-basic-pic.ll
similarity index 100%
rename from test/CodeGen/ARM64/basic-pic.ll
rename to test/CodeGen/AArch64/arm64-basic-pic.ll
diff --git a/test/CodeGen/ARM64/big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
similarity index 99%
rename from test/CodeGen/ARM64/big-endian-bitconverts.ll
rename to test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index cb8708b9267c..f0e968b2c177 100644
--- a/test/CodeGen/ARM64/big-endian-bitconverts.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -O1 -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define void @test_i64_f64(double* %p, i64* %q) {
diff --git a/test/CodeGen/ARM64/big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
similarity index 100%
rename from test/CodeGen/ARM64/big-endian-eh.ll
rename to test/CodeGen/AArch64/arm64-big-endian-eh.ll
diff --git a/test/CodeGen/ARM64/big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
similarity index 100%
rename from test/CodeGen/ARM64/big-endian-varargs.ll
rename to test/CodeGen/AArch64/arm64-big-endian-varargs.ll
diff --git a/test/CodeGen/ARM64/big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
similarity index 99%
rename from test/CodeGen/ARM64/big-endian-vector-callee.ll
rename to test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
index 5b9ccace8821..1dcccf106a29 100644
--- a/test/CodeGen/ARM64/big-endian-vector-callee.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -fast-isel=true -arm64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define i64 @test_i64_f64(double %p) {
diff --git a/test/CodeGen/ARM64/big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
similarity index 99%
rename from test/CodeGen/ARM64/big-endian-vector-caller.ll
rename to test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index 194a32139253..9a12b7a01153 100644
--- a/test/CodeGen/ARM64/big-endian-vector-caller.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 declare i64 @test_i64_f64_helper(double %p)
diff --git a/test/CodeGen/ARM64/big-imm-offsets.ll b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
similarity index 100%
rename from test/CodeGen/ARM64/big-imm-offsets.ll
rename to test/CodeGen/AArch64/arm64-big-imm-offsets.ll
diff --git a/test/CodeGen/ARM64/big-stack.ll b/test/CodeGen/AArch64/arm64-big-stack.ll
similarity index 100%
rename from test/CodeGen/ARM64/big-stack.ll
rename to test/CodeGen/AArch64/arm64-big-stack.ll
diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
similarity index 100%
rename from test/CodeGen/ARM64/bitfield-extract.ll
rename to test/CodeGen/AArch64/arm64-bitfield-extract.ll
diff --git a/test/CodeGen/ARM64/blockaddress.ll b/test/CodeGen/AArch64/arm64-blockaddress.ll
similarity index 100%
rename from test/CodeGen/ARM64/blockaddress.ll
rename to test/CodeGen/AArch64/arm64-blockaddress.ll
diff --git a/test/CodeGen/ARM64/build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
similarity index 94%
rename from test/CodeGen/ARM64/build-vector.ll
rename to test/CodeGen/AArch64/arm64-build-vector.ll
index 143d6894383f..c109263cedb4 100644
--- a/test/CodeGen/ARM64/build-vector.ll
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; Check that building up a vector w/ only one non-zero lane initializes
 ; intelligently.
diff --git a/test/CodeGen/ARM64/call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
similarity index 100%
rename from test/CodeGen/ARM64/call-tailcalls.ll
rename to test/CodeGen/AArch64/arm64-call-tailcalls.ll
diff --git a/test/CodeGen/ARM64/cast-opt.ll b/test/CodeGen/AArch64/arm64-cast-opt.ll
similarity index 100%
rename from test/CodeGen/ARM64/cast-opt.ll
rename to test/CodeGen/AArch64/arm64-cast-opt.ll
diff --git a/test/CodeGen/ARM64/ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
similarity index 99%
rename from test/CodeGen/ARM64/ccmp-heuristics.ll
rename to test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
index 5575997e53ed..664a26cafe4d 100644
--- a/test/CodeGen/ARM64/ccmp-heuristics.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp | FileCheck %s
 target triple = "arm64-apple-ios7.0.0"
 
 @channelColumns = external global i64
diff --git a/test/CodeGen/ARM64/ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
similarity index 98%
rename from test/CodeGen/ARM64/ccmp.ll
rename to test/CodeGen/AArch64/arm64-ccmp.ll
index 79e6f94e3f6d..63965f9538b5 100644
--- a/test/CodeGen/ARM64/ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; CHECK: single_same
diff --git a/test/CodeGen/ARM64/clrsb.ll b/test/CodeGen/AArch64/arm64-clrsb.ll
similarity index 100%
rename from test/CodeGen/ARM64/clrsb.ll
rename to test/CodeGen/AArch64/arm64-clrsb.ll
diff --git a/test/CodeGen/ARM64/coalesce-ext.ll b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
similarity index 100%
rename from test/CodeGen/ARM64/coalesce-ext.ll
rename to test/CodeGen/AArch64/arm64-coalesce-ext.ll
diff --git a/test/CodeGen/ARM64/code-model-large-abs.ll b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
similarity index 100%
rename from test/CodeGen/ARM64/code-model-large-abs.ll
rename to test/CodeGen/AArch64/arm64-code-model-large-abs.ll
diff --git a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
similarity index 91%
rename from test/CodeGen/ARM64/collect-loh-garbage-crash.ll
rename to test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
index 98cb625d2d51..81cee38420aa 100644
--- a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
 ; Check that the LOH analysis does not crash when the analysed chained
 ; contains instructions that are filtered out.
 ;
diff --git a/test/CodeGen/ARM64/collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
similarity index 86%
rename from test/CodeGen/ARM64/collect-loh-str.ll
rename to test/CodeGen/AArch64/arm64-collect-loh-str.ll
index fc63f8bcc2e6..d7bc00e318f7 100644
--- a/test/CodeGen/ARM64/collect-loh-str.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
diff --git a/test/CodeGen/ARM64/collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
similarity index 85%
rename from test/CodeGen/ARM64/collect-loh.ll
rename to test/CodeGen/AArch64/arm64-collect-loh.ll
index e92690b04f8c..6d73daac6209 100644
--- a/test/CodeGen/ARM64/collect-loh.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
 
 ; CHECK-ELF-NOT: .loh
 ; CHECK-ELF-NOT: AdrpAdrp
diff --git a/test/CodeGen/ARM64/complex-copy-noneon.ll b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
similarity index 100%
rename from test/CodeGen/ARM64/complex-copy-noneon.ll
rename to test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
diff --git a/test/CodeGen/ARM64/complex-ret.ll b/test/CodeGen/AArch64/arm64-complex-ret.ll
similarity index 100%
rename from test/CodeGen/ARM64/complex-ret.ll
rename to test/CodeGen/AArch64/arm64-complex-ret.ll
diff --git a/test/CodeGen/ARM64/const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll
similarity index 100%
rename from test/CodeGen/ARM64/const-addr.ll
rename to test/CodeGen/AArch64/arm64-const-addr.ll
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
new file mode 100644
index 000000000000..7123e5e0b235
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -march=arm64 | FileCheck %s
+
+
+define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i16
+; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: xtn  v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn  v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d
+; CHECK:     uzp1  v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h
+  %tmp1 = load <4 x double>* %ptr
+  %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
+  ret <4 x i16> %tmp2
+}
+
+define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i8
+; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
+; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
+; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
+; CHECK-DAG:  fcvtzs  v[[CONV0:[0-9]+]].2d, v0.2d
+; CHECK-DAG:  xtn  v[[NA3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG:  xtn  v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
+; CHECK-DAG:  xtn  v[[NA1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG:  xtn  v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
+; CHECK-DAG:  uzp1  v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h
+; CHECK-DAG:  uzp1  v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h
+; CHECK:      uzp1  v0.8b, v[[TMP2]].8b, v[[TMP1]].8b
+  %tmp1 = load <8 x double>* %ptr
+  %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
+  ret <8 x i8> %tmp2
+}
+
diff --git a/test/CodeGen/ARM64/copy-tuple.ll b/test/CodeGen/AArch64/arm64-copy-tuple.ll
similarity index 69%
rename from test/CodeGen/ARM64/copy-tuple.ll
rename to test/CodeGen/AArch64/arm64-copy-tuple.ll
index f99819312467..1803787d729f 100644
--- a/test/CodeGen/ARM64/copy-tuple.ll
+++ b/test/CodeGen/AArch64/arm64-copy-tuple.ll
@@ -13,14 +13,14 @@ define void @test_D1D2_from_D0D1(i8* %addr) #0 {
 ; CHECK: mov.8b v1, v0
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -30,14 +30,14 @@ define void @test_D0D1_from_D1D2(i8* %addr) #0 {
 ; CHECK: mov.8b v1, v2
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -47,14 +47,14 @@ define void @test_D0D1_from_D31D0(i8* %addr) #0 {
 ; CHECK: mov.8b v0, v31
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -64,14 +64,14 @@ define void @test_D31D0_from_D0D1(i8* %addr) #0 {
 ; CHECK: mov.8b v0, v1
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -82,16 +82,16 @@ define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
 ; CHECK: mov.8b v2, v0
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
   %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
 
   tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
   ret void
 }
 
@@ -102,15 +102,15 @@ define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
 ; CHECK: mov.16b v2, v3
 entry:
   %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
   %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
   %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
   %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
   tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
 
   tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
   ret void
 }
 
@@ -121,26 +121,26 @@ define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
 ; CHECK: mov.16b v2, v31
 ; CHECK: mov.16b v1, v30
   %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
   %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
   %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
   %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
   %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
   ret void
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
 
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/ARM64/crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll
similarity index 58%
rename from test/CodeGen/ARM64/crc32.ll
rename to test/CodeGen/AArch64/arm64-crc32.ll
index 5d759dcce71c..d3099e6bb132 100644
--- a/test/CodeGen/ARM64/crc32.ll
+++ b/test/CodeGen/AArch64/arm64-crc32.ll
@@ -4,7 +4,7 @@ define i32 @test_crc32b(i32 %cur, i8 %next) {
 ; CHECK-LABEL: test_crc32b:
 ; CHECK: crc32b w0, w0, w1
   %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32b(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
@@ -12,21 +12,21 @@ define i32 @test_crc32h(i32 %cur, i16 %next) {
 ; CHECK-LABEL: test_crc32h:
 ; CHECK: crc32h w0, w0, w1
   %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32h(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
 define i32 @test_crc32w(i32 %cur, i32 %next) {
 ; CHECK-LABEL: test_crc32w:
 ; CHECK: crc32w w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
+  %val = call i32 @llvm.aarch64.crc32w(i32 %cur, i32 %next)
   ret i32 %val
 }
 
 define i32 @test_crc32x(i32 %cur, i64 %next) {
 ; CHECK-LABEL: test_crc32x:
 ; CHECK: crc32x w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
+  %val = call i32 @llvm.aarch64.crc32x(i32 %cur, i64 %next)
   ret i32 %val
 }
 
@@ -34,7 +34,7 @@ define i32 @test_crc32cb(i32 %cur, i8 %next) {
 ; CHECK-LABEL: test_crc32cb:
 ; CHECK: crc32cb w0, w0, w1
   %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32cb(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
@@ -42,30 +42,30 @@ define i32 @test_crc32ch(i32 %cur, i16 %next) {
 ; CHECK-LABEL: test_crc32ch:
 ; CHECK: crc32ch w0, w0, w1
   %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
+  %val = call i32 @llvm.aarch64.crc32ch(i32 %cur, i32 %bits)
   ret i32 %val
 }
 
 define i32 @test_crc32cw(i32 %cur, i32 %next) {
 ; CHECK-LABEL: test_crc32cw:
 ; CHECK: crc32cw w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
+  %val = call i32 @llvm.aarch64.crc32cw(i32 %cur, i32 %next)
   ret i32 %val
 }
 
 define i32 @test_crc32cx(i32 %cur, i64 %next) {
 ; CHECK-LABEL: test_crc32cx:
 ; CHECK: crc32cx w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
+  %val = call i32 @llvm.aarch64.crc32cx(i32 %cur, i64 %next)
   ret i32 %val
 }
 
-declare i32 @llvm.arm64.crc32b(i32, i32)
-declare i32 @llvm.arm64.crc32h(i32, i32)
-declare i32 @llvm.arm64.crc32w(i32, i32)
-declare i32 @llvm.arm64.crc32x(i32, i64)
+declare i32 @llvm.aarch64.crc32b(i32, i32)
+declare i32 @llvm.aarch64.crc32h(i32, i32)
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+declare i32 @llvm.aarch64.crc32x(i32, i64)
 
-declare i32 @llvm.arm64.crc32cb(i32, i32)
-declare i32 @llvm.arm64.crc32ch(i32, i32)
-declare i32 @llvm.arm64.crc32cw(i32, i32)
-declare i32 @llvm.arm64.crc32cx(i32, i64)
+declare i32 @llvm.aarch64.crc32cb(i32, i32)
+declare i32 @llvm.aarch64.crc32ch(i32, i32)
+declare i32 @llvm.aarch64.crc32cw(i32, i32)
+declare i32 @llvm.aarch64.crc32cx(i32, i64)
diff --git a/test/CodeGen/ARM64/crypto.ll b/test/CodeGen/AArch64/arm64-crypto.ll
similarity index 50%
rename from test/CodeGen/ARM64/crypto.ll
rename to test/CodeGen/AArch64/arm64-crypto.ll
index 0020865bcd5f..2908b336b1bd 100644
--- a/test/CodeGen/ARM64/crypto.ll
+++ b/test/CodeGen/AArch64/arm64-crypto.ll
@@ -1,50 +1,50 @@
-; RUN: llc -march=arm64 -mattr=crypto -arm64-neon-syntax=apple -o - %s | FileCheck %s
+; RUN: llc -march=arm64 -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s
 
-declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
 
 define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK-LABEL: test_aese:
 ; CHECK: aese.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
   ret <16 x i8> %res
 }
 
 define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
 ; CHECK-LABEL: test_aesd:
 ; CHECK: aesd.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
   ret <16 x i8> %res
 }
 
 define <16 x i8> @test_aesmc(<16 x i8> %data) {
 ; CHECK-LABEL: test_aesmc:
 ; CHECK: aesmc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
   ret <16 x i8> %res
 }
 
 define <16 x i8> @test_aesimc(<16 x i8> %data) {
 ; CHECK-LABEL: test_aesimc:
 ; CHECK: aesimc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
   ret <16 x i8> %res
 }
 
-declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+declare <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
 
 define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha1c:
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1c.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -55,9 +55,9 @@ define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i3
 ; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
 ; CHECK-NOT: fmov
 ; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   %extract = extractelement <4 x i32> %res, i32 0
-  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
   ret <4 x i32> %res2
 }
 
@@ -65,7 +65,7 @@ define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha1p:
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1p.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -73,7 +73,7 @@ define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha1m:
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1m.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -82,33 +82,33 @@ define i32 @test_sha1h(i32 %hash_e) {
 ; CHECK: fmov [[HASH_E:s[0-9]+]], w0
 ; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
 ; CHECK: fmov w0, [[RES]]
-  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+  %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
   ret i32 %res
 }
 
 define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
 ; CHECK-LABEL: test_sha1su0:
 ; CHECK: sha1su0.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
   ret <4 x i32> %res
 }
 
 define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
 ; CHECK-LABEL: test_sha1su1:
 ; CHECK: sha1su1.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
 
 define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
 ; CHECK-LABEL: test_sha256h:
 ; CHECK: sha256h.4s q0, q1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
@@ -116,20 +116,20 @@ define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x
 ; CHECK-LABEL: test_sha256h2:
 ; CHECK: sha256h2.4s q0, q1, v2
 
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
   ret <4 x i32> %res
 }
 
 define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
 ; CHECK-LABEL: test_sha256su0:
 ; CHECK: sha256su0.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
   ret <4 x i32> %res
 }
 
 define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
 ; CHECK-LABEL: test_sha256su1:
 ; CHECK: sha256su1.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
   ret <4 x i32> %res
 }
diff --git a/test/CodeGen/ARM64/cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
similarity index 94%
rename from test/CodeGen/ARM64/cse.ll
rename to test/CodeGen/AArch64/arm64-cse.ll
index d98bfd605390..5d62cfe76a84 100644
--- a/test/CodeGen/ARM64/cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006
@@ -13,7 +13,7 @@ entry:
 ; CHECK: b.ge
 ; CHECK: sub
 ; CHECK: sub
-; CHECK_NOT: sub
+; CHECK-NOT: sub
 ; CHECK: ret
  %0 = load i32* %offset, align 4
  %cmp = icmp slt i32 %0, %size
diff --git a/test/CodeGen/ARM64/csel.ll b/test/CodeGen/AArch64/arm64-csel.ll
similarity index 100%
rename from test/CodeGen/ARM64/csel.ll
rename to test/CodeGen/AArch64/arm64-csel.ll
diff --git a/test/CodeGen/ARM64/cvt.ll b/test/CodeGen/AArch64/arm64-cvt.ll
similarity index 52%
rename from test/CodeGen/ARM64/cvt.ll
rename to test/CodeGen/AArch64/arm64-cvt.ll
index b55a42fdf853..420a8bc04833 100644
--- a/test/CodeGen/ARM64/cvt.ll
+++ b/test/CodeGen/AArch64/arm64-cvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;
 ; Floating-point scalar convert to signed integer (to nearest with ties to away)
@@ -7,7 +7,7 @@ define i32 @fcvtas_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtas_1w1s:
 ;CHECK: fcvtas w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -15,7 +15,7 @@ define i64 @fcvtas_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtas_1x1s:
 ;CHECK: fcvtas x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -23,7 +23,7 @@ define i32 @fcvtas_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtas_1w1d:
 ;CHECK: fcvtas w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -31,14 +31,14 @@ define i64 @fcvtas_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtas_1x1d:
 ;CHECK: fcvtas x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer
@@ -47,7 +47,7 @@ define i32 @fcvtau_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtau_1w1s:
 ;CHECK: fcvtau w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -55,7 +55,7 @@ define i64 @fcvtau_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtau_1x1s:
 ;CHECK: fcvtau x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -63,7 +63,7 @@ define i32 @fcvtau_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtau_1w1d:
 ;CHECK: fcvtau w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -71,14 +71,14 @@ define i64 @fcvtau_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtau_1x1d:
 ;CHECK: fcvtau x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to signed integer (toward -Inf)
@@ -87,7 +87,7 @@ define i32 @fcvtms_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtms_1w1s:
 ;CHECK: fcvtms w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -95,7 +95,7 @@ define i64 @fcvtms_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtms_1x1s:
 ;CHECK: fcvtms x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -103,7 +103,7 @@ define i32 @fcvtms_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtms_1w1d:
 ;CHECK: fcvtms w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -111,14 +111,14 @@ define i64 @fcvtms_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtms_1x1d:
 ;CHECK: fcvtms x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (toward -Inf)
@@ -127,7 +127,7 @@ define i32 @fcvtmu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1w1s:
 ;CHECK: fcvtmu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -135,7 +135,7 @@ define i64 @fcvtmu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1x1s:
 ;CHECK: fcvtmu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -143,7 +143,7 @@ define i32 @fcvtmu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1w1d:
 ;CHECK: fcvtmu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -151,14 +151,14 @@ define i64 @fcvtmu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtmu_1x1d:
 ;CHECK: fcvtmu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to signed integer (to nearest with ties to even)
@@ -167,7 +167,7 @@ define i32 @fcvtns_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtns_1w1s:
 ;CHECK: fcvtns w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -175,7 +175,7 @@ define i64 @fcvtns_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtns_1x1s:
 ;CHECK: fcvtns x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -183,7 +183,7 @@ define i32 @fcvtns_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtns_1w1d:
 ;CHECK: fcvtns w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -191,14 +191,14 @@ define i64 @fcvtns_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtns_1x1d:
 ;CHECK: fcvtns x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
@@ -207,7 +207,7 @@ define i32 @fcvtnu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1w1s:
 ;CHECK: fcvtnu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -215,7 +215,7 @@ define i64 @fcvtnu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1x1s:
 ;CHECK: fcvtnu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -223,7 +223,7 @@ define i32 @fcvtnu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1w1d:
 ;CHECK: fcvtnu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -231,14 +231,14 @@ define i64 @fcvtnu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtnu_1x1d:
 ;CHECK: fcvtnu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to signed integer (toward +Inf)
@@ -247,7 +247,7 @@ define i32 @fcvtps_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtps_1w1s:
 ;CHECK: fcvtps w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -255,7 +255,7 @@ define i64 @fcvtps_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtps_1x1s:
 ;CHECK: fcvtps x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -263,7 +263,7 @@ define i32 @fcvtps_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtps_1w1d:
 ;CHECK: fcvtps w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -271,14 +271,14 @@ define i64 @fcvtps_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtps_1x1d:
 ;CHECK: fcvtps x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (toward +Inf)
@@ -287,7 +287,7 @@ define i32 @fcvtpu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1w1s:
 ;CHECK: fcvtpu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -295,7 +295,7 @@ define i64 @fcvtpu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1x1s:
 ;CHECK: fcvtpu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -303,7 +303,7 @@ define i32 @fcvtpu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1w1d:
 ;CHECK: fcvtpu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -311,14 +311,14 @@ define i64 @fcvtpu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtpu_1x1d:
 ;CHECK: fcvtpu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double) nounwind readnone
 
 ;
 ;  Floating-point scalar convert to signed integer (toward zero)
@@ -327,7 +327,7 @@ define i32 @fcvtzs_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1w1s:
 ;CHECK: fcvtzs w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -335,7 +335,7 @@ define i64 @fcvtzs_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1x1s:
 ;CHECK: fcvtzs x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -343,7 +343,7 @@ define i32 @fcvtzs_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1w1d:
 ;CHECK: fcvtzs w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -351,14 +351,14 @@ define i64 @fcvtzs_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzs_1x1d:
 ;CHECK: fcvtzs x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double) nounwind readnone
 
 ;
 ; Floating-point scalar convert to unsigned integer (toward zero)
@@ -367,7 +367,7 @@ define i32 @fcvtzu_1w1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1w1s:
 ;CHECK: fcvtzu w0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A)
 	ret i32 %tmp3
 }
 
@@ -375,7 +375,7 @@ define i64 @fcvtzu_1x1s(float %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1x1s:
 ;CHECK: fcvtzu x0, s0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A)
 	ret i64 %tmp3
 }
 
@@ -383,7 +383,7 @@ define i32 @fcvtzu_1w1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1w1d:
 ;CHECK: fcvtzu w0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A)
 	ret i32 %tmp3
 }
 
@@ -391,11 +391,11 @@ define i64 @fcvtzu_1x1d(double %A) nounwind {
 ;CHECK-LABEL: fcvtzu_1x1d:
 ;CHECK: fcvtzu x0, d0
 ;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A)
 	ret i64 %tmp3
 }
 
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/dagcombiner-convergence.ll b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
similarity index 100%
rename from test/CodeGen/ARM64/dagcombiner-convergence.ll
rename to test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
diff --git a/test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
similarity index 90%
rename from test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll
rename to test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
index 2cf01357324b..6eed48bf62e3 100644
--- a/test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -mcpu=cyclone < %s | FileCheck %s
 
+; r208640 broke ppc64/Linux self-hosting; xfailing while this is worked on.
+; XFAIL: *
+
 target datalayout = "e-i64:64-n32:64-S128"
 target triple = "arm64-apple-ios"
 
diff --git a/test/CodeGen/ARM64/dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
similarity index 81%
rename from test/CodeGen/ARM64/dagcombiner-indexed-load.ll
rename to test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
index 2e4b658f1c9c..ce132c6afa46 100644
--- a/test/CodeGen/ARM64/dagcombiner-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
@@ -13,12 +13,12 @@ target triple = "arm64-apple-ios"
 
 ; CHECK-LABEL: XX:
 ; CHECK: ldr
-define void @XX(%class.A* %K) {
+define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
 entry:
-  br i1 undef, label %if.then, label %lor.rhs.i
+  br i1 %tst, label %if.then, label %lor.rhs.i
 
 lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* undef, align 4
+  %tmp = load i32* %addr, align 4
   %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
   %tmp1 = load i64* %y.i.i.i, align 8
   %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
@@ -30,17 +30,17 @@ lor.rhs.i:                                        ; preds = %entry
   %add16.i = add nsw i32 %add12.i, %div15.i
   %rem.i.i = srem i32 %add16.i, %tmp
   %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
-  %tobool533 = icmp eq %class.C* undef, null
+  %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
+  %tobool533 = icmp eq %class.C* %pC, null
   br i1 %tobool533, label %while.end, label %while.body
 
 if.then:                                          ; preds = %entry
-  unreachable
+  ret i32 42
 
 while.body:                                       ; preds = %lor.rhs.i
-  unreachable
+  ret i32 5
 
 while.end:                                        ; preds = %lor.rhs.i
   %tmp3 = load %class.C** %arrayidx, align 8
-  unreachable
+  ret i32 50
 }
diff --git a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
similarity index 100%
rename from test/CodeGen/ARM64/dagcombiner-load-slicing.ll
rename to test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
diff --git a/test/CodeGen/ARM64/dead-def-frame-index.ll b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
similarity index 100%
rename from test/CodeGen/ARM64/dead-def-frame-index.ll
rename to test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
diff --git a/test/CodeGen/ARM64/dead-register-def-bug.ll b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
similarity index 100%
rename from test/CodeGen/ARM64/dead-register-def-bug.ll
rename to test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
diff --git a/test/CodeGen/ARM64/dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
similarity index 99%
rename from test/CodeGen/ARM64/dup.ll
rename to test/CodeGen/AArch64/arm64-dup.ll
index 97774a7d18f4..0c56b46c4176 100644
--- a/test/CodeGen/ARM64/dup.ll
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define <8 x i8> @v_dup8(i8 %A) nounwind {
 ;CHECK-LABEL: v_dup8:
diff --git a/test/CodeGen/ARM64/early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
similarity index 99%
rename from test/CodeGen/ARM64/early-ifcvt.ll
rename to test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 17d783a488f3..44150c29aeb0 100644
--- a/test/CodeGen/ARM64/early-ifcvt.ll
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -stress-early-ifcvt -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-macosx"
 
 ; CHECK: mm2
diff --git a/test/CodeGen/ARM64/elf-calls.ll b/test/CodeGen/AArch64/arm64-elf-calls.ll
similarity index 100%
rename from test/CodeGen/ARM64/elf-calls.ll
rename to test/CodeGen/AArch64/arm64-elf-calls.ll
diff --git a/test/CodeGen/ARM64/elf-constpool.ll b/test/CodeGen/AArch64/arm64-elf-constpool.ll
similarity index 100%
rename from test/CodeGen/ARM64/elf-constpool.ll
rename to test/CodeGen/AArch64/arm64-elf-constpool.ll
diff --git a/test/CodeGen/ARM64/elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll
similarity index 100%
rename from test/CodeGen/ARM64/elf-globals.ll
rename to test/CodeGen/AArch64/arm64-elf-globals.ll
diff --git a/test/CodeGen/ARM64/ext.ll b/test/CodeGen/AArch64/arm64-ext.ll
similarity index 98%
rename from test/CodeGen/ARM64/ext.ll
rename to test/CodeGen/AArch64/arm64-ext.ll
index d368eef172e8..67860de51b0f 100644
--- a/test/CodeGen/ARM64/ext.ll
+++ b/test/CodeGen/AArch64/arm64-ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: test_vextd:
diff --git a/test/CodeGen/ARM64/extend-int-to-fp.ll b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
similarity index 86%
rename from test/CodeGen/ARM64/extend-int-to-fp.ll
rename to test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
index 599a697a3103..048fdb083a41 100644
--- a/test/CodeGen/ARM64/extend-int-to-fp.ll
+++ b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <4 x float> @foo(<4 x i16> %a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/ARM64/extend.ll b/test/CodeGen/AArch64/arm64-extend.ll
similarity index 100%
rename from test/CodeGen/ARM64/extend.ll
rename to test/CodeGen/AArch64/arm64-extend.ll
diff --git a/test/CodeGen/ARM64/extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
similarity index 100%
rename from test/CodeGen/ARM64/extern-weak.ll
rename to test/CodeGen/AArch64/arm64-extern-weak.ll
diff --git a/test/CodeGen/ARM64/extload-knownzero.ll b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
similarity index 100%
rename from test/CodeGen/ARM64/extload-knownzero.ll
rename to test/CodeGen/AArch64/arm64-extload-knownzero.ll
diff --git a/test/CodeGen/ARM64/extract.ll b/test/CodeGen/AArch64/arm64-extract.ll
similarity index 95%
rename from test/CodeGen/ARM64/extract.ll
rename to test/CodeGen/AArch64/arm64-extract.ll
index 02e42189bb4a..01984662d23a 100644
--- a/test/CodeGen/ARM64/extract.ll
+++ b/test/CodeGen/AArch64/arm64-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \
 ; RUN: -march=arm64 | FileCheck %s
 
 define i64 @ror_i64(i64 %in) {
diff --git a/test/CodeGen/ARM64/extract_subvector.ll b/test/CodeGen/AArch64/arm64-extract_subvector.ll
similarity index 95%
rename from test/CodeGen/ARM64/extract_subvector.ll
rename to test/CodeGen/AArch64/arm64-extract_subvector.ll
index 20c05fb23265..8b15a6453b2b 100644
--- a/test/CodeGen/ARM64/extract_subvector.ll
+++ b/test/CodeGen/AArch64/arm64-extract_subvector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 ; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
 
diff --git a/test/CodeGen/ARM64/fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-addr-offset.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
diff --git a/test/CodeGen/ARM64/fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-alloca.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
diff --git a/test/CodeGen/ARM64/fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-br.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-br.ll
diff --git a/test/CodeGen/ARM64/fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-call.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-call.ll
diff --git a/test/CodeGen/ARM64/fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-conversion.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
diff --git a/test/CodeGen/ARM64/fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-fcmp.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
diff --git a/test/CodeGen/ARM64/fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-gv.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-gv.ll
diff --git a/test/CodeGen/ARM64/fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-icmp.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
diff --git a/test/CodeGen/ARM64/fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-indirectbr.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
diff --git a/test/CodeGen/ARM64/fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
similarity index 91%
rename from test/CodeGen/ARM64/fast-isel-intrinsic.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index a3d5f6c3c5a1..115298805ac1 100644
--- a/test/CodeGen/ARM64/fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -133,3 +133,16 @@ define void @t8() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
   ret void
 }
+
+define void @test_distant_memcpy(i8* %dst) {
+; ARM64-LABEL: test_distant_memcpy:
+; ARM64: mov [[ARRAY:x[0-9]+]], sp
+; ARM64: movz [[OFFSET:x[0-9]+]], #0x1f40
+; ARM64: add x[[ADDR:[0-9]+]], [[ARRAY]], [[OFFSET]]
+; ARM64: ldrb [[BYTE:w[0-9]+]], [x[[ADDR]]]
+; ARM64: strb [[BYTE]], [x0]
+  %array = alloca i8, i32 8192
+  %elem = getelementptr i8* %array, i32 8000
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %elem, i64 1, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-materialize.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
diff --git a/test/CodeGen/ARM64/fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-noconvert.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
diff --git a/test/CodeGen/ARM64/fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-rem.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-rem.ll
diff --git a/test/CodeGen/ARM64/fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-ret.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-ret.ll
diff --git a/test/CodeGen/ARM64/fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel-select.ll
rename to test/CodeGen/AArch64/arm64-fast-isel-select.ll
diff --git a/test/CodeGen/ARM64/fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
similarity index 100%
rename from test/CodeGen/ARM64/fast-isel.ll
rename to test/CodeGen/AArch64/arm64-fast-isel.ll
diff --git a/test/CodeGen/ARM64/fastcc-tailcall.ll b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
similarity index 100%
rename from test/CodeGen/ARM64/fastcc-tailcall.ll
rename to test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
diff --git a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
similarity index 100%
rename from test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
rename to test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
diff --git a/test/CodeGen/ARM64/fcmp-opt.ll b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
similarity index 98%
rename from test/CodeGen/ARM64/fcmp-opt.ll
rename to test/CodeGen/AArch64/arm64-fcmp-opt.ll
index e79eb9c6fb1f..41027d4b5c74 100644
--- a/test/CodeGen/ARM64/fcmp-opt.ll
+++ b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
 ; rdar://10263824
 
 define i1 @fcmp_float1(float %a) nounwind ssp {
diff --git a/test/CodeGen/ARM64/fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
similarity index 100%
rename from test/CodeGen/ARM64/fcopysign.ll
rename to test/CodeGen/AArch64/arm64-fcopysign.ll
diff --git a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
similarity index 62%
rename from test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
rename to test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
index 77981f292bd8..e51c38b2b95e 100644
--- a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
+++ b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; DAGCombine to transform a conversion of an extract_vector_elt to an
 ; extract_vector_elt of a conversion, which saves a round trip of copies
@@ -8,8 +8,8 @@ define double @foo0(<2 x i64> %a) nounwind {
 ; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
 ; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
   %vecext = extractelement <2 x i64> %a, i32 1
-  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  %fcvt_n = tail call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
   ret double %fcvt_n
 }
 
-declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
+declare double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmadd.ll b/test/CodeGen/AArch64/arm64-fmadd.ll
similarity index 100%
rename from test/CodeGen/ARM64/fmadd.ll
rename to test/CodeGen/AArch64/arm64-fmadd.ll
diff --git a/test/CodeGen/ARM64/fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll
similarity index 100%
rename from test/CodeGen/ARM64/fmax.ll
rename to test/CodeGen/AArch64/arm64-fmax.ll
diff --git a/test/CodeGen/AArch64/arm64-fminv.ll b/test/CodeGen/AArch64/arm64-fminv.ll
new file mode 100644
index 000000000000..f4c97355dd19
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fminv.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define float @test_fminv_v2f32(<2 x float> %in) {
+; CHECK: test_fminv_v2f32:
+; CHECK: fminp s0, v0.2s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %in)
+  ret float %min
+}
+
+define float @test_fminv_v4f32(<4 x float> %in) {
+; CHECK: test_fminv_v4f32:
+; CHECK: fminv s0, v0.4s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %in)
+  ret float %min
+}
+
+define double @test_fminv_v2f64(<2 x double> %in) {
+; CHECK: test_fminv_v2f64:
+; CHECK: fminp d0, v0.2d
+  %min = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxv_v2f32:
+; CHECK: fmaxp s0, v0.2s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %in)
+  ret float %max
+}
+
+define float @test_fmaxv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxv_v4f32:
+; CHECK: fmaxv s0, v0.4s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %in)
+  ret float %max
+}
+
+define double @test_fmaxv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxv_v2f64:
+; CHECK: fmaxp d0, v0.2d
+  %max = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double>)
+
+define float @test_fminnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fminnmv_v2f32:
+; CHECK: fminnmp s0, v0.2s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %in)
+  ret float %minnm
+}
+
+define float @test_fminnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fminnmv_v4f32:
+; CHECK: fminnmv s0, v0.4s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %in)
+  ret float %minnm
+}
+
+define double @test_fminnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fminnmv_v2f64:
+; CHECK: fminnmp d0, v0.2d
+  %minnm = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %minnm
+}
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxnmv_v2f32:
+; CHECK: fmaxnmp s0, v0.2s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
+  ret float %maxnm
+}
+
+define float @test_fmaxnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxnmv_v4f32:
+; CHECK: fmaxnmv s0, v0.4s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
+  ret float %maxnm
+}
+
+define double @test_fmaxnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxnmv_v2f64:
+; CHECK: fmaxnmp d0, v0.2d
+  %maxnm = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %maxnm
+}
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/fmuladd.ll b/test/CodeGen/AArch64/arm64-fmuladd.ll
similarity index 97%
rename from test/CodeGen/ARM64/fmuladd.ll
rename to test/CodeGen/AArch64/arm64-fmuladd.ll
index 174d83076718..6c5eecabd755 100644
--- a/test/CodeGen/ARM64/fmuladd.ll
+++ b/test/CodeGen/AArch64/arm64-fmuladd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define float @test_f32(float* %A, float* %B, float* %C) nounwind {
 ;CHECK-LABEL: test_f32:
diff --git a/test/CodeGen/ARM64/fold-address.ll b/test/CodeGen/AArch64/arm64-fold-address.ll
similarity index 100%
rename from test/CodeGen/ARM64/fold-address.ll
rename to test/CodeGen/AArch64/arm64-fold-address.ll
diff --git a/test/CodeGen/ARM64/fold-lsl.ll b/test/CodeGen/AArch64/arm64-fold-lsl.ll
similarity index 97%
rename from test/CodeGen/ARM64/fold-lsl.ll
rename to test/CodeGen/AArch64/arm64-fold-lsl.ll
index 2e5762dd2628..ec65e467e37d 100644
--- a/test/CodeGen/ARM64/fold-lsl.ll
+++ b/test/CodeGen/AArch64/arm64-fold-lsl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 ;
 ; <rdar://problem/14486451>
 
diff --git a/test/CodeGen/ARM64/fp-contract-zero.ll b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
similarity index 100%
rename from test/CodeGen/ARM64/fp-contract-zero.ll
rename to test/CodeGen/AArch64/arm64-fp-contract-zero.ll
diff --git a/test/CodeGen/ARM64/fp-imm.ll b/test/CodeGen/AArch64/arm64-fp-imm.ll
similarity index 100%
rename from test/CodeGen/ARM64/fp-imm.ll
rename to test/CodeGen/AArch64/arm64-fp-imm.ll
diff --git a/test/CodeGen/ARM64/fp.ll b/test/CodeGen/AArch64/arm64-fp.ll
similarity index 100%
rename from test/CodeGen/ARM64/fp.ll
rename to test/CodeGen/AArch64/arm64-fp.ll
diff --git a/test/CodeGen/ARM64/fp128-folding.ll b/test/CodeGen/AArch64/arm64-fp128-folding.ll
similarity index 100%
rename from test/CodeGen/ARM64/fp128-folding.ll
rename to test/CodeGen/AArch64/arm64-fp128-folding.ll
diff --git a/test/CodeGen/ARM64/fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
similarity index 99%
rename from test/CodeGen/ARM64/fp128.ll
rename to test/CodeGen/AArch64/arm64-fp128.ll
index 57bbb93e12b7..b1d50102aa28 100644
--- a/test/CodeGen/ARM64/fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 @lhs = global fp128 zeroinitializer, align 16
 @rhs = global fp128 zeroinitializer, align 16
diff --git a/test/CodeGen/ARM64/frame-index.ll b/test/CodeGen/AArch64/arm64-frame-index.ll
similarity index 67%
rename from test/CodeGen/ARM64/frame-index.ll
rename to test/CodeGen/AArch64/arm64-frame-index.ll
index 4a91ff31d8cc..321f3354ca21 100644
--- a/test/CodeGen/ARM64/frame-index.ll
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; rdar://11935841
 
 define void @t1() nounwind ssp {
diff --git a/test/CodeGen/ARM64/frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
similarity index 100%
rename from test/CodeGen/ARM64/frameaddr.ll
rename to test/CodeGen/AArch64/arm64-frameaddr.ll
diff --git a/test/CodeGen/ARM64/global-address.ll b/test/CodeGen/AArch64/arm64-global-address.ll
similarity index 100%
rename from test/CodeGen/ARM64/global-address.ll
rename to test/CodeGen/AArch64/arm64-global-address.ll
diff --git a/test/CodeGen/ARM64/hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
similarity index 100%
rename from test/CodeGen/ARM64/hello.ll
rename to test/CodeGen/AArch64/arm64-hello.ll
diff --git a/test/CodeGen/ARM64/i16-subreg-extract.ll b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
similarity index 80%
rename from test/CodeGen/ARM64/i16-subreg-extract.ll
rename to test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
index fc2e8b58ac89..ba759e32aae5 100644
--- a/test/CodeGen/ARM64/i16-subreg-extract.ll
+++ b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @foo(<4 x i16>* %__a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/ARM64/icmp-opt.ll b/test/CodeGen/AArch64/arm64-icmp-opt.ll
similarity index 100%
rename from test/CodeGen/ARM64/icmp-opt.ll
rename to test/CodeGen/AArch64/arm64-icmp-opt.ll
diff --git a/test/CodeGen/ARM64/illegal-float-ops.ll b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
similarity index 100%
rename from test/CodeGen/ARM64/illegal-float-ops.ll
rename to test/CodeGen/AArch64/arm64-illegal-float-ops.ll
diff --git a/test/CodeGen/ARM64/indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
similarity index 99%
rename from test/CodeGen/ARM64/indexed-memory.ll
rename to test/CodeGen/AArch64/arm64-indexed-memory.ll
index e390ed7ece51..e501c6e403bd 100644
--- a/test/CodeGen/ARM64/indexed-memory.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
 
 define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
 ; CHECK-LABEL: store64:
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
new file mode 100644
index 000000000000..c118f109289b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s
+
+; This used to assert with "Overran sorted position" in AssignTopologicalOrder
+; due to a cycle created in performPostLD1Combine.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @f(double* %P1) #0 {
+entry:
+  %arrayidx4 = getelementptr inbounds double* %P1, i64 1
+  %0 = load double* %arrayidx4, align 8, !tbaa !1
+  %1 = load double* %P1, align 8, !tbaa !1
+  %2 = insertelement <2 x double> undef, double %0, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = fsub <2 x double> zeroinitializer, %3
+  %5 = fmul <2 x double> undef, %4
+  %6 = extractelement <2 x double> %5, i32 0
+  %cmp168 = fcmp olt double %6, undef
+  br i1 %cmp168, label %if.then172, label %return
+
+if.then172:                                       ; preds = %cond.end90
+  %7 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
+  br label %return
+
+return:                                           ; preds = %if.then172, %cond.end90, %entry
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"double", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM64/indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
similarity index 75%
rename from test/CodeGen/ARM64/indexed-vector-ldst.ll
rename to test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 3d8ff53137b6..9ee4063658b2 100644
--- a/test/CodeGen/ARM64/indexed-vector-ldst.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -615,7 +615,7 @@ define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) {
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld2:
 ;CHECK: ld2.16b { v0, v1 }, [x0], #32
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
@@ -624,19 +624,19 @@ define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld2:
 ;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld2:
 ;CHECK: ld2.8b { v0, v1 }, [x0], #16
-  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
@@ -645,19 +645,19 @@ define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld2:
 ;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld2:
 ;CHECK: ld2.8h { v0, v1 }, [x0], #32
-  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
@@ -666,19 +666,19 @@ define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld2:
 ;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld2:
 ;CHECK: ld2.4h { v0, v1 }, [x0], #16
-  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
@@ -687,19 +687,19 @@ define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld2:
 ;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], #32
-  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
@@ -708,19 +708,19 @@ define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], #16
-  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
@@ -729,19 +729,19 @@ define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], #32
-  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
@@ -750,19 +750,19 @@ define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
@@ -771,19 +771,19 @@ define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], #32
-  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A)
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
@@ -792,19 +792,19 @@ define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float**
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A)
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], #16
-  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A)
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
@@ -813,19 +813,19 @@ define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float**
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A)
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], #32
-  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A)
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
@@ -834,19 +834,19 @@ define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, doubl
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A)
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
@@ -855,19 +855,19 @@ define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, doubl
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld3:
 ;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
@@ -876,19 +876,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8**
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld3:
 ;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld3:
 ;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
@@ -897,19 +897,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %pt
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld3:
 ;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld3:
 ;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
@@ -918,19 +918,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld3:
 ;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld3:
 ;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
@@ -939,19 +939,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld3:
 ;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
@@ -960,19 +960,19 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
@@ -981,19 +981,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 6
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
@@ -1002,19 +1002,19 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
@@ -1023,19 +1023,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A)
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 12
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
@@ -1044,19 +1044,19 @@ define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float*
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A)
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A)
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 6
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
@@ -1065,19 +1065,19 @@ define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float*
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A)
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A)
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 6
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
@@ -1086,19 +1086,19 @@ define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(dou
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A)
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
@@ -1107,19 +1107,19 @@ define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(dou
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld4:
 ;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
@@ -1128,19 +1128,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld4:
 ;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld4:
 ;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
@@ -1149,19 +1149,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld4:
 ;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld4:
 ;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
@@ -1170,19 +1170,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld4:
 ;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld4:
 ;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
@@ -1191,19 +1191,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld4:
 ;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
@@ -1212,19 +1212,19 @@ define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
@@ -1233,19 +1233,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 8
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
@@ -1254,19 +1254,19 @@ define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
@@ -1275,19 +1275,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A)
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 16
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
@@ -1296,19 +1296,19 @@ define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_i
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A)
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A)
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
@@ -1317,19 +1317,19 @@ define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_i
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A)
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A)
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 8
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
@@ -1338,19 +1338,19 @@ define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_po
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A)
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
@@ -1359,18 +1359,18 @@ define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_po
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double*)
 
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld1x2:
 ;CHECK: ld1.16b { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld1x2
@@ -1379,19 +1379,19 @@ define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld1x2:
 ;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld1x2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld1x2:
 ;CHECK: ld1.8b { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld1x2
@@ -1400,19 +1400,19 @@ define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld1x2:
 ;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld1x2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld1x2:
 ;CHECK: ld1.8h { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld1x2
@@ -1421,19 +1421,19 @@ define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr)
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld1x2:
 ;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld1x2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld1x2:
 ;CHECK: ld1.4h { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld1x2
@@ -1442,19 +1442,19 @@ define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr)
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld1x2:
 ;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld1x2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld1x2
@@ -1463,19 +1463,19 @@ define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr)
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld1x2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld1x2
@@ -1484,19 +1484,19 @@ define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr)
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld1x2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld1x2
@@ -1505,19 +1505,19 @@ define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr)
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld1x2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld1x2
@@ -1526,19 +1526,19 @@ define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr)
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld1x2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld1x2
@@ -1547,19 +1547,19 @@ define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float*
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld1x2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld1x2
@@ -1568,19 +1568,19 @@ define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float*
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld1x2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld1x2
@@ -1589,19 +1589,19 @@ define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, dou
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld1x2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld1x2
@@ -1610,19 +1610,19 @@ define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, dou
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld1x2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld1x3:
 ;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
@@ -1631,19 +1631,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld1x3:
 ;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld1x3:
 ;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
@@ -1652,19 +1652,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld1x3:
 ;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld1x3:
 ;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
@@ -1673,19 +1673,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld1x3:
 ;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld1x3:
 ;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
@@ -1694,19 +1694,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld1x3:
 ;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
@@ -1715,19 +1715,19 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
@@ -1736,19 +1736,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 6
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
@@ -1757,19 +1757,19 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
@@ -1778,19 +1778,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 12
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
@@ -1799,19 +1799,19 @@ define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(floa
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 6
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
@@ -1820,19 +1820,19 @@ define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(floa
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 6
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
@@ -1841,19 +1841,19 @@ define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(d
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
@@ -1862,19 +1862,19 @@ define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(d
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld1x4:
 ;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
@@ -1883,19 +1883,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld1x4:
 ;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld1x4:
 ;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
@@ -1904,19 +1904,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8*
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld1x4:
 ;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld1x4:
 ;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
@@ -1925,19 +1925,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld1x4:
 ;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld1x4:
 ;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
@@ -1946,19 +1946,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld1x4:
 ;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
@@ -1967,19 +1967,19 @@ define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
@@ -1988,19 +1988,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 8
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
@@ -2009,19 +2009,19 @@ define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
@@ -2030,19 +2030,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 16
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
@@ -2051,19 +2051,19 @@ define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_i
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
@@ -2072,19 +2072,19 @@ define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_i
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 8
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
@@ -2093,19 +2093,19 @@ define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_po
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
@@ -2114,19 +2114,19 @@ define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_po
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld2r:
 ;CHECK: ld2r.16b { v0, v1 }, [x0], #2
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
@@ -2135,19 +2135,19 @@ define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nou
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld2r:
 ;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld2r:
 ;CHECK: ld2r.8b { v0, v1 }, [x0], #2
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
@@ -2156,19 +2156,19 @@ define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwi
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld2r:
 ;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld2r:
 ;CHECK: ld2r.8h { v0, v1 }, [x0], #4
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
@@ -2177,19 +2177,19 @@ define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) n
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld2r:
 ;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld2r:
 ;CHECK: ld2r.4h { v0, v1 }, [x0], #4
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
@@ -2198,19 +2198,19 @@ define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) n
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld2r:
 ;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], #8
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
@@ -2219,18 +2219,18 @@ define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) n
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], #8
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
@@ -2239,19 +2239,19 @@ define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) n
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], #16
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
@@ -2260,18 +2260,18 @@ define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) n
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], #16
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
@@ -2280,19 +2280,19 @@ define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) n
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], #8
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
@@ -2301,18 +2301,18 @@ define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float**
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], #8
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
@@ -2321,19 +2321,19 @@ define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float**
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], #16
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
@@ -2342,18 +2342,18 @@ define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, doub
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], #16
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
@@ -2362,19 +2362,19 @@ define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, doub
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld3r:
 ;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
@@ -2383,19 +2383,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8*
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld3r:
 ;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld3r:
 ;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
@@ -2404,19 +2404,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %p
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld3r:
 ;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld3r:
 ;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
@@ -2425,19 +2425,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i1
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld3r:
 ;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld3r:
 ;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
@@ -2446,19 +2446,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i1
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld3r:
 ;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
@@ -2467,18 +2467,18 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i3
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
@@ -2487,19 +2487,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i3
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
@@ -2508,18 +2508,18 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i6
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
@@ -2528,19 +2528,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i6
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
@@ -2549,18 +2549,18 @@ define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
@@ -2569,19 +2569,19 @@ define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
@@ -2590,18 +2590,18 @@ define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(do
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
@@ -2610,19 +2610,19 @@ define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(do
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld4r:
 ;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
@@ -2631,19 +2631,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld4r:
 ;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld4r:
 ;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
@@ -2652,19 +2652,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld4r:
 ;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld4r:
 ;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
@@ -2673,19 +2673,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld4r:
 ;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld4r:
 ;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
@@ -2694,19 +2694,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld4r:
 ;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
@@ -2715,18 +2715,18 @@ define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
@@ -2735,19 +2735,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
@@ -2756,18 +2756,18 @@ define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
@@ -2776,19 +2776,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
@@ -2797,18 +2797,18 @@ define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_i
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
@@ -2817,19 +2817,19 @@ define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_i
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
@@ -2838,18 +2838,18 @@ define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_po
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
@@ -2858,19 +2858,19 @@ define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_po
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], #2
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
@@ -2879,19 +2879,19 @@ define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr,
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], #2
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
@@ -2900,19 +2900,19 @@ define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], #4
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
@@ -2921,19 +2921,19 @@ define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], #4
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
@@ -2942,19 +2942,19 @@ define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
@@ -2963,19 +2963,19 @@ define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
@@ -2984,19 +2984,19 @@ define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
@@ -3005,19 +3005,19 @@ define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
@@ -3026,19 +3026,19 @@ define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
@@ -3047,19 +3047,19 @@ define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, floa
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
@@ -3068,19 +3068,19 @@ define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, floa
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
@@ -3089,19 +3089,19 @@ define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, d
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
 
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
@@ -3110,19 +3110,19 @@ define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, d
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
@@ -3131,19 +3131,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A,
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
@@ -3152,19 +3152,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8**
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
@@ -3173,19 +3173,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A,
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
@@ -3194,19 +3194,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A,
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
@@ -3215,19 +3215,19 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A,
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
@@ -3236,19 +3236,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A,
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
@@ -3257,19 +3257,19 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A,
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
@@ -3278,19 +3278,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A,
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
@@ -3299,19 +3299,19 @@ define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(fl
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
@@ -3320,19 +3320,19 @@ define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(fl
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
@@ -3341,19 +3341,19 @@ define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
 
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
@@ -3362,19 +3362,19 @@ define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
@@ -3383,19 +3383,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4la
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
@@ -3404,19 +3404,19 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
@@ -3425,19 +3425,19 @@ define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4la
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
@@ -3446,19 +3446,19 @@ define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4la
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
@@ -3467,19 +3467,19 @@ define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4la
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
@@ -3488,19 +3488,19 @@ define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4la
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
@@ -3509,19 +3509,19 @@ define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4la
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
@@ -3530,19 +3530,19 @@ define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4la
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
@@ -3551,19 +3551,19 @@ define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_i
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
@@ -3572,19 +3572,19 @@ define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_i
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
@@ -3593,19 +3593,19 @@ define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_po
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
 
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
@@ -3614,19 +3614,19 @@ define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_po
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
 
 
 define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st2:
 ;CHECK: st2.16b { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -3634,18 +3634,18 @@ define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %
 define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st2:
 ;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st2:
 ;CHECK: st2.8b { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   ret i8* %tmp
 }
@@ -3653,18 +3653,18 @@ define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C)
 define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st2:
 ;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st2:
 ;CHECK: st2.8h { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -3672,18 +3672,18 @@ define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16
 define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st2:
 ;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st2:
 ;CHECK: st2.4h { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   ret i16* %tmp
 }
@@ -3691,18 +3691,18 @@ define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16
 define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st2:
 ;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -3710,18 +3710,18 @@ define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32
 define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -3729,18 +3729,18 @@ define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32
 define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -3748,18 +3748,18 @@ define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64
 define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -3767,18 +3767,18 @@ define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64
 define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -3786,18 +3786,18 @@ define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B,
 define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -3805,18 +3805,18 @@ define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B,
 define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -3824,18 +3824,18 @@ define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double>
 define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -3843,18 +3843,18 @@ define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double>
 define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st3:
 ;CHECK: st3.16b { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   ret i8* %tmp
 }
@@ -3862,18 +3862,18 @@ define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %
 define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st3:
 ;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st3:
 ;CHECK: st3.8b { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   ret i8* %tmp
 }
@@ -3881,18 +3881,18 @@ define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C,
 define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st3:
 ;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st3:
 ;CHECK: st3.8h { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   ret i16* %tmp
 }
@@ -3900,18 +3900,18 @@ define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16
 define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st3:
 ;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st3:
 ;CHECK: st3.4h { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   ret i16* %tmp
 }
@@ -3919,18 +3919,18 @@ define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16
 define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st3:
 ;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   ret i32* %tmp
 }
@@ -3938,18 +3938,18 @@ define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32
 define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   ret i32* %tmp
 }
@@ -3957,18 +3957,18 @@ define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32
 define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 6
   ret i64* %tmp
 }
@@ -3976,18 +3976,18 @@ define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64
 define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -3995,18 +3995,18 @@ define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64
 define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 12
   ret float* %tmp
 }
@@ -4014,18 +4014,18 @@ define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B,
 define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 6
   ret float* %tmp
 }
@@ -4033,18 +4033,18 @@ define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B,
 define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 6
   ret double* %tmp
 }
@@ -4052,18 +4052,18 @@ define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double>
 define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -4071,18 +4071,18 @@ define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double>
 define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st4:
 ;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   ret i8* %tmp
 }
@@ -4090,18 +4090,18 @@ define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %
 define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st4:
 ;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st4:
 ;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -4109,18 +4109,18 @@ define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C,
 define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st4:
 ;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st4:
 ;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   ret i16* %tmp
 }
@@ -4128,18 +4128,18 @@ define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16
 define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st4:
 ;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st4:
 ;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -4147,18 +4147,18 @@ define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16
 define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st4:
 ;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
 
 
 define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   ret i32* %tmp
 }
@@ -4166,18 +4166,18 @@ define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32
 define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
 
 
 define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -4185,18 +4185,18 @@ define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32
 define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 8
   ret i64* %tmp
 }
@@ -4204,18 +4204,18 @@ define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64
 define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
 
 
 define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -4223,18 +4223,18 @@ define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64
 define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
 
 
 define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 16
   ret float* %tmp
 }
@@ -4242,18 +4242,18 @@ define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B,
 define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -4261,18 +4261,18 @@ define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B,
 define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 8
   ret double* %tmp
 }
@@ -4280,18 +4280,18 @@ define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double>
 define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+declare void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
 
 
 define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -4299,18 +4299,18 @@ define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double>
 define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st1x2:
 ;CHECK: st1.16b { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -4318,18 +4318,18 @@ define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8>
 define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st1x2:
 ;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st1x2:
 ;CHECK: st1.8b { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   ret i8* %tmp
 }
@@ -4337,18 +4337,18 @@ define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C
 define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st1x2:
 ;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st1x2:
 ;CHECK: st1.8h { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -4356,18 +4356,18 @@ define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i
 define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st1x2:
 ;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st1x2:
 ;CHECK: st1.4h { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   ret i16* %tmp
 }
@@ -4375,18 +4375,18 @@ define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i
 define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st1x2:
 ;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -4394,18 +4394,18 @@ define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i
 define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -4413,18 +4413,18 @@ define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i
 define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -4432,18 +4432,18 @@ define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i
 define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -4451,18 +4451,18 @@ define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i
 define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -4470,18 +4470,18 @@ define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B
 define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -4489,18 +4489,18 @@ define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B
 define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -4508,18 +4508,18 @@ define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double
 define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -4527,18 +4527,18 @@ define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double
 define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st1x3:
 ;CHECK: st1.16b { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   ret i8* %tmp
 }
@@ -4546,18 +4546,18 @@ define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8>
 define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st1x3:
 ;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st1x3:
 ;CHECK: st1.8b { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   ret i8* %tmp
 }
@@ -4565,18 +4565,18 @@ define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C
 define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st1x3:
 ;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st1x3:
 ;CHECK: st1.8h { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   ret i16* %tmp
 }
@@ -4584,18 +4584,18 @@ define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i
 define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st1x3:
 ;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st1x3:
 ;CHECK: st1.4h { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   ret i16* %tmp
 }
@@ -4603,18 +4603,18 @@ define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i
 define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st1x3:
 ;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   ret i32* %tmp
 }
@@ -4622,18 +4622,18 @@ define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i
 define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   ret i32* %tmp
 }
@@ -4641,18 +4641,18 @@ define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i
 define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 6
   ret i64* %tmp
 }
@@ -4660,18 +4660,18 @@ define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i
 define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -4679,18 +4679,18 @@ define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i
 define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 12
   ret float* %tmp
 }
@@ -4698,18 +4698,18 @@ define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B
 define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 6
   ret float* %tmp
 }
@@ -4717,18 +4717,18 @@ define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B
 define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 6
   ret double* %tmp
 }
@@ -4736,18 +4736,18 @@ define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double
 define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -4755,18 +4755,18 @@ define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double
 define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st1x4:
 ;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   ret i8* %tmp
 }
@@ -4774,18 +4774,18 @@ define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8>
 define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st1x4:
 ;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st1x4:
 ;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -4793,18 +4793,18 @@ define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C
 define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st1x4:
 ;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st1x4:
 ;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   ret i16* %tmp
 }
@@ -4812,18 +4812,18 @@ define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i
 define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st1x4:
 ;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st1x4:
 ;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -4831,18 +4831,18 @@ define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i
 define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st1x4:
 ;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
 
 
 define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   ret i32* %tmp
 }
@@ -4850,18 +4850,18 @@ define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i
 define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
 
 
 define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -4869,18 +4869,18 @@ define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i
 define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 8
   ret i64* %tmp
 }
@@ -4888,18 +4888,18 @@ define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i
 define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
 
 
 define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -4907,18 +4907,18 @@ define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i
 define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
 
 
 define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 16
   ret float* %tmp
 }
@@ -4926,18 +4926,18 @@ define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B
 define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -4945,18 +4945,18 @@ define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B
 define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 8
   ret double* %tmp
 }
@@ -4964,18 +4964,18 @@ define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double
 define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
 
 
 define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -4983,33 +4983,33 @@ define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double
 define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) {
-  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   ret i8* %tmp
 }
 
 define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) {
-  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
 
 
 define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], #2
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   ret i8* %tmp
 }
@@ -5017,18 +5017,18 @@ define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i
 define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
 
 
 define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], #2
-  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   ret i8* %tmp
 }
@@ -5036,18 +5036,18 @@ define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8>
 define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
 
 
 define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], #4
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   ret i16* %tmp
 }
@@ -5055,18 +5055,18 @@ define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x
 define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
 
 
 define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], #4
-  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   ret i16* %tmp
 }
@@ -5074,18 +5074,18 @@ define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x
 define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
 
 
 define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   ret i32* %tmp
 }
@@ -5093,18 +5093,18 @@ define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x
 define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
 
 
 define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   ret i32* %tmp
 }
@@ -5112,18 +5112,18 @@ define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x
 define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
 
 
 define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -5131,18 +5131,18 @@ define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x
 define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
 
 
 define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -5150,18 +5150,18 @@ define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x
 define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
 
 
 define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   ret float* %tmp
 }
@@ -5169,18 +5169,18 @@ define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float>
 define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
 
 
 define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   ret float* %tmp
 }
@@ -5188,18 +5188,18 @@ define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float>
 define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
 
 
 define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -5207,18 +5207,18 @@ define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x doub
 define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
 
 
 define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -5226,18 +5226,18 @@ define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x doub
 define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
 
 
 define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   ret i8* %tmp
 }
@@ -5245,18 +5245,18 @@ define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i
 define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
 
 
 define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
-  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   ret i8* %tmp
 }
@@ -5264,18 +5264,18 @@ define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8>
 define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
 
 
 define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   ret i16* %tmp
 }
@@ -5283,18 +5283,18 @@ define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x
 define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
 
 
 define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
-  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   ret i16* %tmp
 }
@@ -5302,18 +5302,18 @@ define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x
 define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
 
 
 define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   ret i32* %tmp
 }
@@ -5321,18 +5321,18 @@ define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x
 define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
 
 
 define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   ret i32* %tmp
 }
@@ -5340,18 +5340,18 @@ define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x
 define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
 
 
 define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -5359,18 +5359,18 @@ define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x
 define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
 
 
 define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -5378,18 +5378,18 @@ define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x
 define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
 
 
 define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   ret float* %tmp
 }
@@ -5397,18 +5397,18 @@ define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float>
 define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
 
 
 define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   ret float* %tmp
 }
@@ -5416,18 +5416,18 @@ define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float>
 define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
 
 
 define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -5435,18 +5435,18 @@ define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x doub
 define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
 
 
 define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -5454,18 +5454,18 @@ define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x doub
 define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
 
 
 define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   ret i8* %tmp
 }
@@ -5473,18 +5473,18 @@ define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i
 define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
 
 
 define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
-  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   ret i8* %tmp
 }
@@ -5492,18 +5492,18 @@ define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8>
 define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
 
 
 define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   ret i16* %tmp
 }
@@ -5511,18 +5511,18 @@ define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x
 define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
 
 
 define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
-  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   ret i16* %tmp
 }
@@ -5530,18 +5530,18 @@ define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x
 define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
 
 
 define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -5549,18 +5549,18 @@ define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x
 define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
 
 
 define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -5568,18 +5568,18 @@ define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x
 define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
 
 
 define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -5587,18 +5587,18 @@ define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x
 define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
 
 
 define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -5606,18 +5606,18 @@ define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x
 define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
 
 
 define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -5625,18 +5625,18 @@ define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float>
 define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
 
 
 define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -5644,18 +5644,18 @@ define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float>
 define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
 
 
 define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -5663,18 +5663,18 @@ define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x doub
 define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
 
 
 define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -5682,12 +5682,12 @@ define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x doub
 define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
 
 define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
 ; CHECK-LABEL: test_v16i8_post_imm_ld1r:
diff --git a/test/CodeGen/ARM64/inline-asm-error-I.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
similarity index 100%
rename from test/CodeGen/ARM64/inline-asm-error-I.ll
rename to test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-J.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
similarity index 100%
rename from test/CodeGen/ARM64/inline-asm-error-J.ll
rename to test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-K.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
similarity index 100%
rename from test/CodeGen/ARM64/inline-asm-error-K.ll
rename to test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-L.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
similarity index 100%
rename from test/CodeGen/ARM64/inline-asm-error-L.ll
rename to test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-M.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
similarity index 100%
rename from test/CodeGen/ARM64/inline-asm-error-M.ll
rename to test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
diff --git a/test/CodeGen/ARM64/inline-asm-error-N.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
similarity index 100%
rename from test/CodeGen/ARM64/inline-asm-error-N.ll
rename to test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
diff --git a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
similarity index 100%
rename from test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
rename to test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
diff --git a/test/CodeGen/ARM64/inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
similarity index 98%
rename from test/CodeGen/ARM64/inline-asm.ll
rename to test/CodeGen/AArch64/arm64-inline-asm.ll
index e64507870fbe..d76cca3f21c6 100644
--- a/test/CodeGen/ARM64/inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
 
 ; rdar://9167275
 
diff --git a/test/CodeGen/ARM64/join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
similarity index 100%
rename from test/CodeGen/ARM64/join-reserved.ll
rename to test/CodeGen/AArch64/arm64-join-reserved.ll
diff --git a/test/CodeGen/ARM64/jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll
similarity index 100%
rename from test/CodeGen/ARM64/jumptable.ll
rename to test/CodeGen/AArch64/arm64-jumptable.ll
diff --git a/test/CodeGen/ARM64/aarch64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll
similarity index 100%
rename from test/CodeGen/ARM64/aarch64-large-frame.ll
rename to test/CodeGen/AArch64/arm64-large-frame.ll
diff --git a/test/CodeGen/ARM64/ld1.ll b/test/CodeGen/AArch64/arm64-ld1.ll
similarity index 67%
rename from test/CodeGen/ARM64/ld1.ll
rename to test/CodeGen/AArch64/arm64-ld1.ll
index 61836a10a806..72d808ccc347 100644
--- a/test/CodeGen/ARM64/ld1.ll
+++ b/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 %struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
@@ -10,7 +10,7 @@ define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
 ; and from the argument of the function also defined by ABI (i.e., x0)
 ; CHECK ld2.8b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x2_t  %tmp2
 }
 
@@ -19,7 +19,7 @@ define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x3_t  %tmp2
 }
 
@@ -28,13 +28,13 @@ define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
 
 %struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
 %struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
@@ -45,7 +45,7 @@ define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -54,7 +54,7 @@ define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -63,13 +63,13 @@ define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
 
 %struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
@@ -80,7 +80,7 @@ define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x2_t  %tmp2
 }
 
@@ -89,7 +89,7 @@ define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x3_t  %tmp2
 }
 
@@ -98,13 +98,13 @@ define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
 
 %struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
 %struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
@@ -115,7 +115,7 @@ define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -124,7 +124,7 @@ define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x3_t %tmp2
 }
 
@@ -133,13 +133,13 @@ define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
 
 %struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
 %struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
@@ -150,7 +150,7 @@ define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x2_t  %tmp2
 }
 
@@ -159,7 +159,7 @@ define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x3_t  %tmp2
 }
 
@@ -168,13 +168,13 @@ define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
 
 %struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
 %struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
@@ -185,7 +185,7 @@ define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -194,7 +194,7 @@ define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -203,13 +203,13 @@ define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
 
 %struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
 %struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
@@ -220,7 +220,7 @@ define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -229,7 +229,7 @@ define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -238,13 +238,13 @@ define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
 
 %struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
 %struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
@@ -256,7 +256,7 @@ define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x2_t  %tmp2
 }
 
@@ -265,7 +265,7 @@ define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x3_t  %tmp2
 }
 
@@ -274,14 +274,14 @@ define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x4_t  %tmp2
 }
 
 
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
 
 %struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
 %struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
@@ -293,7 +293,7 @@ define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x2_t  %tmp2
 }
 
@@ -302,7 +302,7 @@ define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x3_t  %tmp2
 }
 
@@ -311,13 +311,13 @@ define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x4_t  %tmp2
 }
 
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
 
 
 define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
@@ -325,7 +325,7 @@ define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8*
 ; CHECK: ld2lane_16b
 ; CHECK ld2.b { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -334,7 +334,7 @@ define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16
 ; CHECK: ld3lane_16b
 ; CHECK ld3.b { v0, v1, v2 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -343,20 +343,20 @@ define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16
 ; CHECK: ld4lane_16b
 ; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_8h
 ; CHECK ld2.h { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -365,7 +365,7 @@ define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x
 ; CHECK: ld3lane_8h
 ; CHECK ld3.h { v0, v1, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x3_t  %tmp2
 }
 
@@ -374,20 +374,20 @@ define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x
 ; CHECK: ld4lane_8h
 ; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_4s
 ; CHECK ld2.s { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -396,7 +396,7 @@ define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x
 ; CHECK: ld3lane_4s
 ; CHECK ld3.s { v0, v1, v2 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -405,20 +405,20 @@ define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x
 ; CHECK: ld4lane_4s
 ; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_2d
 ; CHECK ld2.d { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -427,7 +427,7 @@ define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x
 ; CHECK: ld3lane_2d
 ; CHECK ld3.d { v0, v1, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -436,13 +436,13 @@ define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x
 ; CHECK: ld4lane_2d
 ; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 define <8 x i8> @ld1r_8b(i8* %bar) {
 ; CHECK: ld1r_8b
@@ -556,7 +556,7 @@ define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.8b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x2_t  %tmp2
 }
 
@@ -565,7 +565,7 @@ define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x3_t  %tmp2
 }
 
@@ -574,20 +574,20 @@ define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
 
 define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
 ; CHECK: ld2r_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -596,7 +596,7 @@ define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -605,20 +605,20 @@ define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
 
 define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
 ; CHECK: ld2r_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x2_t  %tmp2
 }
 
@@ -627,7 +627,7 @@ define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x3_t  %tmp2
 }
 
@@ -636,20 +636,20 @@ define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
 
 define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
 ; CHECK: ld2r_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -658,7 +658,7 @@ define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x3_t  %tmp2
 }
 
@@ -667,20 +667,20 @@ define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
 
 define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
 ; CHECK: ld2r_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x2_t  %tmp2
 }
 
@@ -689,7 +689,7 @@ define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x3_t  %tmp2
 }
 
@@ -698,20 +698,20 @@ define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
 
 define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
 ; CHECK: ld2r_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -720,7 +720,7 @@ define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -729,20 +729,20 @@ define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
 
 define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
 ; CHECK: ld2r_1d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x2_t  %tmp2
 }
 
@@ -751,7 +751,7 @@ define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x3_t  %tmp2
 }
 
@@ -760,20 +760,20 @@ define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
 
 define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
 ; CHECK: ld2r_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -782,7 +782,7 @@ define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -791,13 +791,13 @@ define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
 
 define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
 ; CHECK-LABEL: ld1_16b
@@ -1041,52 +1041,52 @@ entry:
 %struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
 %struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x2_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x2_t %val
 }
 
 define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x2_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x2_t %val
 }
 
 define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x2_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x2_t %val
 }
 
 define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x2_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x2_t %val
 }
 
 define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x2_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x2_t %val
 }
 
 define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x2_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x2_t %val
 }
 
@@ -1099,247 +1099,247 @@ define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
 %struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
 %struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x2_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x2_t %val
 }
 
 define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x2_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x2_t %val
 }
 
 define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x2_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x2_t %val
 }
 
 define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x2_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x2_t %val
 }
 
 define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x2_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x2_t %val
 }
 
 define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x2_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x2_t %val
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x3_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x3_t %val
 }
 
 define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x3_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x3_t %val
 }
 
 define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x3_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x3_t %val
 }
 
 define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x3_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x3_t %val
 }
 
 define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x3_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x3_t %val
 }
 
 define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x3_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x3_t %val
 }
 
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x3_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x3_t %val
 }
 
 define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x3_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x3_t %val
 }
 
 define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x3_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x3_t %val
 }
 
 define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x3_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x3_t %val
 }
 
 define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x3_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x3_t %val
 }
 
 define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x3_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x3_t %val
 }
 
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x4_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x4_t %val
 }
 
 define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x4_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x4_t %val
 }
 
 define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x4_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x4_t %val
 }
 
 define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x4_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x4_t %val
 }
 
 define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x4_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x4_t %val
 }
 
 define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x4_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x4_t %val
 }
 
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x4_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x4_t %val
 }
 
 define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x4_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x4_t %val
 }
 
 define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x4_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x4_t %val
 }
 
 define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x4_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x4_t %val
 }
 
 define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x4_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x4_t %val
 }
 
 define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x4_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x4_t %val
 }
diff --git a/test/CodeGen/ARM64/ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
similarity index 98%
rename from test/CodeGen/ARM64/ldp.ll
rename to test/CodeGen/AArch64/arm64-ldp.ll
index 9444385f8ab3..5a986261b31b 100644
--- a/test/CodeGen/ARM64/ldp.ll
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
 ; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
 
 ; CHECK: ldp_int
diff --git a/test/CodeGen/ARM64/ldur.ll b/test/CodeGen/AArch64/arm64-ldur.ll
similarity index 100%
rename from test/CodeGen/ARM64/ldur.ll
rename to test/CodeGen/AArch64/arm64-ldur.ll
diff --git a/test/CodeGen/ARM64/ldxr-stxr.ll b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
similarity index 69%
rename from test/CodeGen/ARM64/ldxr-stxr.ll
rename to test/CodeGen/AArch64/arm64-ldxr-stxr.ll
index ed53a14ca8cc..9093df27cddc 100644
--- a/test/CodeGen/ARM64/ldxr-stxr.ll
+++ b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -6,7 +6,7 @@ define i128 @f0(i8* %p) nounwind readonly {
 ; CHECK-LABEL: f0:
 ; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
+  %ldrexd = tail call %0 @llvm.aarch64.ldxp(i8* %p)
   %0 = extractvalue %0 %ldrexd, 1
   %1 = extractvalue %0 %ldrexd, 0
   %2 = zext i64 %0 to i128
@@ -23,12 +23,12 @@ entry:
   %tmp4 = trunc i128 %val to i64
   %tmp6 = lshr i128 %val, 64
   %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  %strexd = tail call i32 @llvm.aarch64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
   ret i32 %strexd
 }
 
-declare %0 @llvm.arm64.ldxp(i8*) nounwind
-declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
+declare %0 @llvm.aarch64.ldxp(i8*) nounwind
+declare i32 @llvm.aarch64.stxp(i64, i64, i8*) nounwind
 
 @var = global i64 0, align 8
 
@@ -39,7 +39,7 @@ define void @test_load_i8(i8* %addr) {
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
   %shortval = trunc i64 %val to i8
   %extval = zext i8 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -53,7 +53,7 @@ define void @test_load_i16(i16* %addr) {
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
   %shortval = trunc i64 %val to i16
   %extval = zext i16 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -67,7 +67,7 @@ define void @test_load_i32(i32* %addr) {
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
   %shortval = trunc i64 %val to i32
   %extval = zext i32 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -79,16 +79,16 @@ define void @test_load_i64(i64* %addr) {
 ; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
   store i64 %val, i64* @var, align 8
   ret void
 }
 
 
-declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i64(i64*) nounwind
 
 define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
 ; CHECK-LABEL: test_store_i8:
@@ -96,7 +96,7 @@ define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
 ; CHECK-NOT: and
 ; CHECK: stxrb w0, w1, [x2]
   %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
   ret i32 %res
 }
 
@@ -106,7 +106,7 @@ define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
 ; CHECK-NOT: and
 ; CHECK: stxrh w0, w1, [x2]
   %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
   ret i32 %res
 }
 
@@ -116,36 +116,36 @@ define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
 ; CHECK-NOT: and
 ; CHECK: stxr w0, w1, [x2]
   %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
   ret i32 %res
 }
 
 define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
 ; CHECK-LABEL: test_store_i64:
 ; CHECK: stxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
   ret i32 %res
 }
 
-declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*) nounwind
 
 ; CHECK: test_clear:
 ; CHECK: clrex
 define void @test_clear() {
-  call void @llvm.arm64.clrex()
+  call void @llvm.aarch64.clrex()
   ret void
 }
 
-declare void @llvm.arm64.clrex() nounwind
+declare void @llvm.aarch64.clrex() nounwind
 
 define i128 @test_load_acquire_i128(i8* %p) nounwind readonly {
 ; CHECK-LABEL: test_load_acquire_i128:
 ; CHECK: ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldaxp(i8* %p)
+  %ldrexd = tail call %0 @llvm.aarch64.ldaxp(i8* %p)
   %0 = extractvalue %0 %ldrexd, 1
   %1 = extractvalue %0 %ldrexd, 0
   %2 = zext i64 %0 to i128
@@ -162,12 +162,12 @@ entry:
   %tmp4 = trunc i128 %val to i64
   %tmp6 = lshr i128 %val, 64
   %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  %strexd = tail call i32 @llvm.aarch64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
   ret i32 %strexd
 }
 
-declare %0 @llvm.arm64.ldaxp(i8*) nounwind
-declare i32 @llvm.arm64.stlxp(i64, i64, i8*) nounwind
+declare %0 @llvm.aarch64.ldaxp(i8*) nounwind
+declare i32 @llvm.aarch64.stlxp(i64, i64, i8*) nounwind
 
 define void @test_load_acquire_i8(i8* %addr) {
 ; CHECK-LABEL: test_load_acquire_i8:
@@ -176,7 +176,7 @@ define void @test_load_acquire_i8(i8* %addr) {
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i8(i8* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
   %shortval = trunc i64 %val to i8
   %extval = zext i8 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -190,7 +190,7 @@ define void @test_load_acquire_i16(i16* %addr) {
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i16(i16* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
   %shortval = trunc i64 %val to i16
   %extval = zext i16 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -204,7 +204,7 @@ define void @test_load_acquire_i32(i32* %addr) {
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i32(i32* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
   %shortval = trunc i64 %val to i32
   %extval = zext i32 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -216,16 +216,16 @@ define void @test_load_acquire_i64(i64* %addr) {
 ; CHECK: ldaxr x[[LOADVAL:[0-9]+]], [x0]
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i64(i64* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
   store i64 %val, i64* @var, align 8
   ret void
 }
 
 
-declare i64 @llvm.arm64.ldaxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i64(i64*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64*) nounwind
 
 define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
 ; CHECK-LABEL: test_store_release_i8:
@@ -233,7 +233,7 @@ define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
 ; CHECK-NOT: and
 ; CHECK: stlxrb w0, w1, [x2]
   %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i8(i64 %extval, i8* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
   ret i32 %res
 }
 
@@ -243,7 +243,7 @@ define i32 @test_store_release_i16(i32, i16 %val, i16* %addr) {
 ; CHECK-NOT: and
 ; CHECK: stlxrh w0, w1, [x2]
   %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i16(i64 %extval, i16* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
   ret i32 %res
 }
 
@@ -253,18 +253,18 @@ define i32 @test_store_release_i32(i32, i32 %val, i32* %addr) {
 ; CHECK-NOT: and
 ; CHECK: stlxr w0, w1, [x2]
   %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i32(i64 %extval, i32* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
   ret i32 %res
 }
 
 define i32 @test_store_release_i64(i32, i64 %val, i64* %addr) {
 ; CHECK-LABEL: test_store_release_i64:
 ; CHECK: stlxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stlxr.p0i64(i64 %val, i64* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
   ret i32 %res
 }
 
-declare i32 @llvm.arm64.stlxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i64(i64, i64*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*) nounwind
diff --git a/test/CodeGen/ARM64/leaf.ll b/test/CodeGen/AArch64/arm64-leaf.ll
similarity index 100%
rename from test/CodeGen/ARM64/leaf.ll
rename to test/CodeGen/AArch64/arm64-leaf.ll
diff --git a/test/CodeGen/ARM64/long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
similarity index 100%
rename from test/CodeGen/ARM64/long-shift.ll
rename to test/CodeGen/AArch64/arm64-long-shift.ll
diff --git a/test/CodeGen/ARM64/memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
similarity index 100%
rename from test/CodeGen/ARM64/memcpy-inline.ll
rename to test/CodeGen/AArch64/arm64-memcpy-inline.ll
diff --git a/test/CodeGen/ARM64/memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
similarity index 100%
rename from test/CodeGen/ARM64/memset-inline.ll
rename to test/CodeGen/AArch64/arm64-memset-inline.ll
diff --git a/test/CodeGen/ARM64/memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
similarity index 100%
rename from test/CodeGen/ARM64/memset-to-bzero.ll
rename to test/CodeGen/AArch64/arm64-memset-to-bzero.ll
diff --git a/test/CodeGen/ARM64/misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
similarity index 93%
rename from test/CodeGen/ARM64/misched-basic-A53.ll
rename to test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index b87a523a30be..f88bd6a4fe32 100644
--- a/test/CodeGen/ARM64/misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -110,16 +110,15 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 attributes #1 = { nounwind }
 
 
-; Regression Test for Bug 19761
-; - [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
-; - http://llvm.org/bugs/show_bug.cgi?id=19761
+; Regression Test for PR19761
+;   [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
 ;
 ; Nothing explicit to check other than llc not crashing.
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
diff --git a/test/CodeGen/AArch64/misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
similarity index 55%
rename from test/CodeGen/AArch64/misched-basic-A53.ll
rename to test/CodeGen/AArch64/arm64-misched-basic-A57.ll
index f80956e60fa2..238474a12c65 100644
--- a/test/CodeGen/AArch64/misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -1,19 +1,23 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
-; arm64 now has a separate copy of this test.
 ;
-; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
-; much higher than the ADD instructions in order to hide latency. When not
-; specifying a subtarget, the MADD will remain near the end of the block.
+; The Cortext-A57 machine model will avoid scheduling load instructions in
+; succession because loads on the A57 have a latency of 4 cycles and they all
+; issue to the same pipeline. Instead, it will move other instructions between
+; the loads to avoid unnecessary stalls. The generic machine model schedules 4
+; loads consecutively for this case and will cause stalls.
 ;
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
 ; CHECK: ********** MI Scheduling **********
-; CHECK: main
+; CHECK: main:BB#2
+; CHECK LDR
+; CHECK Latency : 4
 ; CHECK: *** Final schedule for BB#2 ***
-; CHECK: SU(13)
-; CHECK: MADDwwww
-; CHECK: SU(4)
-; CHECK: ADDwwi_lsl0_s
-; CHECK: ********** INTERVALS **********
+; CHECK: LDR
+; CHECK: LDR
+; CHECK-NOT: LDR
+; CHECK: {{.*}}
+; CHECK: ********** MI Scheduling **********
+
 @main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
 @main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
 
@@ -42,28 +46,49 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %3 = load i32* %i, align 4
-  %idxprom = sext i32 %3 to i64
+  %3 = load i32* %yy, align 4
+  %4 = load i32* %i, align 4
+  %idxprom = sext i32 %4 to i64
   %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
-  %4 = load i32* %arrayidx, align 4
-  %add = add nsw i32 %4, 1
+  %5 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %5, 1
   store i32 %add, i32* %xx, align 4
-  %5 = load i32* %xx, align 4
-  %add1 = add nsw i32 %5, 12
-  store i32 %add1, i32* %xx, align 4
   %6 = load i32* %xx, align 4
-  %add2 = add nsw i32 %6, 23
-  store i32 %add2, i32* %xx, align 4
+  %add1 = add nsw i32 %6, 12
+  store i32 %add1, i32* %xx, align 4
   %7 = load i32* %xx, align 4
-  %add3 = add nsw i32 %7, 34
+  %add2 = add nsw i32 %7, 23
+  store i32 %add2, i32* %xx, align 4
+  %8 = load i32* %xx, align 4
+  %add3 = add nsw i32 %8, 34
   store i32 %add3, i32* %xx, align 4
-  %8 = load i32* %i, align 4
-  %idxprom4 = sext i32 %8 to i64
+  %9 = load i32* %i, align 4
+  %idxprom4 = sext i32 %9 to i64
   %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
-  %10 = load i32* %yy, align 4
-  %mul = mul nsw i32 %10, %9
-  store i32 %mul, i32* %yy, align 4
+  %10 = load i32* %arrayidx5, align 4
+
+  %add4 = add nsw i32 %9, %add
+  %add5 = add nsw i32 %10, %add1
+  %add6 = add nsw i32 %add4, %add5
+
+  %add7 = add nsw i32 %9, %add3
+  %add8 = add nsw i32 %10, %add4
+  %add9 = add nsw i32 %add7, %add8
+
+  %add10 = add nsw i32 %9, %add6
+  %add11 = add nsw i32 %10, %add7
+  %add12 = add nsw i32 %add10, %add11
+
+  %add13 = add nsw i32 %9, %add9
+  %add14 = add nsw i32 %10, %add10
+  %add15 = add nsw i32 %add13, %add14
+
+  store i32 %add15, i32* %xx, align 4
+
+  %div = sdiv i32 %4, %5
+
+  store i32 %div, i32* %yy, align 4
+
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
@@ -75,37 +100,11 @@ for.inc:                                          ; preds = %for.body
 for.end:                                          ; preds = %for.cond
   %12 = load i32* %xx, align 4
   %13 = load i32* %yy, align 4
-  %add6 = add nsw i32 %12, %13
-  ret i32 %add6
+  %add67 = add nsw i32 %12, %13
+  ret i32 %add67
 }
 
 
-; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
-; hide latency. Whereas normally there would only be a single FADDvvv_4s
-; after it, this test checks to make sure there are more than one.
-;
-; CHECK: ********** MI Scheduling **********
-; CHECK: neon4xfloat:BB#0
-; CHECK: *** Final schedule for BB#0 ***
-; CHECK: FDIVvvv_4S
-; CHECK: FADDvvv_4S
-; CHECK: FADDvvv_4S
-; CHECK: ********** INTERVALS **********
-define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
-        %tmp1 = fadd <4 x float> %A, %B;
-        %tmp2 = fadd <4 x float> %A, %tmp1;
-        %tmp3 = fadd <4 x float> %A, %tmp2;
-        %tmp4 = fadd <4 x float> %A, %tmp3;
-        %tmp5 = fadd <4 x float> %A, %tmp4;
-        %tmp6 = fadd <4 x float> %A, %tmp5;
-        %tmp7 = fadd <4 x float> %A, %tmp6;
-        %tmp8 = fadd <4 x float> %A, %tmp7;
-        %tmp9 = fdiv <4 x float> %A, %B;
-        %tmp10 = fadd <4 x float> %tmp8, %tmp9;
-
-        ret <4 x float> %tmp10
-}
-
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
 
diff --git a/test/CodeGen/ARM64/misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
similarity index 78%
rename from test/CodeGen/ARM64/misched-forwarding-A53.ll
rename to test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
index 97bfb5ca9d3f..07373ccedc5b 100644
--- a/test/CodeGen/ARM64/misched-forwarding-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -6,9 +6,10 @@
 ;
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: shiftable
-; CHECK: *** Final schedule for BB#0 ***
-; CHECK: ADDXrr %vreg0, %vreg2
-; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: SU(2):   %vreg2<def> = SUBXri %vreg1, 20, 0
+; CHECK:   Successors:
+; CHECK-NEXT:    val SU(4): Latency=1 Reg=%vreg2
+; CHECK-NEXT:    val SU(3): Latency=2 Reg=%vreg2
 ; CHECK: ********** INTERVALS **********
 define i64 @shiftable(i64 %A, i64 %B) {
         %tmp0 = sub i64 %B, 20
diff --git a/test/CodeGen/ARM64/movi.ll b/test/CodeGen/AArch64/arm64-movi.ll
similarity index 100%
rename from test/CodeGen/ARM64/movi.ll
rename to test/CodeGen/AArch64/arm64-movi.ll
diff --git a/test/CodeGen/ARM64/mul.ll b/test/CodeGen/AArch64/arm64-mul.ll
similarity index 100%
rename from test/CodeGen/ARM64/mul.ll
rename to test/CodeGen/AArch64/arm64-mul.ll
diff --git a/test/CodeGen/ARM64/named-reg-alloc.ll b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
similarity index 100%
rename from test/CodeGen/ARM64/named-reg-alloc.ll
rename to test/CodeGen/AArch64/arm64-named-reg-alloc.ll
diff --git a/test/CodeGen/ARM64/named-reg-notareg.ll b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
similarity index 100%
rename from test/CodeGen/ARM64/named-reg-notareg.ll
rename to test/CodeGen/AArch64/arm64-named-reg-notareg.ll
diff --git a/test/CodeGen/ARM64/neg.ll b/test/CodeGen/AArch64/arm64-neg.ll
similarity index 100%
rename from test/CodeGen/ARM64/neg.ll
rename to test/CodeGen/AArch64/arm64-neg.ll
diff --git a/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
similarity index 82%
rename from test/CodeGen/ARM64/aarch64-neon-2velem-high.ll
rename to test/CodeGen/AArch64/arm64-neon-2velem-high.ll
index 2013747713b9..58df094d1922 100644
--- a/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -4,25 +4,25 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
 define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: test_vmull_high_n_s16:
@@ -34,7 +34,7 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vmull15.i.i
 }
 
@@ -46,7 +46,7 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vmull9.i.i
 }
 
@@ -60,7 +60,7 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vmull15.i.i
 }
 
@@ -72,7 +72,7 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vmull9.i.i
 }
 
@@ -86,7 +86,7 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vqdmull15.i.i
 }
 
@@ -98,7 +98,7 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vqdmull9.i.i
 }
 
@@ -112,7 +112,7 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -125,7 +125,7 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -140,7 +140,7 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -153,7 +153,7 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -167,8 +167,8 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+  %vqdmlal15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlal17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
   ret <4 x i32> %vqdmlal17.i.i
 }
 
@@ -179,8 +179,8 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+  %vqdmlal9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlal11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
   ret <2 x i64> %vqdmlal11.i.i
 }
 
@@ -193,7 +193,7 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -205,7 +205,7 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -219,7 +219,7 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -231,7 +231,7 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -245,8 +245,8 @@ entry:
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
   ret <4 x i32> %vqdmlsl17.i.i
 }
 
@@ -257,8 +257,8 @@ entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
   ret <2 x i64> %vqdmlsl11.i.i
 }
 
diff --git a/test/CodeGen/ARM64/aarch64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
similarity index 84%
rename from test/CodeGen/ARM64/aarch64-neon-2velem.ll
rename to test/CodeGen/AArch64/arm64-neon-2velem.ll
index 4c6b72d55c34..869966caa3ae 100644
--- a/test/CodeGen/ARM64/aarch64-neon-2velem.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1,46 +1,46 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
 
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
 
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
 
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
 
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
 
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
 
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmla_lane_s16:
@@ -563,7 +563,7 @@ define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -574,7 +574,7 @@ define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -585,7 +585,7 @@ define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -596,7 +596,7 @@ define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -608,7 +608,7 @@ define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16>
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -620,7 +620,7 @@ define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32>
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -632,7 +632,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -644,7 +644,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -655,7 +655,7 @@ define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -666,7 +666,7 @@ define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -677,7 +677,7 @@ define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -688,7 +688,7 @@ define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -700,7 +700,7 @@ define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16>
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -712,7 +712,7 @@ define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32>
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -724,7 +724,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -736,7 +736,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -747,7 +747,7 @@ define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -758,7 +758,7 @@ define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -769,7 +769,7 @@ define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -780,7 +780,7 @@ define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -792,7 +792,7 @@ define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16>
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -804,7 +804,7 @@ define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32>
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -816,7 +816,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -828,7 +828,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -839,7 +839,7 @@ define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -850,7 +850,7 @@ define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -861,7 +861,7 @@ define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -872,7 +872,7 @@ define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v)
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -884,7 +884,7 @@ define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16>
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -896,7 +896,7 @@ define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32>
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -908,7 +908,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -920,7 +920,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -931,7 +931,7 @@ define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -941,7 +941,7 @@ define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -951,7 +951,7 @@ define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -961,7 +961,7 @@ define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -972,7 +972,7 @@ define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -983,7 +983,7 @@ define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -994,7 +994,7 @@ define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1005,7 +1005,7 @@ define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1015,7 +1015,7 @@ define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1025,7 +1025,7 @@ define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1035,7 +1035,7 @@ define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1045,7 +1045,7 @@ define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1056,7 +1056,7 @@ define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1067,7 +1067,7 @@ define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1078,7 +1078,7 @@ define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1089,7 +1089,7 @@ define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1099,8 +1099,8 @@ define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -1110,8 +1110,8 @@ define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -1122,8 +1122,8 @@ define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i1
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -1134,8 +1134,8 @@ define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i3
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -1145,8 +1145,8 @@ define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -1156,8 +1156,8 @@ define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -1168,8 +1168,8 @@ define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i1
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -1180,8 +1180,8 @@ define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i3
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -1191,7 +1191,7 @@ define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1201,7 +1201,7 @@ define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1211,7 +1211,7 @@ define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1221,7 +1221,7 @@ define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1232,7 +1232,7 @@ define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1243,7 +1243,7 @@ define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1254,7 +1254,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1265,7 +1265,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1275,7 +1275,7 @@ define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqdmulh2.i
 }
 
@@ -1285,7 +1285,7 @@ define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqdmulh2.i
 }
 
@@ -1295,7 +1295,7 @@ define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqdmulh2.i
 }
 
@@ -1305,7 +1305,7 @@ define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqdmulh2.i
 }
 
@@ -1315,7 +1315,7 @@ define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqrdmulh2.i
 }
 
@@ -1325,7 +1325,7 @@ define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqrdmulh2.i
 }
 
@@ -1335,7 +1335,7 @@ define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqrdmulh2.i
 }
 
@@ -1345,7 +1345,7 @@ define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqrdmulh2.i
 }
 
@@ -1441,7 +1441,7 @@ define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -1451,7 +1451,7 @@ define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -1461,7 +1461,7 @@ define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
@@ -1471,7 +1471,7 @@ define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -1481,7 +1481,7 @@ define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -1491,7 +1491,7 @@ define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
@@ -1942,7 +1942,7 @@ define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -1953,7 +1953,7 @@ define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -1964,7 +1964,7 @@ define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -1975,7 +1975,7 @@ define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -1987,7 +1987,7 @@ define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -1999,7 +1999,7 @@ define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2011,7 +2011,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2023,7 +2023,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2034,7 +2034,7 @@ define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2045,7 +2045,7 @@ define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2056,7 +2056,7 @@ define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2067,7 +2067,7 @@ define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2079,7 +2079,7 @@ define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2091,7 +2091,7 @@ define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2103,7 +2103,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2115,7 +2115,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2126,7 +2126,7 @@ define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2137,7 +2137,7 @@ define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2148,7 +2148,7 @@ define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2159,7 +2159,7 @@ define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2171,7 +2171,7 @@ define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2183,7 +2183,7 @@ define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2195,7 +2195,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2207,7 +2207,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2218,7 +2218,7 @@ define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2229,7 +2229,7 @@ define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2240,7 +2240,7 @@ define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2251,7 +2251,7 @@ define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2263,7 +2263,7 @@ define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i1
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2275,7 +2275,7 @@ define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i3
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2287,7 +2287,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2299,7 +2299,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2310,7 +2310,7 @@ define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2320,7 +2320,7 @@ define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2330,7 +2330,7 @@ define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2340,7 +2340,7 @@ define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2351,7 +2351,7 @@ define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2362,7 +2362,7 @@ define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2373,7 +2373,7 @@ define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2384,7 +2384,7 @@ define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2394,7 +2394,7 @@ define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2404,7 +2404,7 @@ define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2414,7 +2414,7 @@ define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2424,7 +2424,7 @@ define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2435,7 +2435,7 @@ define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2446,7 +2446,7 @@ define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2457,7 +2457,7 @@ define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2468,7 +2468,7 @@ define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2478,8 +2478,8 @@ define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16>
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -2489,8 +2489,8 @@ define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32>
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -2501,8 +2501,8 @@ define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -2513,8 +2513,8 @@ define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -2524,8 +2524,8 @@ define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16>
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -2535,8 +2535,8 @@ define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32>
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -2547,8 +2547,8 @@ define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -2559,8 +2559,8 @@ define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -2570,7 +2570,7 @@ define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2580,7 +2580,7 @@ define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2590,7 +2590,7 @@ define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2600,7 +2600,7 @@ define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2611,7 +2611,7 @@ define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2622,7 +2622,7 @@ define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2633,7 +2633,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2644,7 +2644,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2654,7 +2654,7 @@ define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqdmulh2.i
 }
 
@@ -2664,7 +2664,7 @@ define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqdmulh2.i
 }
 
@@ -2674,7 +2674,7 @@ define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqdmulh2.i
 }
 
@@ -2684,7 +2684,7 @@ define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqdmulh2.i
 }
 
@@ -2694,7 +2694,7 @@ define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqrdmulh2.i
 }
 
@@ -2704,7 +2704,7 @@ define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqrdmulh2.i
 }
 
@@ -2714,7 +2714,7 @@ define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqrdmulh2.i
 }
 
@@ -2724,7 +2724,7 @@ define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqrdmulh2.i
 }
 
@@ -2797,7 +2797,7 @@ define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -2807,7 +2807,7 @@ define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -2817,7 +2817,7 @@ define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
@@ -2827,7 +2827,7 @@ define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -2837,7 +2837,7 @@ define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -2847,7 +2847,7 @@ define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
diff --git a/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
similarity index 85%
rename from test/CodeGen/ARM64/aarch64-neon-3vdiff.ll
rename to test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index a479844de8d6..cb9b36c4c183 100644
--- a/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll
+++ b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,54 +1,54 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
 
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
 
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
 
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
 
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
 
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
 
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
 
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
 
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
 
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vaddl_s8:
@@ -690,7 +690,7 @@ define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_vraddhn_s16:
 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vraddhn2.i
 }
 
@@ -698,7 +698,7 @@ define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_vraddhn_s32:
 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vraddhn2.i
 }
 
@@ -706,7 +706,7 @@ define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vraddhn_s64:
 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vraddhn2.i
 }
 
@@ -714,7 +714,7 @@ define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_vraddhn_u16:
 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vraddhn2.i
 }
 
@@ -722,7 +722,7 @@ define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_vraddhn_u32:
 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vraddhn2.i
 }
 
@@ -730,7 +730,7 @@ define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vraddhn_u64:
 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vraddhn2.i
 }
 
@@ -738,7 +738,7 @@ define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ; CHECK-LABEL: test_vraddhn_high_s16:
 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -750,7 +750,7 @@ define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b
 ; CHECK-LABEL: test_vraddhn_high_s32:
 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -762,7 +762,7 @@ define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b
 ; CHECK-LABEL: test_vraddhn_high_s64:
 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -774,7 +774,7 @@ define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ; CHECK-LABEL: test_vraddhn_high_u16:
 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -786,7 +786,7 @@ define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b
 ; CHECK-LABEL: test_vraddhn_high_u32:
 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -798,7 +798,7 @@ define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b
 ; CHECK-LABEL: test_vraddhn_high_u64:
 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -954,7 +954,7 @@ define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_vrsubhn_s16:
 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vrsubhn2.i
 }
 
@@ -962,7 +962,7 @@ define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_vrsubhn_s32:
 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vrsubhn2.i
 }
 
@@ -970,7 +970,7 @@ define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vrsubhn_s64:
 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vrsubhn2.i
 }
 
@@ -978,7 +978,7 @@ define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_vrsubhn_u16:
 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vrsubhn2.i
 }
 
@@ -986,7 +986,7 @@ define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_vrsubhn_u32:
 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vrsubhn2.i
 }
 
@@ -994,7 +994,7 @@ define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vrsubhn_u64:
 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vrsubhn2.i
 }
 
@@ -1002,7 +1002,7 @@ define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ; CHECK-LABEL: test_vrsubhn_high_s16:
 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1014,7 +1014,7 @@ define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b
 ; CHECK-LABEL: test_vrsubhn_high_s32:
 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1026,7 +1026,7 @@ define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b
 ; CHECK-LABEL: test_vrsubhn_high_s64:
 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1038,7 +1038,7 @@ define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ; CHECK-LABEL: test_vrsubhn_high_u16:
 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1050,7 +1050,7 @@ define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b
 ; CHECK-LABEL: test_vrsubhn_high_u32:
 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1062,7 +1062,7 @@ define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b
 ; CHECK-LABEL: test_vrsubhn_high_u64:
 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1074,7 +1074,7 @@ define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vabdl_s8:
 ; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i
 }
@@ -1083,7 +1083,7 @@ define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: test_vabdl_s16:
 ; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i
 }
@@ -1092,7 +1092,7 @@ define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vabdl_s32:
 ; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i
 }
@@ -1101,7 +1101,7 @@ define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vabdl_u8:
 ; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i
 }
@@ -1110,7 +1110,7 @@ define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: test_vabdl_u16:
 ; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i
 }
@@ -1119,7 +1119,7 @@ define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vabdl_u32:
 ; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i
 }
@@ -1128,7 +1128,7 @@ define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: test_vabal_s8:
 ; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   ret <8 x i16> %add.i
@@ -1138,7 +1138,7 @@ define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vabal_s16:
 ; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   ret <4 x i32> %add.i
@@ -1148,7 +1148,7 @@ define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vabal_s32:
 ; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   ret <2 x i64> %add.i
@@ -1158,7 +1158,7 @@ define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: test_vabal_u8:
 ; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   ret <8 x i16> %add.i
@@ -1168,7 +1168,7 @@ define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vabal_u16:
 ; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   ret <4 x i32> %add.i
@@ -1178,7 +1178,7 @@ define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vabal_u32:
 ; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   ret <2 x i64> %add.i
@@ -1190,7 +1190,7 @@ define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i.i
 }
@@ -1201,7 +1201,7 @@ define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i.i
 }
@@ -1212,7 +1212,7 @@ define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i.i
 }
@@ -1223,7 +1223,7 @@ define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i.i
 }
@@ -1234,7 +1234,7 @@ define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i.i
 }
@@ -1245,7 +1245,7 @@ define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i.i
 }
@@ -1256,7 +1256,7 @@ define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   ret <8 x i16> %add.i.i
@@ -1268,7 +1268,7 @@ define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c)
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   ret <4 x i32> %add.i.i
@@ -1280,7 +1280,7 @@ define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c)
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   ret <2 x i64> %add.i.i
@@ -1292,7 +1292,7 @@ define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   ret <8 x i16> %add.i.i
@@ -1304,7 +1304,7 @@ define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c)
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   ret <4 x i32> %add.i.i
@@ -1316,7 +1316,7 @@ define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c)
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   ret <2 x i64> %add.i.i
@@ -1326,7 +1326,7 @@ define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vmull_s8:
 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
@@ -1334,7 +1334,7 @@ define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: test_vmull_s16:
 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1342,7 +1342,7 @@ define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vmull_s32:
 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1350,7 +1350,7 @@ define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vmull_u8:
 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
@@ -1358,7 +1358,7 @@ define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: test_vmull_u16:
 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1366,7 +1366,7 @@ define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vmull_u32:
 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1376,7 +1376,7 @@ define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1386,7 +1386,7 @@ define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1396,7 +1396,7 @@ define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1406,7 +1406,7 @@ define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1416,7 +1416,7 @@ define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1426,7 +1426,7 @@ define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1434,7 +1434,7 @@ define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: test_vmlal_s8:
 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %add.i = add <8 x i16> %vmull.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1443,7 +1443,7 @@ define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vmlal_s16:
 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %add.i = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1452,7 +1452,7 @@ define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vmlal_s32:
 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %add.i = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1461,7 +1461,7 @@ define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: test_vmlal_u8:
 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %add.i = add <8 x i16> %vmull.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1470,7 +1470,7 @@ define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vmlal_u16:
 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %add.i = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1479,7 +1479,7 @@ define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vmlal_u32:
 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %add.i = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1490,7 +1490,7 @@ define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
@@ -1501,7 +1501,7 @@ define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c)
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -1512,7 +1512,7 @@ define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c)
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -1523,7 +1523,7 @@ define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
@@ -1534,7 +1534,7 @@ define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c)
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -1545,7 +1545,7 @@ define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c)
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -1554,7 +1554,7 @@ define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: test_vmlsl_s8:
 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %sub.i = sub <8 x i16> %a, %vmull.i.i
   ret <8 x i16> %sub.i
 }
@@ -1563,7 +1563,7 @@ define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vmlsl_s16:
 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   ret <4 x i32> %sub.i
 }
@@ -1572,7 +1572,7 @@ define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vmlsl_s32:
 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   ret <2 x i64> %sub.i
 }
@@ -1581,7 +1581,7 @@ define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
 ; CHECK-LABEL: test_vmlsl_u8:
 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %sub.i = sub <8 x i16> %a, %vmull.i.i
   ret <8 x i16> %sub.i
 }
@@ -1590,7 +1590,7 @@ define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vmlsl_u16:
 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   ret <4 x i32> %sub.i
 }
@@ -1599,7 +1599,7 @@ define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vmlsl_u32:
 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   ret <2 x i64> %sub.i
 }
@@ -1610,7 +1610,7 @@ define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   ret <8 x i16> %sub.i.i
 }
@@ -1621,7 +1621,7 @@ define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c)
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -1632,7 +1632,7 @@ define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c)
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -1643,7 +1643,7 @@ define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   ret <8 x i16> %sub.i.i
 }
@@ -1654,7 +1654,7 @@ define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c)
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -1665,7 +1665,7 @@ define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c)
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -1674,7 +1674,7 @@ define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: test_vqdmull_s16:
 ; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1682,7 +1682,7 @@ define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vqdmull_s32:
 ; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1690,8 +1690,8 @@ define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vqdmlal_s16:
 ; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -1699,8 +1699,8 @@ define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vqdmlal_s32:
 ; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -1708,8 +1708,8 @@ define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vqdmlsl_s16:
 ; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -1717,8 +1717,8 @@ define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vqdmlsl_s32:
 ; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -1728,7 +1728,7 @@ define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vqdmull2.i.i
 }
 
@@ -1738,7 +1738,7 @@ define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vqdmull2.i.i
 }
 
@@ -1748,8 +1748,8 @@ define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+  %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
   ret <4 x i32> %vqdmlal4.i.i
 }
 
@@ -1759,8 +1759,8 @@ define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+  %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
   ret <2 x i64> %vqdmlal4.i.i
 }
 
@@ -1770,8 +1770,8 @@ define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
   ret <4 x i32> %vqdmlsl4.i.i
 }
 
@@ -1781,8 +1781,8 @@ define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
   ret <2 x i64> %vqdmlsl4.i.i
 }
 
@@ -1790,7 +1790,7 @@ define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vmull_p8:
 ; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
@@ -1800,7 +1800,7 @@ define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1808,7 +1808,7 @@ define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
 ; CHECK-LABEL: test_vmull_p64
 ; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
 entry:
-  %vmull2.i = tail call <16 x i8> @llvm.arm64.neon.pmull64(i64 %a, i64 %b)
+  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
   %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
   ret i128 %vmull3.i
 }
@@ -1819,11 +1819,11 @@ define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
 entry:
   %0 = extractelement <2 x i64> %a, i32 1
   %1 = extractelement <2 x i64> %b, i32 1
-  %vmull2.i.i = tail call <16 x i8> @llvm.arm64.neon.pmull64(i64 %0, i64 %1) #1
+  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
   %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
   ret i128 %vmull3.i.i
 }
 
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64) #5
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
 
 
diff --git a/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
similarity index 58%
rename from test/CodeGen/ARM64/aarch64-neon-aba-abd.ll
rename to test/CodeGen/AArch64/arm64-neon-aba-abd.ll
index 9d321b239763..6404ab728011 100644
--- a/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll
+++ b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -1,18 +1,18 @@
 ; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
 
 define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_uabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
 ; CHECK: uabd v0.8b, v0.8b, v1.8b
   ret <8 x i8> %abd
 }
 
 define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_uaba_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
   %aba = add <8 x i8> %lhs, %abd
 ; CHECK: uaba v0.8b, v0.8b, v1.8b
   ret <8 x i8> %aba
@@ -20,32 +20,32 @@ define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 
 define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_sabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
 ; CHECK: sabd v0.8b, v0.8b, v1.8b
   ret <8 x i8> %abd
 }
 
 define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK: test_saba_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
   %aba = add <8 x i8> %lhs, %abd
 ; CHECK: saba v0.8b, v0.8b, v1.8b
   ret <8 x i8> %aba
 }
 
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
 
 define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_uabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
 ; CHECK: uabd v0.16b, v0.16b, v1.16b
   ret <16 x i8> %abd
 }
 
 define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_uaba_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
   %aba = add <16 x i8> %lhs, %abd
 ; CHECK: uaba v0.16b, v0.16b, v1.16b
   ret <16 x i8> %aba
@@ -53,32 +53,32 @@ define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 
 define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_sabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
 ; CHECK: sabd v0.16b, v0.16b, v1.16b
   ret <16 x i8> %abd
 }
 
 define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_saba_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
   %aba = add <16 x i8> %lhs, %abd
 ; CHECK: saba v0.16b, v0.16b, v1.16b
   ret <16 x i8> %aba
 }
 
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
 
 define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_uabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: uabd v0.4h, v0.4h, v1.4h
   ret <4 x i16> %abd
 }
 
 define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_uaba_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %aba = add <4 x i16> %lhs, %abd
 ; CHECK: uaba v0.4h, v0.4h, v1.4h
   ret <4 x i16> %aba
@@ -86,32 +86,32 @@ define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 
 define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_sabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: sabd v0.4h, v0.4h, v1.4h
   ret <4 x i16> %abd
 }
 
 define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_saba_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %aba = add <4 x i16> %lhs, %abd
 ; CHECK: saba v0.4h, v0.4h, v1.4h
   ret <4 x i16> %aba
 }
 
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_uabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: uabd v0.8h, v0.8h, v1.8h
   ret <8 x i16> %abd
 }
 
 define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_uaba_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %aba = add <8 x i16> %lhs, %abd
 ; CHECK: uaba v0.8h, v0.8h, v1.8h
   ret <8 x i16> %aba
@@ -119,32 +119,32 @@ define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 
 define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_sabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: sabd v0.8h, v0.8h, v1.8h
   ret <8 x i16> %abd
 }
 
 define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_saba_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %aba = add <8 x i16> %lhs, %abd
 ; CHECK: saba v0.8h, v0.8h, v1.8h
   ret <8 x i16> %aba
 }
 
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
 
 define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_uabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: uabd v0.2s, v0.2s, v1.2s
   ret <2 x i32> %abd
 }
 
 define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_uaba_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   %aba = add <2 x i32> %lhs, %abd
 ; CHECK: uaba v0.2s, v0.2s, v1.2s
   ret <2 x i32> %aba
@@ -152,7 +152,7 @@ define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 
 define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_sabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: sabd v0.2s, v0.2s, v1.2s
   ret <2 x i32> %abd
 }
@@ -161,7 +161,7 @@ define <2 x i32> @test_sabd_v2i32_const() {
 ; CHECK: test_sabd_v2i32_const:
 ; CHECK: movi     d1, #0x00ffffffff0000
 ; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
-  %1 = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(
     <2 x i32> <i32 -2147483648, i32 2147450880>,
     <2 x i32> <i32 -65536, i32 65535>)
   ret <2 x i32> %1
@@ -169,25 +169,25 @@ define <2 x i32> @test_sabd_v2i32_const() {
 
 define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_saba_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   %aba = add <2 x i32> %lhs, %abd
 ; CHECK: saba v0.2s, v0.2s, v1.2s
   ret <2 x i32> %aba
 }
 
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_uabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: uabd v0.4s, v0.4s, v1.4s
   ret <4 x i32> %abd
 }
 
 define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_uaba_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   %aba = add <4 x i32> %lhs, %abd
 ; CHECK: uaba v0.4s, v0.4s, v1.4s
   ret <4 x i32> %aba
@@ -195,42 +195,42 @@ define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 
 define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_sabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: sabd v0.4s, v0.4s, v1.4s
   ret <4 x i32> %abd
 }
 
 define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_saba_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   %aba = add <4 x i32> %lhs, %abd
 ; CHECK: saba v0.4s, v0.4s, v1.4s
   ret <4 x i32> %aba
 }
 
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>)
 
 define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
 ; CHECK: test_fabd_v2f32:
-  %abd = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+  %abd = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
 ; CHECK: fabd v0.2s, v0.2s, v1.2s
   ret <2 x float> %abd
 }
 
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>)
 
 define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK: test_fabd_v4f32:
-  %abd = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+  %abd = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
 ; CHECK: fabd v0.4s, v0.4s, v1.4s
   ret <4 x float> %abd
 }
 
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>)
 
 define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 ; CHECK: test_fabd_v2f64:
-  %abd = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+  %abd = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
 ; CHECK: fabd v0.2d, v0.2d, v1.2d
   ret <2 x double> %abd
 }
diff --git a/test/CodeGen/ARM64/aarch64-neon-across.ll b/test/CodeGen/AArch64/arm64-neon-across.ll
similarity index 56%
rename from test/CodeGen/ARM64/aarch64-neon-across.ll
rename to test/CodeGen/AArch64/arm64-neon-across.ll
index 5a3538ba88ec..3a63673f1209 100644
--- a/test/CodeGen/ARM64/aarch64-neon-across.ll
+++ b/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -1,88 +1,88 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
 
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
 
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
 
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.uminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
 
-declare i64 @llvm.arm64.neon.uaddlv.i64.v4i32(<4 x i32>)
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.uaddlv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>)
 
-declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32>)
+declare i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.saddlv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.saddlv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.uaddlv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddlv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.saddlv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.saddlv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
 
 define i16 @test_vaddlv_s8(<8 x i8> %a) {
 ; CHECK: test_vaddlv_s8:
 ; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v8i8(<8 x i8> %a)
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %saddlvv.i to i16
   ret i16 %0
 }
@@ -91,7 +91,7 @@ define i32 @test_vaddlv_s16(<4 x i16> %a) {
 ; CHECK: test_vaddlv_s16:
 ; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v4i16(<4 x i16> %a)
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a)
   ret i32 %saddlvv.i
 }
 
@@ -99,7 +99,7 @@ define i16 @test_vaddlv_u8(<8 x i8> %a) {
 ; CHECK: test_vaddlv_u8:
 ; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %uaddlvv.i to i16
   ret i16 %0
 }
@@ -108,7 +108,7 @@ define i32 @test_vaddlv_u16(<4 x i16> %a) {
 ; CHECK: test_vaddlv_u16:
 ; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
   ret i32 %uaddlvv.i
 }
 
@@ -116,7 +116,7 @@ define i16 @test_vaddlvq_s8(<16 x i8> %a) {
 ; CHECK: test_vaddlvq_s8:
 ; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v16i8(<16 x i8> %a)
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %saddlvv.i to i16
   ret i16 %0
 }
@@ -125,7 +125,7 @@ define i32 @test_vaddlvq_s16(<8 x i16> %a) {
 ; CHECK: test_vaddlvq_s16:
 ; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v8i16(<8 x i16> %a)
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a)
   ret i32 %saddlvv.i
 }
 
@@ -133,7 +133,7 @@ define i64 @test_vaddlvq_s32(<4 x i32> %a) {
 ; CHECK: test_vaddlvq_s32:
 ; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %saddlvv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %a)
+  %saddlvv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a)
   ret i64 %saddlvv.i
 }
 
@@ -141,7 +141,7 @@ define i16 @test_vaddlvq_u8(<16 x i8> %a) {
 ; CHECK: test_vaddlvq_u8:
 ; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %uaddlvv.i to i16
   ret i16 %0
 }
@@ -150,7 +150,7 @@ define i32 @test_vaddlvq_u16(<8 x i16> %a) {
 ; CHECK: test_vaddlvq_u16:
 ; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
   ret i32 %uaddlvv.i
 }
 
@@ -158,7 +158,7 @@ define i64 @test_vaddlvq_u32(<4 x i32> %a) {
 ; CHECK: test_vaddlvq_u32:
 ; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %uaddlvv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
+  %uaddlvv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
   ret i64 %uaddlvv.i
 }
 
@@ -166,7 +166,7 @@ define i8 @test_vmaxv_s8(<8 x i8> %a) {
 ; CHECK: test_vmaxv_s8:
 ; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a)
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %smaxv.i to i8
   ret i8 %0
 }
@@ -175,7 +175,7 @@ define i16 @test_vmaxv_s16(<4 x i16> %a) {
 ; CHECK: test_vmaxv_s16:
 ; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a)
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a)
   %0 = trunc i32 %smaxv.i to i16
   ret i16 %0
 }
@@ -184,7 +184,7 @@ define i8 @test_vmaxv_u8(<8 x i8> %a) {
 ; CHECK: test_vmaxv_u8:
 ; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a)
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %umaxv.i to i8
   ret i8 %0
 }
@@ -193,7 +193,7 @@ define i16 @test_vmaxv_u16(<4 x i16> %a) {
 ; CHECK: test_vmaxv_u16:
 ; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a)
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a)
   %0 = trunc i32 %umaxv.i to i16
   ret i16 %0
 }
@@ -202,7 +202,7 @@ define i8 @test_vmaxvq_s8(<16 x i8> %a) {
 ; CHECK: test_vmaxvq_s8:
 ; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a)
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %smaxv.i to i8
   ret i8 %0
 }
@@ -211,7 +211,7 @@ define i16 @test_vmaxvq_s16(<8 x i16> %a) {
 ; CHECK: test_vmaxvq_s16:
 ; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a)
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a)
   %0 = trunc i32 %smaxv.i to i16
   ret i16 %0
 }
@@ -220,7 +220,7 @@ define i32 @test_vmaxvq_s32(<4 x i32> %a) {
 ; CHECK: test_vmaxvq_s32:
 ; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a)
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a)
   ret i32 %smaxv.i
 }
 
@@ -228,7 +228,7 @@ define i8 @test_vmaxvq_u8(<16 x i8> %a) {
 ; CHECK: test_vmaxvq_u8:
 ; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a)
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %umaxv.i to i8
   ret i8 %0
 }
@@ -237,7 +237,7 @@ define i16 @test_vmaxvq_u16(<8 x i16> %a) {
 ; CHECK: test_vmaxvq_u16:
 ; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a)
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a)
   %0 = trunc i32 %umaxv.i to i16
   ret i16 %0
 }
@@ -246,7 +246,7 @@ define i32 @test_vmaxvq_u32(<4 x i32> %a) {
 ; CHECK: test_vmaxvq_u32:
 ; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i32(<4 x i32> %a)
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a)
   ret i32 %umaxv.i
 }
 
@@ -254,7 +254,7 @@ define i8 @test_vminv_s8(<8 x i8> %a) {
 ; CHECK: test_vminv_s8:
 ; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a)
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %sminv.i to i8
   ret i8 %0
 }
@@ -263,7 +263,7 @@ define i16 @test_vminv_s16(<4 x i16> %a) {
 ; CHECK: test_vminv_s16:
 ; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a)
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a)
   %0 = trunc i32 %sminv.i to i16
   ret i16 %0
 }
@@ -272,7 +272,7 @@ define i8 @test_vminv_u8(<8 x i8> %a) {
 ; CHECK: test_vminv_u8:
 ; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a)
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %uminv.i to i8
   ret i8 %0
 }
@@ -281,7 +281,7 @@ define i16 @test_vminv_u16(<4 x i16> %a) {
 ; CHECK: test_vminv_u16:
 ; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a)
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a)
   %0 = trunc i32 %uminv.i to i16
   ret i16 %0
 }
@@ -290,7 +290,7 @@ define i8 @test_vminvq_s8(<16 x i8> %a) {
 ; CHECK: test_vminvq_s8:
 ; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a)
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %sminv.i to i8
   ret i8 %0
 }
@@ -299,7 +299,7 @@ define i16 @test_vminvq_s16(<8 x i16> %a) {
 ; CHECK: test_vminvq_s16:
 ; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a)
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a)
   %0 = trunc i32 %sminv.i to i16
   ret i16 %0
 }
@@ -308,7 +308,7 @@ define i32 @test_vminvq_s32(<4 x i32> %a) {
 ; CHECK: test_vminvq_s32:
 ; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a)
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a)
   ret i32 %sminv.i
 }
 
@@ -316,7 +316,7 @@ define i8 @test_vminvq_u8(<16 x i8> %a) {
 ; CHECK: test_vminvq_u8:
 ; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a)
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %uminv.i to i8
   ret i8 %0
 }
@@ -325,7 +325,7 @@ define i16 @test_vminvq_u16(<8 x i16> %a) {
 ; CHECK: test_vminvq_u16:
 ; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a)
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a)
   %0 = trunc i32 %uminv.i to i16
   ret i16 %0
 }
@@ -334,7 +334,7 @@ define i32 @test_vminvq_u32(<4 x i32> %a) {
 ; CHECK: test_vminvq_u32:
 ; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i32(<4 x i32> %a)
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a)
   ret i32 %uminv.i
 }
 
@@ -342,7 +342,7 @@ define i8 @test_vaddv_s8(<8 x i8> %a) {
 ; CHECK: test_vaddv_s8:
 ; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -351,7 +351,7 @@ define i16 @test_vaddv_s16(<4 x i16> %a) {
 ; CHECK: test_vaddv_s16:
 ; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -360,7 +360,7 @@ define i8 @test_vaddv_u8(<8 x i8> %a) {
 ; CHECK: test_vaddv_u8:
 ; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -369,7 +369,7 @@ define i16 @test_vaddv_u16(<4 x i16> %a) {
 ; CHECK: test_vaddv_u16:
 ; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -378,7 +378,7 @@ define i8 @test_vaddvq_s8(<16 x i8> %a) {
 ; CHECK: test_vaddvq_s8:
 ; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -387,7 +387,7 @@ define i16 @test_vaddvq_s16(<8 x i16> %a) {
 ; CHECK: test_vaddvq_s16:
 ; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -396,7 +396,7 @@ define i32 @test_vaddvq_s32(<4 x i32> %a) {
 ; CHECK: test_vaddvq_s32:
 ; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
   ret i32 %vaddv.i
 }
 
@@ -404,7 +404,7 @@ define i8 @test_vaddvq_u8(<16 x i8> %a) {
 ; CHECK: test_vaddvq_u8:
 ; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -413,7 +413,7 @@ define i16 @test_vaddvq_u16(<8 x i16> %a) {
 ; CHECK: test_vaddvq_u16:
 ; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -422,7 +422,7 @@ define i32 @test_vaddvq_u32(<4 x i32> %a) {
 ; CHECK: test_vaddvq_u32:
 ; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
   ret i32 %vaddv.i
 }
 
@@ -430,7 +430,7 @@ define float @test_vmaxvq_f32(<4 x float> %a) {
 ; CHECK: test_vmaxvq_f32:
 ; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a)
   ret float %0
 }
 
@@ -438,7 +438,7 @@ define float @test_vminvq_f32(<4 x float> %a) {
 ; CHECK: test_vminvq_f32:
 ; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %0 = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a)
   ret float %0
 }
 
@@ -446,7 +446,7 @@ define float @test_vmaxnmvq_f32(<4 x float> %a) {
 ; CHECK: test_vmaxnmvq_f32:
 ; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %0 = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
   ret float %0
 }
 
@@ -454,7 +454,7 @@ define float @test_vminnmvq_f32(<4 x float> %a) {
 ; CHECK: test_vminnmvq_f32:
 ; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
 entry:
-  %0 = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a)
   ret float %0
 }
 
diff --git a/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
similarity index 51%
rename from test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll
rename to test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
index 9cd76ff1b7e9..d3dc1b8d010f 100644
--- a/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll
+++ b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -1,91 +1,91 @@
 ; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
 
 define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; CHECK: test_addp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %tmp1 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
 ; CHECK: addp v0.8b, v0.8b, v1.8b
   ret <8 x i8> %tmp1
 }
 
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
 
 define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK: test_addp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %tmp1 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
 ; CHECK: addp v0.16b, v0.16b, v1.16b
   ret <16 x i8> %tmp1
 }
 
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
 
 define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK: test_addp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %tmp1 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: addp v0.4h, v0.4h, v1.4h
   ret <4 x i16> %tmp1
 }
 
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK: test_addp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %tmp1 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: addp v0.8h, v0.8h, v1.8h
   ret <8 x i16> %tmp1
 }
 
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
 
 define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_addp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %tmp1 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: addp v0.2s, v0.2s, v1.2s
   ret <2 x i32> %tmp1
 }
 
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK: test_addp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %tmp1 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: addp v0.4s, v0.4s, v1.4s
   ret <4 x i32> %tmp1
 }
 
 
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
 
 define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; CHECK: test_addp_v2i64:
-        %val = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+        %val = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
 ; CHECK: addp v0.2d, v0.2d, v1.2d
         ret <2 x i64> %val
 }
 
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>)
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
 
 define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
 ; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
 ; CHECK: faddp v0.2s, v0.2s, v1.2s
         ret <2 x float> %val
 }
 
 define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
 ; CHECK: faddp v0.4s, v0.4s, v1.4s
         ret <4 x float> %val
 }
 
 define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 ; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
 ; CHECK: faddp v0.2d, v0.2d, v1.2d
         ret <2 x double> %val
 }
@@ -93,8 +93,8 @@ define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 define i32 @test_vaddv.v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: test_vaddv.v2i32
 ; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a)
+  %1 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
   ret i32 %1
 }
 
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/ARM64/aarch64-neon-add-sub.ll b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
similarity index 89%
rename from test/CodeGen/ARM64/aarch64-neon-add-sub.ll
rename to test/CodeGen/AArch64/arm64-neon-add-sub.ll
index 241025eca339..fbde606538ca 100644
--- a/test/CodeGen/ARM64/aarch64-neon-add-sub.ll
+++ b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -arm64-simd-scalar| FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -aarch64-simd-scalar| FileCheck %s
 
 define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
 ;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
@@ -182,35 +182,35 @@ define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
 define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vabd_f64
 ; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmax_f64
 ; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmin_f64
 ; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmaxnm_f64
 ; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vminnm_f64
 ; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
@@ -229,9 +229,9 @@ define <1 x double> @test_vneg_f64(<1 x double> %a) {
 }
 
 declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmin.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmax.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fabd.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
diff --git a/test/CodeGen/ARM64/neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
similarity index 100%
rename from test/CodeGen/ARM64/neon-compare-instructions.ll
rename to test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
diff --git a/test/CodeGen/ARM64/aarch64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
similarity index 99%
rename from test/CodeGen/ARM64/aarch64-neon-copy.ll
rename to test/CodeGen/AArch64/arm64-neon-copy.ll
index 9493cad33452..cfc2ebf0a2e9 100644
--- a/test/CodeGen/ARM64/aarch64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1030,7 +1030,7 @@ define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
 ; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
 ; CHECK-NEXT: ret
 entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
   %2 = extractelement <1 x float> %1, i32 0
   %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
@@ -1042,14 +1042,14 @@ define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
 ; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
 ; CHECK-NEXT: ret
 entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
   %2 = extractelement <1 x float> %1, i32 0
   %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
   ret <4 x float> %vecinit1.i
 }
 
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
 
 define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
 ; CHECK-LABEL: test_concat_undef_v1i32:
@@ -1060,14 +1060,14 @@ entry:
   ret <2 x i32> %vecinit1.i
 }
 
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) #4
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4
 
 define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
 ; CHECK-LABEL: test_concat_v1i32_undef:
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK-NEXT: ret
 entry:
-  %b = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %a)
+  %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
   ret <2 x i32> %vecinit.i432
 }
@@ -1088,9 +1088,9 @@ define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %c = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %a)
+  %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %d = insertelement <2 x i32> undef, i32 %c, i32 0
-  %e = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %b)
+  %e = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %b)
   %f = insertelement <2 x i32> undef, i32 %e, i32 0
   %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %h
diff --git a/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
similarity index 56%
rename from test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll
rename to test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
index f24392bb8fcc..276ac13da40e 100644
--- a/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
@@ -7,9 +7,9 @@ define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
 ; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 ; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
   %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
+  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
   %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
   ret <4 x i32> %vld1.fca.0.extract
 }
@@ -21,9 +21,9 @@ define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
 ; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 ; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
   %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
   %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
   ret <4 x i32> %vld1.fca.0.extract
 }
@@ -36,13 +36,13 @@ define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
 ; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 ; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
   %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
   %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
   ret <4 x i32> %vld1.fca.0.extract
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/test/CodeGen/ARM64/aarch64-neon-mul-div.ll b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
similarity index 92%
rename from test/CodeGen/ARM64/aarch64-neon-mul-div.ll
rename to test/CodeGen/AArch64/arm64-neon-mul-div.ll
index f3a976631977..720f3eb6a4bf 100644
--- a/test/CodeGen/ARM64/aarch64-neon-mul-div.ll
+++ b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -684,98 +684,98 @@ define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
 	ret <2 x double> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
 
 define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK-LABEL: poly_mulv8i8:
-   %prod = call <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+   %prod = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
 ; CHECK: pmul v0.8b, v0.8b, v1.8b
    ret <8 x i8> %prod
 }
 
 define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK-LABEL: poly_mulv16i8:
-   %prod = call <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+   %prod = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
 ; CHECK: pmul v0.16b, v0.16b, v1.16b
    ret <16 x i8> %prod
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
    ret <4 x i16> %prod
 }
 
 define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
    ret <8 x i16> %prod
 }
 
 define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
    ret <2 x i32> %prod
 }
 
 define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
    ret <4 x i32> %prod
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
    ret <4 x i16> %prod
 }
 
 define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
    ret <8 x i16> %prod
 }
 
 define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
    ret <2 x i32> %prod
 }
 
 define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
    ret <4 x i32> %prod
 }
 
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 
 define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
 ; CHECK-LABEL: fmulx_v2f32:
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: fmulx v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        %val = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
         ret <2 x float> %val
 }
 
@@ -783,7 +783,7 @@ define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-LABEL: fmulx_v4f32:
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: fmulx v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        %val = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
         ret <4 x float> %val
 }
 
@@ -791,7 +791,7 @@ define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 ; CHECK-LABEL: fmulx_v2f64:
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: fmulx v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        %val = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
         ret <2 x double> %val
 }
 
diff --git a/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
similarity index 85%
rename from test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll
rename to test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
index 18e6e0fe8b65..92ed23995098 100644
--- a/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll
+++ b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
@@ -61,13 +61,13 @@ define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
   ret double %tmp2;
 }
 
-declare float @llvm.arm64.neon.fmulx.f32(float, float)
+declare float @llvm.aarch64.neon.fmulx.f32(float, float)
 
 define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
   ; CHECK-LABEL: test_fmulx_lane_f32
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %a, float %tmp1)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
 }
 
@@ -75,7 +75,7 @@ define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
   ; CHECK-LABEL: test_fmulx_laneq_f32
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %a, float %tmp1)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
 }
 
@@ -83,17 +83,17 @@ define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
   ; CHECK-LABEL: test_fmulx_laneq_f32_swap
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %tmp1, float %a)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %tmp1, float %a)
   ret float %tmp2;
 }
 
-declare double @llvm.arm64.neon.fmulx.f64(double, double)
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
 
 define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
   ; CHECK-LABEL: test_fmulx_lane_f64
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
@@ -101,7 +101,7 @@ define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
   ; CHECK-LABEL: test_fmulx_laneq_f64_0
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <2 x double> %v, i32 0
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
@@ -110,7 +110,7 @@ define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
   ; CHECK-LABEL: test_fmulx_laneq_f64_1
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
@@ -118,7 +118,7 @@ define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
   ; CHECK-LABEL: test_fmulx_laneq_f64_1_swap
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %tmp1, double %a)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %tmp1, double %a)
   ret double %tmp2;
 }
 
diff --git a/test/CodeGen/ARM64/aarch64-neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
similarity index 100%
rename from test/CodeGen/ARM64/aarch64-neon-select_cc.ll
rename to test/CodeGen/AArch64/arm64-neon-select_cc.ll
diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-ldst-one.ll b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
similarity index 100%
rename from test/CodeGen/ARM64/aarch64-neon-simd-ldst-one.ll
rename to test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
similarity index 81%
rename from test/CodeGen/ARM64/aarch64-neon-simd-shift.ll
rename to test/CodeGen/AArch64/arm64-neon-simd-shift.ll
index 2fd2c1e35ce5..447fb6307f21 100644
--- a/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll
+++ b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
@@ -333,7 +333,7 @@ define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
 define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqshrun_high_n_s16
 ; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -344,7 +344,7 @@ define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqshrun_high_n_s32
 ; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -356,7 +356,7 @@ define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 ; CHECK: test_vqshrun_high_n_s64
 ; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrun = tail call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -366,7 +366,7 @@ define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vrshrn_high_n_s16
 ; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -377,7 +377,7 @@ define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vrshrn_high_n_s32
 ; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -389,7 +389,7 @@ define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 ; CHECK: test_vrshrn_high_n_s64
 ; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vrshrn = tail call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -399,7 +399,7 @@ define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqrshrun_high_n_s16
 ; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -410,7 +410,7 @@ define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqrshrun_high_n_s32
 ; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -422,7 +422,7 @@ define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 ; CHECK: test_vqrshrun_high_n_s64
 ; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrun = tail call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -432,7 +432,7 @@ define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqshrn_high_n_s16
 ; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -443,7 +443,7 @@ define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqshrn_high_n_s32
 ; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -455,7 +455,7 @@ define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 ; CHECK: test_vqshrn_high_n_s64
 ; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -465,7 +465,7 @@ define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqshrn_high_n_u16
 ; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -476,7 +476,7 @@ define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
 define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqshrn_high_n_u32
 ; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -488,7 +488,7 @@ define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
 ; CHECK: test_vqshrn_high_n_u64
 ; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -498,7 +498,7 @@ define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
 define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqrshrn_high_n_s16
 ; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -509,7 +509,7 @@ define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqrshrn_high_n_s32
 ; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -521,7 +521,7 @@ define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 ; CHECK: test_vqrshrn_high_n_s64
 ; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -531,7 +531,7 @@ define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
 define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqrshrn_high_n_u16
 ; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -542,7 +542,7 @@ define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
 define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqrshrn_high_n_u32
 ; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -554,7 +554,7 @@ define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
 ; CHECK: test_vqrshrn_high_n_u64
 ; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -563,101 +563,101 @@ define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
 
 
 
-declare <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32)
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
 
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
 
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
 
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
 
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
 
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
 
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
 
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
 
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
 
 define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvt_n_s64_f64
 ; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvt_n_u64_f64
 ; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
   ret <1 x i64> %1
 }
 
 define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
 ; CHECK-LABEL: test_vcvt_n_f64_s64
 ; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
 ; CHECK-LABEL: test_vcvt_n_f64_u64
 ; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
   ret <1 x double> %1
 }
 
-declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
-declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
-declare <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
-declare <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
similarity index 100%
rename from test/CodeGen/ARM64/aarch64-neon-simd-vget.ll
rename to test/CodeGen/AArch64/arm64-neon-simd-vget.ll
diff --git a/test/CodeGen/ARM64/neon-v1i1-setcc.ll b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
similarity index 100%
rename from test/CodeGen/ARM64/neon-v1i1-setcc.ll
rename to test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
diff --git a/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
similarity index 77%
rename from test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll
rename to test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
index 9e69ac025f97..8262fe43a66c 100644
--- a/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll
+++ b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
@@ -10,7 +10,7 @@ define i32 @spill.DPairReg(i32* %arg1, i32 %arg2) {
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %arg1)
+  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -30,7 +30,7 @@ define i16 @spill.DTripleReg(i16* %arg1, i32 %arg2) {
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %arg1)
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -50,7 +50,7 @@ define i16 @spill.DQuadReg(i16* %arg1, i32 %arg2) {
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %arg1)
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -70,7 +70,7 @@ define i32 @spill.QPairReg(i32* %arg1, i32 %arg2) {
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %arg1)
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -90,7 +90,7 @@ define float @spill.QTripleReg(float* %arg1, i32 %arg2) {
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %arg1)
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -110,7 +110,7 @@ define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %arg1)
+  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -124,12 +124,12 @@ if.end:
   ret i8 %res
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*)
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
 
 declare void @foo()
 
@@ -139,7 +139,7 @@ declare void @foo()
 ; then we can delete it.
 ; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
 define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -150,7 +150,7 @@ define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
 
 ; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
 define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -161,7 +161,7 @@ define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
 
 ; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
 define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -170,6 +170,6 @@ define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
   ret <8 x i16> %3
 }
 
-declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
-declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
-declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
diff --git a/test/CodeGen/ARM64/patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
similarity index 97%
rename from test/CodeGen/ARM64/patchpoint.ll
rename to test/CodeGen/AArch64/arm64-patchpoint.ll
index 9ef1d778a319..039cdfcc3858 100644
--- a/test/CodeGen/ARM64/patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -161,3 +161,11 @@ define void @clobberScratch(i32* %p) {
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
+; CHECK-LABEL: test_i16:
+; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
+; CHECK: add w0, w0, [[BREG]]
+define webkit_jscc i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
+  %sum = add i16 %a, %b
+  ret i16 %sum
+}
diff --git a/test/CodeGen/ARM64/pic-local-symbol.ll b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
similarity index 100%
rename from test/CodeGen/ARM64/pic-local-symbol.ll
rename to test/CodeGen/AArch64/arm64-pic-local-symbol.ll
diff --git a/test/CodeGen/ARM64/platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
similarity index 100%
rename from test/CodeGen/ARM64/platform-reg.ll
rename to test/CodeGen/AArch64/arm64-platform-reg.ll
diff --git a/test/CodeGen/ARM64/popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
similarity index 93%
rename from test/CodeGen/ARM64/popcnt.ll
rename to test/CodeGen/AArch64/arm64-popcnt.ll
index 9bbba09c250f..2afade2ee750 100644
--- a/test/CodeGen/ARM64/popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
diff --git a/test/CodeGen/ARM64/prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
similarity index 100%
rename from test/CodeGen/ARM64/prefetch.ll
rename to test/CodeGen/AArch64/arm64-prefetch.ll
diff --git a/test/CodeGen/ARM64/promote-const.ll b/test/CodeGen/AArch64/arm64-promote-const.ll
similarity index 98%
rename from test/CodeGen/ARM64/promote-const.ll
rename to test/CodeGen/AArch64/arm64-promote-const.ll
index 9e7a215f64c2..380ff55d6839 100644
--- a/test/CodeGen/ARM64/promote-const.ll
+++ b/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -1,9 +1,9 @@
 ; Disable machine cse to stress the different path of the algorithm.
 ; Otherwise, we always fall in the simple case, i.e., only one definition.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
 ; The REGULAR run just checks that the inputs passed to promote const expose
 ; the appropriate patterns.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
 
 %struct.uint8x16x4_t = type { [4 x <16 x i8>] }
 
diff --git a/test/CodeGen/ARM64/redzone.ll b/test/CodeGen/AArch64/arm64-redzone.ll
similarity index 88%
rename from test/CodeGen/ARM64/redzone.ll
rename to test/CodeGen/AArch64/arm64-redzone.ll
index b89d7b1de3fe..9b0c384c4d9e 100644
--- a/test/CodeGen/ARM64/redzone.ll
+++ b/test/CodeGen/AArch64/arm64-redzone.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
 
 define i32 @foo(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/ARM64/reg-copy-noneon.ll b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
similarity index 100%
rename from test/CodeGen/ARM64/reg-copy-noneon.ll
rename to test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
diff --git a/test/CodeGen/AArch64/arm64-register-offset-addressing.ll b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
new file mode 100644
index 000000000000..045712bea6ac
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @test_64bit_add(i16* %a, i64 %b) {
+; CHECK-LABEL: test_64bit_add:
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+  %tmp1 = getelementptr inbounds i16* %a, i64 %b
+  %tmp2 = load i16* %tmp1
+  %tmp3 = trunc i16 %tmp2 to i8
+  ret i8 %tmp3
+}
+
+; These tests are trying to form SEXT and ZEXT operations that never leave i64
+; space, to make sure LLVM can adapt the offset register correctly.
+define void @ldst_8bit(i8* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_8bit:
+
+   %off32.sext.tmp = shl i64 %offset, 32
+   %off32.sext = ashr i64 %off32.sext.tmp, 32
+   %addr8_sxtw = getelementptr i8* %base, i64 %off32.sext
+   %val8_sxtw = load volatile i8* %addr8_sxtw
+   %val32_signed = sext i8 %val8_sxtw to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %addrint_uxtw = ptrtoint i8* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i8*
+  %val8_uxtw = load volatile i8* %addr_uxtw
+  %newval8 = add i8 %val8_uxtw, 1
+  store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+   ret void
+}
+
+
+define void @ldst_16bit(i16* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_16bit:
+
+  %addrint_uxtw = ptrtoint i16* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i16*
+  %val8_uxtw = load volatile i16* %addr_uxtw
+  %newval8 = add i16 %val8_uxtw, 1
+  store volatile i16 %newval8, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i16* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i16*
+  %val16_sxtw = load volatile i16* %addr_sxtw
+  %val64_signed = sext i16 %val16_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i16* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 1
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i16*
+  %val32 = load volatile i32* @var_32bit
+  %val16_trunc32 = trunc i32 %val32 to i16
+  store volatile i16 %val16_trunc32, i16* %addr_uxtwN
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+   ret void
+}
+
+define void @ldst_32bit(i32* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_32bit:
+
+  %addrint_uxtw = ptrtoint i32* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i32*
+  %val32_uxtw = load volatile i32* %addr_uxtw
+  %newval32 = add i32 %val32_uxtw, 1
+  store volatile i32 %newval32, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i32* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i32*
+  %val32_sxtw = load volatile i32* %addr_sxtw
+  %val64_signed = sext i32 %val32_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i32* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 2
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
+  %val32 = load volatile i32* @var_32bit
+  store volatile i32 %val32, i32* %addr_uxtwN
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+   ret void
+}
+
+define void @ldst_64bit(i64* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_64bit:
+
+  %addrint_uxtw = ptrtoint i64* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i64*
+  %val64_uxtw = load volatile i64* %addr_uxtw
+  %newval8 = add i64 %val64_uxtw, 1
+  store volatile i64 %newval8, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i64* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
+  %val64_sxtw = load volatile i64* %addr_sxtw
+  store volatile i64 %val64_sxtw, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i64* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 3
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
+  %val64 = load volatile i64* @var_64bit
+  store volatile i64 %val64, i64* %addr_uxtwN
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+   ret void
+}
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
diff --git a/test/CodeGen/ARM64/register-pairing.ll b/test/CodeGen/AArch64/arm64-register-pairing.ll
similarity index 100%
rename from test/CodeGen/ARM64/register-pairing.ll
rename to test/CodeGen/AArch64/arm64-register-pairing.ll
diff --git a/test/CodeGen/ARM64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
similarity index 100%
rename from test/CodeGen/ARM64/regress-f128csel-flags.ll
rename to test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
diff --git a/test/CodeGen/ARM64/regress-interphase-shift.ll b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
similarity index 100%
rename from test/CodeGen/ARM64/regress-interphase-shift.ll
rename to test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
diff --git a/test/CodeGen/ARM64/return-vector.ll b/test/CodeGen/AArch64/arm64-return-vector.ll
similarity index 100%
rename from test/CodeGen/ARM64/return-vector.ll
rename to test/CodeGen/AArch64/arm64-return-vector.ll
diff --git a/test/CodeGen/ARM64/returnaddr.ll b/test/CodeGen/AArch64/arm64-returnaddr.ll
similarity index 100%
rename from test/CodeGen/ARM64/returnaddr.ll
rename to test/CodeGen/AArch64/arm64-returnaddr.ll
diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/AArch64/arm64-rev.ll
similarity index 99%
rename from test/CodeGen/ARM64/rev.ll
rename to test/CodeGen/AArch64/arm64-rev.ll
index 1da59e42f6b9..30d9f4f3e670 100644
--- a/test/CodeGen/ARM64/rev.ll
+++ b/test/CodeGen/AArch64/arm64-rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @test_rev_w(i32 %a) nounwind {
 entry:
diff --git a/test/CodeGen/ARM64/rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll
similarity index 100%
rename from test/CodeGen/ARM64/rounding.ll
rename to test/CodeGen/AArch64/arm64-rounding.ll
diff --git a/test/CodeGen/ARM64/scaled_iv.ll b/test/CodeGen/AArch64/arm64-scaled_iv.ll
similarity index 100%
rename from test/CodeGen/ARM64/scaled_iv.ll
rename to test/CodeGen/AArch64/arm64-scaled_iv.ll
diff --git a/test/CodeGen/ARM64/scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
similarity index 99%
rename from test/CodeGen/ARM64/scvt.ll
rename to test/CodeGen/AArch64/arm64-scvt.ll
index b4d4add1e8a9..2e006cff159a 100644
--- a/test/CodeGen/ARM64/scvt.ll
+++ b/test/CodeGen/AArch64/arm64-scvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 ; rdar://13082402
 
 define float @t1(i32* nocapture %src) nounwind ssp {
diff --git a/test/CodeGen/ARM64/shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
similarity index 100%
rename from test/CodeGen/ARM64/shifted-sext.ll
rename to test/CodeGen/AArch64/arm64-shifted-sext.ll
diff --git a/test/CodeGen/AArch64/arm64-shrink-v1i64.ll b/test/CodeGen/AArch64/arm64-shrink-v1i64.ll
new file mode 100644
index 000000000000..f31a5702761c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-shrink-v1i64.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+; The DAGCombiner tries to do following shrink:
+;     Convert x+y to (VT)((SmallVT)x+(SmallVT)y)
+; But currently it can't handle vector type and will trigger an assertion failure
+; when it tries to generate an add mixed using vector type and scaler type.
+; This test checks that such assertion failur should not happen.
+define <1 x i64> @dotest(<1 x i64> %in0) {
+entry:
+  %0 = add <1 x i64> %in0, %in0
+  %vshl_n = shl <1 x i64> %0, <i64 32>
+  %vsra_n = ashr <1 x i64> %vshl_n, <i64 32>
+  ret <1 x i64> %vsra_n
+}
diff --git a/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
new file mode 100644
index 000000000000..aed39e7ed8cb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b h0, v0
+; CHECK: rshrn.8b v0, v0, #4
+; CHECK: dup.16b v0, v0[0]
+; CHECK: ret
+
+; CHECK-FAST: uaddlv.16b
+; CHECK-FAST: rshrn.8b
+; CHECK-FAST: dup.16b
+  %tmp = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/simplest-elf.ll b/test/CodeGen/AArch64/arm64-simplest-elf.ll
similarity index 100%
rename from test/CodeGen/ARM64/simplest-elf.ll
rename to test/CodeGen/AArch64/arm64-simplest-elf.ll
diff --git a/test/CodeGen/ARM64/sincos.ll b/test/CodeGen/AArch64/arm64-sincos.ll
similarity index 100%
rename from test/CodeGen/ARM64/sincos.ll
rename to test/CodeGen/AArch64/arm64-sincos.ll
diff --git a/test/CodeGen/ARM64/sitofp-combine-chains.ll b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
similarity index 100%
rename from test/CodeGen/ARM64/sitofp-combine-chains.ll
rename to test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
diff --git a/test/CodeGen/ARM64/sli-sri-opt.ll b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
similarity index 95%
rename from test/CodeGen/ARM64/sli-sri-opt.ll
rename to test/CodeGen/AArch64/arm64-sli-sri-opt.ll
index 725dcd51fd17..7fec53993bc1 100644
--- a/test/CodeGen/ARM64/sli-sri-opt.ll
+++ b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -aarch64-shift-insert-generation=true -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
 ; CHECK-LABEL: testLeftGood:
diff --git a/test/CodeGen/ARM64/smaxv.ll b/test/CodeGen/AArch64/arm64-smaxv.ll
similarity index 61%
rename from test/CodeGen/ARM64/smaxv.ll
rename to test/CodeGen/AArch64/arm64-smaxv.ll
index 4f6e01b31ea0..183e667643cc 100644
--- a/test/CodeGen/ARM64/smaxv.ll
+++ b/test/CodeGen/AArch64/arm64-smaxv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
 ; CHECK: test_vmaxv_s8
@@ -6,7 +6,7 @@ define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vmaxv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@ define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vmaxv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@ define i32 @test_vmaxv_s32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vmaxv.i
 }
 
@@ -39,7 +39,7 @@ define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vmaxv.i to i8
   ret i8 %0
 }
@@ -50,7 +50,7 @@ define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vmaxv.i to i16
   ret i16 %0
 }
@@ -61,14 +61,14 @@ define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vmaxv.i
 }
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
 
diff --git a/test/CodeGen/ARM64/sminv.ll b/test/CodeGen/AArch64/arm64-sminv.ll
similarity index 61%
rename from test/CodeGen/ARM64/sminv.ll
rename to test/CodeGen/AArch64/arm64-sminv.ll
index a246868d2f20..195c4e59dc41 100644
--- a/test/CodeGen/ARM64/sminv.ll
+++ b/test/CodeGen/AArch64/arm64-sminv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define signext i8 @test_vminv_s8(<8 x i8> %a1) {
 ; CHECK: test_vminv_s8
@@ -6,7 +6,7 @@ define signext i8 @test_vminv_s8(<8 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@ define signext i16 @test_vminv_s16(<4 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@ define i32 @test_vminv_s32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vminv.i
 }
 
@@ -39,7 +39,7 @@ define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
   ret i8 %0
 }
@@ -50,7 +50,7 @@ define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
   ret i16 %0
 }
@@ -61,14 +61,14 @@ define i32 @test_vminvq_s32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vminv.i
 }
 
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
 
diff --git a/test/CodeGen/ARM64/spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll
similarity index 100%
rename from test/CodeGen/ARM64/spill-lr.ll
rename to test/CodeGen/AArch64/arm64-spill-lr.ll
diff --git a/test/CodeGen/ARM64/spill.ll b/test/CodeGen/AArch64/arm64-spill.ll
similarity index 88%
rename from test/CodeGen/ARM64/spill.ll
rename to test/CodeGen/AArch64/arm64-spill.ll
index 5a5da9723cb6..47cdc2bd95e4 100644
--- a/test/CodeGen/ARM64/spill.ll
+++ b/test/CodeGen/AArch64/arm64-spill.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -arm64-neon-syntax=apple -verify-machineinstrs
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -verify-machineinstrs
 
 ; CHECK: fpr128
 ; CHECK: ld1.2d
diff --git a/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-sqshl-uqshl-i64Contant.ll
similarity index 63%
rename from test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll
rename to test/CodeGen/AArch64/arm64-sqshl-uqshl-i64Contant.ll
index a4c9cd8f1062..3949b85fbd32 100644
--- a/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll
+++ b/test/CodeGen/AArch64/arm64-sqshl-uqshl-i64Contant.ll
@@ -4,16 +4,16 @@
 define i64 @test_vqshld_s64_i(i64 %a) {
 ; CHECK-LABEL: test_vqshld_s64_i:
 ; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-  %1 = tail call i64 @llvm.arm64.neon.sqshl.i64(i64 %a, i64 36)
+  %1 = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 36)
   ret i64 %1
 }
 
 define i64 @test_vqshld_u64_i(i64 %a) {
 ; CHECK-LABEL: test_vqshld_u64_i:
 ; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-  %1 = tail call i64 @llvm.arm64.neon.uqshl.i64(i64 %a, i64 36)
+  %1 = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 36)
   ret i64 %1
 }
 
-declare i64 @llvm.arm64.neon.uqshl.i64(i64, i64)
-declare i64 @llvm.arm64.neon.sqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.uqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqshl.i64(i64, i64)
diff --git a/test/CodeGen/ARM64/st1.ll b/test/CodeGen/AArch64/arm64-st1.ll
similarity index 50%
rename from test/CodeGen/ARM64/st1.ll
rename to test/CodeGen/AArch64/arm64-st1.ll
index b9aafc60e7ba..4370484478c0 100644
--- a/test/CodeGen/ARM64/st1.ll
+++ b/test/CodeGen/AArch64/arm64-st1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 define void @st1lane_16b(<16 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane_16b
@@ -83,594 +83,594 @@ define void @st1lane_2s_float(<2 x float> %A, float* %D) {
 define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
 ; CHECK-LABEL: st2lane_16b
 ; CHECK: st2.b
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
   ret void
 }
 
 define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
 ; CHECK-LABEL: st2lane_8h
 ; CHECK: st2.h
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
   ret void
 }
 
 define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
 ; CHECK-LABEL: st2lane_4s
 ; CHECK: st2.s
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
   ret void
 }
 
 define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
 ; CHECK-LABEL: st2lane_2d
 ; CHECK: st2.d
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
   ret void
 }
 
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
 ; CHECK-LABEL: st3lane_16b
 ; CHECK: st3.b
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
   ret void
 }
 
 define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
 ; CHECK-LABEL: st3lane_8h
 ; CHECK: st3.h
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
   ret void
 }
 
 define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
 ; CHECK-LABEL: st3lane_4s
 ; CHECK: st3.s
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
   ret void
 }
 
 define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
 ; CHECK-LABEL: st3lane_2d
 ; CHECK: st3.d
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
   ret void
 }
 
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
 ; CHECK-LABEL: st4lane_16b
 ; CHECK: st4.b
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
   ret void
 }
 
 define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
 ; CHECK-LABEL: st4lane_8h
 ; CHECK: st4.h
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
   ret void
 }
 
 define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
 ; CHECK-LABEL: st4lane_4s
 ; CHECK: st4.s
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
   ret void
 }
 
 define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
 ; CHECK-LABEL: st4lane_2d
 ; CHECK: st4.d
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
   ret void
 }
 
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 
 define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
 ; CHECK-LABEL: st2_8b
 ; CHECK st2.8b
-	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
 	ret void
 }
 
 define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
 ; CHECK-LABEL: st3_8b
 ; CHECK st3.8b
-	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
 	ret void
 }
 
 define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
 ; CHECK-LABEL: st4_8b
 ; CHECK st4.8b
-	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
 
 define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
 ; CHECK-LABEL: st2_16b
 ; CHECK st2.16b
-	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
 	ret void
 }
 
 define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
 ; CHECK-LABEL: st3_16b
 ; CHECK st3.16b
-	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
 	ret void
 }
 
 define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
 ; CHECK-LABEL: st4_16b
 ; CHECK st4.16b
-	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
 
 define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
 ; CHECK-LABEL: st2_4h
 ; CHECK st2.4h
-	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
 	ret void
 }
 
 define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
 ; CHECK-LABEL: st3_4h
 ; CHECK st3.4h
-	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
 	ret void
 }
 
 define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
 ; CHECK-LABEL: st4_4h
 ; CHECK st4.4h
-	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
 
 define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
 ; CHECK-LABEL: st2_8h
 ; CHECK st2.8h
-	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
 	ret void
 }
 
 define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
 ; CHECK-LABEL: st3_8h
 ; CHECK st3.8h
-	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
 	ret void
 }
 
 define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
 ; CHECK-LABEL: st4_8h
 ; CHECK st4.8h
-	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
 
 define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
 ; CHECK-LABEL: st2_2s
 ; CHECK st2.2s
-	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
 	ret void
 }
 
 define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
 ; CHECK-LABEL: st3_2s
 ; CHECK st3.2s
-	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
 	ret void
 }
 
 define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
 ; CHECK-LABEL: st4_2s
 ; CHECK st4.2s
-	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
 
 define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
 ; CHECK-LABEL: st2_4s
 ; CHECK st2.4s
-	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
 	ret void
 }
 
 define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
 ; CHECK-LABEL: st3_4s
 ; CHECK st3.4s
-	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
 	ret void
 }
 
 define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
 ; CHECK-LABEL: st4_4s
 ; CHECK st4.4s
-	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
 
 define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
 ; CHECK-LABEL: st2_1d
 ; CHECK st1.2d
-	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
 	ret void
 }
 
 define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
 ; CHECK-LABEL: st3_1d
 ; CHECK st1.3d
-	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
 	ret void
 }
 
 define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
 ; CHECK-LABEL: st4_1d
 ; CHECK st1.4d
-	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
 
 define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
 ; CHECK-LABEL: st2_2d
 ; CHECK st2.2d
-	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
 	ret void
 }
 
 define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
 ; CHECK-LABEL: st3_2d
 ; CHECK st2.3d
-	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
 	ret void
 }
 
 define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
 ; CHECK-LABEL: st4_2d
 ; CHECK st2.4d
-	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
 	ret void
 }
 
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
 
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
 
 define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
 ; CHECK-LABEL: st1_x2_v8i8:
 ; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
   ret void
 }
 
 define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
 ; CHECK-LABEL: st1_x2_v4i16:
 ; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
   ret void
 }
 
 define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
 ; CHECK-LABEL: st1_x2_v2i32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
   ret void
 }
 
 define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
 ; CHECK-LABEL: st1_x2_v2f32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
   ret void
 }
 
 define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
 ; CHECK-LABEL: st1_x2_v1i64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
   ret void
 }
 
 define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
 ; CHECK-LABEL: st1_x2_v1f64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
 
 define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
 ; CHECK-LABEL: st1_x2_v16i8:
 ; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
   ret void
 }
 
 define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
 ; CHECK-LABEL: st1_x2_v8i16:
 ; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
   ret void
 }
 
 define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
 ; CHECK-LABEL: st1_x2_v4i32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
   ret void
 }
 
 define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
 ; CHECK-LABEL: st1_x2_v4f32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
   ret void
 }
 
 define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
 ; CHECK-LABEL: st1_x2_v2i64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
   ret void
 }
 
 define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
 ; CHECK-LABEL: st1_x2_v2f64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
 
 define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
 ; CHECK-LABEL: st1_x3_v8i8:
 ; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
   ret void
 }
 
 define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
 ; CHECK-LABEL: st1_x3_v4i16:
 ; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
   ret void
 }
 
 define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
 ; CHECK-LABEL: st1_x3_v2i32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
   ret void
 }
 
 define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
 ; CHECK-LABEL: st1_x3_v2f32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
   ret void
 }
 
 define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
 ; CHECK-LABEL: st1_x3_v1i64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
   ret void
 }
 
 define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
 ; CHECK-LABEL: st1_x3_v1f64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
 
 define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
 ; CHECK-LABEL: st1_x3_v16i8:
 ; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
   ret void
 }
 
 define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
 ; CHECK-LABEL: st1_x3_v8i16:
 ; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
   ret void
 }
 
 define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
 ; CHECK-LABEL: st1_x3_v4i32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
   ret void
 }
 
 define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
 ; CHECK-LABEL: st1_x3_v4f32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
   ret void
 }
 
 define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
 ; CHECK-LABEL: st1_x3_v2i64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
   ret void
 }
 
 define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
 ; CHECK-LABEL: st1_x3_v2f64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
   ret void
 }
 
 
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
 
 define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
 ; CHECK-LABEL: st1_x4_v8i8:
 ; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
   ret void
 }
 
 define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
 ; CHECK-LABEL: st1_x4_v4i16:
 ; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
   ret void
 }
 
 define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
 ; CHECK-LABEL: st1_x4_v2i32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
   ret void
 }
 
 define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
 ; CHECK-LABEL: st1_x4_v2f32:
 ; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
   ret void
 }
 
 define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
 ; CHECK-LABEL: st1_x4_v1i64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
   ret void
 }
 
 define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
 ; CHECK-LABEL: st1_x4_v1f64:
 ; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
   ret void
 }
 
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
 
 define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
 ; CHECK-LABEL: st1_x4_v16i8:
 ; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
   ret void
 }
 
 define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
 ; CHECK-LABEL: st1_x4_v8i16:
 ; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
   ret void
 }
 
 define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
 ; CHECK-LABEL: st1_x4_v4i32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
   ret void
 }
 
 define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
 ; CHECK-LABEL: st1_x4_v4f32:
 ; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
   ret void
 }
 
 define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
 ; CHECK-LABEL: st1_x4_v2i64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
   ret void
 }
 
 define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
 ; CHECK-LABEL: st1_x4_v2f64:
 ; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
   ret void
 }
diff --git a/test/CodeGen/ARM64/stack-no-frame.ll b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
similarity index 100%
rename from test/CodeGen/ARM64/stack-no-frame.ll
rename to test/CodeGen/AArch64/arm64-stack-no-frame.ll
diff --git a/test/CodeGen/ARM64/stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
similarity index 100%
rename from test/CodeGen/ARM64/stackmap.ll
rename to test/CodeGen/AArch64/arm64-stackmap.ll
diff --git a/test/CodeGen/ARM64/stackpointer.ll b/test/CodeGen/AArch64/arm64-stackpointer.ll
similarity index 100%
rename from test/CodeGen/ARM64/stackpointer.ll
rename to test/CodeGen/AArch64/arm64-stackpointer.ll
diff --git a/test/CodeGen/ARM64/stacksave.ll b/test/CodeGen/AArch64/arm64-stacksave.ll
similarity index 100%
rename from test/CodeGen/ARM64/stacksave.ll
rename to test/CodeGen/AArch64/arm64-stacksave.ll
diff --git a/test/CodeGen/ARM64/stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
similarity index 94%
rename from test/CodeGen/ARM64/stp.ll
rename to test/CodeGen/AArch64/arm64-stp.ll
index 3a58396e61b4..40bdf22c995c 100644
--- a/test/CodeGen/ARM64/stp.ll
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
 ; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
 
 ; CHECK: stp_int
diff --git a/test/CodeGen/ARM64/strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll
similarity index 75%
rename from test/CodeGen/ARM64/strict-align.ll
rename to test/CodeGen/AArch64/arm64-strict-align.ll
index 48a1528b5cd1..5d137043a691 100644
--- a/test/CodeGen/ARM64/strict-align.ll
+++ b/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-no-strict-align | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
 
 define i32 @f0(i32* nocapture %p) nounwind {
 ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
diff --git a/test/CodeGen/ARM64/stur.ll b/test/CodeGen/AArch64/arm64-stur.ll
similarity index 96%
rename from test/CodeGen/ARM64/stur.ll
rename to test/CodeGen/AArch64/arm64-stur.ll
index dc67b60000d4..a2e684dc9528 100644
--- a/test/CodeGen/ARM64/stur.ll
+++ b/test/CodeGen/AArch64/arm64-stur.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
 %struct.X = type <{ i32, i64, i64 }>
 
 define void @foo1(i32* %p, i64 %val) nounwind {
diff --git a/test/CodeGen/ARM64/subsections.ll b/test/CodeGen/AArch64/arm64-subsections.ll
similarity index 100%
rename from test/CodeGen/ARM64/subsections.ll
rename to test/CodeGen/AArch64/arm64-subsections.ll
diff --git a/test/CodeGen/ARM64/subvector-extend.ll b/test/CodeGen/AArch64/arm64-subvector-extend.ll
similarity index 97%
rename from test/CodeGen/ARM64/subvector-extend.ll
rename to test/CodeGen/AArch64/arm64-subvector-extend.ll
index ad2f06ce7ba8..d5a178a9e656 100644
--- a/test/CodeGen/ARM64/subvector-extend.ll
+++ b/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 ; Test efficient codegen of vector extends up from legal type to 128 bit
 ; and 256 bit vector types.
diff --git a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
similarity index 100%
rename from test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
rename to test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
diff --git a/test/CodeGen/AArch64/arm64-tbl.ll b/test/CodeGen/AArch64/arm64-tbl.ll
new file mode 100644
index 000000000000..b1ce15a1e19a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/this-return.ll b/test/CodeGen/AArch64/arm64-this-return.ll
similarity index 100%
rename from test/CodeGen/ARM64/this-return.ll
rename to test/CodeGen/AArch64/arm64-this-return.ll
diff --git a/test/CodeGen/ARM64/tls-darwin.ll b/test/CodeGen/AArch64/arm64-tls-darwin.ll
similarity index 100%
rename from test/CodeGen/ARM64/tls-darwin.ll
rename to test/CodeGen/AArch64/arm64-tls-darwin.ll
diff --git a/test/CodeGen/ARM64/tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
similarity index 100%
rename from test/CodeGen/ARM64/tls-dynamic-together.ll
rename to test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
diff --git a/test/CodeGen/ARM64/tls-dynamics.ll b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
similarity index 100%
rename from test/CodeGen/ARM64/tls-dynamics.ll
rename to test/CodeGen/AArch64/arm64-tls-dynamics.ll
diff --git a/test/CodeGen/ARM64/tls-execs.ll b/test/CodeGen/AArch64/arm64-tls-execs.ll
similarity index 100%
rename from test/CodeGen/ARM64/tls-execs.ll
rename to test/CodeGen/AArch64/arm64-tls-execs.ll
diff --git a/test/CodeGen/ARM64/trap.ll b/test/CodeGen/AArch64/arm64-trap.ll
similarity index 100%
rename from test/CodeGen/ARM64/trap.ll
rename to test/CodeGen/AArch64/arm64-trap.ll
diff --git a/test/CodeGen/ARM64/trn.ll b/test/CodeGen/AArch64/arm64-trn.ll
similarity index 98%
rename from test/CodeGen/ARM64/trn.ll
rename to test/CodeGen/AArch64/arm64-trn.ll
index f46798490f69..2db7a14e7549 100644
--- a/test/CodeGen/ARM64/trn.ll
+++ b/test/CodeGen/AArch64/arm64-trn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vtrni8:
diff --git a/test/CodeGen/ARM64/trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll
similarity index 100%
rename from test/CodeGen/ARM64/trunc-store.ll
rename to test/CodeGen/AArch64/arm64-trunc-store.ll
diff --git a/test/CodeGen/ARM64/umaxv.ll b/test/CodeGen/AArch64/arm64-umaxv.ll
similarity index 75%
rename from test/CodeGen/ARM64/umaxv.ll
rename to test/CodeGen/AArch64/arm64-umaxv.ll
index 15277d32f030..d523f317d087 100644
--- a/test/CodeGen/ARM64/umaxv.ll
+++ b/test/CodeGen/AArch64/arm64-umaxv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmax_u8x8:
@@ -7,7 +7,7 @@ define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -30,7 +30,7 @@ define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -51,7 +51,7 @@ define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -72,7 +72,7 @@ define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@ return:
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/uminv.ll b/test/CodeGen/AArch64/arm64-uminv.ll
similarity index 75%
rename from test/CodeGen/ARM64/uminv.ll
rename to test/CodeGen/AArch64/arm64-uminv.ll
index 440522f1693e..3bade4b28b8f 100644
--- a/test/CodeGen/ARM64/uminv.ll
+++ b/test/CodeGen/AArch64/arm64-uminv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmin_u8x8:
@@ -7,7 +7,7 @@ define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
   %tmp = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -30,7 +30,7 @@ define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
   %tmp = trunc i32 %vminv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -51,7 +51,7 @@ define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
   %tmp = trunc i32 %vminv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -72,7 +72,7 @@ define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
   %tmp = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@ return:
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/umov.ll b/test/CodeGen/AArch64/arm64-umov.ll
similarity index 90%
rename from test/CodeGen/ARM64/umov.ll
rename to test/CodeGen/AArch64/arm64-umov.ll
index 19fd91b6c3d2..a1ef9908646a 100644
--- a/test/CodeGen/ARM64/umov.ll
+++ b/test/CodeGen/AArch64/arm64-umov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define zeroext i8 @f1(<16 x i8> %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM64/unaligned_ldst.ll b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
similarity index 100%
rename from test/CodeGen/ARM64/unaligned_ldst.ll
rename to test/CodeGen/AArch64/arm64-unaligned_ldst.ll
diff --git a/test/CodeGen/ARM64/uzp.ll b/test/CodeGen/AArch64/arm64-uzp.ll
similarity index 98%
rename from test/CodeGen/ARM64/uzp.ll
rename to test/CodeGen/AArch64/arm64-uzp.ll
index 60e16d0d686c..cdd8d31c9981 100644
--- a/test/CodeGen/ARM64/uzp.ll
+++ b/test/CodeGen/AArch64/arm64-uzp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vuzpi8:
diff --git a/test/CodeGen/ARM64/vaargs.ll b/test/CodeGen/AArch64/arm64-vaargs.ll
similarity index 100%
rename from test/CodeGen/ARM64/vaargs.ll
rename to test/CodeGen/AArch64/arm64-vaargs.ll
diff --git a/test/CodeGen/ARM64/vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
similarity index 68%
rename from test/CodeGen/ARM64/vabs.ll
rename to test/CodeGen/AArch64/arm64-vabs.ll
index 0d8aa24e1b47..5afc8d9f3f49 100644
--- a/test/CodeGen/ARM64/vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -6,7 +6,7 @@ define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: sabdl.8h
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
         ret <8 x i16> %tmp4
 }
@@ -16,7 +16,7 @@ define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sabdl.4s
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
         ret <4 x i32> %tmp4
 }
@@ -26,7 +26,7 @@ define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sabdl.2d
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
         ret <2 x i64> %tmp4
 }
@@ -38,7 +38,7 @@ define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
         %load2 = load <16 x i8>* %B
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
         ret <8 x i16> %tmp4
 }
@@ -50,7 +50,7 @@ define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
         %load2 = load <8 x i16>* %B
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
         ret <4 x i32> %tmp4
 }
@@ -62,7 +62,7 @@ define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
         %load2 = load <4 x i32>* %B
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
         ret <2 x i64> %tmp4
 }
@@ -72,7 +72,7 @@ define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uabdl.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
   ret <8 x i16> %tmp4
 }
@@ -82,7 +82,7 @@ define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uabdl.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
   ret <4 x i32> %tmp4
 }
@@ -92,7 +92,7 @@ define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uabdl.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
 }
@@ -105,7 +105,7 @@ define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
   ret <8 x i16> %tmp4
 }
@@ -117,7 +117,7 @@ define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
   ret <4 x i32> %tmp4
 }
@@ -129,7 +129,7 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
 }
@@ -139,7 +139,7 @@ define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: fabd.2s
         %tmp1 = load <2 x float>* %A
         %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -148,7 +148,7 @@ define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fabd.4s
         %tmp1 = load <4 x float>* %A
         %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -157,20 +157,20 @@ define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fabd.2d
         %tmp1 = load <2 x double>* %A
         %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sabd_8b:
 ;CHECK: sabd.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -179,7 +179,7 @@ define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sabd.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -188,7 +188,7 @@ define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sabd.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -197,7 +197,7 @@ define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sabd.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -206,7 +206,7 @@ define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sabd.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -215,23 +215,23 @@ define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sabd.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: uabd_8b:
 ;CHECK: uabd.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -240,7 +240,7 @@ define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uabd.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -249,7 +249,7 @@ define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uabd.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -258,7 +258,7 @@ define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uabd.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -267,7 +267,7 @@ define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uabd.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -276,22 +276,22 @@ define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uabd.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqabs_8b:
 ;CHECK: sqabs.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -299,7 +299,7 @@ define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqabs_16b:
 ;CHECK: sqabs.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -307,7 +307,7 @@ define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqabs_4h:
 ;CHECK: sqabs.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -315,7 +315,7 @@ define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqabs_8h:
 ;CHECK: sqabs.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -323,7 +323,7 @@ define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqabs_2s:
 ;CHECK: sqabs.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -331,22 +331,22 @@ define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqabs_4s:
 ;CHECK: sqabs.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
 
 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqneg_8b:
 ;CHECK: sqneg.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -354,7 +354,7 @@ define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqneg_16b:
 ;CHECK: sqneg.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -362,7 +362,7 @@ define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqneg_4h:
 ;CHECK: sqneg.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -370,7 +370,7 @@ define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqneg_8h:
 ;CHECK: sqneg.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -378,7 +378,7 @@ define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqneg_2s:
 ;CHECK: sqneg.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -386,22 +386,22 @@ define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqneg_4s:
 ;CHECK: sqneg.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
 
 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: abs_8b:
 ;CHECK: abs.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -409,7 +409,7 @@ define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: abs_16b:
 ;CHECK: abs.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -417,7 +417,7 @@ define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: abs_4h:
 ;CHECK: abs.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -425,7 +425,7 @@ define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: abs_8h:
 ;CHECK: abs.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -433,7 +433,7 @@ define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: abs_2s:
 ;CHECK: abs.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -441,32 +441,32 @@ define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: abs_4s:
 ;CHECK: abs.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
 ; CHECK-LABEL: abs_1d:
 ; CHECK: abs d0, d0
-  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
+  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
   ret <1 x i64> %abs
 }
 
 define i64 @abs_1d_honestly(i64 %A) nounwind {
 ; CHECK-LABEL: abs_1d_honestly:
 ; CHECK: abs d0, d0
-  %abs = call i64 @llvm.arm64.neon.abs.i64(i64 %A)
+  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
   ret i64 %abs
 }
 
-declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.abs.i64(i64) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
 
 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: sabal8h:
@@ -474,7 +474,7 @@ define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
         %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -486,7 +486,7 @@ define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
         %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -498,7 +498,7 @@ define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
         %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
@@ -513,7 +513,7 @@ define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
         %tmp3 = load <8 x i16>* %C
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -527,7 +527,7 @@ define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
         %tmp3 = load <4 x i32>* %C
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -541,7 +541,7 @@ define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwin
         %tmp3 = load <2 x i64>* %C
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -553,7 +553,7 @@ define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
         %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -565,7 +565,7 @@ define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
         %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -577,7 +577,7 @@ define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
         %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -591,7 +591,7 @@ define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
         %tmp3 = load <8 x i16>* %C
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -605,7 +605,7 @@ define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
         %tmp3 = load <4 x i32>* %C
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -619,7 +619,7 @@ define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwin
         %tmp3 = load <2 x i64>* %C
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -630,7 +630,7 @@ define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: saba.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = load <8 x i8>* %C
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -641,7 +641,7 @@ define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 ;CHECK: saba.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         %tmp4 = load <16 x i8>* %C
         %tmp5 = add <16 x i8> %tmp3, %tmp4
         ret <16 x i8> %tmp5
@@ -652,7 +652,7 @@ define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
 ;CHECK: saba.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = load <4 x i16>* %C
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -663,7 +663,7 @@ define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
 ;CHECK: saba.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         %tmp4 = load <8 x i16>* %C
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -674,7 +674,7 @@ define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
 ;CHECK: saba.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = load <2 x i32>* %C
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -685,7 +685,7 @@ define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
 ;CHECK: saba.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         %tmp4 = load <4 x i32>* %C
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -696,7 +696,7 @@ define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: uaba.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = load <8 x i8>* %C
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -707,7 +707,7 @@ define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 ;CHECK: uaba.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         %tmp4 = load <16 x i8>* %C
         %tmp5 = add <16 x i8> %tmp3, %tmp4
         ret <16 x i8> %tmp5
@@ -718,7 +718,7 @@ define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
 ;CHECK: uaba.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = load <4 x i16>* %C
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -729,7 +729,7 @@ define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
 ;CHECK: uaba.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         %tmp4 = load <8 x i16>* %C
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -740,7 +740,7 @@ define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
 ;CHECK: uaba.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = load <2 x i32>* %C
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -751,7 +751,7 @@ define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
 ;CHECK: uaba.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         %tmp4 = load <4 x i32>* %C
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -761,19 +761,19 @@ define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
 define float @fabds(float %a, float %b) nounwind {
 ; CHECK-LABEL: fabds:
 ; CHECK: fabd s0, s0, s1
-  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
+  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
   ret float %vabd.i
 }
 
 define double @fabdd(double %a, double %b) nounwind {
 ; CHECK-LABEL: fabdd:
 ; CHECK: fabd d0, d0, d1
-  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
+  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
   ret double %vabd.i
 }
 
-declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
-declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
 
 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uabdl_from_extract_dup:
@@ -784,7 +784,7 @@ define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
@@ -798,7 +798,7 @@ define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
diff --git a/test/CodeGen/ARM64/vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
similarity index 80%
rename from test/CodeGen/ARM64/vadd.ll
rename to test/CodeGen/AArch64/arm64-vadd.ll
index f674c6de3390..9ed8aa6d7c5d 100644
--- a/test/CodeGen/ARM64/vadd.ll
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: addhn8b:
 ;CHECK: addhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: addhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: addhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -31,8 +31,8 @@ define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;CHECK-LABEL: addhn2_16b:
 ;CHECK: addhn.8b
 ;CHECK-NEXT: addhn2.16b
-  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -41,8 +41,8 @@ define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;CHECK-LABEL: addhn2_8h:
 ;CHECK: addhn.4h
 ;CHECK-NEXT: addhn2.8h
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -51,15 +51,15 @@ define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;CHECK-LABEL: addhn2_4s:
 ;CHECK: addhn.2s
 ;CHECK-NEXT: addhn2.4s
-  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 
 define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
@@ -67,7 +67,7 @@ define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: raddhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -76,7 +76,7 @@ define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: raddhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -85,7 +85,7 @@ define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: raddhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -93,8 +93,8 @@ define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;CHECK-LABEL: raddhn2_16b:
 ;CHECK: raddhn.8b
 ;CHECK-NEXT: raddhn2.16b
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -103,8 +103,8 @@ define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;CHECK-LABEL: raddhn2_8h:
 ;CHECK: raddhn.4h
 ;CHECK-NEXT: raddhn2.8h
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -113,15 +113,15 @@ define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;CHECK-LABEL: raddhn2_4s:
 ;CHECK: raddhn.2s
 ;CHECK-NEXT: raddhn2.4s
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: saddl8h:
@@ -428,7 +428,7 @@ define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: saddlp4h:
 ;CHECK: saddlp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -436,7 +436,7 @@ define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: saddlp2s:
 ;CHECK: saddlp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -444,7 +444,7 @@ define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: saddlp1d:
 ;CHECK: saddlp.1d
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
@@ -452,7 +452,7 @@ define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: saddlp8h:
 ;CHECK: saddlp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -460,7 +460,7 @@ define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: saddlp4s:
 ;CHECK: saddlp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
@@ -468,23 +468,23 @@ define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: saddlp2d:
 ;CHECK: saddlp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
 }
 
-declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 
-declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: uaddlp4h:
 ;CHECK: uaddlp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -492,7 +492,7 @@ define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: uaddlp2s:
 ;CHECK: uaddlp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -500,7 +500,7 @@ define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: uaddlp1d:
 ;CHECK: uaddlp.1d
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
@@ -508,7 +508,7 @@ define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: uaddlp8h:
 ;CHECK: uaddlp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -516,7 +516,7 @@ define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uaddlp4s:
 ;CHECK: uaddlp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
@@ -524,23 +524,23 @@ define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uaddlp2d:
 ;CHECK: uaddlp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
 }
 
-declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 
-declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sadalp4h:
 ;CHECK: sadalp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -550,7 +550,7 @@ define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: sadalp2s:
 ;CHECK: sadalp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -560,7 +560,7 @@ define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sadalp8h:
 ;CHECK: sadalp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -570,7 +570,7 @@ define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sadalp4s:
 ;CHECK: sadalp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -580,7 +580,7 @@ define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: sadalp2d:
 ;CHECK: sadalp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
         ret <2 x i64> %tmp5
@@ -590,7 +590,7 @@ define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: uadalp4h:
 ;CHECK: uadalp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -600,7 +600,7 @@ define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: uadalp2s:
 ;CHECK: uadalp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -610,7 +610,7 @@ define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uadalp8h:
 ;CHECK: uadalp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -620,7 +620,7 @@ define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uadalp4s:
 ;CHECK: uadalp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -630,7 +630,7 @@ define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: uadalp2d:
 ;CHECK: uadalp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
         ret <2 x i64> %tmp5
@@ -641,7 +641,7 @@ define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: addp.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -650,7 +650,7 @@ define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: addp.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -659,7 +659,7 @@ define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: addp.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -668,7 +668,7 @@ define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: addp.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -677,7 +677,7 @@ define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: addp.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -686,7 +686,7 @@ define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: addp.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -695,24 +695,24 @@ define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: addp.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: faddp_2s:
 ;CHECK: faddp.2s
         %tmp1 = load <2 x float>* %A
         %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -721,7 +721,7 @@ define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: faddp.4s
         %tmp1 = load <4 x float>* %A
         %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -730,13 +730,13 @@ define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: faddp.2d
         %tmp1 = load <2 x double>* %A
         %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uaddl2_duprhs
diff --git a/test/CodeGen/ARM64/vaddlv.ll b/test/CodeGen/AArch64/arm64-vaddlv.ll
similarity index 55%
rename from test/CodeGen/ARM64/vaddlv.ll
rename to test/CodeGen/AArch64/arm64-vaddlv.ll
index d4d4608ba076..2d6413812ec8 100644
--- a/test/CodeGen/ARM64/vaddlv.ll
+++ b/test/CodeGen/AArch64/arm64-vaddlv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
 ; CHECK: test_vaddlv_s32
@@ -6,7 +6,7 @@ define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
 ; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
   ret i64 %vaddlv.i
 }
 
@@ -16,11 +16,11 @@ define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
 ; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
   ret i64 %vaddlv.i
 }
 
-declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
 
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+declare i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
 
diff --git a/test/CodeGen/ARM64/vaddv.ll b/test/CodeGen/AArch64/arm64-vaddv.ll
similarity index 63%
rename from test/CodeGen/ARM64/vaddv.ll
rename to test/CodeGen/AArch64/arm64-vaddv.ll
index 154f91779375..2d92ce6ea570 100644
--- a/test/CodeGen/ARM64/vaddv.ll
+++ b/test/CodeGen/AArch64/arm64-vaddv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 
 define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
 ; CHECK-LABEL: test_vaddv_s8:
@@ -6,7 +6,7 @@ define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@ define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@ define i32 @test_vaddv_s32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -39,7 +39,7 @@ define i64 @test_vaddv_s64(<2 x i64> %a1) {
 ; CHECK-NEXT: fmov x0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
   ret i64 %vaddv.i
 }
 
@@ -49,7 +49,7 @@ define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -60,7 +60,7 @@ define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
   %0 = and i32 %vaddv.i, 511 ; 0x1ff
   ret i32 %0
 }
@@ -71,7 +71,7 @@ define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -82,7 +82,7 @@ define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
   %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
   ret i32 %0
 }
@@ -94,7 +94,7 @@ define i32 @test_vaddv_u32(<2 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -103,7 +103,7 @@ define float @test_vaddv_f32(<2 x float> %a1) {
 ; CHECK: faddp.2s s0, v0
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
   ret float %vaddv.i
 }
 
@@ -113,7 +113,7 @@ define float @test_vaddv_v4f32(<4 x float> %a1) {
 ; CHECK: faddp.2s s0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
   ret float %vaddv.i
 }
 
@@ -122,7 +122,7 @@ define double @test_vaddv_f64(<2 x double> %a1) {
 ; CHECK: faddp.2d d0, v0
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
   ret double %vaddv.i
 }
 
@@ -132,7 +132,7 @@ define i64 @test_vaddv_u64(<2 x i64> %a1) {
 ; CHECK-NEXT: fmov x0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
   ret i64 %vaddv.i
 }
 
@@ -143,7 +143,7 @@ define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
 ; CHECK-NOT: ins
 ; CHECK: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
   %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
   ret <1 x i64> %vec
 }
@@ -154,7 +154,7 @@ define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -165,7 +165,7 @@ define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -176,7 +176,7 @@ define i32 @test_vaddvq_s32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -186,7 +186,7 @@ define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -197,7 +197,7 @@ define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -208,38 +208,38 @@ define i32 @test_vaddvq_u32(<4 x i32> %a1) {
 ; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vaddv.i
 }
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
 
-declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
+declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
 
-declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
+declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
 
-declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
-declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
-declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/ARM64/variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
similarity index 100%
rename from test/CodeGen/ARM64/variadic-aapcs.ll
rename to test/CodeGen/AArch64/arm64-variadic-aapcs.ll
diff --git a/test/CodeGen/ARM64/vbitwise.ll b/test/CodeGen/AArch64/arm64-vbitwise.ll
similarity index 86%
rename from test/CodeGen/ARM64/vbitwise.ll
rename to test/CodeGen/AArch64/arm64-vbitwise.ll
index 7d8378de292d..93de95e52e53 100644
--- a/test/CodeGen/ARM64/vbitwise.ll
+++ b/test/CodeGen/AArch64/arm64-vbitwise.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: rbit_8b:
 ;CHECK: rbit.8b
 	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp3
 }
 
@@ -12,12 +12,12 @@ define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: rbit_16b:
 ;CHECK: rbit.16b
 	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
 
 define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sxtl8h:
diff --git a/test/CodeGen/ARM64/vclz.ll b/test/CodeGen/AArch64/arm64-vclz.ll
similarity index 98%
rename from test/CodeGen/ARM64/vclz.ll
rename to test/CodeGen/AArch64/arm64-vclz.ll
index ddc09ed85faf..cf5670a0354f 100644
--- a/test/CodeGen/ARM64/vclz.ll
+++ b/test/CodeGen/AArch64/arm64-vclz.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_u8:
diff --git a/test/CodeGen/ARM64/vcmp.ll b/test/CodeGen/AArch64/arm64-vcmp.ll
similarity index 76%
rename from test/CodeGen/ARM64/vcmp.ll
rename to test/CodeGen/AArch64/arm64-vcmp.ll
index 56153f08f357..982ab09ee69e 100644
--- a/test/CodeGen/ARM64/vcmp.ll
+++ b/test/CodeGen/AArch64/arm64-vcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
@@ -18,7 +18,7 @@ define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: facge.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -27,7 +27,7 @@ define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: facge.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -36,20 +36,20 @@ define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: facge.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: facgt_2s:
 ;CHECK: facgt.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -58,7 +58,7 @@ define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: facgt.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -67,47 +67,47 @@ define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: facgt.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define i32 @facge_s(float %A, float %B) nounwind {
 ; CHECK-LABEL: facge_s:
 ; CHECK: facge {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
+  %mask = call i32 @llvm.aarch64.neon.facge.i32.f32(float %A, float %B)
   ret i32 %mask
 }
 
 define i64 @facge_d(double %A, double %B) nounwind {
 ; CHECK-LABEL: facge_d:
 ; CHECK: facge {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
+  %mask = call i64 @llvm.aarch64.neon.facge.i64.f64(double %A, double %B)
   ret i64 %mask
 }
 
-declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
+declare i64 @llvm.aarch64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facge.i32.f32(float, float)
 
 define i32 @facgt_s(float %A, float %B) nounwind {
 ; CHECK-LABEL: facgt_s:
 ; CHECK: facgt {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
+  %mask = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
   ret i32 %mask
 }
 
 define i64 @facgt_d(double %A, double %B) nounwind {
 ; CHECK-LABEL: facgt_d:
 ; CHECK: facgt {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
+  %mask = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %A, double %B)
   ret i64 %mask
 }
 
-declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
+declare i64 @llvm.aarch64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facgt.i32.f32(float, float)
 
 define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: cmtst_8b:
diff --git a/test/CodeGen/AArch64/arm64-vcnt.ll b/test/CodeGen/AArch64/arm64-vcnt.ll
new file mode 100644
index 000000000000..903501ec16a9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcombine.ll b/test/CodeGen/AArch64/arm64-vcombine.ll
similarity index 90%
rename from test/CodeGen/ARM64/vcombine.ll
rename to test/CodeGen/AArch64/arm64-vcombine.ll
index 16f591e378e1..fa1299603af3 100644
--- a/test/CodeGen/ARM64/vcombine.ll
+++ b/test/CodeGen/AArch64/arm64-vcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; LowerCONCAT_VECTORS() was reversing the order of two parts.
 ; rdar://11558157
diff --git a/test/CodeGen/ARM64/vcvt.ll b/test/CodeGen/AArch64/arm64-vcvt.ll
similarity index 65%
rename from test/CodeGen/ARM64/vcvt.ll
rename to test/CodeGen/AArch64/arm64-vcvt.ll
index 19bb8cb8dc5c..6570f0e3e7eb 100644
--- a/test/CodeGen/ARM64/vcvt.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtas_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -23,20 +23,20 @@ define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtau_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -45,7 +45,7 @@ define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -54,20 +54,20 @@ define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtms_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -76,7 +76,7 @@ define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -85,20 +85,20 @@ define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtmu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -107,7 +107,7 @@ define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -116,20 +116,20 @@ define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtps_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -138,7 +138,7 @@ define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -147,20 +147,20 @@ define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtpu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -169,7 +169,7 @@ define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -178,20 +178,20 @@ define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtns_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -200,7 +200,7 @@ define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -209,20 +209,20 @@ define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtnu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -231,7 +231,7 @@ define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -240,13 +240,13 @@ define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzs_2s:
@@ -401,7 +401,7 @@ define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: frintn.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %A)
 	ret <2 x float> %tmp3
 }
 
@@ -410,7 +410,7 @@ define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: frintn.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %A)
 	ret <4 x float> %tmp3
 }
 
@@ -419,13 +419,13 @@ define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: frintn.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %A)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) nounwind readnone
 
 define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: frintp_2s:
@@ -525,7 +525,7 @@ define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtxn v0.2s, v0.2d
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
 	ret <2 x float> %tmp3
 }
 
@@ -534,19 +534,19 @@ define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtxn2 v0.4s, v1.2d
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
         %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 	ret <4 x float> %res
 }
 
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzsc_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
 	ret <2 x i32> %tmp3
 }
 
@@ -555,7 +555,7 @@ define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
 	ret <4 x i32> %tmp3
 }
 
@@ -564,20 +564,20 @@ define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
 
 define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzuc_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
 	ret <2 x i32> %tmp3
 }
 
@@ -586,7 +586,7 @@ define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
 	ret <4 x i32> %tmp3
 }
 
@@ -595,20 +595,20 @@ define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
 
 define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
 ;CHECK-LABEL: scvtf_2sc:
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
 	ret <2 x float> %tmp3
 }
 
@@ -617,7 +617,7 @@ define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
 	ret <4 x float> %tmp3
 }
 
@@ -626,20 +626,20 @@ define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
 
 define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
 ;CHECK-LABEL: ucvtf_2sc:
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
 	ret <2 x float> %tmp3
 }
 
@@ -648,7 +648,7 @@ define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
 	ret <4 x float> %tmp3
 }
 
@@ -657,7 +657,7 @@ define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
 	ret <2 x double> %tmp3
 }
 
@@ -665,22 +665,22 @@ define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
 ;CHECK-LABEL: autogen_SD28458:
 ;CHECK: fcvt
 ;CHECK: ret
-define void @autogen_SD28458() {
-  %Tr53 = fptrunc <8 x double> undef to <8 x float>
-  store <8 x float> %Tr53, <8 x float>* undef
+define void @autogen_SD28458(<8 x double> %val.f64, <8 x float>* %addr.f32) {
+  %Tr53 = fptrunc <8 x double> %val.f64 to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* %addr.f32
   ret void
 }
 
 ;CHECK-LABEL: autogen_SD19225:
 ;CHECK: fcvt
 ;CHECK: ret
-define void @autogen_SD19225() {
-  %A = load <8 x float>* undef
+define void @autogen_SD19225(<8 x double>* %addr.f64, <8 x float>* %addr.f32) {
+  %A = load <8 x float>* %addr.f32
   %Tr53 = fpext <8 x float> %A to <8 x double>
-  store <8 x double> %Tr53, <8 x double>* undef
+  store <8 x double> %Tr53, <8 x double>* %addr.f64
   ret void
 }
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
similarity index 73%
rename from test/CodeGen/ARM64/vcvt_f.ll
rename to test/CodeGen/AArch64/arm64-vcvt_f.ll
index d67aa3b9d471..d24495844b45 100644
--- a/test/CodeGen/ARM64/vcvt_f.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f64_f32:
@@ -38,7 +38,7 @@ define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) noun
 
 define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvtx_f32_f64:
-  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
 ; CHECK: fcvtxn
   ret <2 x float> %vcvtx1.i
 ; CHECK: ret
@@ -46,7 +46,7 @@ define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 
 define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvtx_high_f32_f64:
-  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: fcvtxn2
   ret <4 x float> %res
@@ -54,13 +54,13 @@ define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nou
 }
 
 
-declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
 
-declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
 
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
 
 define i16 @to_half(float %in) {
 ; CHECK-LABEL: to_half:
diff --git a/test/CodeGen/ARM64/vcvt_f32_su32.ll b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
similarity index 75%
rename from test/CodeGen/ARM64/vcvt_f32_su32.ll
rename to test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
index 51e053d97459..1eb7b43d5755 100644
--- a/test/CodeGen/ARM64/vcvt_f32_su32.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
 ; CHECK-LABEL: ucvt:
@@ -37,7 +37,7 @@ define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
 ; CHECK-LABEL: cvtf16:
 ; CHECK: fcvtl  v0.4s, v0.4h
 ; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %a) nounwind
   ret <4 x float> %vcvt1.i
 }
 
@@ -46,7 +46,7 @@ define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
 ; CHECK: fcvtl2  v0.4s, v0.8h
 ; CHECK-NEXT: ret
   %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %in) nounwind
   ret <4 x float> %vcvt1.i
 }
 
@@ -56,7 +56,7 @@ define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
 ; CHECK-LABEL: cvtf16f32:
 ; CHECK: fcvtn  v0.4h, v0.4s
 ; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  %vcvt1.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) nounwind
   ret <4 x i16> %vcvt1.i
 }
 
@@ -64,10 +64,10 @@ define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
 ; CHECK-LABEL: cvtf16f32_high:
 ; CHECK: fcvtn2 v0.8h, v1.4s
 ; CHECK-NEXT: ret
-  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %high = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %high_big)
   %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
 
-declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_n.ll b/test/CodeGen/AArch64/arm64-vcvt_n.ll
new file mode 100644
index 000000000000..7ed5be6e8af9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_su32_f32.ll b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
similarity index 91%
rename from test/CodeGen/ARM64/vcvt_su32_f32.ll
rename to test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
index 8c82fa095c81..985a5f762439 100644
--- a/test/CodeGen/ARM64/vcvt_su32_f32.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
 ; CHECK: c1
diff --git a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
similarity index 54%
rename from test/CodeGen/ARM64/vcvtxd_f32_f64.ll
rename to test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
index bbe8f0b38641..b29c22cbfda5 100644
--- a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
+++ b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
@@ -4,8 +4,8 @@ define float @fcvtxn(double %a) {
 ; CHECK-LABEL: fcvtxn:
 ; CHECK: fcvtxn s0, d0
 ; CHECK-NEXT: ret
-  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
+  %vcvtxd.i = tail call float @llvm.aarch64.sisd.fcvtxn(double %a) nounwind
   ret float %vcvtxd.i
 }
 
-declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone
+declare float @llvm.aarch64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/vecCmpBr.ll b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
similarity index 87%
rename from test/CodeGen/ARM64/vecCmpBr.ll
rename to test/CodeGen/AArch64/arm64-vecCmpBr.ll
index 2af8775cea6e..c7321e4b7d07 100644
--- a/test/CodeGen/ARM64/vecCmpBr.ll
+++ b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 ; ModuleID = 'arm64_vecCmpBr.c'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios3.0.0"
@@ -13,7 +13,7 @@ define i32 @anyZero64(<4 x i16> %a) #0 {
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -39,7 +39,7 @@ define i32 @anyZero128(<8 x i16> %a) #0 {
 
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -63,7 +63,7 @@ define i32 @anyNonZero64(<4 x i16> %a) #0 {
 
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@ define i32 @anyNonZero128(<8 x i16> %a) #0 {
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -109,7 +109,7 @@ define i32 @allZero64(<4 x i16> %a) #0 {
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -132,7 +132,7 @@ define i32 @allZero128(<8 x i16> %a) #0 {
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -155,7 +155,7 @@ define i32 @allNonZero64(<4 x i16> %a) #0 {
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -178,7 +178,7 @@ define i32 @allNonZero128(<8 x i16> %a) #0 {
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -192,13 +192,13 @@ return:                                           ; preds = %entry, %if.then
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) #2
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) #2
 
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) #2
 
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) #2
 
 attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
 attributes #1 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/ARM64/vecFold.ll b/test/CodeGen/AArch64/arm64-vecFold.ll
similarity index 74%
rename from test/CodeGen/ARM64/vecFold.ll
rename to test/CodeGen/AArch64/arm64-vecFold.ll
index 6888932f2ce1..aeacfccab3c4 100644
--- a/test/CodeGen/ARM64/vecFold.ll
+++ b/test/CodeGen/AArch64/arm64-vecFold.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple -o - %s| FileCheck %s
 
 define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
 ; CHECK-LABEL: foov16i8:
@@ -50,8 +50,8 @@ define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
 
 define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: bar:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
 ; CHECK: addhn.4h	v0, v0, v1
 ; CHECK-NEXT: addhn2.8h	v0, v2, v3
 ; CHECK-NEXT: ret
@@ -64,7 +64,7 @@ define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1
 
 define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: baz:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
   %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
   %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
 ; CHECK: addhn.4h	v0, v0, v1
@@ -83,8 +83,8 @@ entry:
 ; CHECK: 	raddhn.4h	v0, v0, v1
 ; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
 ; CHECK-NEXT: 	ret
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
   %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -97,8 +97,8 @@ define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16>
 ; CHECK: rshrn.8b	v0, v0, #5
 ; CHECK-NEXT: rshrn2.16b	v0, v2, #6
 ; CHECK-NEXT: ret
-  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
-  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
   %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
   %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -111,8 +111,8 @@ define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16>
 ; CHECK: rsubhn.8b	v0, v0, v1
 ; CHECK: rsubhn2.16b	v0, v2, v3
 ; CHECK-NEXT: 	ret
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
-  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
   %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
   %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -122,8 +122,8 @@ define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16>
 
 define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: noOpt1:
-  %vqsub2.i = tail call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vqsub2.i = tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
 ; CHECK:	sqsub.2s	v0, v0, v1
 ; CHECK-NEXT:	addhn2.8h	v0, v2, v3
   %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
@@ -133,13 +133,13 @@ define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32>
   ret <8 x i16> %3
 }
 
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8> @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
diff --git a/test/CodeGen/ARM64/vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
similarity index 81%
rename from test/CodeGen/ARM64/vector-ext.ll
rename to test/CodeGen/AArch64/arm64-vector-ext.ll
index 9cc0555d8c48..650ff1e14f02 100644
--- a/test/CodeGen/ARM64/vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;CHECK: @func30
 ;CHECK: ushll.4s  v0, v0, #0
diff --git a/test/CodeGen/ARM64/vector-imm.ll b/test/CodeGen/AArch64/arm64-vector-imm.ll
similarity index 98%
rename from test/CodeGen/ARM64/vector-imm.ll
rename to test/CodeGen/AArch64/arm64-vector-imm.ll
index a84f804c8cd1..9fb088b9a497 100644
--- a/test/CodeGen/ARM64/vector-imm.ll
+++ b/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
 ; CHECK-LABEL: v_orrimm:
diff --git a/test/CodeGen/ARM64/vector-insertion.ll b/test/CodeGen/AArch64/arm64-vector-insertion.ll
similarity index 91%
rename from test/CodeGen/ARM64/vector-insertion.ll
rename to test/CodeGen/AArch64/arm64-vector-insertion.ll
index 0926bcfde9a3..8fbff71f9fc2 100644
--- a/test/CodeGen/ARM64/vector-insertion.ll
+++ b/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -mcpu=generic -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -mcpu=generic -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @test0f(float* nocapture %x, float %a) #0 {
 entry:
diff --git a/test/CodeGen/ARM64/vector-ldst.ll b/test/CodeGen/AArch64/arm64-vector-ldst.ll
similarity index 99%
rename from test/CodeGen/ARM64/vector-ldst.ll
rename to test/CodeGen/AArch64/arm64-vector-ldst.ll
index 154160ee502a..c00191577d17 100644
--- a/test/CodeGen/ARM64/vector-ldst.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 ; rdar://9428579
 
diff --git a/test/CodeGen/ARM64/vext.ll b/test/CodeGen/AArch64/arm64-vext.ll
similarity index 99%
rename from test/CodeGen/ARM64/vext.ll
rename to test/CodeGen/AArch64/arm64-vext.ll
index c82043940c8f..2240dfd5a1ae 100644
--- a/test/CodeGen/ARM64/vext.ll
+++ b/test/CodeGen/AArch64/arm64-vext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @test_vext_s8() nounwind ssp {
   ; CHECK-LABEL: test_vext_s8:
diff --git a/test/CodeGen/AArch64/arm64-vext_reverse.ll b/test/CodeGen/AArch64/arm64-vext_reverse.ll
new file mode 100644
index 000000000000..c45e55edeca5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext_reverse.ll
@@ -0,0 +1,172 @@
+; RUN: llc -mtriple=arm64-linux-gnuabi < %s | FileCheck %s
+
+; The following tests is to check the correctness of reversing input operand 
+; of vext by enumerating all cases of using two undefs in shuffle masks.
+
+define <4 x i16> @vext_6701_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_12:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_24:
+; CHECK: rev32   v0.4h, v1.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_13:
+; CHECK: rev32   v0.4h, v0.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_14:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
diff --git a/test/CodeGen/ARM64/vfloatintrinsics.ll b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
similarity index 99%
rename from test/CodeGen/ARM64/vfloatintrinsics.ll
rename to test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
index a8c882bf6960..255a18216de5 100644
--- a/test/CodeGen/ARM64/vfloatintrinsics.ll
+++ b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 ;;; Float vectors
 
diff --git a/test/CodeGen/ARM64/vhadd.ll b/test/CodeGen/AArch64/arm64-vhadd.ll
similarity index 51%
rename from test/CodeGen/ARM64/vhadd.ll
rename to test/CodeGen/AArch64/arm64-vhadd.ll
index aed76810e133..6178bf9809dd 100644
--- a/test/CodeGen/ARM64/vhadd.ll
+++ b/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: shadd8b:
 ;CHECK: shadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: shadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: shadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: shadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: shadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: shadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uhadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uhadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uhadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uhadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uhadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -104,32 +104,32 @@ define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uhadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: srhadd8b:
 ;CHECK: srhadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -138,7 +138,7 @@ define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: srhadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -147,7 +147,7 @@ define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: srhadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -156,7 +156,7 @@ define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: srhadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -165,7 +165,7 @@ define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: srhadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -174,7 +174,7 @@ define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: srhadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -183,7 +183,7 @@ define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: urhadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -192,7 +192,7 @@ define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: urhadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -201,7 +201,7 @@ define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: urhadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -210,7 +210,7 @@ define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: urhadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -219,7 +219,7 @@ define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: urhadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -228,22 +228,22 @@ define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: urhadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vhsub.ll b/test/CodeGen/AArch64/arm64-vhsub.ll
similarity index 50%
rename from test/CodeGen/ARM64/vhsub.ll
rename to test/CodeGen/AArch64/arm64-vhsub.ll
index 85df4d4eb73a..13bfda3899e5 100644
--- a/test/CodeGen/ARM64/vhsub.ll
+++ b/test/CodeGen/AArch64/arm64-vhsub.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: shsub8b:
 ;CHECK: shsub.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: shsub.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: shsub.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: shsub.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: shsub.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: shsub.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uhsub.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uhsub.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uhsub.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uhsub.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uhsub.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -104,22 +104,22 @@ define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uhsub.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/virtual_base.ll b/test/CodeGen/AArch64/arm64-virtual_base.ll
similarity index 100%
rename from test/CodeGen/ARM64/virtual_base.ll
rename to test/CodeGen/AArch64/arm64-virtual_base.ll
diff --git a/test/CodeGen/ARM64/vmax.ll b/test/CodeGen/AArch64/arm64-vmax.ll
similarity index 52%
rename from test/CodeGen/ARM64/vmax.ll
rename to test/CodeGen/AArch64/arm64-vmax.ll
index b2426f35057f..3f2c134dec6e 100644
--- a/test/CodeGen/ARM64/vmax.ll
+++ b/test/CodeGen/AArch64/arm64-vmax.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: smax_8b:
 ;CHECK: smax.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: smax.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smax.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: smax.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smax.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -50,23 +50,23 @@ define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: smax.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umax_8b:
 ;CHECK: umax.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -75,7 +75,7 @@ define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: umax.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -84,7 +84,7 @@ define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umax.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -93,7 +93,7 @@ define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: umax.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -102,7 +102,7 @@ define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umax.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -111,23 +111,23 @@ define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: umax.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: smin_8b:
 ;CHECK: smin.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -136,7 +136,7 @@ define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: smin.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -145,7 +145,7 @@ define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smin.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -154,7 +154,7 @@ define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: smin.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -163,7 +163,7 @@ define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smin.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -172,23 +172,23 @@ define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: smin.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umin_8b:
 ;CHECK: umin.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -197,7 +197,7 @@ define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: umin.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -206,7 +206,7 @@ define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umin.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -215,7 +215,7 @@ define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: umin.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -224,7 +224,7 @@ define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umin.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -233,25 +233,25 @@ define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: umin.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: smaxp_8b:
 ;CHECK: smaxp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -260,7 +260,7 @@ define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: smaxp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -269,7 +269,7 @@ define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smaxp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -278,7 +278,7 @@ define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: smaxp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -287,7 +287,7 @@ define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smaxp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -296,23 +296,23 @@ define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: smaxp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umaxp_8b:
 ;CHECK: umaxp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -321,7 +321,7 @@ define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: umaxp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -330,7 +330,7 @@ define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umaxp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -339,7 +339,7 @@ define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: umaxp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -348,7 +348,7 @@ define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umaxp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -357,25 +357,25 @@ define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: umaxp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sminp_8b:
 ;CHECK: sminp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -384,7 +384,7 @@ define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sminp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -393,7 +393,7 @@ define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sminp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -402,7 +402,7 @@ define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sminp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -411,7 +411,7 @@ define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sminp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -420,23 +420,23 @@ define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sminp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: uminp_8b:
 ;CHECK: uminp.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -445,7 +445,7 @@ define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uminp.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -454,7 +454,7 @@ define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uminp.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -463,7 +463,7 @@ define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uminp.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -472,7 +472,7 @@ define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uminp.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -481,23 +481,23 @@ define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uminp.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmax_2s:
 ;CHECK: fmax.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -506,7 +506,7 @@ define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmax.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -515,20 +515,20 @@ define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmax.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmaxp_2s:
 ;CHECK: fmaxp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -537,7 +537,7 @@ define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmaxp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -546,20 +546,20 @@ define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmaxp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmin_2s:
 ;CHECK: fmin.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -568,7 +568,7 @@ define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmin.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -577,20 +577,20 @@ define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmin.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fminp_2s:
 ;CHECK: fminp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -599,7 +599,7 @@ define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fminp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -608,20 +608,20 @@ define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fminp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fminnmp_2s:
 ;CHECK: fminnmp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -630,7 +630,7 @@ define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fminnmp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -639,20 +639,20 @@ define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fminnmp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmaxnmp_2s:
 ;CHECK: fmaxnmp.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -661,7 +661,7 @@ define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmaxnmp.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -670,10 +670,10 @@ define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmaxnmp.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
new file mode 100644
index 000000000000..b5aca45cd479
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+  %max = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+  %min = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vmovn.ll b/test/CodeGen/AArch64/arm64-vmovn.ll
similarity index 74%
rename from test/CodeGen/ARM64/vmovn.ll
rename to test/CodeGen/AArch64/arm64-vmovn.ll
index 675633b6cfad..67e2816a7f5f 100644
--- a/test/CodeGen/ARM64/vmovn.ll
+++ b/test/CodeGen/AArch64/arm64-vmovn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: xtn8b:
@@ -62,7 +62,7 @@ define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -71,7 +71,7 @@ define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -80,7 +80,7 @@ define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -89,7 +89,7 @@ define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -99,7 +99,7 @@ define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -109,21 +109,21 @@ define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
 
 define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: uqxtn8b:
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -132,7 +132,7 @@ define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -141,7 +141,7 @@ define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -150,7 +150,7 @@ define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -160,7 +160,7 @@ define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -170,21 +170,21 @@ define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
 
 define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: sqxtun8b:
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -193,7 +193,7 @@ define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -202,7 +202,7 @@ define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -211,7 +211,7 @@ define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -221,7 +221,7 @@ define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -231,12 +231,12 @@ define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
 
diff --git a/test/CodeGen/ARM64/vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
similarity index 80%
rename from test/CodeGen/ARM64/vmul.ll
rename to test/CodeGen/AArch64/arm64-vmul.ll
index b6bd16ac0b4c..6fa60fe346af 100644
--- a/test/CodeGen/ARM64/vmul.ll
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -6,7 +6,7 @@ define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: smull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -15,7 +15,7 @@ define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: smull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -24,20 +24,20 @@ define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: smull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
-declare <8 x i16>  @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umull8h:
 ;CHECK: umull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -46,7 +46,7 @@ define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: umull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -55,20 +55,20 @@ define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: umull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
-declare <8 x i16>  @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull4s:
 ;CHECK: sqdmull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqdmull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
@@ -88,7 +88,7 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -99,31 +99,31 @@ define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: pmull8h:
 ;CHECK: pmull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
 
 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmulh_4h:
 ;CHECK: sqdmulh.4h
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i16> %tmp3
 }
 
@@ -132,7 +132,7 @@ define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqdmulh.8h
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -141,7 +141,7 @@ define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqdmulh.2s
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i32> %tmp3
 }
 
@@ -150,7 +150,7 @@ define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqdmulh.4s
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -159,22 +159,22 @@ define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
 ;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
   %tmp1 = load i32* %A
   %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+  %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
   ret i32 %tmp3
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
 
 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqrdmulh_4h:
 ;CHECK: sqrdmulh.4h
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i16> %tmp3
 }
 
@@ -183,7 +183,7 @@ define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqrdmulh.8h
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -192,7 +192,7 @@ define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqrdmulh.2s
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i32> %tmp3
 }
 
@@ -201,7 +201,7 @@ define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqrdmulh.4s
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -210,22 +210,22 @@ define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
 ;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
   %tmp1 = load i32* %A
   %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+  %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
   ret i32 %tmp3
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
 
 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmulx_2s:
 ;CHECK: fmulx.2s
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
-  %tmp3 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
   ret <2 x float> %tmp3
 }
 
@@ -234,7 +234,7 @@ define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: fmulx.4s
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
-  %tmp3 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
   ret <4 x float> %tmp3
 }
 
@@ -243,13 +243,13 @@ define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: fmulx.2d
   %tmp1 = load <2 x double>* %A
   %tmp2 = load <2 x double>* %B
-  %tmp3 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+  %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
   ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: smlal4s:
@@ -257,7 +257,7 @@ define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = add <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -268,7 +268,7 @@ define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = add <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -279,7 +279,7 @@ define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = sub <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -290,15 +290,15 @@ define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = sub <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal4s:
@@ -306,8 +306,8 @@ define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwin
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -317,8 +317,8 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -330,8 +330,8 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -343,8 +343,8 @@ define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounw
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -354,8 +354,8 @@ define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwin
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -365,8 +365,8 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -378,8 +378,8 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -391,8 +391,8 @@ define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounw
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -402,7 +402,7 @@ define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = add <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -413,7 +413,7 @@ define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = add <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -424,7 +424,7 @@ define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = sub <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -435,7 +435,7 @@ define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = sub <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -717,7 +717,7 @@ define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
   %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+  %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
   ret <2 x float> %tmp4
 }
 
@@ -728,7 +728,7 @@ define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
   %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+  %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
   ret <4 x float> %tmp4
 }
 
@@ -739,7 +739,7 @@ define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind
   %tmp1 = load <2 x double>* %A
   %tmp2 = load <2 x double>* %B
   %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+  %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
   ret <2 x double> %tmp4
 }
 
@@ -750,7 +750,7 @@ define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i16> %tmp4
 }
 
@@ -761,7 +761,7 @@ define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
   ret <8 x i16> %tmp4
 }
 
@@ -772,7 +772,7 @@ define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i32> %tmp4
 }
 
@@ -783,7 +783,7 @@ define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -792,7 +792,7 @@ define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
 ;CHECK-NOT: dup
 ;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
   %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+  %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
   ret i32 %tmp2
 }
 
@@ -803,7 +803,7 @@ define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i16> %tmp4
 }
 
@@ -814,7 +814,7 @@ define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
   ret <8 x i16> %tmp4
 }
 
@@ -825,7 +825,7 @@ define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i32> %tmp4
 }
 
@@ -836,7 +836,7 @@ define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -845,7 +845,7 @@ define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
 ;CHECK-NOT: dup
 ;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
   %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+  %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
   ret i32 %tmp2
 }
 
@@ -856,7 +856,7 @@ define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -867,7 +867,7 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -879,7 +879,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp4
 }
 
@@ -891,7 +891,7 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp4
 }
 
@@ -902,7 +902,7 @@ define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -913,7 +913,7 @@ define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -924,7 +924,7 @@ define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -935,7 +935,7 @@ define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -947,7 +947,7 @@ define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = add <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -960,7 +960,7 @@ define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = add <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -973,8 +973,8 @@ define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) n
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -986,8 +986,8 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1000,8 +1000,8 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1014,8 +1014,8 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C)
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1024,45 +1024,45 @@ define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
 ;CHECK: sqdmlal.4s
   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqadd.i32(i32 %A, i32 %prod)
+  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
   ret i32 %res
 }
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
 
 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_lane_1s:
 ;CHECK: sqdmlsl.4s
   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqsub.i32(i32 %A, i32 %prod)
+  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
   ret i32 %res
 }
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
 
 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: sqdmlal_lane_1d:
 ;CHECK: sqdmlal.s
   %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqadd.i64(i64 %A, i64 %prod)
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
   ret i64 %res
 }
-declare i64 @llvm.arm64.neon.sqdmulls.scalar(i32, i32)
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
 
 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_lane_1d:
 ;CHECK: sqdmlsl.s
   %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqsub.i64(i64 %A, i64 %prod)
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
   ret i64 %res
 }
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
 
 
 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
@@ -1073,7 +1073,7 @@ define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = add <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1086,7 +1086,7 @@ define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = add <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1100,7 +1100,7 @@ define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1113,7 +1113,7 @@ define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1126,8 +1126,8 @@ define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) n
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1139,8 +1139,8 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1153,8 +1153,8 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1167,8 +1167,8 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C)
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1180,7 +1180,7 @@ define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1193,7 +1193,7 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1202,7 +1202,7 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou
 define float @fmulxs(float %a, float %b) nounwind {
 ; CHECK-LABEL: fmulxs:
 ; CHECKNEXT: fmulx s0, s0, s1
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
 ; CHECKNEXT: ret
   ret float %fmulx.i
 }
@@ -1210,7 +1210,7 @@ define float @fmulxs(float %a, float %b) nounwind {
 define double @fmulxd(double %a, double %b) nounwind {
 ; CHECK-LABEL: fmulxd:
 ; CHECKNEXT: fmulx d0, d0, d1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
 ; CHECKNEXT: ret
   ret double %fmulx.i
 }
@@ -1219,7 +1219,7 @@ define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
 ; CHECK-LABEL: fmulxs_lane:
 ; CHECKNEXT: fmulx.s s0, s0, v1[3]
   %b = extractelement <4 x float> %vec, i32 3
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
 ; CHECKNEXT: ret
   ret float %fmulx.i
 }
@@ -1228,13 +1228,13 @@ define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
 ; CHECK-LABEL: fmulxd_lane:
 ; CHECKNEXT: fmulx d0, d0, v1[1]
   %b = extractelement <2 x double> %vec, i32 1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
 ; CHECKNEXT: ret
   ret double %fmulx.i
 }
 
-declare double @llvm.arm64.neon.fmulx.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.fmulx.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
 
 
 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
@@ -1243,7 +1243,7 @@ define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; CHECK-NEXT: ret
   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %3 = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
   ret <8 x i16> %3
 }
 
@@ -1256,7 +1256,7 @@ define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1269,7 +1269,7 @@ define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1282,7 +1282,7 @@ define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1295,7 +1295,7 @@ define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1308,7 +1308,7 @@ define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1321,7 +1321,7 @@ define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1334,7 +1334,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   ret <4 x i32> %vmull2.i
 }
 
@@ -1347,7 +1347,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   ret <2 x i64> %vmull2.i
 }
 
@@ -1360,7 +1360,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   ret <4 x i32> %vmull2.i
 }
 
@@ -1373,7 +1373,7 @@ entry:
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   ret <2 x i64> %vmull2.i
 }
 
@@ -1388,7 +1388,7 @@ define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   %add.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1404,7 +1404,7 @@ define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1420,7 +1420,7 @@ define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1436,7 +1436,7 @@ define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   %add.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1452,7 +1452,7 @@ define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1468,7 +1468,7 @@ define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1484,7 +1484,7 @@ define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add
 }
@@ -1500,7 +1500,7 @@ define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add
 }
@@ -1517,7 +1517,7 @@ define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add
 }
@@ -1533,7 +1533,7 @@ define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add
 }
@@ -1631,7 +1631,7 @@ entry:
 ; CHECK: smull.4s v0, v0, v1[6]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   ret <4 x i32> %vmull2.i
 }
 
@@ -1642,7 +1642,7 @@ entry:
 ; CHECK: smull.2d v0, v0, v1[2]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   ret <2 x i64> %vmull2.i
 }
 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
@@ -1652,7 +1652,7 @@ entry:
 ; CHECK: umull.4s v0, v0, v1[6]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   ret <4 x i32> %vmull2.i
 }
 
@@ -1663,7 +1663,7 @@ entry:
 ; CHECK: umull.2d v0, v0, v1[2]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   ret <2 x i64> %vmull2.i
 }
 
@@ -1681,7 +1681,7 @@ entry:
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1696,7 +1696,7 @@ entry:
   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1714,7 +1714,7 @@ entry:
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1729,7 +1729,7 @@ entry:
   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1787,7 +1787,7 @@ define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   ret <2 x i64> %res
 }
 
@@ -1799,8 +1799,8 @@ define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   ret <2 x i64> %sum
 }
 
@@ -1813,7 +1813,7 @@ define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   ret <2 x i64> %res
 }
 
@@ -1826,7 +1826,7 @@ define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
 
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
   ret <8 x i16> %res
 }
 
@@ -1838,7 +1838,7 @@ define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
   ret <8 x i16> %res
 }
 
@@ -1850,7 +1850,7 @@ define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   ret <2 x i64> %res
 }
 
@@ -1862,8 +1862,8 @@ define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs,
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   ret <2 x i64> %sum
 }
 
@@ -1875,7 +1875,7 @@ define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   %sum = add <2 x i64> %accum, %res
   ret <2 x i64> %sum
 }
@@ -1997,23 +1997,23 @@ define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind
 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
 ;CHECK-LABEL: sqdmlal_d:
 ;CHECK: sqdmlal
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqadd.i64(i64 %C, i64 %tmp4)
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
   ret i64 %tmp5
 }
 
 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_d:
 ;CHECK: sqdmlsl
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqsub.i64(i64 %C, i64 %tmp4)
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
   ret i64 %tmp5
 }
 
 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
 ; CHECK-LABEL: test_pmull_64:
 ; CHECK: pmull.1q
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l, i64 %r)
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   ret <16 x i8> %val
 }
 
@@ -2022,11 +2022,11 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
 ; CHECK: pmull2.1q
   %l_hi = extractelement <2 x i64> %l, i32 1
   %r_hi = extractelement <2 x i64> %r, i32 1
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
   ret <16 x i8> %val
 }
 
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64)
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
 
 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
 ; CHECK-LABEL: test_mul_v1i64:
diff --git a/test/CodeGen/ARM64/volatile.ll b/test/CodeGen/AArch64/arm64-volatile.ll
similarity index 100%
rename from test/CodeGen/ARM64/volatile.ll
rename to test/CodeGen/AArch64/arm64-volatile.ll
diff --git a/test/CodeGen/ARM64/vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
similarity index 100%
rename from test/CodeGen/ARM64/vpopcnt.ll
rename to test/CodeGen/AArch64/arm64-vpopcnt.ll
diff --git a/test/CodeGen/ARM64/vqadd.ll b/test/CodeGen/AArch64/arm64-vqadd.ll
similarity index 50%
rename from test/CodeGen/ARM64/vqadd.ll
rename to test/CodeGen/AArch64/arm64-vqadd.ll
index 0b7f7e53105e..20f7e2c7a893 100644
--- a/test/CodeGen/ARM64/vqadd.ll
+++ b/test/CodeGen/AArch64/arm64-vqadd.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqadd8b:
 ;CHECK: sqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -104,7 +104,7 @@ define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -113,7 +113,7 @@ define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -122,36 +122,36 @@ define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: uqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: usqadd8b:
 ;CHECK: usqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -160,7 +160,7 @@ define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: usqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -169,7 +169,7 @@ define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: usqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -178,7 +178,7 @@ define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: usqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -187,7 +187,7 @@ define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: usqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -196,7 +196,7 @@ define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: usqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -205,42 +205,42 @@ define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: usqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
 define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
 ; CHECK-LABEL: usqadd_d:
 ; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.usqadd.i64(i64 %l, i64 %r)
+  %sum = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %l, i64 %r)
   ret i64 %sum
 }
 
 define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
 ; CHECK-LABEL: usqadd_s:
 ; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.usqadd.i32(i32 %l, i32 %r)
+  %sum = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %l, i32 %r)
   ret i32 %sum
 }
 
-declare <8 x i8>  @llvm.arm64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.usqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.usqadd.i32(i32, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.usqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.usqadd.i32(i32, i32) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: suqadd8b:
 ;CHECK: suqadd.8b
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -249,7 +249,7 @@ define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: suqadd.4h
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -258,7 +258,7 @@ define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: suqadd.2s
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -267,7 +267,7 @@ define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: suqadd.16b
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -276,7 +276,7 @@ define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: suqadd.8h
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -285,7 +285,7 @@ define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: suqadd.4s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -294,39 +294,39 @@ define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: suqadd.2d
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
 define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
 ; CHECK-LABEL: suqadd_1d:
 ; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+  %sum = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
   ret <1 x i64> %sum
 }
 
 define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
 ; CHECK-LABEL: suqadd_d:
 ; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.suqadd.i64(i64 %l, i64 %r)
+  %sum = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %l, i64 %r)
   ret i64 %sum
 }
 
 define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
 ; CHECK-LABEL: suqadd_s:
 ; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.suqadd.i32(i32 %l, i32 %r)
+  %sum = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %l, i32 %r)
   ret i32 %sum
 }
 
-declare <8 x i8>  @llvm.arm64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.suqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.suqadd.i32(i32, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.suqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.suqadd.i32(i32, i32) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqsub.ll b/test/CodeGen/AArch64/arm64-vqsub.ll
new file mode 100644
index 000000000000..dde3ac3478e4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vselect.ll b/test/CodeGen/AArch64/arm64-vselect.ll
similarity index 89%
rename from test/CodeGen/ARM64/vselect.ll
rename to test/CodeGen/AArch64/arm64-vselect.ll
index aa8e81eb709e..9988512f530e 100644
--- a/test/CodeGen/ARM64/vselect.ll
+++ b/test/CodeGen/AArch64/arm64-vselect.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;CHECK: @func63
 ;CHECK: cmeq.4h v0, v0, v1
diff --git a/test/CodeGen/ARM64/vsetcc_fp.ll b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
similarity index 80%
rename from test/CodeGen/ARM64/vsetcc_fp.ll
rename to test/CodeGen/AArch64/arm64-vsetcc_fp.ll
index c93aad5c4ee0..f4f4714dde4d 100644
--- a/test/CodeGen/ARM64/vsetcc_fp.ll
+++ b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
 ; CHECK-LABEL: fcmp_one:
 ; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
diff --git a/test/CodeGen/ARM64/vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
similarity index 68%
rename from test/CodeGen/ARM64/vshift.ll
rename to test/CodeGen/AArch64/arm64-vshift.ll
index 486c6cc390b1..65bd50cbe9d0 100644
--- a/test/CodeGen/ARM64/vshift.ll
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -enable-misched=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
 
 define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqshl8b:
 ;CHECK: sqshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sqshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uqshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uqshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uqshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sqshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sqshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uqshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -104,7 +104,7 @@ define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uqshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -113,7 +113,7 @@ define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uqshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -122,36 +122,36 @@ define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: uqshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: srshl8b:
 ;CHECK: srshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -160,7 +160,7 @@ define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: srshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -169,7 +169,7 @@ define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: srshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -178,7 +178,7 @@ define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: urshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -187,7 +187,7 @@ define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: urshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -196,7 +196,7 @@ define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: urshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -205,7 +205,7 @@ define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: srshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -214,7 +214,7 @@ define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: srshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -223,7 +223,7 @@ define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: srshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -232,7 +232,7 @@ define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: srshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -241,7 +241,7 @@ define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: urshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -250,7 +250,7 @@ define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: urshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -259,7 +259,7 @@ define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: urshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -268,36 +268,36 @@ define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: urshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqrshl8b:
 ;CHECK: sqrshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -306,7 +306,7 @@ define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sqrshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -315,7 +315,7 @@ define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sqrshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -324,7 +324,7 @@ define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: uqrshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -333,7 +333,7 @@ define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: uqrshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -342,7 +342,7 @@ define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: uqrshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -351,7 +351,7 @@ define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sqrshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -360,7 +360,7 @@ define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sqrshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -369,7 +369,7 @@ define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sqrshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -378,7 +378,7 @@ define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sqrshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -387,7 +387,7 @@ define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: uqrshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -396,7 +396,7 @@ define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: uqrshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -405,7 +405,7 @@ define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: uqrshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -414,35 +414,35 @@ define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: uqrshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: urshr8b:
 ;CHECK: urshr.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <8 x i8> %tmp3
 }
 
@@ -450,7 +450,7 @@ define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: urshr4h:
 ;CHECK: urshr.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <4 x i16> %tmp3
 }
 
@@ -458,7 +458,7 @@ define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: urshr2s:
 ;CHECK: urshr.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         ret <2 x i32> %tmp3
 }
 
@@ -466,7 +466,7 @@ define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: urshr16b:
 ;CHECK: urshr.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <16 x i8> %tmp3
 }
 
@@ -474,7 +474,7 @@ define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: urshr8h:
 ;CHECK: urshr.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <8 x i16> %tmp3
 }
 
@@ -482,7 +482,7 @@ define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: urshr4s:
 ;CHECK: urshr.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         ret <4 x i32> %tmp3
 }
 
@@ -490,7 +490,7 @@ define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: urshr2d:
 ;CHECK: urshr.2d
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         ret <2 x i64> %tmp3
 }
 
@@ -498,7 +498,7 @@ define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: srshr8b:
 ;CHECK: srshr.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <8 x i8> %tmp3
 }
 
@@ -506,7 +506,7 @@ define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: srshr4h:
 ;CHECK: srshr.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <4 x i16> %tmp3
 }
 
@@ -514,7 +514,7 @@ define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: srshr2s:
 ;CHECK: srshr.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         ret <2 x i32> %tmp3
 }
 
@@ -522,7 +522,7 @@ define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: srshr16b:
 ;CHECK: srshr.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <16 x i8> %tmp3
 }
 
@@ -530,7 +530,7 @@ define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: srshr8h:
 ;CHECK: srshr.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <8 x i16> %tmp3
 }
 
@@ -538,7 +538,7 @@ define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: srshr4s:
 ;CHECK: srshr.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         ret <4 x i32> %tmp3
 }
 
@@ -546,7 +546,7 @@ define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: srshr2d:
 ;CHECK: srshr.2d
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         ret <2 x i64> %tmp3
 }
 
@@ -554,7 +554,7 @@ define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshlu8b:
 ;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -562,7 +562,7 @@ define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshlu4h:
 ;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -570,7 +570,7 @@ define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshlu2s:
 ;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -578,7 +578,7 @@ define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshlu16b:
 ;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -586,7 +586,7 @@ define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshlu8h:
 ;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -594,7 +594,7 @@ define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshlu4s:
 ;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -602,25 +602,25 @@ define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshlu2d:
 ;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: rshrn8b:
 ;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -628,7 +628,7 @@ define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: rshrn4h:
 ;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -636,7 +636,7 @@ define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: rshrn2s:
 ;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -645,7 +645,7 @@ define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
 ;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -655,7 +655,7 @@ define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -665,14 +665,14 @@ define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare <8 x i8>  @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: shrn8b:
@@ -734,14 +734,14 @@ define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
         ret <4 x i32> %tmp4
 }
 
-declare <8 x i8>  @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqshrn1s:
 ; CHECK: sqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -749,7 +749,7 @@ define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshrn8b:
 ;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -757,7 +757,7 @@ define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshrn4h:
 ;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -765,7 +765,7 @@ define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshrn2s:
 ;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -775,7 +775,7 @@ define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -785,7 +785,7 @@ define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -795,20 +795,20 @@ define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqshrun1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqshrun1s:
 ; CHECK: sqshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrun.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -816,7 +816,7 @@ define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshrun8b:
 ;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -824,7 +824,7 @@ define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshrun4h:
 ;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -832,7 +832,7 @@ define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshrun2s:
 ;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -841,7 +841,7 @@ define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -851,7 +851,7 @@ define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -861,20 +861,20 @@ define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqrshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqrshrn1s:
 ; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -882,7 +882,7 @@ define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqrshrn8b:
 ;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -890,7 +890,7 @@ define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqrshrn4h:
 ;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -898,7 +898,7 @@ define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqrshrn2s:
 ;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -907,7 +907,7 @@ define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -917,7 +917,7 @@ define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -927,20 +927,20 @@ define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqrshrun1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqrshrun1s:
 ; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrun.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -948,7 +948,7 @@ define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqrshrun8b:
 ;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -956,7 +956,7 @@ define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqrshrun4h:
 ;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -964,7 +964,7 @@ define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqrshrun2s:
 ;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -973,7 +973,7 @@ define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -983,7 +983,7 @@ define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -993,20 +993,20 @@ define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqrshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @uqrshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: uqrshrn1s:
 ; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqrshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -1014,7 +1014,7 @@ define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqrshrn8b:
 ;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1022,7 +1022,7 @@ define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqrshrn4h:
 ;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1030,7 +1030,7 @@ define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: uqrshrn2s:
 ;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1039,7 +1039,7 @@ define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -1049,7 +1049,7 @@ define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -1059,20 +1059,20 @@ define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.uqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @uqshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: uqshrn1s:
 ; CHECK: uqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -1080,7 +1080,7 @@ define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshrn8b:
 ;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1088,7 +1088,7 @@ define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqshrn4h:
 ;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1096,7 +1096,7 @@ define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: uqshrn2s:
 ;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1105,7 +1105,7 @@ define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
 ;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -1115,7 +1115,7 @@ define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
 ;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
   %out = load <4 x i16>* %ret
   %tmp1 = load <4 x i32>* %A
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
   %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %tmp4
 }
@@ -1125,15 +1125,15 @@ define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
 ;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
   %out = load <2 x i32>* %ret
   %tmp1 = load <2 x i64>* %A
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
   %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.uqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: ushll8h:
@@ -1253,7 +1253,7 @@ define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshli8b:
 ;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -1261,7 +1261,7 @@ define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshli4h:
 ;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -1269,7 +1269,7 @@ define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshli2s:
 ;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -1277,7 +1277,7 @@ define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqshli16b:
 ;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -1285,7 +1285,7 @@ define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sqshli8h:
 ;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -1293,7 +1293,7 @@ define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sqshli4s:
 ;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -1301,7 +1301,7 @@ define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: sqshli2d:
 ;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
@@ -1309,7 +1309,16 @@ define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: uqshli8b:
 ;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @uqshli8b_1(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b_1:
+;CHECK: movi.8b [[REG:v[0-9]+]], #0x8
+;CHECK: uqshl.8b v0, v0, [[REG]]
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
         ret <8 x i8> %tmp3
 }
 
@@ -1317,7 +1326,7 @@ define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshli4h:
 ;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -1325,7 +1334,7 @@ define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqshli2s:
 ;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -1333,7 +1342,7 @@ define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: uqshli16b:
 ;CHECK: uqshl.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -1341,7 +1350,7 @@ define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshli8h:
 ;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -1349,7 +1358,7 @@ define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: uqshli4s:
 ;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -1357,7 +1366,7 @@ define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: uqshli2d:
 ;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
@@ -1365,7 +1374,7 @@ define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: ursra8b:
 ;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <8 x i8>* %B
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -1375,7 +1384,7 @@ define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: ursra4h:
 ;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -1385,7 +1394,7 @@ define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: ursra2s:
 ;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -1395,7 +1404,7 @@ define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ursra16b:
 ;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <16 x i8>* %B
         %tmp5 = add <16 x i8> %tmp3, %tmp4
          ret <16 x i8> %tmp5
@@ -1405,7 +1414,7 @@ define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ursra8h:
 ;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
          ret <8 x i16> %tmp5
@@ -1415,7 +1424,7 @@ define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ursra4s:
 ;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
          ret <4 x i32> %tmp5
@@ -1425,7 +1434,7 @@ define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: ursra2d:
 ;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
          ret <2 x i64> %tmp5
@@ -1435,7 +1444,7 @@ define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: srsra8b:
 ;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <8 x i8>* %B
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -1445,7 +1454,7 @@ define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: srsra4h:
 ;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -1455,7 +1464,7 @@ define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK-LABEL: srsra2s:
 ;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -1465,7 +1474,7 @@ define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: srsra16b:
 ;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <16 x i8>* %B
         %tmp5 = add <16 x i8> %tmp3, %tmp4
          ret <16 x i8> %tmp5
@@ -1475,7 +1484,7 @@ define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: srsra8h:
 ;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
          ret <8 x i16> %tmp5
@@ -1485,7 +1494,7 @@ define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: srsra4s:
 ;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
          ret <4 x i32> %tmp5
@@ -1495,7 +1504,7 @@ define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK-LABEL: srsra2d:
 ;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
          ret <2 x i64> %tmp5
@@ -1831,7 +1840,7 @@ define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: sli.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1840,7 +1849,7 @@ define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: sli.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1849,7 +1858,7 @@ define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: sli.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1858,7 +1867,7 @@ define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: sli d0, {{d[0-9]+}}, #1
         %tmp1 = load <1 x i64>* %A
         %tmp2 = load <1 x i64>* %B
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
         ret <1 x i64> %tmp3
 }
 
@@ -1867,7 +1876,7 @@ define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: sli.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
         ret <16 x i8> %tmp3
 }
 
@@ -1876,7 +1885,7 @@ define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: sli.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
         ret <8 x i16> %tmp3
 }
 
@@ -1885,7 +1894,7 @@ define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: sli.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
         ret <4 x i32> %tmp3
 }
 
@@ -1894,19 +1903,19 @@ define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: sli.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
 
 define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: ashr_v1i64:
diff --git a/test/CodeGen/ARM64/vshr.ll b/test/CodeGen/AArch64/arm64-vshr.ll
similarity index 95%
rename from test/CodeGen/ARM64/vshr.ll
rename to test/CodeGen/AArch64/arm64-vshr.ll
index 1da8f60acb68..21eb579f2522 100644
--- a/test/CodeGen/ARM64/vshr.ll
+++ b/test/CodeGen/AArch64/arm64-vshr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 
 define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: testShiftRightArith_v8i16:
diff --git a/test/CodeGen/ARM64/vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
similarity index 100%
rename from test/CodeGen/ARM64/vshuffle.ll
rename to test/CodeGen/AArch64/arm64-vshuffle.ll
diff --git a/test/CodeGen/ARM64/vsqrt.ll b/test/CodeGen/AArch64/arm64-vsqrt.ll
similarity index 51%
rename from test/CodeGen/ARM64/vsqrt.ll
rename to test/CodeGen/AArch64/arm64-vsqrt.ll
index 094d7042a4de..02b7c7ec5d80 100644
--- a/test/CodeGen/ARM64/vsqrt.ll
+++ b/test/CodeGen/AArch64/arm64-vsqrt.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: frecps_2s:
 ;CHECK: frecps.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: frecps.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -23,13 +23,13 @@ define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: frecps.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 
 define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
@@ -37,7 +37,7 @@ define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: frsqrts.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x float> %tmp3
 }
 
@@ -46,7 +46,7 @@ define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: frsqrts.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x float> %tmp3
 }
 
@@ -55,19 +55,19 @@ define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: frsqrts.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
 ;CHECK-LABEL: frecpe_2s:
 ;CHECK: frecpe.2s
 	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float> %tmp1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %tmp1)
 	ret <2 x float> %tmp3
 }
 
@@ -75,7 +75,7 @@ define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
 ;CHECK-LABEL: frecpe_4s:
 ;CHECK: frecpe.4s
 	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float> %tmp1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %tmp1)
 	ret <4 x float> %tmp3
 }
 
@@ -83,7 +83,7 @@ define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
 ;CHECK-LABEL: frecpe_2d:
 ;CHECK: frecpe.2d
 	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double> %tmp1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %tmp1)
 	ret <2 x double> %tmp3
 }
 
@@ -91,7 +91,7 @@ define float @frecpe_s(float* %A) nounwind {
 ;CHECK-LABEL: frecpe_s:
 ;CHECK: frecpe s0, {{s[0-9]+}}
   %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpe.f32(float %tmp1)
+  %tmp3 = call float @llvm.aarch64.neon.frecpe.f32(float %tmp1)
   ret float %tmp3
 }
 
@@ -99,21 +99,21 @@ define double @frecpe_d(double* %A) nounwind {
 ;CHECK-LABEL: frecpe_d:
 ;CHECK: frecpe d0, {{d[0-9]+}}
   %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpe.f64(double %tmp1)
+  %tmp3 = call double @llvm.aarch64.neon.frecpe.f64(double %tmp1)
   ret double %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frecpe.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpe.f64(double) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frecpe.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpe.f64(double) nounwind readnone
 
 define float @frecpx_s(float* %A) nounwind {
 ;CHECK-LABEL: frecpx_s:
 ;CHECK: frecpx s0, {{s[0-9]+}}
   %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpx.f32(float %tmp1)
+  %tmp3 = call float @llvm.aarch64.neon.frecpx.f32(float %tmp1)
   ret float %tmp3
 }
 
@@ -121,18 +121,18 @@ define double @frecpx_d(double* %A) nounwind {
 ;CHECK-LABEL: frecpx_d:
 ;CHECK: frecpx d0, {{d[0-9]+}}
   %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpx.f64(double %tmp1)
+  %tmp3 = call double @llvm.aarch64.neon.frecpx.f64(double %tmp1)
   ret double %tmp3
 }
 
-declare float @llvm.arm64.neon.frecpx.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpx.f64(double) nounwind readnone
+declare float @llvm.aarch64.neon.frecpx.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpx.f64(double) nounwind readnone
 
 define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
 ;CHECK-LABEL: frsqrte_2s:
 ;CHECK: frsqrte.2s
 	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %tmp1)
 	ret <2 x float> %tmp3
 }
 
@@ -140,7 +140,7 @@ define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
 ;CHECK-LABEL: frsqrte_4s:
 ;CHECK: frsqrte.4s
 	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %tmp1)
 	ret <4 x float> %tmp3
 }
 
@@ -148,7 +148,7 @@ define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
 ;CHECK-LABEL: frsqrte_2d:
 ;CHECK: frsqrte.2d
 	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %tmp1)
 	ret <2 x double> %tmp3
 }
 
@@ -156,7 +156,7 @@ define float @frsqrte_s(float* %A) nounwind {
 ;CHECK-LABEL: frsqrte_s:
 ;CHECK: frsqrte s0, {{s[0-9]+}}
   %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frsqrte.f32(float %tmp1)
+  %tmp3 = call float @llvm.aarch64.neon.frsqrte.f32(float %tmp1)
   ret float %tmp3
 }
 
@@ -164,21 +164,21 @@ define double @frsqrte_d(double* %A) nounwind {
 ;CHECK-LABEL: frsqrte_d:
 ;CHECK: frsqrte d0, {{d[0-9]+}}
   %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frsqrte.f64(double %tmp1)
+  %tmp3 = call double @llvm.aarch64.neon.frsqrte.f64(double %tmp1)
   ret double %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frsqrte.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frsqrte.f64(double) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrte.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrte.f64(double) nounwind readnone
 
 define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: urecpe_2s:
 ;CHECK: urecpe.2s
 	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp3
 }
 
@@ -186,18 +186,18 @@ define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: urecpe_4s:
 ;CHECK: urecpe.4s
 	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
 
 define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
 ;CHECK-LABEL: ursqrte_2s:
 ;CHECK: ursqrte.2s
 	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp3
 }
 
@@ -205,18 +205,18 @@ define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: ursqrte_4s:
 ;CHECK: ursqrte.4s
 	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
 
 define float @f1(float %a, float %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: f1:
 ; CHECK: frsqrts s0, s0, s1
 ; CHECK-NEXT: ret
-  %vrsqrtss.i = tail call float @llvm.arm64.neon.frsqrts.f32(float %a, float %b) nounwind
+  %vrsqrtss.i = tail call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) nounwind
   ret float %vrsqrtss.i
 }
 
@@ -224,9 +224,9 @@ define double @f2(double %a, double %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: f2:
 ; CHECK: frsqrts d0, d0, d1
 ; CHECK-NEXT: ret
-  %vrsqrtsd.i = tail call double @llvm.arm64.neon.frsqrts.f64(double %a, double %b) nounwind
+  %vrsqrtsd.i = tail call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) nounwind
   ret double %vrsqrtsd.i
 }
 
-declare double @llvm.arm64.neon.frsqrts.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.frsqrts.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/vsra.ll b/test/CodeGen/AArch64/arm64-vsra.ll
similarity index 98%
rename from test/CodeGen/ARM64/vsra.ll
rename to test/CodeGen/AArch64/arm64-vsra.ll
index 3611eb3cba6b..5e9cef3e7e28 100644
--- a/test/CodeGen/ARM64/vsra.ll
+++ b/test/CodeGen/AArch64/arm64-vsra.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vsras8:
diff --git a/test/CodeGen/ARM64/vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll
similarity index 85%
rename from test/CodeGen/ARM64/vsub.ll
rename to test/CodeGen/AArch64/arm64-vsub.ll
index 5c7e84f46efc..c2c8755c0669 100644
--- a/test/CodeGen/ARM64/vsub.ll
+++ b/test/CodeGen/AArch64/arm64-vsub.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: subhn8b:
 ;CHECK: subhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: subhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: subhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -31,8 +31,8 @@ define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ;CHECK-LABEL: subhn2_16b:
 ;CHECK: subhn.8b
 ;CHECK-NEXT: subhn2.16b
-  %vsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -41,8 +41,8 @@ define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ;CHECK-LABEL: subhn2_8h:
 ;CHECK: subhn.4h
 ;CHECK-NEXT: subhn2.8h
-  %vsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -51,22 +51,22 @@ define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
 ;CHECK-LABEL: subhn2_4s:
 ;CHECK: subhn.2s
 ;CHECK-NEXT: subhn2.4s
-  %vsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: rsubhn8b:
 ;CHECK: rsubhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -75,7 +75,7 @@ define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: rsubhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -84,7 +84,7 @@ define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: rsubhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -92,8 +92,8 @@ define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ;CHECK-LABEL: rsubhn2_16b:
 ;CHECK: rsubhn.8b
 ;CHECK-NEXT: rsubhn2.16b
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vrsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -102,8 +102,8 @@ define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ;CHECK-LABEL: rsubhn2_8h:
 ;CHECK: rsubhn.4h
 ;CHECK-NEXT: rsubhn2.8h
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vrsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -112,15 +112,15 @@ define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
 ;CHECK-LABEL: rsubhn2_4s:
 ;CHECK: rsubhn.2s
 ;CHECK-NEXT: rsubhn2.4s
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vrsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubl8h:
diff --git a/test/CodeGen/ARM64/weak-reference.ll b/test/CodeGen/AArch64/arm64-weak-reference.ll
similarity index 100%
rename from test/CodeGen/ARM64/weak-reference.ll
rename to test/CodeGen/AArch64/arm64-weak-reference.ll
diff --git a/test/CodeGen/ARM64/xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
similarity index 99%
rename from test/CodeGen/ARM64/xaluo.ll
rename to test/CodeGen/AArch64/arm64-xaluo.ll
index 6cffbdeef8a1..0c300de802b7 100644
--- a/test/CodeGen/ARM64/xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
diff --git a/test/CodeGen/ARM64/zero-cycle-regmov.ll b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
similarity index 100%
rename from test/CodeGen/ARM64/zero-cycle-regmov.ll
rename to test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
diff --git a/test/CodeGen/ARM64/zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
similarity index 100%
rename from test/CodeGen/ARM64/zero-cycle-zeroing.ll
rename to test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
diff --git a/test/CodeGen/ARM64/zext.ll b/test/CodeGen/AArch64/arm64-zext.ll
similarity index 100%
rename from test/CodeGen/ARM64/zext.ll
rename to test/CodeGen/AArch64/arm64-zext.ll
diff --git a/test/CodeGen/ARM64/zextload-unscaled.ll b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
similarity index 100%
rename from test/CodeGen/ARM64/zextload-unscaled.ll
rename to test/CodeGen/AArch64/arm64-zextload-unscaled.ll
diff --git a/test/CodeGen/ARM64/zip.ll b/test/CodeGen/AArch64/arm64-zip.ll
similarity index 98%
rename from test/CodeGen/ARM64/zip.ll
rename to test/CodeGen/AArch64/arm64-zip.ll
index d06a9f899ddf..304b28099432 100644
--- a/test/CodeGen/ARM64/zip.ll
+++ b/test/CodeGen/AArch64/arm64-zip.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vzipi8:
diff --git a/test/CodeGen/AArch64/asm-large-immediate.ll b/test/CodeGen/AArch64/asm-large-immediate.ll
new file mode 100644
index 000000000000..05e4dddc7a7f
--- /dev/null
+++ b/test/CodeGen/AArch64/asm-large-immediate.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=aarch64 -no-integrated-as < %s | FileCheck %s
+
+define void @test() {
+entry:
+; CHECK: /* result: 68719476738 */
+        tail call void asm sideeffect "/* result: ${0:c} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+; CHECK: /* result: -68719476738 */
+        tail call void asm sideeffect "/* result: ${0:n} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+        ret void
+}
diff --git a/test/CodeGen/AArch64/assertion-rc-mismatch.ll b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
index f09203f2211f..bcf206ec9bed 100644
--- a/test/CodeGen/AArch64/assertion-rc-mismatch.ll
+++ b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
@@ -1,4 +1,3 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 ; Test case related to <rdar://problem/15633429>.
 
diff --git a/test/CodeGen/AArch64/atomic-ops-not-barriers.ll b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
index fc4db9097aae..da095a0a42c5 100644
--- a/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
+++ b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 define i32 @foo(i32* %var, i1 %cond) {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index f8db05fd416e..26301b92f9fe 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
 
 
 ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
@@ -502,12 +500,10 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
 ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
-; CHECK-ARM64-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
@@ -528,12 +524,10 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
 ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
-; CHECK-ARM64-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -555,11 +549,9 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
 ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -581,11 +573,9 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
 ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-AARCH64-NEXT: cmp x0, x[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -607,12 +597,10 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
 ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
 
-; CHECK-ARM64-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -634,12 +622,10 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
 ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
 
-; CHECK-ARM64-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -661,11 +647,9 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
 ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -687,11 +671,9 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
 ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-AARCH64-NEXT: cmp x0, x[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lt
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
 
 
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -713,11 +695,9 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
 ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -739,11 +719,9 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
 ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -765,11 +743,9 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
 ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -791,11 +767,9 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
 ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-AARCH64-NEXT: cmp x0, x[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -817,11 +791,9 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
 ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -843,11 +815,9 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
 ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -869,11 +839,9 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
 ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-AARCH64-NEXT: cmp w0, w[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -895,11 +863,9 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
 ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-AARCH64-NEXT: cmp x0, x[[OLD]]
-; CHECK-AARCH64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lo
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -912,7 +878,9 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
 
 define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i8:
-   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %old = extractvalue { i8, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
@@ -923,8 +891,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -934,7 +901,9 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 
 define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
-   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %old = extractvalue { i16, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
@@ -945,8 +914,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -956,7 +924,9 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 
 define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
-   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %old = extractvalue { i32, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
@@ -967,8 +937,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -978,7 +947,9 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 
 define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
-   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %pair = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %old = extractvalue { i64, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
index c63610bccae5..62d41bcead6b 100644
--- a/test/CodeGen/AArch64/basic-pic.ll
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
 
 @var = global i32 0
 
diff --git a/test/CodeGen/AArch64/bitfield-insert-0.ll b/test/CodeGen/AArch64/bitfield-insert-0.ll
index 9272e1edfb94..da0ed8af3126 100644
--- a/test/CodeGen/AArch64/bitfield-insert-0.ll
+++ b/test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -disassemble - | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
 
 ; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
 ; point, in the edge case where lsb = 0. Just make sure.
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index b67aa0fa23f0..2369a55aa92d 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK
 
 ; First, a simple example from Clang. The registers could plausibly be
 ; different, but probably won't be.
@@ -64,10 +63,8 @@ define void @test_whole64(i64* %existing, i64* %new) {
 define void @test_whole32_from64(i64* %existing, i64* %new) {
 ; CHECK-LABEL: test_whole32_from64:
 
-; CHECK-AARCH64: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
-; CHECK-AARCH64-NOT: and
 
-; CHECK-ARM64: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
+; CHECK: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
 
 ; CHECK: ret
 
@@ -86,9 +83,8 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
 define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_masked:
 
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
-; CHECK-AARCH64: and {{w[0-9]+}}, [[INSERT]], #0xff
 
   %oldval = load volatile i32* %existing
   %oldval_keep = and i32 %oldval, 135 ; = 0x87
@@ -105,9 +101,8 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 
 define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_masked:
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
-; CHECK-AARCH64: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
 
   %oldval = load volatile i64* %existing
   %oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
@@ -126,9 +121,8 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_complexmask:
 
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
-; CHECK-AARCH64: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   %oldval = load volatile i32* %existing
   %oldval_keep = and i32 %oldval, 647 ; = 0x287
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 92f6d74908bb..0e1265372bd8 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -24,8 +23,7 @@ define void @test_extendb(i8 %var) {
 
   %uxt64 = zext i8 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-AARCH64: uxtb {{x[0-9]+}}, {{w[0-9]+}}
-; CHECK-ARM64: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
   ret void
 }
 
@@ -49,8 +47,7 @@ define void @test_extendh(i16 %var) {
 
   %uxt64 = zext i16 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-AARCH64: uxth {{x[0-9]+}}, {{w[0-9]+}}
-; CHECK-ARM64: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
   ret void
 }
 
@@ -63,8 +60,7 @@ define void @test_extendw(i32 %var) {
 
   %uxt64 = zext i32 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-AARCH64: ubfx {{w[0-9]+}}, {{w[0-9]+}}, #0, #32
-; CHECK-ARM64: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
+; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
index c22ce1caf4d5..3a5dbdc945ca 100644
--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -code-model=large -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 @addr = global i8* null
 
diff --git a/test/CodeGen/AArch64/bool-loads.ll b/test/CodeGen/AArch64/bool-loads.ll
index 37cc8e42f175..881aeaa15dd5 100644
--- a/test/CodeGen/AArch64/bool-loads.ll
+++ b/test/CodeGen/AArch64/bool-loads.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 @var = global i1 0
 
diff --git a/test/CodeGen/AArch64/branch-relax-asm.ll b/test/CodeGen/AArch64/branch-relax-asm.ll
new file mode 100644
index 000000000000..7409c84e6180
--- /dev/null
+++ b/test/CodeGen/AArch64/branch-relax-asm.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -disable-block-placement -aarch64-tbz-offset-bits=4 -o - %s | FileCheck %s
+define i32 @test_asm_length(i32 %in) {
+; CHECK-LABEL: test_asm_length:
+
+  ; It would be more natural to use just one "tbnz %false" here, but if the
+  ; number of instructions in the asm is counted reasonably, that block is out
+  ; of the limited range we gave tbz. So branch relaxation has to invert the
+  ; condition.
+; CHECK:     tbz w0, #0, [[TRUE:LBB[0-9]+_[0-9]+]]
+; CHECK:     b [[FALSE:LBB[0-9]+_[0-9]+]]
+
+; CHECK: [[TRUE]]:
+; CHECK:     orr w0, wzr, #0x4
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     ret
+
+; CHECK: [[FALSE]]:
+; CHECK:     ret
+
+  %val = and i32 %in, 1
+  %tst = icmp eq i32 %val, 0
+  br i1 %tst, label %true, label %false
+
+true:
+  call void asm sideeffect "nop\0A\09nop\0A\09nop\0A\09nop\0A\09nop\0A\09nop", ""()
+  ret i32 4
+
+false:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
index 285c19ddee30..952404495ce5 100644
--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 @stored_label = global i8* null
 
diff --git a/test/CodeGen/AArch64/callee-save.ll b/test/CodeGen/AArch64/callee-save.ll
index 6a2832ceaadf..046e6ceac077 100644
--- a/test/CodeGen/AArch64/callee-save.ll
+++ b/test/CodeGen/AArch64/callee-save.ll
@@ -1,20 +1,14 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var = global float 0.0
 
 define void @foo() {
 ; CHECK-LABEL: foo:
 
-; CHECK: stp d14, d15, [sp
-; CHECK: stp d12, d13, [sp
-; CHECK: stp d10, d11, [sp
-; CHECK: stp d8, d9, [sp
-
-; CHECK-ARM64: stp d15, d14, [sp
-; CHECK-ARM64: stp d13, d12, [sp
-; CHECK-ARM64: stp d11, d10, [sp
-; CHECK-ARM64: stp d9, d8, [sp
+; CHECK: stp d15, d14, [sp
+; CHECK: stp d13, d12, [sp
+; CHECK: stp d11, d10, [sp
+; CHECK: stp d9, d8, [sp
 
   ; Create lots of live variables to exhaust the supply of
   ; caller-saved registers
@@ -84,14 +78,9 @@ define void @foo() {
   store volatile float %val31, float* @var
   store volatile float %val32, float* @var
 
-; CHECK: ldp     d8, d9, [sp
-; CHECK: ldp     d10, d11, [sp
-; CHECK: ldp     d12, d13, [sp
-; CHECK: ldp     d14, d15, [sp
-
-; CHECK-ARM64: ldp     d9, d8, [sp
-; CHECK-ARM64: ldp     d11, d10, [sp
-; CHECK-ARM64: ldp     d13, d12, [sp
-; CHECK-ARM64: ldp     d15, d14, [sp
+; CHECK: ldp     d9, d8, [sp
+; CHECK: ldp     d11, d10, [sp
+; CHECK: ldp     d13, d12, [sp
+; CHECK: ldp     d15, d14, [sp
   ret void
 }
diff --git a/test/CodeGen/AArch64/cmpxchg-idioms.ll b/test/CodeGen/AArch64/cmpxchg-idioms.ll
new file mode 100644
index 000000000000..0c008c269794
--- /dev/null
+++ b/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s
+
+define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_return:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: orr w0, wzr, #0x1
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov w0, wzr
+; CHECK: ret
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %conv = zext i1 %success to i32
+  ret i32 %conv
+}
+
+define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
+; CHECK-LABEL: test_return_bool:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxrb [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1, uxtb
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+  ; FIXME: DAG combine should be able to deal with this.
+; CHECK: orr [[TMP:w[0-9]+]], wzr, #0x1
+; CHECK: eor w0, [[TMP]], #0x1
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov [[TMP:w[0-9]+]], wzr
+; CHECK: eor w0, [[TMP]], #0x1
+; CHECK: ret
+
+  %pair = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
+  %success = extractvalue { i8, i1 } %pair, 1
+  %failure = xor i1 %success, 1
+  ret i1 %failure
+}
+
+define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_conditional:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], w2, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _bar
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _baz
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  tail call void @bar() #2
+  br label %end
+
+false:
+  tail call void @baz() #2
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @bar()
+declare void @baz()
diff --git a/test/CodeGen/AArch64/code-model-large-abs.ll b/test/CodeGen/AArch64/code-model-large-abs.ll
index b2b1fa7a5728..ca92500855b4 100644
--- a/test/CodeGen/AArch64/code-model-large-abs.ll
+++ b/test/CodeGen/AArch64/code-model-large-abs.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -code-model=large -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll
index 31b9829d8a1f..a1a87cf51a1a 100644
--- a/test/CodeGen/AArch64/compare-branch.ll
+++ b/test/CodeGen/AArch64/compare-branch.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/compiler-ident.ll b/test/CodeGen/AArch64/compiler-ident.ll
new file mode 100644
index 000000000000..035057194a06
--- /dev/null
+++ b/test/CodeGen/AArch64/compiler-ident.ll
@@ -0,0 +1,12 @@
+; RUN: llc -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; ModuleID = 'compiler-ident.c'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK: .ident  "some LLVM version"
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"some LLVM version"}
+
diff --git a/test/CodeGen/AArch64/complex-copy-noneon.ll b/test/CodeGen/AArch64/complex-copy-noneon.ll
index 137ea5f0ff51..4ae547856ecd 100644
--- a/test/CodeGen/AArch64/complex-copy-noneon.ll
+++ b/test/CodeGen/AArch64/complex-copy-noneon.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
 
 ; The DAG combiner decided to use a vector load/store for this struct copy
 ; previously. This probably shouldn't happen without NEON, but the most
diff --git a/test/CodeGen/AArch64/complex-fp-to-int.ll b/test/CodeGen/AArch64/complex-fp-to-int.ll
new file mode 100644
index 000000000000..13cf762c3d2e
--- /dev/null
+++ b/test/CodeGen/AArch64/complex-fp-to-int.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i64> @test_v2f32_to_signed_v2i64(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i64:
+; CHECK: fcvtl [[VAL64:v[0-9]+]].2d, v0.2s
+; CHECK: fcvtzs.2d v0, [[VAL64]]
+
+  %val = fptosi <2 x float> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <2 x i64> @test_v2f32_to_unsigned_v2i64(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i64:
+; CHECK: fcvtl [[VAL64:v[0-9]+]].2d, v0.2s
+; CHECK: fcvtzu.2d v0, [[VAL64]]
+
+  %val = fptoui <2 x float> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <2 x i16> @test_v2f32_to_signed_v2i16(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i16:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptosi <2 x float> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i16> @test_v2f32_to_unsigned_v2i16(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i16:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptoui <2 x float> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i8> @test_v2f32_to_signed_v2i8(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i8:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptosi <2 x float> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <2 x i8> @test_v2f32_to_unsigned_v2i8(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i8:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptoui <2 x float> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <4 x i16> @test_v4f32_to_signed_v4i16(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_signed_v4i16:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptosi <4 x float> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <4 x i16> @test_v4f32_to_unsigned_v4i16(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_unsigned_v4i16:
+; CHECK: fcvtzu.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptoui <4 x float> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <4 x i8> @test_v4f32_to_signed_v4i8(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_signed_v4i8:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptosi <4 x float> %in to <4 x i8>
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @test_v4f32_to_unsigned_v4i8(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_unsigned_v4i8:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptoui <4 x float> %in to <4 x i8>
+  ret <4 x i8> %val
+}
+
+define <2 x i32> @test_v2f64_to_signed_v2i32(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i32:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x i32> @test_v2f64_to_unsigned_v2i32(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i32:
+; CHECK: fcvtzu.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x i16> @test_v2f64_to_signed_v2i16(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i16:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i16> @test_v2f64_to_unsigned_v2i16(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i16:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i8> @test_v2f64_to_signed_v2i8(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i8:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <2 x i8> @test_v2f64_to_unsigned_v2i8(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i8:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i8>
+  ret <2 x i8> %val
+}
diff --git a/test/CodeGen/AArch64/complex-int-to-fp.ll b/test/CodeGen/AArch64/complex-int-to-fp.ll
new file mode 100644
index 000000000000..5c943f95c355
--- /dev/null
+++ b/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655(<2 x i64>* %addr, <2 x float>* %addrfloat) {
+  %T = load <2 x i64>* %addr
+  %F = sitofp <2 x i64> %T to <2 x float>
+  store <2 x float> %F, <2 x float>* %addrfloat
+  ret void
+}
+
+define <2 x double> @test_signed_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i32_to_v2f64:
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], v0, #0
+; CHECK-NEXT: scvtf.2d v0, [[VAL64]]
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_unsigned_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i32_to_v2f64
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], v0, #0
+; CHECK-NEXT: ucvtf.2d v0, [[VAL64]]
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_signed_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i16_to_v2f64:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: scvtf.2d v0, [[VAL64]]
+
+  %conv = sitofp <2 x i16> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @test_unsigned_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f64
+; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: ucvtf.2d v0, [[VAL64]]
+
+  %conv = uitofp <2 x i16> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_signed_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i8_to_v2f64:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: scvtf.2d v0, [[VAL64]]
+
+  %conv = sitofp <2 x i8> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f64
+; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: ucvtf.2d v0, [[VAL64]]
+
+  %conv = uitofp <2 x i8> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i64_to_v2f32:
+; CHECK: scvtf.2d [[VAL64:v[0-9]+]], v0
+; CHECK: fcvtn v0.2s, [[VAL64]].2d
+
+  %conv = sitofp <2 x i64> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i64_to_v2f32
+; CHECK: ucvtf.2d [[VAL64:v[0-9]+]], v0
+; CHECK: fcvtn v0.2s, [[VAL64]].2d
+
+  %conv = uitofp <2 x i64> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <2 x float> @test_signed_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i16_to_v2f32:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
+; CHECK: scvtf.2s v0, [[VAL32]]
+
+  %conv = sitofp <2 x i16> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f32
+; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ucvtf.2s v0, [[VAL32]]
+
+  %conv = uitofp <2 x i16> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <2 x float> @test_signed_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i8_to_v2f32:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
+; CHECK: scvtf.2s v0, [[VAL32]]
+
+  %conv = sitofp <2 x i8> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f32
+; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ucvtf.2s v0, [[VAL32]]
+
+  %conv = uitofp <2 x i8> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <4 x float> @test_signed_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v4i16_to_v4f32:
+; CHECK: sshll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: scvtf.4s v0, [[VAL32]]
+
+  %conv = sitofp <4 x i16> %v to <4 x float>
+  ret <4 x float> %conv
+}
+
+define <4 x float> @test_unsigned_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v4i16_to_v4f32
+; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: ucvtf.4s v0, [[VAL32]]
+
+  %conv = uitofp <4 x i16> %v to <4 x float>
+  ret <4 x float> %conv
+}
+
+define <4 x float> @test_signed_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v4i8_to_v4f32:
+; CHECK: shl.4h [[TMP:v[0-9]+]], v0, #8
+; CHECK: sshr.4h [[VAL16:v[0-9]+]], [[TMP]], #8
+; CHECK: sshll.4s [[VAL32:v[0-9]+]], [[VAL16]], #0
+; CHECK: scvtf.4s v0, [[VAL32]]
+
+  %conv = sitofp <4 x i8> %v to <4 x float>
+  ret <4 x float> %conv
+}
+define <4 x float> @test_unsigned_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v4i8_to_v4f32
+; CHECK: bic.4h v0, #0xff, lsl #8
+; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: ucvtf.4s v0, [[VAL32]]
+
+  %conv = uitofp <4 x i8> %v to <4 x float>
+  ret <4 x float> %conv
+}
diff --git a/test/CodeGen/AArch64/concatvector-bugs.ll b/test/CodeGen/AArch64/concatvector-bugs.ll
deleted file mode 100644
index 8d167e42c72b..000000000000
--- a/test/CodeGen/AArch64/concatvector-bugs.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon
-; Bug: i8 type in FRP8 register but not registering with register class causes segmentation fault.
-; Fix: Removed i8 type from FPR8 register class.
-
-; Not relevant to arm64.
-
-define void @test_concatvector_v8i8() {
-entry.split:
-  br i1 undef, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry.split
-  unreachable
-
-if.end:                                           ; preds = %entry.split
-  br i1 undef, label %if.then9, label %if.end18
-
-if.then9:                                         ; preds = %if.end
-  unreachable
-
-if.end18:                                         ; preds = %if.end
-  br label %for.body
-
-for.body:                                         ; preds = %for.inc, %if.end18
-  br i1 false, label %if.then30, label %for.inc
-
-if.then30:                                        ; preds = %for.body
-  unreachable
-
-for.inc:                                          ; preds = %for.body
-  br i1 undef, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc
-  br label %for.body77
-
-for.body77:                                       ; preds = %for.body77, %for.end
-  br i1 undef, label %for.end106, label %for.body77
-
-for.end106:                                       ; preds = %for.body77
-  br i1 undef, label %for.body130.us.us, label %stmt.for.body130.us.us
-
-stmt.for.body130.us.us:                     ; preds = %stmt.for.body130.us.us, %for.end106
-  %_p_splat.us = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
-  store <8 x i8> %_p_splat.us, <8 x i8>* undef, align 1
-  br label %stmt.for.body130.us.us
-
-for.body130.us.us:                                ; preds = %for.body130.us.us, %for.end106
-  br label %for.body130.us.us
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define <8 x i16> @test_splat(i32 %l) nounwind {
-; CHECK-LABEL: test_splat:
-; CHECK: ret
-  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
-  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
-  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> zeroinitializer
-  ret <8 x i16> %vec
-}
-
-
-define <8 x i16> @test_notsplat(<8 x i16> %a, <8 x i16> %b, i32 %l) nounwind {
-; CHECK-LABEL: test_notsplat:
-; CHECK: ret
-entry:
-  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
-  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
-  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0>
-  ret <8 x i16> %vec
-}
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 2ee49a2f6edb..5f81cba66cbc 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -47,8 +45,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 ; CHECK-NOFP-NOT: fcmp
   %val2 = select i1 %tst2, i64 9, i64 15
   store i64 %val2, i64* @var64
-; CHECK-AARCH64: movz x[[CONST15:[0-9]+]], #15
-; CHECK-ARM64: orr w[[CONST15:[0-9]+]], wzr, #0xf
+; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf
 ; CHECK: movz {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
 ; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq
 ; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 23c06be3a1dc..f0f36bd5cea5 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -1,9 +1,5 @@
 ; This tests that llc accepts all valid AArch64 CPUs
 
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index cd9a863bd658..fbea4a6e5838 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,14 +1,10 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
   %val = select i1 %bit, i32 %a, i32 %b
-; CHECK-AARCH64: movz [[ONE:w[0-9]+]], #1
-; CHECK-AARCH64: tst w0, [[ONE]]
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel w0, w1, w2, ne
 
   ret i32 %val
@@ -17,9 +13,7 @@ define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 ; CHECK-LABEL: test_select_i64:
   %val = select i1 %bit, i64 %a, i64 %b
-; CHECK-AARCH64: movz [[ONE:w[0-9]+]], #1
-; CHECK-AARCH64: tst w0, [[ONE]]
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel x0, x1, x2, ne
 
   ret i64 %val
@@ -28,9 +22,7 @@ define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 define float @test_select_float(i1 %bit, float %a, float %b) {
 ; CHECK-LABEL: test_select_float:
   %val = select i1 %bit, float %a, float %b
-; CHECK-AARCH64: movz [[ONE:w[0-9]+]], #1
-; CHECK-AARCH64: tst w0, [[ONE]]
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel s0, s0, s1, ne
 ; CHECK-NOFP-NOT: fcsel
   ret float %val
@@ -39,9 +31,7 @@ define float @test_select_float(i1 %bit, float %a, float %b) {
 define double @test_select_double(i1 %bit, double %a, double %b) {
 ; CHECK-LABEL: test_select_double:
   %val = select i1 %bit, double %a, double %b
-; CHECK-AARCH64: movz [[ONE:w[0-9]+]], #1
-; CHECK-AARCH64: tst w0, [[ONE]]
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel d0, d0, d1, ne
 ; CHECK-NOFP-NOT: fcsel
 
diff --git a/test/CodeGen/AArch64/dp-3source.ll b/test/CodeGen/AArch64/dp-3source.ll
index 433ce209a72d..22bd4a844e1a 100644
--- a/test/CodeGen/AArch64/dp-3source.ll
+++ b/test/CodeGen/AArch64/dp-3source.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) {
diff --git a/test/CodeGen/AArch64/dp1.ll b/test/CodeGen/AArch64/dp1.ll
index 41ef1951997f..662b41588541 100644
--- a/test/CodeGen/AArch64/dp1.ll
+++ b/test/CodeGen/AArch64/dp1.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/dp2.ll b/test/CodeGen/AArch64/dp2.ll
index 391418d75508..71b31696372a 100644
--- a/test/CodeGen/AArch64/dp2.ll
+++ b/test/CodeGen/AArch64/dp2.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64 | FileCheck %s
 
 @var32_0 = global i32 0
diff --git a/test/CodeGen/AArch64/eliminate-trunc.ll b/test/CodeGen/AArch64/eliminate-trunc.ll
index 314a94dda14e..ea86a084cb42 100644
--- a/test/CodeGen/AArch64/eliminate-trunc.ll
+++ b/test/CodeGen/AArch64/eliminate-trunc.ll
@@ -1,15 +1,11 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-AARCH64
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s
 
 ; Check  trunc i64 operation is translated as a subregister access
 ; eliminating an i32 induction varible.
-; CHECK-AARCH64: add {{x[0-9]+}}, {{x[0-9]+}}, #1
-; CHECK-AARCH64-NOT: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-AARCH64-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}, uxtw
 
-; CHECK-ARM64-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
-; CHECK-ARM64: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-ARM64-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
 define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize {
 entry:
   %conv = zext i8 %limit to i32
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index 7c78f9a815e1..ce5c0f686615 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -o - < %s | FileCheck %s --check-prefix=CHECK-AARCH64
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
@@ -9,13 +7,10 @@ define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
 ; evaluate to 0 if the symbol is undefined. We use a litpool entry.
   ret i32()* @var
-; CHECK-AARCH64: .LCPI0_0:
-; CHECK-AARCH64-NEXT: .xword var
 
-; CHECK-AARCH64: ldr x0, [{{x[0-9]+}}, #:lo12:.LCPI0_0]
 
-; CHECK-ARM64: adrp x[[ADDRHI:[0-9]+]], :got:var
-; CHECK-ARM64: ldr x0, [x[[ADDRHI]], :got_lo12:var]
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
@@ -30,15 +25,11 @@ define i32()* @foo() {
 
 define i32* @bar() {
   %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
-; CHECK-AARCH64: .LCPI1_0:
-; CHECK-AARCH64-NEXT: .xword arr_var
 
-; CHECK-AARCH64: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI1_0]
-; CHECK-AARCH64: add x0, [[BASE]], #20
 
-; CHECK-ARM64: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
-; CHECK-ARM64: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
-; CHECK-ARM64: add x0, [[BASE]], #20
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
+; CHECK: add x0, [[BASE]], #20
 
   ret i32* %addr
 
@@ -54,11 +45,9 @@ define i32* @bar() {
 
 define i32* @wibble() {
   ret i32* @defined_weak_var
-; CHECK-AARCH64: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK-AARCH64: add x0, [[BASE]], #:lo12:defined_weak_var
 
-; CHECK-ARM64: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK-ARM64: add x0, [[BASE]], :lo12:defined_weak_var
+; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/extract.ll b/test/CodeGen/AArch64/extract.ll
index f066b59af2fd..1fc9387fecc0 100644
--- a/test/CodeGen/AArch64/extract.ll
+++ b/test/CodeGen/AArch64/extract.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i64 @ror_i64(i64 %in) {
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index 97410aa50265..a392619a768d 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64
 
 ; This test is designed to be run in the situation where the
 ; call-frame is not reserved (hence disable-fp-elim), but where
@@ -13,30 +12,22 @@ define fastcc void @foo(i32 %in) {
   %addr = alloca i8, i32 %in
 
 ; Normal frame setup stuff:
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame:
 ; CHECK: sub sp, sp, #16
-; CHECK-ARM64: sub sp, sp, #16
 
   call fastcc void @will_pop([8 x i32] undef, i32 42)
 ; CHECK: bl will_pop
-; CHECK-ARM64: bl will_pop
 
 ; Since @will_pop is fastcc with tailcallopt, it will put the stack
 ; back where it needs to be, we shouldn't duplicate that
 ; CHECK-NOT: sub sp, sp, #16
 ; CHECK-NOT: add sp, sp,
-; CHECK-ARM64-NOT: sub sp, sp, #16
-; CHECK-ARM64-NOT: add sp, sp,
 
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
-; CHECK-ARM64: mov     sp, x29
-; CHECK-ARM64: ldp     x29, x30, [sp], #16
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
 
@@ -47,28 +38,21 @@ define void @foo1(i32 %in) {
 
   %addr = alloca i8, i32 %in
 ; Normal frame setup again
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame
 ; CHECK: sub sp, sp, #16
-; CHECK-ARM64: sub sp, sp, #16
 
   call void @wont_pop([8 x i32] undef, i32 42)
 ; CHECK: bl wont_pop
-; CHECK-ARM64: bl wont_pop
 
 ; This time we *do* need to unreserve the call-frame
 ; CHECK: add sp, sp, #16
-; CHECK-ARM64: add sp, sp, #16
 
 ; Check for epilogue (primarily to make sure sp spotted above wasn't
 ; part of it).
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
-; CHECK-ARM64: mov     sp, x29
-; CHECK-ARM64: ldp     x29, x30, [sp], #16
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index fb9b4ac57edd..9917fcd044fd 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -1,228 +1,144 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s
 
 ; Without tailcallopt fastcc still means the caller cleans up the
 ; stack, so try to make sure this is respected.
 
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
-; CHECK: sub sp, sp, #48
-
-; CHECK-ARM64-LABEL: func_stack0:
-; CHECK-ARM64: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-NEXT: mov x29, sp
-; CHECK-ARM64-NEXT: sub sp, sp, #32
+; CHECK: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack0:
-; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-TAIL-NEXT: mov x29, sp
-; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL-NEXT: mov x29, sp
+; CHECK-TAIL-NEXT: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
-
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
 
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #48
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
-; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #32
 
-; CHECK-ARM64-LABEL: func_stack8:
-; CHECK-ARM64: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov x29, sp
-; CHECK-ARM64: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack8:
-; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-TAIL: mov x29, sp
-; CHECK-ARM64-TAIL: sub sp, sp, #32
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
-
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #64
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
-; CHECK: sub sp, sp, #48
-
-; CHECK-ARM64-LABEL: func_stack32:
-; CHECK-ARM64: mov x29, sp
+; CHECK: mov x29, sp
 
 ; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack32:
-; CHECK-ARM64-TAIL: mov x29, sp
+; CHECK-TAIL: mov x29, sp
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
-
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
 
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
-
-; CHECK-TAIL: add sp, sp, #80
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
diff --git a/test/CodeGen/AArch64/fcmp.ll b/test/CodeGen/AArch64/fcmp.ll
index fe2c3260a8b0..3c74508bb12b 100644
--- a/test/CodeGen/AArch64/fcmp.ll
+++ b/test/CodeGen/AArch64/fcmp.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 declare void @bar(i32)
 
diff --git a/test/CodeGen/AArch64/fcvt-fixed.ll b/test/CodeGen/AArch64/fcvt-fixed.ll
index 5d7c83ebfb36..ccb3616b70bf 100644
--- a/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 -O0
 
 ; (The O0 test is to make sure FastISel still constrains its operands properly
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index a85b02538ef5..d549c7e78421 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i32 @test_floattoi32(float %in) {
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
index cae6856d799e..77bbcddc4926 100644
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
diff --git a/test/CodeGen/AArch64/floatdp_1source.ll b/test/CodeGen/AArch64/floatdp_1source.ll
index 5d11d3f0e211..8c02787a2340 100644
--- a/test/CodeGen/AArch64/floatdp_1source.ll
+++ b/test/CodeGen/AArch64/floatdp_1source.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @varhalf = global half 0.0
diff --git a/test/CodeGen/AArch64/floatdp_2source.ll b/test/CodeGen/AArch64/floatdp_2source.ll
index 0a0933e0e95e..262271784ec6 100644
--- a/test/CodeGen/AArch64/floatdp_2source.ll
+++ b/test/CodeGen/AArch64/floatdp_2source.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll
index ed9f36d948ef..b4f4d77cd0bc 100644
--- a/test/CodeGen/AArch64/fp-cond-sel.ll
+++ b/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
@@ -13,8 +12,7 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, float 0.0, float 1.0
   store float %val1, float* @varfloat
-; CHECK-AARCH64: ldr s[[FLT0:[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
-; CHECK-ARM64: movi v[[FLT0:[0-9]+]].2d, #0
+; CHECK: movi v[[FLT0:[0-9]+]].2d, #0
 ; CHECK: fmov s[[FLT1:[0-9]+]], #1.0
 ; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi
 
@@ -22,7 +20,6 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
   %tst2 = icmp sle i64 %lhs64, %rhs64
   %val2 = select i1 %tst2, double 1.0, double 0.0
   store double %val2, double* @vardouble
-; CHECK-AARCH64: ldr d[[FLT0:[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
 ; FLT0 is reused from above on ARM64.
 ; CHECK: fmov d[[FLT1:[0-9]+]], #1.0
 ; CHECK: fcsel {{d[0-9]+}}, d[[FLT1]], d[[FLT0]], le
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index e6da7c8762b3..10f88fdbbe96 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -1,6 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -check-prefix=CHECK-NOFAST
 
 declare float @llvm.fma.f32(float, float, float)
diff --git a/test/CodeGen/AArch64/fp128-folding.ll b/test/CodeGen/AArch64/fp128-folding.ll
index 91445e2c849f..892b19c5cf33 100644
--- a/test/CodeGen/AArch64/fp128-folding.ll
+++ b/test/CodeGen/AArch64/fp128-folding.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 declare void @bar(i8*, i8*, i32*)
 
 ; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/fp128.ll
deleted file mode 100644
index 56089e33e65d..000000000000
--- a/test/CodeGen/AArch64/fp128.ll
+++ /dev/null
@@ -1,282 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; arm64 has a separate copy of this test.
-@lhs = global fp128 zeroinitializer
-@rhs = global fp128 zeroinitializer
-
-define fp128 @test_add() {
-; CHECK-LABEL: test_add:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, {{#?}}:lo12:rhs]
-
-  %val = fadd fp128 %lhs, %rhs
-; CHECK: bl __addtf3
-  ret fp128 %val
-}
-
-define fp128 @test_sub() {
-; CHECK-LABEL: test_sub:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, {{#?}}:lo12:rhs]
-
-  %val = fsub fp128 %lhs, %rhs
-; CHECK: bl __subtf3
-  ret fp128 %val
-}
-
-define fp128 @test_mul() {
-; CHECK-LABEL: test_mul:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, {{#?}}:lo12:rhs]
-
-  %val = fmul fp128 %lhs, %rhs
-; CHECK: bl __multf3
-  ret fp128 %val
-}
-
-define fp128 @test_div() {
-; CHECK-LABEL: test_div:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, {{#?}}:lo12:rhs]
-
-  %val = fdiv fp128 %lhs, %rhs
-; CHECK: bl __divtf3
-  ret fp128 %val
-}
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_fptosi() {
-; CHECK-LABEL: test_fptosi:
-  %val = load fp128* @lhs
-
-  %val32 = fptosi fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixtfsi
-
-  %val64 = fptosi fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixtfdi
-
-  ret void
-}
-
-define void @test_fptoui() {
-; CHECK-LABEL: test_fptoui:
-  %val = load fp128* @lhs
-
-  %val32 = fptoui fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixunstfsi
-
-  %val64 = fptoui fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixunstfdi
-
-  ret void
-}
-
-define void @test_sitofp() {
-; CHECK-LABEL: test_sitofp:
-
-  %src32 = load i32* @var32
-  %val32 = sitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatsitf
-
-  %src64 = load i64* @var64
-  %val64 = sitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatditf
-
-  ret void
-}
-
-define void @test_uitofp() {
-; CHECK-LABEL: test_uitofp:
-
-  %src32 = load i32* @var32
-  %val32 = uitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatunsitf
-
-  %src64 = load i64* @var64
-  %val64 = uitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatunditf
-
-  ret void
-}
-
-define i1 @test_setcc1() {
-; CHECK-LABEL: test_setcc1:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, {{#?}}:lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ole fp128 %lhs, %rhs
-; CHECK: bl __letf2
-; CHECK: cmp w0, #0
-; CHECK: cset w0, le
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i1 @test_setcc2() {
-; CHECK-LABEL: test_setcc2:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, {{#?}}:lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __gttf2
-; CHECK: cmp w0, #0
-; CHECK: cset   [[GT:w[0-9]+]], gt
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-
-; CHECK: orr     w0, [[UNORDERED]], [[GT]]
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i32 @test_br_cc() {
-; CHECK-LABEL: test_br_cc:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, {{#?}}:lo12:rhs]
-
-  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
-  %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __getf2
-; CHECK: cmp w0, #0
-; CHECK: cset   [[OGE:w[0-9]+]], ge
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-
-; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
-; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
-  br i1 %cond, label %iftrue, label %iffalse
-
-iftrue:
-  ret i32 42
-; CHECK-NEXT: BB#
-; CHECK-NEXT: movz {{x0|w0}}, #42
-; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
-
-iffalse:
-  ret i32 29
-; CHECK: [[RET29]]:
-; CHECK-NEXT: movz {{x0|w0}}, #29
-; CHECK-NEXT: [[REALRET]]:
-; CHECK: ret
-}
-
-define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
-; CHECK-LABEL: test_select:
-
-  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
-  store fp128 %val, fp128* @lhs
-; CHECK-AARCH64: cmp {{w[0-9]+}}, #0
-; CHECK-AARCH64: str q1, [sp]
-; CHECK-ARM64: tst {{w[0-9]+}}, #0x1
-; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: BB#
-; CHECK-AARCH64-NEXT: str q0, [sp]
-; CHECK-ARM64-NEXT: orr v[[DEST:[0-9]+]].16b, v0.16b, v0.16b
-; CHECK-NEXT: [[IFFALSE]]:
-; CHECK-AARCH64-NEXT: ldr q[[DEST:[0-9]+]], [sp]
-; CHECK: str q[[DEST]], [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-  ret void
-; CHECK: ret
-}
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-
-define void @test_round() {
-; CHECK-LABEL: test_round:
-
-  %val = load fp128* @lhs
-
-  %float = fptrunc fp128 %val to float
-  store float %float, float* @varfloat
-; CHECK: bl __trunctfsf2
-; CHECK: str s0, [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
-
-  %double = fptrunc fp128 %val to double
-  store double %double, double* @vardouble
-; CHECK: bl __trunctfdf2
-; CHECK: str d0, [{{x[0-9]+}}, {{#?}}:lo12:vardouble]
-
-  ret void
-}
-
-define void @test_extend() {
-; CHECK-LABEL: test_extend:
-
-  %val = load fp128* @lhs
-
-  %float = load float* @varfloat
-  %fromfloat = fpext float %float to fp128
-  store volatile fp128 %fromfloat, fp128* @lhs
-; CHECK: bl __extendsftf2
-; CHECK: str q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-
-  %double = load double* @vardouble
-  %fromdouble = fpext double %double to fp128
-  store volatile fp128 %fromdouble, fp128* @lhs
-; CHECK: bl __extenddftf2
-; CHECK: str q0, [{{x[0-9]+}}, {{#?}}:lo12:lhs]
-
-  ret void
-; CHECK: ret
-}
-
-define fp128 @test_neg(fp128 %in) {
-; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
-; Make sure the weird hex constant below *is* -0.0
-; CHECK-NEXT: fp128 -0
-
-; CHECK-LABEL: test_neg:
-
-  ; Could in principle be optimized to fneg which we can't select, this makes
-  ; sure that doesn't happen.
-  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
-; CHECK-AARCH64: str q0, [sp, #-16]
-; CHECK-AARCH64-NEXT: ldr q1, [sp], #16
-; CHECK-ARM64: orr v1.16b, v0.16b, v0.16b
-; CHECK: ldr q0, [{{x[0-9]+}}, {{#?}}:lo12:[[MINUS0]]]
-; CHECK: bl __subtf3
-
-  ret fp128 %ret
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index e5aafb545611..e59520c4dc95 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @varf32 = global float 0.0
 @varf64 = global double 0.0
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
index 78fc13b37ead..85d95e21c9b7 100644
--- a/test/CodeGen/AArch64/frameaddr.ll
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -1,4 +1,3 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0  | FileCheck %s
 
 define i8* @t() nounwind {
diff --git a/test/CodeGen/AArch64/free-zext.ll b/test/CodeGen/AArch64/free-zext.ll
index 584ce2844da1..d69105eec381 100644
--- a/test/CodeGen/AArch64/free-zext.ll
+++ b/test/CodeGen/AArch64/free-zext.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i64 @test_free_zext(i8* %a, i16* %b) {
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 5b3e6c89db6e..abb732ccf43a 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,12 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-AARCH64 --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE-AARCH64 --check-prefix=CHECK-BE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM64 %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-ARM64-BE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -67,9 +60,7 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st
 
     %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
-; CHECK-AARCH64: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16
-; CHECK-AARCH64: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12]
-; CHECK-ARM64: ldr [[REG32:w[0-9]+]], [sp, #28]
+; CHECK: ldr [[REG32:w[0-9]+]], [sp, #28]
     store i32 %val0, i32* @var32
 ; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
@@ -155,7 +146,6 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
     %retval = load volatile i32* %stacked
     ret i32 %retval
 ; CHECK-LE: ldr w0, [sp, #16]
-; CHECK-BE-AARCH64: ldr w0, [sp, #20]
 }
 
 define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
@@ -165,10 +155,8 @@ define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
     store float %var8, float* @varfloat
     ; Beware as above: the offset would be different on big-endian
     ; machines if the first ldr were changed to use s-registers.
-; CHECK-ARM64: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
-; CHECK-AARCH64: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
-; CHECK-ARM64: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
-; CHECK-AARCH64: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
+; CHECK: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
+; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
 
     ret void
 }
@@ -193,12 +181,10 @@ define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
     ; Nothing local on stack in current codegen, so first stack is 16 away
 ; CHECK-LE: add     x[[REG:[0-9]+]], sp, #16
 ; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8]
-; CHECK-BE-AARCH64: ldr {{x[0-9]+}}, [sp, #24]
 
     ; Important point is that we address sp+24 for second dword
-; CHECK-AARCH64: ldr     {{x[0-9]+}}, [sp, #16]
 
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
     ret void
 }
 
@@ -218,6 +204,5 @@ define i16 @stacked_i16(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
                         i32 %val4, i32 %val5, i32 %val6, i32 %val7,
                         i16 %stack1) {
 ; CHECK-LABEL: stacked_i16
-; CHECK-ARM64-BE: ldrh
   ret i16 %stack1
 }
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 807bffe38ad0..422c5765ec48 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,12 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-BE --check-prefix=CHECK-NOFP %s
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM64-NONEON %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -93,18 +88,14 @@ define void @check_stack_args() {
   ; Want to check that the final double is passed in registers and
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
-; CHECK-AARCH64: mov x[[SPREG:[0-9]+]], sp
-; CHECK-AARCH64-DAG: str {{w[0-9]+}}, [x[[SPREG]]]
-; CHECK-AARCH64-DAG: str {{w[0-9]+}}, [x[[SPREG]], #12]
-; CHECK-AARCH64-DAG: fmov d0,
 
-; CHECK-ARM64-DAG: str {{q[0-9]+}}, [sp]
-; CHECK-ARM64-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
-; CHECK-ARM64: mov v0.16b, v[[FINAL_DOUBLE]].16b
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
 
-; CHECK-ARM64-NONEON-DAG: str {{q[0-9]+}}, [sp]
-; CHECK-ARM64-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
-; CHECK-ARM64-NONEON: fmov d0, d[[FINAL_DOUBLE]]
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
 
 ; CHECK: bl struct_on_stack
 ; CHECK-NOFP-NOT: fmov
@@ -112,15 +103,12 @@ define void @check_stack_args() {
   call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0,
                          float -2.0, float -8.0, float 16.0, float 1.0,
                          float 64.0)
-; CHECK-AARCH64: ldr s[[STACKEDREG:[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
-; CHECK-AARCH64: mov x0, sp
-; CHECK-AARCH64: str d[[STACKEDREG]], [x0]
 
-; CHECK-ARM64:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
-; CHECK-ARM64: str [[SIXTY_FOUR]], [sp]
+; CHECK:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK: str [[SIXTY_FOUR]], [sp]
 
-; CHECK-ARM64-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
-; CHECK-ARM64-NONEON: str [[SIXTY_FOUR]], [sp]
+; CHECK-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK-NONEON: str [[SIXTY_FOUR]], [sp]
 
 ; CHECK: bl stacked_fpu
   ret void
@@ -142,11 +130,11 @@ define void @check_i128_align() {
                                    i32 42, i128 %val)
 ; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:var128]
 ; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
-; CHECK-AARCH64: mov x[[SPREG:[0-9]+]], sp
-; CHECK-AARCH64: str [[I128HI]], [x[[SPREG]], #24]
-; CHECK-AARCH64: str [[I128LO]], [x[[SPREG]], #16]
-; CHECK-ARM64: stp [[I128LO]], [[I128HI]], [sp, #16]
-; CHECK-ARM64-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
+; CHECK: stp [[I128LO]], [[I128HI]], [sp, #16]
+
+; CHECK-NONEON: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, :lo12:var128]
+; CHECK-NONEON: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
 ; CHECK: bl check_i128_stackalign
 
   call void @check_i128_regalign(i32 0, i128 42)
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
index 36b74e5a57c8..451b9d6741ee 100644
--- a/test/CodeGen/AArch64/global-alignment.ll
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @var32 = global [3 x i32] zeroinitializer
 @var64 = global [3 x i64] zeroinitializer
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
new file mode 100644
index 000000000000..68aba5ebe065
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals@PAGE
+;CHECK-APPLE-IOS-NOT: adrp
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals@PAGEOFF
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	_MergedGlobals
+;CHECK:	.comm	_MergedGlobals,8,8
+
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3 ; @_MergedGlobals
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
new file mode 100644
index 000000000000..a7735667b359
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@x = global i32 0, align 4
+@y = global i32 0, align 4
+@z = global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS-LABEL: _f1:
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @x, align 4
+  store i32 %a2, i32* @y, align 4
+  ret void
+}
+
+define void @g1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS-LABEL: _g1:
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @y, align 4
+  store i32 %a2, i32* @z, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
+;CHECK:	.globl	_MergedGlobals_x
+;CHECK:	.align	3
+;CHECK: _MergedGlobals_x:
+;CHECK:	.size	_MergedGlobals_x, 12
+
+;CHECK:	.globl	x
+;CHECK: x = _MergedGlobals_x
+;CHECK:	.globl	y
+;CHECK: y = _MergedGlobals_x+4
+;CHECK:	.globl	z
+;CHECK: z = _MergedGlobals_x+8
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
+;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,3
+
+;CHECK-APPLE-IOS: .globl	_x
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x
+;CHECK-APPLE-IOS: .globl	_y
+;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4
+;CHECK-APPLE-IOS: .globl	_z
+;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8
+;CHECK-APPLE-IOS: .subsections_via_symbols
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
new file mode 100644
index 000000000000..d455d40edcc2
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@x = global [1000 x i32] zeroinitializer, align 1
+@y = global [1000 x i32] zeroinitializer, align 1
+@z = internal global i32 1, align 4
+
+define void @f1(i32 %a1, i32 %a2, i32 %a3) {
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS-NOT: adrp
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x9, __MergedGlobals_y@PAGE
+;CHECK-APPLE-IOS: add	x9, x9, __MergedGlobals_y@PAGEOFF
+  %x3 = getelementptr inbounds [1000 x i32]* @x, i32 0, i64 3
+  %y3 = getelementptr inbounds [1000 x i32]* @y, i32 0, i64 3
+  store i32 %a1, i32* %x3, align 4
+  store i32 %a2, i32* %y3, align 4
+  store i32 %a3, i32* @z, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
+;CHECK: .globl	_MergedGlobals_x
+;CHECK: .align	4
+;CHECK: _MergedGlobals_x:
+;CHECK: .size	_MergedGlobals_x, 4004
+
+;CHECK: .type	_MergedGlobals_y,@object // @_MergedGlobals_y
+;CHECK: .globl	_MergedGlobals_y
+;CHECK: _MergedGlobals_y:
+;CHECK: .size	_MergedGlobals_y, 4000
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
+;CHECK-APPLE-IOS: .align	4
+;CHECK-APPLE-IOS:  __MergedGlobals_x:
+;CHECK-APPLE-IOS: .long 1
+;CHECK-APPLE-IOS: .space	4000
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_y       ; @_MergedGlobals_y
+;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_y,4000,4
+
+;CHECK:	.globl	x
+;CHECK: x = _MergedGlobals_x+4
+;CHECK:	.globl	y
+;CHECK: y = _MergedGlobals_y
+
+;CHECK-APPLE-IOS:.globl	_x
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x+4
+;CHECK-APPLE-IOS:.globl	_y
+;CHECK-APPLE-IOS: _y = __MergedGlobals_y
diff --git a/test/Transforms/GlobalMerge/ARM64/arm64.ll b/test/CodeGen/AArch64/global-merge-4.ll
similarity index 78%
rename from test/Transforms/GlobalMerge/ARM64/arm64.ll
rename to test/CodeGen/AArch64/global-merge-4.ll
index eea474a74f13..a525ccd8dee3 100644
--- a/test/Transforms/GlobalMerge/ARM64/arm64.ll
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -1,23 +1,4 @@
-; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
-
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -o - | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
@@ -83,6 +64,10 @@ define internal i32* @returnFoo() #1 {
   ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
 }
 
+;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	_MergedGlobals
+;CHECK:	.comm	_MergedGlobals,60,16
+
 attributes #0 = { nounwind ssp }
 attributes #1 = { nounwind readnone ssp }
 attributes #2 = { nounwind }
diff --git a/test/CodeGen/AArch64/global-merge.ll b/test/CodeGen/AArch64/global-merge.ll
new file mode 100644
index 000000000000..aed1dc4d1c7b
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck --check-prefix=NO-MERGE %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 -global-merge-on-external=true | FileCheck --check-prefix=NO-MERGE %s
+
+; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 -global-merge-on-external=true | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+
+; FIXME: add O1/O2 test for aarch64-none-linux-gnu and aarch64-apple-ios
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK-LABEL: f1:
+; CHECK: adrp x{{[0-9]+}}, _MergedGlobals
+; CHECK-NOT: adrp
+
+; CHECK-APPLE-IOS-LABEL: f1:
+; CHECK-APPLE-IOS: adrp x{{[0-9]+}}, __MergedGlobals
+; CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
+
+; CHECK:        .local _MergedGlobals
+; CHECK:        .comm  _MergedGlobals,8,8
+; NO-MERGE-NOT: .local _MergedGlobals
+
+; CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3
+; CHECK-APPLE-IOS-NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,8,3
diff --git a/test/CodeGen/AArch64/global_merge_1.ll b/test/CodeGen/AArch64/global_merge_1.ll
deleted file mode 100644
index e0587d6b9041..000000000000
--- a/test/CodeGen/AArch64/global_merge_1.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-
-@m = internal global i32 0, align 4
-@n = internal global i32 0, align 4
-
-define void @f1(i32 %a1, i32 %a2) {
-; CHECK-LABEL: f1:
-; CHECK: adrp x{{[0-9]+}}, _MergedGlobals
-; CHECK-NOT: adrp
-  store i32 %a1, i32* @m, align 4
-  store i32 %a2, i32* @n, align 4
-  ret void
-}
-
-; CHECK:        .local _MergedGlobals
-; CHECK:        .comm  _MergedGlobals,8,8
-
diff --git a/test/CodeGen/AArch64/got-abuse.ll b/test/CodeGen/AArch64/got-abuse.ll
index 216bfef7d5c5..7a02b104e777 100644
--- a/test/CodeGen/AArch64/got-abuse.ll
+++ b/test/CodeGen/AArch64/got-abuse.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
 
 ; LLVM gives well-defined semantics to this horrible construct (though C says
 ; it's undefined). Regardless, we shouldn't crash. The important feature here is
diff --git a/test/CodeGen/AArch64/i1-contents.ll b/test/CodeGen/AArch64/i1-contents.ll
new file mode 100644
index 000000000000..7f133fc3ea83
--- /dev/null
+++ b/test/CodeGen/AArch64/i1-contents.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+%big = type i32
+
+@var = global %big 0
+
+; AAPCS: low 8 bits of %in (== w0) will be either 0 or 1. Need to extend to
+; 32-bits.
+define void @consume_i1_arg(i1 %in) {
+; CHECK-LABEL: consume_i1_arg:
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+  %val = zext i1 %in to %big
+  store %big %val, %big* @var
+  ret void
+}
+
+; AAPCS: low 8 bits of %val1 (== w0) will be either 0 or 1. Need to extend to
+; 32-bits (doesn't really matter if it's from 1 or 8 bits).
+define void @consume_i1_ret() {
+; CHECK-LABEL: consume_i1_ret:
+; CHECK: bl produce_i1_ret
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+  %val1 = call i1 @produce_i1_ret()
+  %val = zext i1 %val1 to %big
+  store %big %val, %big* @var
+  ret void
+}
+
+; AAPCS: low 8 bits of w0 must be either 0 or 1. Need to mask them off.
+define i1 @produce_i1_ret() {
+; CHECK-LABEL: produce_i1_ret:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+  %val = load %big* @var
+  %val1 = trunc %big %val to i1
+  ret i1 %val1
+}
+
+define void @produce_i1_arg() {
+; CHECK-LABEL: produce_i1_arg:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+; CHECK: bl consume_i1_arg
+  %val = load %big* @var
+  %val1 = trunc %big %val to i1
+  call void @consume_i1_arg(i1 %val1)
+  ret void
+}
+
+
+;define zeroext i1 @foo(i8 %in) {
+;  %val = trunc i8 %in to i1
+;  ret i1 %val
+;}
diff --git a/test/CodeGen/AArch64/i128-align.ll b/test/CodeGen/AArch64/i128-align.ll
index fb363a9591b1..a1b4d6f5a446 100644
--- a/test/CodeGen/AArch64/i128-align.ll
+++ b/test/CodeGen/AArch64/i128-align.ll
@@ -1,4 +1,3 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs -o - %s | FileCheck %s
 
 %struct = type { i32, i128, i8 }
diff --git a/test/CodeGen/AArch64/i128-shift.ll b/test/CodeGen/AArch64/i128-shift.ll
deleted file mode 100644
index bfc9e3c09369..000000000000
--- a/test/CodeGen/AArch64/i128-shift.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; arm64 has its own version of this in long-shift.ll. We'll just use that.
-
-define i128 @test_i128_lsl(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_lsl:
-
-  %sh_prom = zext i32 %shift to i128
-  %shl = shl i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR:x[0-9]+]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsl [[TMP3:x[0-9]+]], [[LO]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], [[TMP3]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsl [[TMP4:x[0-9]+]], [[LO]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], xzr, [[TMP4]], ge
-
-  ret i128 %shl
-}
-
-define i128 @test_i128_shr(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_shr:
-
-  %sh_prom = zext i32 %shift to i128
-  %shr = lshr i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsr [[TRUEVAL:x[0-9]+]], [[HI]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], [[TRUEVAL]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsr [[TMP3:x[0-9]+]], [[HI]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], xzr, [[TMP3]], ge
-
-  ret i128 %shr
-}
diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll
index 49443d240986..9f7dd998bc21 100644
--- a/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index 22b7cc5cf954..f47b490baebd 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
index 91921d5aa3b7..9d833d936c06 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -1,5 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
-; RUN: not llc -mtriple=arm64-none-linux-gnu -o - %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; Out of range immediate for I.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
index cc4558fa54eb..6ffc05dcbde1 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
@@ -1,4 +1,3 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
 ; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
 
 define void @foo() {
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
index 820063392488..172601301993 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -1,5 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
-; RUN: not llc -mtriple=arm64-none-linux-gnu -o - %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
index e7b8173f6abe..3c2f60c1f837 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
@@ -1,4 +1,3 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
 ; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
 
 define void @foo() {
diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll
deleted file mode 100644
index 365453c5fec4..000000000000
--- a/test/CodeGen/AArch64/inline-asm-constraints.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -no-integrated-as < %s | FileCheck %s
-
-define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r:
-  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 %base, i32 %offset)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
-  ret i64 %val
-}
-
-define i16 @test_small_reg(i16 %lhs, i16 %rhs) {
-; CHECK-LABEL: test_small_reg:
-  %val = call i16 asm sideeffect "add $0, $1, $2, sxth", "=r,r,r"(i16 %lhs, i16 %rhs)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
-  ret i16 %val
-}
-
-define i64 @test_inline_constraint_r_imm(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r_imm:
-  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 4, i32 12)
-; CHECK: movz [[FOUR:x[0-9]+]], #4
-; CHECK: movz [[TWELVE:w[0-9]+]], #12
-; CHECK: add {{x[0-9]+}}, [[FOUR]], [[TWELVE]], sxtw
-  ret i64 %val
-}
-
-; m is permitted to have a base/offset form. We don't do that
-; currently though.
-define i32 @test_inline_constraint_m(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_m:
-  %val = call i32 asm "ldr $0, $1", "=r,m"(i32 *%ptr)
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  ret i32 %val
-}
-
-@arr = global [8 x i32] zeroinitializer
-
-; Q should *never* have base/offset form even if given the chance.
-define i32 @test_inline_constraint_Q(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_Q:
-  %val = call i32 asm "ldr $0, $1", "=r,Q"(i32* getelementptr([8 x i32]* @arr, i32 0, i32 1))
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  ret i32 %val
-}
-
-@dump = global fp128 zeroinitializer
-
-define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) {
-; CHECK: test_inline_constraint_w:
-  call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64)
-  call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128)
-; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-
-  ; Arguably semantically dodgy to output "vN", but it's what GCC does
-  ; so purely for compatibility we want vector registers to be output.
-  call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef)
-  call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt)
-  call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl)
-  call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad)
-; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
-; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret void
-}
-
-define void @test_inline_constraint_I() {
-; CHECK-LABEL: test_inline_constraint_I:
-  call void asm sideeffect "add x0, x0, $0", "I"(i32 0)
-  call void asm sideeffect "add x0, x0, $0", "I"(i64 4095)
-; CHECK: add x0, x0, #0
-; CHECK: add x0, x0, #4095
-
-  ret void
-}
-
-; Skip J because it's useless
-
-define void @test_inline_constraint_K() {
-; CHECK-LABEL: test_inline_constraint_K:
-  call void asm sideeffect "and w0, w0, $0", "K"(i32 2863311530) ; = 0xaaaaaaaa
-  call void asm sideeffect "and w0, w0, $0", "K"(i32 65535)
-; CHECK: and w0, w0, #-1431655766
-; CHECK: and w0, w0, #65535
-
-  ret void
-}
-
-define void @test_inline_constraint_L() {
-; CHECK-LABEL: test_inline_constraint_L:
-  call void asm sideeffect "and x0, x0, $0", "L"(i64 4294967296) ; = 0xaaaaaaaa
-  call void asm sideeffect "and x0, x0, $0", "L"(i64 65535)
-; CHECK: and x0, x0, #4294967296
-; CHECK: and x0, x0, #65535
-
-  ret void
-}
-
-; Skip M and N because we don't support MOV pseudo-instructions yet.
-
-@var = global i32 0
-
-define void @test_inline_constraint_S() {
-; CHECK-LABEL: test_inline_constraint_S:
-  call void asm sideeffect "adrp x0, $0", "S"(i32* @var)
-  call void asm sideeffect "adrp x0, ${0:A}", "S"(i32* @var)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S"(i32* @var)
-; CHECK: adrp x0, var
-; CHECK: adrp x0, var
-; CHECK: add x0, x0, #:lo12:var
-  ret void
-}
-
-define i32 @test_inline_constraint_S_label(i1 %in) {
-; CHECK-LABEL: test_inline_constraint_S_label:
-  call void asm sideeffect "adr x0, $0", "S"(i8* blockaddress(@test_inline_constraint_S_label, %loc))
-; CHECK: adr x0, .Ltmp{{[0-9]+}}
-  br i1 %in, label %loc, label %loc2
-loc:
-  ret i32 0
-loc2:
-  ret i32 42
-}
-
-define void @test_inline_constraint_Y() {
-; CHECK-LABEL: test_inline_constraint_Y:
-  call void asm sideeffect "fcmp s0, $0", "Y"(float 0.0)
-; CHECK: fcmp s0, #0.0
-  ret void
-}
-
-define void @test_inline_constraint_Z() {
-; CHECK-LABEL: test_inline_constraint_Z:
-  call void asm sideeffect "cmp w0, $0", "Z"(i32 0)
-; CHECK: cmp w0, #0
-  ret void
-}
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
deleted file mode 100644
index cb66335b105b..000000000000
--- a/test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s
-
-@var_simple = hidden global i32 0
-@var_got = global i32 0
-@var_tlsgd = thread_local global i32 0
-@var_tlsld = thread_local(localdynamic) global i32 0
-@var_tlsie = thread_local(initialexec) global i32 0
-@var_tlsle = thread_local(localexec) global i32 0
-
-define void @test_inline_modifier_L() nounwind {
-; CHECK-LABEL: test_inline_modifier_L:
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_simple)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_got)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsgd)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsld)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_tlsie)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:lo12:var_simple
-; CHECK: ldr x0, [x0, #:got_lo12:var_got]
-; CHECK: add x0, x0, #:tlsdesc_lo12:var_tlsgd
-; CHECK: add x0, x0, #:dtprel_lo12:var_tlsld
-; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie]
-; CHECK: add x0, x0, #:tprel_lo12:var_tlsle
-
-  call void asm sideeffect "add x0, x0, ${0:L}", "Si,~{x0}"(i32 64)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "Si,~{x0}"(i32 64)
-; CHECK: add x0, x0, #64
-; CHECK: ldr x0, [x0, #64]
-
-  ret void
-}
-
-define void @test_inline_modifier_G() nounwind {
-; CHECK-LABEL: test_inline_modifier_G:
-  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsld)
-  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
-; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
-
-  call void asm sideeffect "add x0, x0, ${0:G}", "Si,~{x0}"(i32 42)
-; CHECK: add x0, x0, #42
-  ret void
-}
-
-define void @test_inline_modifier_A() nounwind {
-; CHECK-LABEL: test_inline_modifier_A:
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_simple)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_got)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsgd)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsie)
-  ; N.b. All tprel and dtprel relocs are modified: lo12 or granules.
-; CHECK: adrp x0, var_simple
-; CHECK: adrp x0, :got:var_got
-; CHECK: adrp x0, :tlsdesc:var_tlsgd
-; CHECK: adrp x0, :gottprel:var_tlsie
-
-  call void asm sideeffect "adrp x0, ${0:A}", "Si,~{x0}"(i32 40)
-; CHECK: adrp x0, #40
-
-  ret void
-}
-
-define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind {
-; CHECK-LABEL: test_inline_modifier_wx:
-  call i32 asm sideeffect "add $0, $0, $0", "=r,0"(i32 %small)
-  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i32 %small)
-  call i32 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i32 %small)
-; CHECK: //APP
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
-  call i64 asm sideeffect "add $0, $0, $0", "=r,0"(i64 %big)
-  call i64 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i64 %big)
-  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i64 %big)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
-  call i32 asm sideeffect "add ${0:w}, ${1:w}, ${1:w}", "=r,r"(i32 0)
-  call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0)
-; CHECK: add {{w[0-9]+}}, wzr, wzr
-; CHECK: add {{x[0-9]+}}, xzr, xzr
-
-  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${1:w}", "=r,Ir,0"(i32 123, i32 %small)
-  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${1:x}", "=r,Ir,0"(i32 456, i64 %big)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #123
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #456
-
-  ret void
-}
-
-define void @test_inline_modifier_bhsdq() nounwind {
-; CHECK-LABEL: test_inline_modifier_bhsdq:
-  call float asm sideeffect "ldr ${0:b}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:h}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:s}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:d}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
-  call double asm sideeffect "ldr ${0:b}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:h}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:s}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:d}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
-  call void asm sideeffect "fcmp b0, ${0:b}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp h0, ${0:h}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp s0, ${0:s}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp d0, ${0:d}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp q0, ${0:q}", "Yw"(float 0.0)
-; CHECK: fcmp b0, #0
-; CHECK: fcmp h0, #0
-; CHECK: fcmp s0, #0
-; CHECK: fcmp d0, #0
-; CHECK: fcmp q0, #0
-
-  ret void
-}
-
-define void @test_inline_modifier_c() nounwind {
-; CHECK-LABEL: test_inline_modifier_c:
-  call void asm sideeffect "adr x0, ${0:c}", "i"(i32 3)
-; CHECK: adr x0, 3
-
-  ret void
-}
-
-define void @test_inline_modifier_a() nounwind {
-; CHECK-LABEL: test_inline_modifier_a:
-  call void asm sideeffect "prfm pldl1keep, ${0:a}", "r"(i32* @var_simple)
-; CHECK: adrp [[VARHI:x[0-9]+]], var_simple
-; CHECK: add x[[VARADDR:[0-9]+]], [[VARHI]], #:lo12:var_simple
-; CHECK: prfm pldl1keep, [x[[VARADDR]]]
-  ret void
-}
-
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 87a42ba60a45..69fbd9972b87 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,9 +1,6 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic <%s | FileCheck --check-prefix=CHECK-PIC %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
diff --git a/test/CodeGen/AArch64/large-consts.ll b/test/CodeGen/AArch64/large-consts.ll
index b1f98b9cf9ed..6bf85e829f61 100644
--- a/test/CodeGen/AArch64/large-consts.ll
+++ b/test/CodeGen/AArch64/large-consts.ll
@@ -1,19 +1,14 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s --check-prefix=CHECK-AARCH64
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s
 
 ; Make sure the shift amount is encoded into the instructions by LLVM because
 ; it's not the linker's job to put it there.
 
 define double @foo() {
-; CHECK-AARCH64: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK-AARCH64: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK-AARCH64: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK-AARCH64: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [A,A,0x80'A',0xf2'A']
 
-; CHECK-ARM64: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
+; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
 
   ret double 3.14159
 }
diff --git a/test/CodeGen/AArch64/large-frame.ll b/test/CodeGen/AArch64/large-frame.ll
deleted file mode 100644
index 79dc6487f1fc..000000000000
--- a/test/CodeGen/AArch64/large-frame.ll
+++ /dev/null
@@ -1,120 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; arm64 has a separate copy: aarch64-large-frame.ll (codegen was too different).
-declare void @use_addr(i8*)
-
-@addr = global i8* null
-
-define void @test_bigframe() {
-; CHECK-LABEL: test_bigframe:
-; CHECK: .cfi_startproc
-
-  %var1 = alloca i8, i32 20000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 20000000
-; CHECK: sub sp, sp, #496
-; CHECK: .cfi_def_cfa sp, 496
-; CHECK: str x30, [sp, #488]
-  ; Total adjust is 39999536
-; CHECK: movz [[SUBCONST:x[0-9]+]], #22576
-; CHECK: movk [[SUBCONST]], #610, lsl #16
-; CHECK: sub sp, sp, [[SUBCONST]]
-; CHECK: .cfi_def_cfa sp, 40000032
-; CHECK: .cfi_offset x30, -8
-
-  ; Total offset is 20000024
-; CHECK: movz [[VAR1OFFSET:x[0-9]+]], #11544
-; CHECK: movk [[VAR1OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR1OFFSET]]
-  store volatile i8* %var1, i8** @addr
-
-  %var1plus2 = getelementptr i8* %var1, i32 2
-  store volatile i8* %var1plus2, i8** @addr
-
-; CHECK: movz [[VAR2OFFSET:x[0-9]+]], #11528
-; CHECK: movk [[VAR2OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR2OFFSET]]
-  store volatile i8* %var2, i8** @addr
-
-  %var2plus2 = getelementptr i8* %var2, i32 2
-  store volatile i8* %var2plus2, i8** @addr
-
-  store volatile i8* %var3, i8** @addr
-
-  %var3plus2 = getelementptr i8* %var3, i32 2
-  store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: movz [[ADDCONST:x[0-9]+]], #22576
-; CHECK: movk [[ADDCONST]], #610, lsl #16
-; CHECK: add sp, sp, [[ADDCONST]]
-; CHECK: .cfi_endproc
-  ret void
-}
-
-define void @test_mediumframe() {
-; CHECK-LABEL: test_mediumframe:
-  %var1 = alloca i8, i32 1000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 1000000
-; CHECK: sub sp, sp, #496
-; CHECK: str x30, [sp, #488]
-; CHECK: sub sp, sp, #688
-; CHECK-NEXT: sub sp, sp, #488, lsl #12
-
-  store volatile i8* %var1, i8** @addr
-; CHECK: add [[VAR1ADDR:x[0-9]+]], sp, #600
-; CHECK: add [[VAR1ADDR]], [[VAR1ADDR]], #244, lsl #12
-
-  %var1plus2 = getelementptr i8* %var1, i32 2
-  store volatile i8* %var1plus2, i8** @addr
-; CHECK: add [[VAR1PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
-  store volatile i8* %var2, i8** @addr
-; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #584
-; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #244, lsl #12
-
-  %var2plus2 = getelementptr i8* %var2, i32 2
-  store volatile i8* %var2plus2, i8** @addr
-; CHECK: add [[VAR2PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
-  store volatile i8* %var3, i8** @addr
-
-  %var3plus2 = getelementptr i8* %var3, i32 2
-  store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: add sp, sp, #688
-; CHECK: add sp, sp, #488, lsl #12
-; CHECK: ldr x30, [sp, #488]
-; CHECK: add sp, sp, #496
-  ret void
-}
-
-
-@bigspace = global [8 x i64] zeroinitializer
-
-; If temporary registers are allocated for adjustment, they should *not* clobber
-; argument registers.
-define void @test_tempallocation([8 x i64] %val) nounwind {
-; CHECK-LABEL: test_tempallocation:
-  %var = alloca i8, i32 1000000
-; CHECK: sub sp, sp,
-
-; Make sure the prologue is reasonably efficient
-; CHECK-NEXT: stp x29, x30, [sp,
-; CHECK-NEXT: stp x25, x26, [sp,
-; CHECK-NEXT: stp x23, x24, [sp,
-; CHECK-NEXT: stp x21, x22, [sp,
-; CHECK-NEXT: stp x19, x20, [sp,
-
-; Make sure we don't trash an argument register
-; CHECK-NOT: movz {{x[0-7],}}
-; CHECK: sub sp, sp,
-
-; CHECK-NOT: movz {{x[0-7],}}
-
-; CHECK: bl use_addr
-  call void @use_addr(i8* %var)
-
-  store [8 x i64] %val, [8 x i64]* @bigspace
-  ret void
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
new file mode 100644
index 000000000000..e4f4295c8503
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -0,0 +1,767 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s
+
+; This file contains tests for the AArch64 load/store optimizer.
+
+%padding = type { i8*, i8*, i8*, i8* }
+%s.word = type { i32, i32 }
+%s.doubleword = type { i64, i32 }
+%s.quadword = type { fp128, i32 }
+%s.float = type { float, i32 }
+%s.double = type { double, i32 }
+%struct.word = type { %padding, %s.word }
+%struct.doubleword = type { %padding, %s.doubleword }
+%struct.quadword = type { %padding, %s.quadword }
+%struct.float = type { %padding, %s.float }
+%struct.double = type { %padding, %s.double }
+
+; Check the following transform:
+;
+; (ldr|str) X, [x0, #32]
+;  ...
+; add x0, x0, #32
+;  ->
+; (ldr|str) X, [x0, #32]!
+;
+; with X being either w1, x1, s0, d0 or q0.
+
+declare void @bar_word(%s.word*, i32)
+
+define void @load-pre-indexed-word(%struct.word* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+  %add = load i32* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %add)
+  ret void
+}
+
+define void @store-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+  store i32 %val, i32* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %val)
+  ret void
+}
+
+declare void @bar_doubleword(%s.doubleword*, i64)
+
+define void @load-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  %add = load i64* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %add)
+  ret void
+}
+
+define void @store-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  store i64 %val, i64* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %val)
+  ret void
+}
+
+declare void @bar_quadword(%s.quadword*, fp128)
+
+define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+  %add = load fp128* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+  tail call void @bar_quadword(%s.quadword* %c, fp128 %add)
+  ret void
+}
+
+define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+  store fp128 %val, fp128* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+  tail call void @bar_quadword(%s.quadword* %c, fp128 %val)
+  ret void
+}
+
+declare void @bar_float(%s.float*, float)
+
+define void @load-pre-indexed-float(%struct.float* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+  %add = load float* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+  tail call void @bar_float(%s.float* %c, float %add)
+  ret void
+}
+
+define void @store-pre-indexed-float(%struct.float* %ptr, float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+  store float %val, float* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+  tail call void @bar_float(%s.float* %c, float %val)
+  ret void
+}
+
+declare void @bar_double(%s.double*, double)
+
+define void @load-pre-indexed-double(%struct.double* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+  %add = load double* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+  tail call void @bar_double(%s.double* %c, double %add)
+  ret void
+}
+
+define void @store-pre-indexed-double(%struct.double* %ptr, double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+  store double %val, double* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+  tail call void @bar_double(%s.double* %c, double %val)
+  ret void
+}
+
+; Check the following transform:
+;
+; add x8, x8, #16
+;  ...
+; ldr X, [x8]
+;  ->
+; ldr X, [x8, #16]!
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+%pre.struct.i32 = type { i32, i32, i32}
+%pre.struct.i64 = type { i32, i64, i64}
+%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>}
+%pre.struct.float = type { i32, float, float}
+%pre.struct.double = type { i32, double, double}
+
+define i32 @load-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
+                                   %pre.struct.i32* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-word2
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i32* %retptr
+  ret i32 %ret
+}
+
+define i64 @load-pre-indexed-doubleword2(%pre.struct.i64** %this, i1 %cond,
+                                         %pre.struct.i64* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword2
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i64* %retptr
+  ret i64 %ret
+}
+
+define <2 x i64> @load-pre-indexed-quadword2(%pre.struct.i128** %this, i1 %cond,
+                                             %pre.struct.i128* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword2
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load <2 x i64>* %retptr
+  ret <2 x i64> %ret
+}
+
+define float @load-pre-indexed-float2(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-float2
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load float* %retptr
+  ret float %ret
+}
+
+define double @load-pre-indexed-double2(%pre.struct.double** %this, i1 %cond,
+                                        %pre.struct.double* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-double2
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load double* %retptr
+  ret double %ret
+}
+
+; Check the following transform:
+;
+; add x8, x8, #16
+;  ...
+; str X, [x8]
+;  ->
+; str X, [x8, #16]!
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @store-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
+                                     %pre.struct.i32* %load2,
+                                     i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word2
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i32 %val, i32* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-doubleword2(%pre.struct.i64** %this, i1 %cond,
+                                           %pre.struct.i64* %load2,
+                                           i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword2
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i64 %val, i64* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-quadword2(%pre.struct.i128** %this, i1 %cond,
+                                         %pre.struct.i128* %load2,
+                                         <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword2
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store <2 x i64> %val, <2 x i64>* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-float2(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2,
+                                      float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float2
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store float %val, float* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-double2(%pre.struct.double** %this, i1 %cond,
+                                      %pre.struct.double* %load2,
+                                      double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double2
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store double %val, double* %retptr
+  ret void
+}
+
+; Check the following transform:
+;
+; ldr X, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; ldr X, [x20], #32
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @load-post-indexed-word(i32* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr i32* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i32* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i32* %iv2, i64 -1
+  %load = load i32* %gep2
+  call void @use-word(i32 %load)
+  %load2 = load i32* %iv2
+  call void @use-word(i32 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i32* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-doubleword(i64* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr i64* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i64* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i64* %iv2, i64 -1
+  %load = load i64* %gep2
+  call void @use-doubleword(i64 %load)
+  %load2 = load i64* %iv2
+  call void @use-doubleword(i64 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i64* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-quadword(<2 x i64>* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #64
+entry:
+  %gep1 = getelementptr <2 x i64>* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi <2 x i64>* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr <2 x i64>* %iv2, i64 -1
+  %load = load <2 x i64>* %gep2
+  call void @use-quadword(<2 x i64> %load)
+  %load2 = load <2 x i64>* %iv2
+  call void @use-quadword(<2 x i64> %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr <2 x i64>* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-float(float* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr float* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi float* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr float* %iv2, i64 -1
+  %load = load float* %gep2
+  call void @use-float(float %load)
+  %load2 = load float* %iv2
+  call void @use-float(float %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr float* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-double(double* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr double* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi double* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr double* %iv2, i64 -1
+  %load = load double* %gep2
+  call void @use-double(double %load)
+  %load2 = load double* %iv2
+  call void @use-double(double %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr double* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+; Check the following transform:
+;
+; str X, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; str X, [x20], #32
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @store-post-indexed-word(i32* %array, i64 %count, i32 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-word
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr i32* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i32* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i32* %iv2, i64 -1
+  %load = load i32* %gep2
+  call void @use-word(i32 %load)
+  store i32 %val, i32* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i32* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-doubleword(i64* %array, i64 %count, i64 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-doubleword
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr i64* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i64* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i64* %iv2, i64 -1
+  %load = load i64* %gep2
+  call void @use-doubleword(i64 %load)
+  store i64 %val, i64* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i64* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-quadword(<2 x i64>* %array, i64 %count, <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-post-indexed-quadword
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}], #64
+entry:
+  %gep1 = getelementptr <2 x i64>* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi <2 x i64>* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr <2 x i64>* %iv2, i64 -1
+  %load = load <2 x i64>* %gep2
+  call void @use-quadword(<2 x i64> %load)
+  store <2 x i64> %val, <2 x i64>* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr <2 x i64>* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-float(float* %array, i64 %count, float %val) nounwind {
+; CHECK-LABEL: store-post-indexed-float
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr float* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi float* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr float* %iv2, i64 -1
+  %load = load float* %gep2
+  call void @use-float(float %load)
+  store float %val, float* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr float* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-double(double* %array, i64 %count, double %val) nounwind {
+; CHECK-LABEL: store-post-indexed-double
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr double* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi double* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr double* %iv2, i64 -1
+  %load = load double* %gep2
+  call void @use-double(double %load)
+  store double %val, double* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr double* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+declare void @use-word(i32)
+declare void @use-doubleword(i64)
+declare void @use-quadword(<2 x i64>)
+declare void @use-float(float)
+declare void @use-double(double)
+
+; Check the following transform:
+;
+; (ldr|str) X, [x20]
+;  ...
+; sub x20, x20, #16
+;  ->
+; (ldr|str) X, [x20], #-16
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @post-indexed-sub-word(i32* %a, i32* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #-8
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #-8
+  br label %for.body
+for.body:
+  %phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi i32* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr i32* %phi1, i64 -1
+  %load1 = load i32* %gep1
+  %gep2 = getelementptr i32* %phi2, i64 -1
+  store i32 %load1, i32* %gep2
+  %load2 = load i32* %phi1
+  store i32 %load2, i32* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr i32* %phi2, i64 -2
+  %gep4 = getelementptr i32* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-doubleword(i64* %a, i64* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #-16
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #-16
+  br label %for.body
+for.body:
+  %phi1 = phi i64* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi i64* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr i64* %phi1, i64 -1
+  %load1 = load i64* %gep1
+  %gep2 = getelementptr i64* %phi2, i64 -1
+  store i64 %load1, i64* %gep2
+  %load2 = load i64* %phi1
+  store i64 %load2, i64* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr i64* %phi2, i64 -2
+  %gep4 = getelementptr i64* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-quadword(<2 x i64>* %a, <2 x i64>* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #-32
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}], #-32
+  br label %for.body
+for.body:
+  %phi1 = phi <2 x i64>* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi <2 x i64>* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr <2 x i64>* %phi1, i64 -1
+  %load1 = load <2 x i64>* %gep1
+  %gep2 = getelementptr <2 x i64>* %phi2, i64 -1
+  store <2 x i64> %load1, <2 x i64>* %gep2
+  %load2 = load <2 x i64>* %phi1
+  store <2 x i64> %load2, <2 x i64>* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr <2 x i64>* %phi2, i64 -2
+  %gep4 = getelementptr <2 x i64>* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-float(float* %a, float* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #-8
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}], #-8
+  br label %for.body
+for.body:
+  %phi1 = phi float* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi float* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr float* %phi1, i64 -1
+  %load1 = load float* %gep1
+  %gep2 = getelementptr float* %phi2, i64 -1
+  store float %load1, float* %gep2
+  %load2 = load float* %phi1
+  store float %load2, float* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr float* %phi2, i64 -2
+  %gep4 = getelementptr float* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-double(double* %a, double* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #-16
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}], #-16
+  br label %for.body
+for.body:
+  %phi1 = phi double* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi double* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr double* %phi1, i64 -1
+  %load1 = load double* %gep1
+  %gep2 = getelementptr double* %phi2, i64 -1
+  store double %load1, double* %gep2
+  %load2 = load double* %phi1
+  store double %load2, double* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr double* %phi2, i64 -2
+  %gep4 = getelementptr double* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
index 2b42d8ec0830..e2fa08bcce69 100644
--- a/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
index 36944ba9a8a8..1de8443d9ed2 100644
--- a/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
index b3359b34f06a..e171d22b6c7c 100644
--- a/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index c6f83453ac20..125995cebf11 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -1,4 +1,10 @@
-targets = set(config.root.targets_to_build.split())
-if 'AArch64' not in targets or 'ARM64' not in targets:
+import re
+
+config.suffixes = ['.ll']
+
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
+# For now we don't test arm64-win32.
+if re.search(r'cygwin|mingw32|win32', config.target_triple):
+    config.unsupported = True
diff --git a/test/CodeGen/AArch64/literal_pools_float.ll b/test/CodeGen/AArch64/literal_pools_float.ll
index 769a68bebc9c..e53b8b62c6f3 100644
--- a/test/CodeGen/AArch64/literal_pools_float.ll
+++ b/test/CodeGen/AArch64/literal_pools_float.ll
@@ -1,11 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/literal_pools_int.ll b/test/CodeGen/AArch64/literal_pools_int.ll
deleted file mode 100644
index 33a73d58bb85..000000000000
--- a/test/CodeGen/AArch64/literal_pools_int.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
-; arm64 does not use literal pools for integers so there is nothing to check.
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @foo() {
-; CHECK-LABEL: foo:
-    %val32 = load i32* @var32
-    %val64 = load i64* @var64
-
-    %val32_lit32 = and i32 %val32, 123456785
-    store volatile i32 %val32_lit32, i32* @var32
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit32 = and i64 %val64, 305402420
-    store volatile i64 %val64_lit32, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit32signed = and i64 %val64, -12345678
-    store volatile i64 %val64_lit32signed, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldrsw {{x[0-9]+}}, [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldrsw {{x[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit64 = and i64 %val64, 1234567898765432
-    store volatile i64 %val64_lit64, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{x[0-9]+}}, [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{x[0-9]+}}, [x[[LITADDR]]]
-
-    ret void
-}
diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll
index 1a76d5bcc1fe..2f5b9f2adb48 100644
--- a/test/CodeGen/AArch64/local_vars.ll
+++ b/test/CodeGen/AArch64/local_vars.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-AARCH64 %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
 
 ; Make sure a reasonably sane prologue and epilogue are
 ; generated. This test is not robust in the face of an frame-handling
diff --git a/test/CodeGen/AArch64/logical-imm.ll b/test/CodeGen/AArch64/logical-imm.ll
index 3ae63ad16f67..a5e4a9956de7 100644
--- a/test/CodeGen/AArch64/logical-imm.ll
+++ b/test/CodeGen/AArch64/logical-imm.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var32 = global i32 0
diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll
index 49b253bcfde6..b249d72e0f90 100644
--- a/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var1_32 = global i32 0
 @var2_32 = global i32 0
diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll
index 3359616fa8d2..276c54d2cc4e 100644
--- a/test/CodeGen/AArch64/mature-mc-support.ll
+++ b/test/CodeGen/AArch64/mature-mc-support.ll
@@ -1,16 +1,10 @@
 ; Test that inline assembly is parsed by the MC layer when MC support is mature
 ; (even when the output is assembly).
 
-; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t1
-; RUN: FileCheck %s < %t1
-
-; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t2
-; RUN: FileCheck %s < %t2
-
-; RUN: not llc -mtriple=arm64-pc-linux < %s > /dev/null 2> %t3
+; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t3
 ; RUN: FileCheck %s < %t3
 
-; RUN: not llc -mtriple=arm64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
+; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
 ; RUN: FileCheck %s < %t4
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll
index 876eb52df62b..93c181271755 100644
--- a/test/CodeGen/AArch64/movw-consts.ll
+++ b/test/CodeGen/AArch64/movw-consts.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s  --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
 
 define i64 @test0() {
 ; CHECK-LABEL: test0:
@@ -10,50 +9,43 @@ define i64 @test0() {
 
 define i64 @test1() {
 ; CHECK-LABEL: test1:
-; CHECK-AARCH64: movz x0, #1
-; CHECK-ARM64: orr w0, wzr, #0x1
+; CHECK: orr w0, wzr, #0x1
   ret i64 1
 }
 
 define i64 @test2() {
 ; CHECK-LABEL: test2:
-; CHECK-AARCH64: movz x0, #65535
-; CHECK-ARM64: orr w0, wzr, #0xffff
+; CHECK: orr w0, wzr, #0xffff
   ret i64 65535
 }
 
 define i64 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK-AARCH64: movz x0, #1, lsl #16
-; CHECK-ARM64: orr w0, wzr, #0x10000
+; CHECK: orr w0, wzr, #0x10000
   ret i64 65536
 }
 
 define i64 @test4() {
 ; CHECK-LABEL: test4:
-; CHECK-AARCH64: movz x0, #65535, lsl #16
-; CHECK-ARM64: orr w0, wzr, #0xffff0000
+; CHECK: orr w0, wzr, #0xffff0000
   ret i64 4294901760
 }
 
 define i64 @test5() {
 ; CHECK-LABEL: test5:
-; CHECK-AARCH64: movz x0, #1, lsl #32
-; CHECK-ARM64: orr x0, xzr, #0x100000000
+; CHECK: orr x0, xzr, #0x100000000
   ret i64 4294967296
 }
 
 define i64 @test6() {
 ; CHECK-LABEL: test6:
-; CHECK-AARCH64: movz x0, #65535, lsl #32
-; CHECK-ARM64: orr x0, xzr, #0xffff00000000
+; CHECK: orr x0, xzr, #0xffff00000000
   ret i64 281470681743360
 }
 
 define i64 @test7() {
 ; CHECK-LABEL: test7:
-; CHECK-AARCH64: movz x0, #1, lsl #48
-; CHECK-ARM64: orr x0, xzr, #0x1000000000000
+; CHECK: orr x0, xzr, #0x1000000000000
   ret i64 281474976710656
 }
 
@@ -83,40 +75,35 @@ define i64 @test10() {
 
 define void @test11() {
 ; CHECK-LABEL: test11:
-; CHECK-AARCH64: mov {{w[0-9]+}}, wzr
-; CHECK-ARM64: str wzr
+; CHECK: str wzr
   store i32 0, i32* @var32
   ret void
 }
 
 define void @test12() {
 ; CHECK-LABEL: test12:
-; CHECK-AARCH64: movz {{w[0-9]+}}, #1
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0x1
+; CHECK: orr {{w[0-9]+}}, wzr, #0x1
   store i32 1, i32* @var32
   ret void
 }
 
 define void @test13() {
 ; CHECK-LABEL: test13:
-; CHECK-AARCH64: movz {{w[0-9]+}}, #65535
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0xffff
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff
   store i32 65535, i32* @var32
   ret void
 }
 
 define void @test14() {
 ; CHECK-LABEL: test14:
-; CHECK-AARCH64: movz {{w[0-9]+}}, #1, lsl #16
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0x10000
+; CHECK: orr {{w[0-9]+}}, wzr, #0x10000
   store i32 65536, i32* @var32
   ret void
 }
 
 define void @test15() {
 ; CHECK-LABEL: test15:
-; CHECK-AARCH64: movz {{w[0-9]+}}, #65535, lsl #16
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0xffff0000
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff0000
   store i32 4294901760, i32* @var32
   ret void
 }
@@ -132,7 +119,6 @@ define i64 @test17() {
 ; CHECK-LABEL: test17:
 
   ; Mustn't MOVN w0 here.
-; CHECK-AARCH64: movn x0, #2
-; CHECK-ARM64: orr x0, xzr, #0xfffffffffffffffd
+; CHECK: orr x0, xzr, #0xfffffffffffffffd
   ret i64 -3
 }
diff --git a/test/CodeGen/AArch64/movw-shift-encoding.ll b/test/CodeGen/AArch64/movw-shift-encoding.ll
index 8a0da4cb9324..178fccce333b 100644
--- a/test/CodeGen/AArch64/movw-shift-encoding.ll
+++ b/test/CodeGen/AArch64/movw-shift-encoding.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s --check-prefix=CHECK-AARCH64
-; RUN: llc -mtriple=arm64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s
 
 @var = global i32 0
 
@@ -8,13 +7,9 @@
 
 define i32* @get_var() {
   ret i32* @var
-; CHECK-AARCH64: movz    x0, #:abs_g3:var        // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK-AARCH64: movk    x0, #:abs_g2_nc:var     // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK-AARCH64: movk    x0, #:abs_g1_nc:var     // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK-AARCH64: movk    x0, #:abs_g0_nc:var     // encoding: [A,A,0x80'A',0xf2'A']
 
-; CHECK-ARM64: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
-; CHECK-ARM64: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
-; CHECK-ARM64: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
-; CHECK-ARM64: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
+; CHECK: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
+; CHECK: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
 }
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index 3b027f2d4f10..0689fbdcc078 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,5 +1,3 @@
-; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
 ; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
 ; RUN: llc -mtriple=arm64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
 
diff --git a/test/CodeGen/AArch64/named-reg-alloc.ll b/test/CodeGen/AArch64/named-reg-alloc.ll
deleted file mode 100644
index 31d72f6be0ef..000000000000
--- a/test/CodeGen/AArch64/named-reg-alloc.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: not llc < %s -mtriple=aarch64-linux-gnueabi 2>&1 | FileCheck %s
-; arm64 has separate copy of this test
-
-define i32 @get_stack() nounwind {
-entry:
-; FIXME: Include an allocatable-specific error message
-; CHECK: Invalid register name global variable
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
-  ret i32 %sp
-}
-
-declare i32 @llvm.read_register.i32(metadata) nounwind
-
-!0 = metadata !{metadata !"x5\00"}
diff --git a/test/CodeGen/AArch64/named-reg-notareg.ll b/test/CodeGen/AArch64/named-reg-notareg.ll
deleted file mode 100644
index 66d013137cf4..000000000000
--- a/test/CodeGen/AArch64/named-reg-notareg.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc < %s -mtriple=aarch64-linux-gnueabi 2>&1 | FileCheck %s
-; arm64 has separate copy of this test
-
-define i32 @get_stack() nounwind {
-entry:
-; CHECK: Invalid register name global variable
-	%sp = call i32 @llvm.read_register.i32(metadata !0)
-  ret i32 %sp
-}
-
-declare i32 @llvm.read_register.i32(metadata) nounwind
-
-!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/neon-2velem-high.ll
deleted file mode 100644
index ebdb5b7132d8..000000000000
--- a/test/CodeGen/AArch64/neon-2velem-high.ll
+++ /dev/null
@@ -1,331 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has copied test in its directory due to differing intrinsics.
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
-
-define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_s16:
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vmull15.i.i
-}
-
-define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_s32:
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vmull9.i.i
-}
-
-define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_u16:
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vmull15.i.i
-}
-
-define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_u32:
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vmull9.i.i
-}
-
-define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vqdmull_high_n_s16:
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vqdmull15.i.i
-}
-
-define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vqdmull_high_n_s32:
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vqdmull9.i.i
-}
-
-define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_s16:
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_s32:
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_u16:
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_u32:
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlal_high_n_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
-  ret <4 x i32> %vqdmlal17.i.i
-}
-
-define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlal_high_n_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
-  ret <2 x i64> %vqdmlal11.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlsl_high_n_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
-  ret <4 x i32> %vqdmlsl17.i.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlsl_high_n_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
-  ret <2 x i64> %vqdmlsl11.i.i
-}
-
-define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
-; CHECK: test_vmul_n_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
-  %mul.i = fmul <2 x float> %vecinit1.i, %a
-  ret <2 x float> %mul.i
-}
-
-define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
-; CHECK: test_vmulq_n_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
-  %mul.i = fmul <4 x float> %vecinit3.i, %a
-  ret <4 x float> %mul.i
-}
-
-define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
-; CHECK: test_vmulq_n_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-entry:
-  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
-  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
-  %mul.i = fmul <2 x double> %vecinit1.i, %a
-  ret <2 x double> %mul.i
-}
-
-define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfma_n_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmaq_n_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfms_n_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
-  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
-  %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
-  ret <2 x float> %1
-}
-
-define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmsq_n_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
-  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
-  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
-  ret <4 x float> %1
-}
diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/neon-2velem.ll
deleted file mode 100644
index b9d0e84f16c2..000000000000
--- a/test/CodeGen/AArch64/neon-2velem.ll
+++ /dev/null
@@ -1,2854 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has copied test in its directory due to differing intrinsics.
-
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
-
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
-
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
-
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
-
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmaq_lane_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
-
-define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmsq_lane_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <1 x double> <double -0.000000e+00>, %v
-  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
-  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmas_laneq_f32
-; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %extract = extractelement <4 x float> %v, i32 3
-  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
-  ret float %0
-}
-
-declare float @llvm.fma.f32(float, float, float)
-
-define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmsd_lane_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <1 x double> %v, i32 0
-  %extract = fsub double -0.000000e+00, %extract.rhs
-  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
-  ret double %0
-}
-
-declare double @llvm.fma.f64(double, double, double)
-
-define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK: test_vfmss_laneq_f32
-; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <4 x float> %v, i32 3
-  %extract = fsub float -0.000000e+00, %extract.rhs
-  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
-  ret float %0
-}
-
-define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsd_laneq_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <2 x double> %v, i32 1
-  %extract = fsub double -0.000000e+00, %extract.rhs
-  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
-  ret double %0
-}
-
-define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqdmulh2.i
-}
-
-define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqdmulh2.i
-}
-
-define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqdmulh2.i
-}
-
-define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqdmulh2.i
-}
-
-define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqrdmulh2.i
-}
-
-define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqrdmulh2.i
-}
-
-define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqrdmulh2.i
-}
-
-define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqrdmulh2.i
-}
-
-define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
-; CHECK: test_vmul_lane_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <1 x double> %v, i32 0
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulq_lane_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <2 x double> %v, i32 1
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64_0:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64_0:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
-  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16_0:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32_0:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16_0:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32_0:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqdmulh2.i
-}
-
-define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqdmulh2.i
-}
-
-define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqdmulh2.i
-}
-
-define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqdmulh2.i
-}
-
-define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqrdmulh2.i
-}
-
-define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqrdmulh2.i
-}
-
-define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqrdmulh2.i
-}
-
-define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqrdmulh2.i
-}
-
-define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64_0:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <2 x double> %v, i32 0
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64_0:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/neon-3vdiff.ll
deleted file mode 100644
index dbe2a726b902..000000000000
--- a/test/CodeGen/AArch64/neon-3vdiff.ll
+++ /dev/null
@@ -1,1834 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has its own copy of this test in its directory.
-
-declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
-
-declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
-
-declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>)
-
-declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>)
-
-declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>)
-
-declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>)
-
-declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_s8:
-; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_s16:
-; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_s32:
-; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_u8:
-; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_u16:
-; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_u32:
-; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_s8:
-; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %1
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_s16:
-; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %1
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_s32:
-; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %1
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_u8:
-; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %1
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_u16:
-; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %1
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_u32:
-; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %1
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_s8:
-; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_s16:
-; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_s32:
-; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_u8:
-; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_u16:
-; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_u32:
-; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_s8:
-; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_s16:
-; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_s32:
-; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_u8:
-; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_u16:
-; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_u32:
-; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_s8:
-; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_s16:
-; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_s32:
-; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_u8:
-; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_u16:
-; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_u32:
-; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_s8:
-; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %sub.i = sub <8 x i16> %0, %1
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_s16:
-; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %sub.i = sub <4 x i32> %0, %1
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_s32:
-; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %sub.i = sub <2 x i64> %0, %1
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_u8:
-; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %sub.i = sub <8 x i16> %0, %1
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_u16:
-; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %sub.i = sub <4 x i32> %0, %1
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_u32:
-; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %sub.i = sub <2 x i64> %0, %1
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_s8:
-; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %vmovl.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_s16:
-; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %vmovl.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_s32:
-; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %vmovl.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_u8:
-; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %vmovl.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_u16:
-; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %vmovl.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_u32:
-; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %vmovl.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_s8:
-; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %0
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_s16:
-; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %0
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_s32:
-; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %0
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_u8:
-; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %0
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_u16:
-; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %0
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_u32:
-; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %0
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_s16:
-; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i = add <8 x i16> %a, %b
-  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
-  ret <8 x i8> %vaddhn2.i
-}
-
-define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_s32:
-; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i = add <4 x i32> %a, %b
-  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
-  ret <4 x i16> %vaddhn2.i
-}
-
-define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_s64:
-; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i = add <2 x i64> %a, %b
-  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
-  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
-  ret <2 x i32> %vaddhn2.i
-}
-
-define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_u16:
-; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i = add <8 x i16> %a, %b
-  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
-  ret <8 x i8> %vaddhn2.i
-}
-
-define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_u32:
-; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i = add <4 x i32> %a, %b
-  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
-  ret <4 x i16> %vaddhn2.i
-}
-
-define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_u64:
-; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i = add <2 x i64> %a, %b
-  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
-  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
-  ret <2 x i32> %vaddhn2.i
-}
-
-define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_s16:
-; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i.i = add <8 x i16> %a, %b
-  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_s32:
-; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i.i = add <4 x i32> %a, %b
-  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_s64:
-; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i.i = add <2 x i64> %a, %b
-  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
-  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_u16:
-; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i.i = add <8 x i16> %a, %b
-  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_u32:
-; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i.i = add <4 x i32> %a, %b
-  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_u64:
-; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i.i = add <2 x i64> %a, %b
-  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
-  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_s16:
-; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vraddhn2.i
-}
-
-define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_s32:
-; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vraddhn2.i
-}
-
-define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_s64:
-; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vraddhn2.i
-}
-
-define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_u16:
-; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vraddhn2.i
-}
-
-define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_u32:
-; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vraddhn2.i
-}
-
-define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_u64:
-; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vraddhn2.i
-}
-
-define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_s16:
-; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_s32:
-; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_s64:
-; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_u16:
-; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_u32:
-; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_u64:
-; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_s16:
-; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i = sub <8 x i16> %a, %b
-  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
-  ret <8 x i8> %vsubhn2.i
-}
-
-define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_s32:
-; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i = sub <4 x i32> %a, %b
-  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
-  ret <4 x i16> %vsubhn2.i
-}
-
-define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_s64:
-; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i = sub <2 x i64> %a, %b
-  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
-  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
-  ret <2 x i32> %vsubhn2.i
-}
-
-define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_u16:
-; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i = sub <8 x i16> %a, %b
-  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
-  ret <8 x i8> %vsubhn2.i
-}
-
-define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_u32:
-; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i = sub <4 x i32> %a, %b
-  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
-  ret <4 x i16> %vsubhn2.i
-}
-
-define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_u64:
-; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i = sub <2 x i64> %a, %b
-  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
-  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
-  ret <2 x i32> %vsubhn2.i
-}
-
-define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_s16:
-; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i.i = sub <8 x i16> %a, %b
-  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_s32:
-; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i.i = sub <4 x i32> %a, %b
-  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_s64:
-; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i.i = sub <2 x i64> %a, %b
-  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
-  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_u16:
-; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i.i = sub <8 x i16> %a, %b
-  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_u32:
-; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i.i = sub <4 x i32> %a, %b
-  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_u64:
-; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i.i = sub <2 x i64> %a, %b
-  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
-  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_s16:
-; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vrsubhn2.i
-}
-
-define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_s32:
-; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vrsubhn2.i
-}
-
-define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_s64:
-; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vrsubhn2.i
-}
-
-define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_u16:
-; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vrsubhn2.i
-}
-
-define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_u32:
-; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vrsubhn2.i
-}
-
-define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_u64:
-; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vrsubhn2.i
-}
-
-define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_s16:
-; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_s32:
-; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_s64:
-; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_u16:
-; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_u32:
-; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_u64:
-; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_s8:
-; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
-  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i
-}
-
-define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_s16:
-; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
-  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i
-}
-
-define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_s32:
-; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
-  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i
-}
-
-define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_u8:
-; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
-  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i
-}
-
-define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_u16:
-; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
-  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i
-}
-
-define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_u32:
-; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
-  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i
-}
-
-define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_s8:
-; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_s16:
-; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_s32:
-; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_u8:
-; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_u16:
-; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_u32:
-; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_s8:
-; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i.i
-}
-
-define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_s16:
-; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i.i
-}
-
-define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_s32:
-; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i.i
-}
-
-define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_u8:
-; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i.i
-}
-
-define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_u16:
-; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i.i
-}
-
-define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_u32:
-; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i.i
-}
-
-define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_s8:
-; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
-  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_s16:
-; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
-  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_s32:
-; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
-  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_u8:
-; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
-  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_u16:
-; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
-  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_u32:
-; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
-  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_s8:
-; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_s16:
-; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_s32:
-; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_u8:
-; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_u16:
-; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_u32:
-; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_s8:
-; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_s16:
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_s32:
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_u8:
-; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_u16:
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_u32:
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_s8:
-; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %add.i = add <8 x i16> %vmull.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_s16:
-; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %add.i = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_s32:
-; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %add.i = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_u8:
-; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %add.i = add <8 x i16> %vmull.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_u16:
-; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %add.i = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_u32:
-; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %add.i = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_s8:
-; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_s16:
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_s32:
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_u8:
-; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_u16:
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_u32:
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_s8:
-; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %sub.i = sub <8 x i16> %a, %vmull.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_s16:
-; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %sub.i = sub <4 x i32> %a, %vmull2.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_s32:
-; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %sub.i = sub <2 x i64> %a, %vmull2.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_u8:
-; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %sub.i = sub <8 x i16> %a, %vmull.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_u16:
-; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %sub.i = sub <4 x i32> %a, %vmull2.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_u32:
-; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %sub.i = sub <2 x i64> %a, %vmull2.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_s8:
-; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
-  ret <8 x i16> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_u8:
-; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
-  ret <8 x i16> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vqdmull_s16:
-; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vqdmull_s32:
-; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlal_s16:
-; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlal_s32:
-; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlsl_s16:
-; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlsl_s32:
-; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vqdmull_high_s16:
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vqdmull2.i.i
-}
-
-define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vqdmull_high_s32:
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vqdmull2.i.i
-}
-
-define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlal_high_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
-  ret <4 x i32> %vqdmlal4.i.i
-}
-
-define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlal_high_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
-  ret <2 x i64> %vqdmlal4.i.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlsl_high_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
-  ret <4 x i32> %vqdmlsl4.i.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlsl_high_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
-  ret <2 x i64> %vqdmlsl4.i.i
-}
-
-define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_p8:
-; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_p8:
-; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK: test_vmull_p64
-; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
-entry:
-  %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1
-  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
-  ret i128 %vmull3.i
-}
-
-define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK: test_vmull_high_p64
-; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %0 = extractelement <2 x i64> %a, i32 1
-  %1 = extractelement <2 x i64> %b, i32 1
-  %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0
-  %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0
-  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1
-  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
-  ret i128 %vmull3.i.i
-}
-
-declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5
-
-
diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll
deleted file mode 100644
index 1fe52565afe1..000000000000
--- a/test/CodeGen/AArch64/neon-aba-abd.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 has copied test in its own directory (different intrinsic names).
-
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uaba_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: uaba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_saba_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: saba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uaba_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: uaba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_saba_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: saba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uaba_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: uaba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_saba_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: saba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uaba_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: uaba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_saba_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: saba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uaba_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: uaba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_sabd_v2i32_const() {
-; CHECK: test_sabd_v2i32_const:
-; CHECK: movi     d1, #0xffffffff0000
-; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
-  %1 = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(
-    <2 x i32> <i32 -2147483648, i32 2147450880>,
-    <2 x i32> <i32 -65536, i32 65535>)
-  ret <2 x i32> %1
-}
-
-define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_saba_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: saba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uaba_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: uaba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_saba_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: saba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>)
-
-define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fabd_v2f32:
-  %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fabd v0.2s, v0.2s, v1.2s
-  ret <2 x float> %abd
-}
-
-declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>)
-
-define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fabd_v4f32:
-  %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fabd v0.4s, v0.4s, v1.4s
-  ret <4 x float> %abd
-}
-
-declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>)
-
-define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fabd_v2f64:
-  %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fabd v0.2d, v0.2d, v1.2d
-  ret <2 x double> %abd
-}
diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll
deleted file mode 100644
index 98444d29a01a..000000000000
--- a/test/CodeGen/AArch64/neon-across.ll
+++ /dev/null
@@ -1,473 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has copied test in its own directory.
-
-declare float @llvm.aarch64.neon.vminnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vminv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxv(<4 x float>)
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8>)
-
-define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK: test_vaddlv_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i16> %saddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_s16(<4 x i16> %a) {
-; CHECK: test_vaddlv_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i32> %saddlv.i, i32 0
-  ret i32 %0
-}
-
-define i16 @test_vaddlv_u8(<8 x i8> %a) {
-; CHECK: test_vaddlv_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_u16(<4 x i16> %a) {
-; CHECK: test_vaddlv_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
-  ret i32 %0
-}
-
-define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i16> %saddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i32> %saddlv.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vaddlvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_s32:
-; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %saddlv.i = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i64> %saddlv.i, i32 0
-  ret i64 %0
-}
-
-define i16 @test_vaddlvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vaddlvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_u32:
-; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uaddlv.i = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i64> %uaddlv.i, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vmaxv_s8(<8 x i8> %a) {
-; CHECK: test_vmaxv_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %smaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_s16(<4 x i16> %a) {
-; CHECK: test_vmaxv_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %smaxv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vmaxv_u8(<8 x i8> %a) {
-; CHECK: test_vmaxv_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %umaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_u16(<4 x i16> %a) {
-; CHECK: test_vmaxv_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %umaxv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vmaxvq_s8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %smaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_s16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %smaxv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_s32:
-; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %smaxv.i = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %smaxv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vmaxvq_u8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %umaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_u16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %umaxv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_u32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_u32:
-; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %umaxv.i = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %umaxv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vminv_s8(<8 x i8> %a) {
-; CHECK: test_vminv_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %sminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminv_s16(<4 x i16> %a) {
-; CHECK: test_vminv_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %sminv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vminv_u8(<8 x i8> %a) {
-; CHECK: test_vminv_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %uminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminv_u16(<4 x i16> %a) {
-; CHECK: test_vminv_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %uminv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vminvq_s8(<16 x i8> %a) {
-; CHECK: test_vminvq_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %sminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminvq_s16(<8 x i16> %a) {
-; CHECK: test_vminvq_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %sminv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a) {
-; CHECK: test_vminvq_s32:
-; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sminv.i = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %sminv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vminvq_u8(<16 x i8> %a) {
-; CHECK: test_vminvq_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %uminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminvq_u16(<8 x i16> %a) {
-; CHECK: test_vminvq_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %uminv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vminvq_u32(<4 x i32> %a) {
-; CHECK: test_vminvq_u32:
-; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uminv.i = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %uminv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vaddv_s8(<8 x i8> %a) {
-; CHECK: test_vaddv_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddv_s16(<4 x i16> %a) {
-; CHECK: test_vaddv_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vaddv_u8(<8 x i8> %a) {
-; CHECK: test_vaddv_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddv_u16(<4 x i16> %a) {
-; CHECK: test_vaddv_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vaddvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddvq_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddvq_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddvq_s32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %vaddv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vaddvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddvq_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddvq_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddvq_u32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %vaddv.i, i32 0
-  ret i32 %0
-}
-
-define float @test_vmaxvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxvq_f32:
-; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vmaxv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminvq_f32(<4 x float> %a) {
-; CHECK: test_vminvq_f32:
-; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vminv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vmaxnmvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxnmvq_f32:
-; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vmaxnmv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminnmvq_f32(<4 x float> %a) {
-; CHECK: test_vminnmvq_f32:
-; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vminnmv(<4 x float> %a)
-  ret float %0
-}
-
diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll
deleted file mode 100644
index d304094adb40..000000000000
--- a/test/CodeGen/AArch64/neon-add-pairwise.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 has a copy of this test in its own directory.
-
-declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_addp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: addp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_addp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: addp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_addp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: addp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_addp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: addp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_addp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: addp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_addp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: addp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_addp_v2i64:
-        %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: addp v0.2d, v0.2d, v1.2d
-        ret <2 x i64> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: faddp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: faddp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: faddp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vaddv.v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddv.v2i32
-; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll
deleted file mode 100644
index eebad4df106e..000000000000
--- a/test/CodeGen/AArch64/neon-add-sub.ll
+++ /dev/null
@@ -1,280 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has its own copy of this test
-
-define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = add <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = add <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = add <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = add <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = add <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = add <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = add <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fadd <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fadd <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fadd <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = sub <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = sub <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = sub <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = sub <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = sub <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = sub <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = sub <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fsub <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fsub <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fsub <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vadd_f64
-; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fadd <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmul_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vdiv_f64
-; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fdiv <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vmla_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %b, %c
-  %2 = fadd <1 x double> %1, %a
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vmls_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %b, %c
-  %2 = fsub <1 x double> %a, %1
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vfms_f64
-; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> <double -0.000000e+00>, %b
-  %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a)
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vfma_f64
-; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vsub_f64
-; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vabd_f64
-; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmax_f64
-; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmin_f64
-; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmaxnm_f64
-; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vminnm_f64
-; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vabs_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vabs_f64
-; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vneg_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vneg_f64
-; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> <double -0.000000e+00>, %a
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-define <1 x i8> @test_add_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_add_v1i8:
-;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = add <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_add_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_add_v1i16:
-;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = add <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_add_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_add_v1i32:
-;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = add <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @test_sub_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_sub_v1i8:
-;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = sub <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_sub_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_sub_v1i16:
-;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = sub <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_sub_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_sub_v1i32:
-;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = sub <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll
index 25819b379322..61099d48fdd2 100644
--- a/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/test/CodeGen/AArch64/neon-bitcast.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 
 ; From <8 x i8>
 
diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 228a6bfdf5dd..6497856c7d36 100644
--- a/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: and8xi8:
diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll
deleted file mode 100644
index 3182b700d8f7..000000000000
--- a/test/CodeGen/AArch64/neon-bsl.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has no equivalent vbsl intrinsic, always using the and/or IR. The final
-; two tests are duplicated by ARM64's vselect.ll test.
-
-declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>)
-
-declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
-declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
-
-declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
-
-define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_s8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_s16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  %0 = bitcast <4 x i16> %vbsl3.i to <8 x i8>
-  ret <8 x i8> %0
-}
-
-define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_s32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
-  ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_s64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
-  ret <1 x i64> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_u8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_u16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  ret <4 x i16> %vbsl3.i
-}
-
-define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_u32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
-  ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_u64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
-  ret <1 x i64> %vbsl3.i
-}
-
-define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) {
-; CHECK-LABEL: test_vbsl_f32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3)
-  ret <2 x float> %vbsl3.i
-}
-
-define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_vbsl_f64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = bitcast <1 x i64> %v1 to <1 x double>
-  %vbsl3.i = tail call <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double> %vbsl.i, <1 x double> %v2, <1 x double> %v3)
-  ret <1 x double> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_p8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_p16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  ret <4 x i16> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_s8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_s16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_s32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
-  ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_s64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
-  ret <2 x i64> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_u8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_u16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_u32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
-  ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_u64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
-  ret <2 x i64> %vbsl3.i
-}
-
-define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_vbslq_f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = bitcast <4 x i32> %v1 to <4 x float>
-  %vbsl3.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %vbsl.i, <4 x float> %v2, <4 x float> %v3)
-  ret <4 x float> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_p8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_p16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_vbslq_f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = bitcast <2 x i64> %v1 to <2 x double>
-  %vbsl3.i = tail call <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double> %vbsl.i, <2 x double> %v2, <2 x double> %v3)
-  ret <2 x double> %vbsl3.i
-}
-
-define <2 x double> @test_bsl_v2f64(<2 x i1> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_bsl_v2f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  %1 = select <2 x i1> %v1, <2 x double> %v2, <2 x double> %v3
-  ret <2 x double> %1
-}
-
-define <4 x float> @test_bsl_v4f32(<4 x i1> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_bsl_v4f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  %1 = select <4 x i1> %v1, <4 x float> %v2, <4 x float> %v3
-  ret <4 x float> %1
-}
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index e029cfcf3394..6d89dfbacf41 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-LABEL: cmeq8xi8:
diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll
deleted file mode 100644
index 096018ab886a..000000000000
--- a/test/CodeGen/AArch64/neon-copy.ll
+++ /dev/null
@@ -1,1402 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has copied equivalent test due to intrinsics.
-
-define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
-  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
-  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
-  ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
-  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
-  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
-  ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
-  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
-  ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
-  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
-  ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
-  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
-  ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
-  ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
-  ret <2 x i64> %tmp4
-}
-
-define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x float> %tmp1, i32 2
-  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x double> %tmp1, i32 0
-  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
-  ret <2 x double> %tmp4
-}
-
-define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
-  ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
-  ret <2 x i64> %tmp4
-}
-
-define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x float> %tmp1, i32 1
-  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x double> %tmp1, i32 0
-  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
-  ret <2 x double> %tmp4
-}
-
-define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
-  ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
-  ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <2 x i32> %tmp4
-}
-
-define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
-  ret <1 x i64> %tmp4
-}
-
-define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x float> %tmp1, i32 2
-  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
-  ret <2 x float> %tmp4
-}
-
-define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x double> %tmp1, i32 0
-  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
-  ret <1 x double> %tmp4
-}
-
-define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
-  ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
-  ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <2 x i32> %tmp4
-}
-
-define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
-  ret <1 x i64> %tmp4
-}
-
-define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  %tmp3 = extractelement <2 x float> %tmp1, i32 0
-  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
-  ret <2 x float> %tmp4
-}
-
-define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x double> %tmp1, i32 0
-  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
-  ret <1 x double> %tmp4
-}
-
-define i32 @umovw16b(<16 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = zext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw8h(<8 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = zext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw4s(<4 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  ret i32 %tmp3
-}
-
-define i64 @umovx2d(<2 x i64> %tmp1) {
-;CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  ret i64 %tmp3
-}
-
-define i32 @umovw8b(<8 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
-  %tmp4 = zext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw4h(<4 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = zext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw2s(<2 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  ret i32 %tmp3
-}
-
-define i64 @umovx1d(<1 x i64> %tmp1) {
-;CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  ret i64 %tmp3
-}
-
-define i32 @smovw16b(<16 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovw8h(<8 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovx16b(<16 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @smovx8h(<8 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i64 @smovx4s(<4 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = sext i32 %tmp3 to i64
-  ret i64 %tmp4
-}
-
-define i32 @smovw8b(<8 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovw4h(<4 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovx8b(<8 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[6]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
-  %tmp4 = sext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @smovx4h(<4 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i64 @smovx2s(<2 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  %tmp4 = sext i32 %tmp3 to i64
-  ret i64 %tmp4
-}
-
-define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
-  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
-  ret <8 x i8> %vset_lane
-}
-
-define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
-  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
-  ret <16 x i8> %vset_lane
-}
-
-define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
-  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
-  ret <8 x i8> %vset_lane
-}
-
-define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
-  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <16 x i8> %vset_lane
-}
-
-define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
-  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
-  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
-  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
-  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
-  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
-  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
-  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
-  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
-  ret <8 x i8> %vecinit7.i
-}
-
-define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
-  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
-  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
-;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
-  ret <1 x i64> %vecinit.i
-}
-
-define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
-  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
-  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
-  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
-  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
-  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
-  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
-  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
-  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
-  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
-  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
-  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
-  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
-  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
-  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
-  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
-  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
-  ret <16 x i8> %vecinit15.i
-}
-
-define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
-  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
-  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
-  ret <4 x i32> %vecinit3.i
-}
-
-define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
-  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
-  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
-  ret <2 x i64> %vecinit1.i
-}
-
-define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <8 x i8> %shuffle
-}
-
-define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  ret <4 x i16> %shuffle
-}
-
-define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i32> %shuffle
-}
-
-define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <16 x i8> %shuffle
-}
-
-define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  ret <8 x i16> %shuffle
-}
-
-define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ret <4 x i32> %shuffle
-}
-
-define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
-;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <8 x i8> %shuffle
-}
-
-define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  ret <4 x i16> %shuffle
-}
-
-define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i32> %shuffle
-}
-
-define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <16 x i8> %shuffle
-}
-
-define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  ret <8 x i16> %shuffle
-}
-
-define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ret <4 x i32> %shuffle
-}
-
-define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
-; CHECK-LABEL: test_bitcastv8i8toi64:
-   %res = bitcast <8 x i8> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
-; CHECK-LABEL: test_bitcastv4i16toi64:
-   %res = bitcast <4 x i16> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
-; CHECK-LABEL: test_bitcastv2i32toi64:
-   %res = bitcast <2 x i32> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
-; CHECK-LABEL: test_bitcastv2f32toi64:
-   %res = bitcast <2 x float> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
-; CHECK-LABEL: test_bitcastv1i64toi64:
-   %res = bitcast <1 x i64> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
-; CHECK-LABEL: test_bitcastv1f64toi64:
-   %res = bitcast <1 x double> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov8i8:
-   %res = bitcast i64 %in to <8 x i8>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <8 x i8> %res
-}
-
-define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov4i16:
-   %res = bitcast i64 %in to <4 x i16>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <4 x i16> %res
-}
-
-define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov2i32:
-   %res = bitcast i64 %in to <2 x i32>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <2 x i32> %res
-}
-
-define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov2f32:
-   %res = bitcast i64 %in to <2 x float>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <2 x float> %res
-}
-
-define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov1i64:
-   %res = bitcast i64 %in to <1 x i64>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <1 x i64> %res
-}
-
-define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov1f64:
-   %res = bitcast i64 %in to <1 x double>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <1 x double> %res
-}
-
-define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
-; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <8 x i8> zeroinitializer, %a
-  %1 = bitcast <8 x i8> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
-; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <4 x i16> zeroinitializer, %a
-  %1 = bitcast <4 x i16> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <2 x i32> zeroinitializer, %a
-  %1 = bitcast <2 x i32> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1i64tov1f64:
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <1 x i64> zeroinitializer, %a
-  %1 = bitcast <1 x i64> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
-; CHECK-LABEL: test_bitcastv2f32tov1f64:
-; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
-  %1 = bitcast <2 x float> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov8i8:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
-  %sub.i = sub <8 x i8> zeroinitializer, %1
-  ret <8 x i8> %sub.i
-}
-
-define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov4i16:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
-  %sub.i = sub <4 x i16> zeroinitializer, %1
-  ret <4 x i16> %sub.i
-}
-
-define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov2i32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
-  %sub.i = sub <2 x i32> zeroinitializer, %1
-  ret <2 x i32> %sub.i
-}
-
-define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov1i64:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
-  %sub.i = sub <1 x i64> zeroinitializer, %1
-  ret <1 x i64> %sub.i
-}
-
-define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov2f32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <2 x float>
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
-  ret <2 x float> %sub.i
-}
-
-; Test insert element into an undef vector
-define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
-; CHECK-LABEL: scalar_to_vector.v8i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
-  %b = insertelement <8 x i8> undef, i8 %a, i32 0
-  ret <8 x i8> %b
-}
-
-define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
-; CHECK-LABEL: scalar_to_vector.v16i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
-  %b = insertelement <16 x i8> undef, i8 %a, i32 0
-  ret <16 x i8> %b
-}
-
-define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
-  %b = insertelement <4 x i16> undef, i16 %a, i32 0
-  ret <4 x i16> %b
-}
-
-define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
-; CHECK-LABEL: scalar_to_vector.v8i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
-  %b = insertelement <8 x i16> undef, i16 %a, i32 0
-  ret <8 x i16> %b
-}
-
-define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
-  %b = insertelement <2 x i32> undef, i32 %a, i32 0
-  ret <2 x i32> %b
-}
-
-define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
-  %b = insertelement <4 x i32> undef, i32 %a, i32 0
-  ret <4 x i32> %b
-}
-
-define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{x[0-9]+}}
-  %b = insertelement <2 x i64> undef, i64 %a, i32 0
-  ret <2 x i64> %b
-}
-
-define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
-; CHECK-LABEL: testDUP.v1i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
-  %b = extractelement <1 x i8> %a, i32 0
-  %c = insertelement <8 x i8> undef, i8 %b, i32 0
-  %d = insertelement <8 x i8> %c, i8 %b, i32 1
-  %e = insertelement <8 x i8> %d, i8 %b, i32 2
-  %f = insertelement <8 x i8> %e, i8 %b, i32 3
-  %g = insertelement <8 x i8> %f, i8 %b, i32 4
-  %h = insertelement <8 x i8> %g, i8 %b, i32 5
-  %i = insertelement <8 x i8> %h, i8 %b, i32 6
-  %j = insertelement <8 x i8> %i, i8 %b, i32 7
-  ret <8 x i8> %j
-}
-
-define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
-; CHECK-LABEL: testDUP.v1i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
-  %b = extractelement <1 x i16> %a, i32 0
-  %c = insertelement <8 x i16> undef, i16 %b, i32 0
-  %d = insertelement <8 x i16> %c, i16 %b, i32 1
-  %e = insertelement <8 x i16> %d, i16 %b, i32 2
-  %f = insertelement <8 x i16> %e, i16 %b, i32 3
-  %g = insertelement <8 x i16> %f, i16 %b, i32 4
-  %h = insertelement <8 x i16> %g, i16 %b, i32 5
-  %i = insertelement <8 x i16> %h, i16 %b, i32 6
-  %j = insertelement <8 x i16> %i, i16 %b, i32 7
-  ret <8 x i16> %j
-}
-
-define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
-; CHECK-LABEL: testDUP.v1i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
-  %b = extractelement <1 x i32> %a, i32 0
-  %c = insertelement <4 x i32> undef, i32 %b, i32 0
-  %d = insertelement <4 x i32> %c, i32 %b, i32 1
-  %e = insertelement <4 x i32> %d, i32 %b, i32 2
-  %f = insertelement <4 x i32> %e, i32 %b, i32 3
-  ret <4 x i32> %f
-}
-
-define <8 x i8> @getl(<16 x i8> %x) #0 {
-; CHECK-LABEL: getl:
-; CHECK: ret
-  %vecext = extractelement <16 x i8> %x, i32 0
-  %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <16 x i8> %x, i32 1
-  %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <16 x i8> %x, i32 2
-  %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <16 x i8> %x, i32 3
-  %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <16 x i8> %x, i32 4
-  %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <16 x i8> %x, i32 5
-  %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <16 x i8> %x, i32 6
-  %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <16 x i8> %x, i32 7
-  %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
-  ret <8 x i8> %vecinit14
-}
-
-define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
-; CHECK-LABEL: test_dup_v2i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
-entry:
-  %x = extractelement <2 x i32> %a, i32 1
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
-; CHECK-LABEL: test_dup_v4i32_v8i16:
-; CHECK: dup v0.8h, v0.h[6]
-entry:
-  %x = extractelement <4 x i32> %a, i32 3
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
-; CHECK-LABEL: test_dup_v1i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
-entry:
-  %x = extractelement <1 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
-; CHECK-LABEL: test_dup_v1i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
-entry:
-  %x = extractelement <1 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v8i16:
-; CHECK: dup v0.8h, v0.h[4]
-entry:
-  %x = extractelement <2 x i64> %a, i32 1
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v4i32:
-; CHECK: dup v0.4s, v0.s[2]
-entry:
-  %x = extractelement <2 x i64> %a, i32 1
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
-  ret <4 x i32> %vecinit3.i
-}
-
-define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
-; CHECK-LABEL: test_dup_v4i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
-entry:
-  %x = extractelement <4 x i32> %a, i32 1
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
-entry:
-  %x = extractelement <2 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
-entry:
-  %x = extractelement <2 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-
-define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
-entry:
-  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  %1 = insertelement <1 x float> undef, float %0, i32 0
-  %2 = extractelement <1 x float> %1, i32 0
-  %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
-  ret <2 x float> %vecinit1.i
-}
-
-define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
-; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
-entry:
-  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  %1 = insertelement <1 x float> undef, float %0, i32 0
-  %2 = extractelement <1 x float> %1, i32 0
-  %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
-  ret <4 x float> %vecinit1.i
-}
-
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
-
-define <2 x i32> @test_concat_undef_v1i32(<1 x i32> %a) {
-; CHECK-LABEL: test_concat_undef_v1i32:
-; CHECK: ins v{{[0-9]+}}.s[1], v{{[0-9]+}}.s[0]
-entry:
-  %0 = extractelement <1 x i32> %a, i32 0
-  %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>) #4
-
-define <2 x i32> @test_concat_v1i32_undef(<1 x i32> %a) {
-; CHECK-LABEL: test_concat_v1i32_undef:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ret
-entry:
-  %b = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
-  %0 = extractelement <1 x i32> %b, i32 0
-  %vecinit.i432 = insertelement <2 x i32> undef, i32 %0, i32 0
-  ret <2 x i32> %vecinit.i432
-}
-
-define <2 x i32> @test_concat_same_v1i32_v1i32(<1 x i32> %a) {
-; CHECK-LABEL: test_concat_same_v1i32_v1i32:
-; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-entry:
-  %0 = extractelement <1 x i32> %a, i32 0
-  %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <2 x i32> @test_concat_diff_v1i32_v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ins v0.s[1], v1.s[0]
-entry:
-  %c = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
-  %d = extractelement <1 x i32> %c, i32 0
-  %e = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %b)
-  %f = extractelement <1 x i32> %e, i32 0
-  %h = shufflevector <1 x i32> %c, <1 x i32> %e, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %h
-}
-
-define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <8 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <8 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <8 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <8 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <8 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <16 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <16 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <16 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <16 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <16 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <16 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <16 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <16 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecext15 = extractelement <8 x i8> %y, i32 0
-  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
-  %vecext17 = extractelement <8 x i8> %y, i32 1
-  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
-  %vecext19 = extractelement <8 x i8> %y, i32 2
-  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
-  %vecext21 = extractelement <8 x i8> %y, i32 3
-  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
-  %vecext23 = extractelement <8 x i8> %y, i32 4
-  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
-  %vecext25 = extractelement <8 x i8> %y, i32 5
-  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
-  %vecext27 = extractelement <8 x i8> %y, i32 6
-  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
-  %vecext29 = extractelement <8 x i8> %y, i32 7
-  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <8 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <8 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <8 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <8 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <8 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecext15 = extractelement <8 x i8> %y, i32 0
-  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
-  %vecext17 = extractelement <8 x i8> %y, i32 1
-  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
-  %vecext19 = extractelement <8 x i8> %y, i32 2
-  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
-  %vecext21 = extractelement <8 x i8> %y, i32 3
-  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
-  %vecext23 = extractelement <8 x i8> %y, i32 4
-  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
-  %vecext25 = extractelement <8 x i8> %y, i32 5
-  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
-  %vecext27 = extractelement <8 x i8> %y, i32 6
-  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
-  %vecext29 = extractelement <8 x i8> %y, i32 7
-  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
-  ret <16 x i8> %vecinit30
-}
-
-define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <4 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <4 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <4 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <8 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecext7 = extractelement <4 x i16> %y, i32 0
-  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
-  %vecext9 = extractelement <4 x i16> %y, i32 1
-  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
-  %vecext11 = extractelement <4 x i16> %y, i32 2
-  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
-  %vecext13 = extractelement <4 x i16> %y, i32 3
-  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <4 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <4 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <4 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecext7 = extractelement <4 x i16> %y, i32 0
-  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
-  %vecext9 = extractelement <4 x i16> %y, i32 1
-  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
-  %vecext11 = extractelement <4 x i16> %y, i32 2
-  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
-  %vecext13 = extractelement <4 x i16> %y, i32 3
-  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
-  ret <8 x i16> %vecinit14
-}
-
-define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <2 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <4 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecext3 = extractelement <2 x i32> %y, i32 0
-  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
-  %vecext5 = extractelement <2 x i32> %y, i32 1
-  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <2 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecext3 = extractelement <2 x i32> %y, i32 0
-  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
-  %vecext5 = extractelement <2 x i32> %y, i32 1
-  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
-  ret <4 x i32> %vecinit6
-}
-
-define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <1 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecext1 = extractelement <1 x i64> %y, i32 0
-  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <1 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecext1 = extractelement <1 x i64> %y, i32 0
-  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
-  ret <2 x i64> %vecinit2
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-; This case tests the copy of two FPR8 registers, which is implemented by fmov
-; of two FPR32 registers.
-define <1 x i8> @test_copy_FPR8_FPR8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: test_copy_FPR8_FPR8:
-; CHECK: usqadd b1, b0
-; CHECK-NEXT: fmov s0, s1
-entry:
- %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %b, <1 x i8> %a)
- ret <1 x i8> %vsqadd2.i
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_copy_FPR16_FPR16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: test_copy_FPR16_FPR16:
-; CHECK: usqadd h1, h0
-; CHECK-NEXT: fmov s0, s1
-entry:
-  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %b, <1 x i16> %a)
-  ret <1 x i16> %vsqadd2.i
-}
-
-define <4 x i16> @concat_vector_v4i16_const() {
-; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: dup {{v[0-9]+}}.4h, wzr
- %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i16> @concat_vector_v4i16_const_one() {
-; CHECK-LABEL: concat_vector_v4i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
- %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i32> @concat_vector_v4i32_const() {
-; CHECK-LABEL: concat_vector_v4i32_const:
-; CHECK: dup {{v[0-9]+}}.4s, wzr
- %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
- ret <4 x i32> %r
-}
-
-define <8 x i8> @concat_vector_v8i8_const() {
-; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: dup {{v[0-9]+}}.8b, wzr
- %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
- ret <8 x i8> %r
-}
-
-define <8 x i16> @concat_vector_v8i16_const() {
-; CHECK-LABEL: concat_vector_v8i16_const:
-; CHECK: dup {{v[0-9]+}}.8h, wzr
- %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <8 x i16> @concat_vector_v8i16_const_one() {
-; CHECK-LABEL: concat_vector_v8i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
- %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <16 x i8> @concat_vector_v16i8_const() {
-; CHECK-LABEL: concat_vector_v16i8_const:
-; CHECK: dup {{v[0-9]+}}.16b, wzr
- %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
- ret <16 x i8> %r
-}
-
-define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
-; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
- %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
-; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
- %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
- ret <4 x i32> %r
-}
-
-define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
-; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[0]
- %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
- ret <8 x i8> %r
-}
-
-define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
-; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
- %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
-; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[0]
- %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
- ret <16 x i8> %r
-}
diff --git a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
deleted file mode 100644
index 1256b2b65049..000000000000
--- a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has a separate copy due to intrinsics
-
-define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) {
-; CHECK-LABEL: copyTuple.QPair:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QTriple:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QQuad:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll
deleted file mode 100644
index 5f1491eb1e90..000000000000
--- a/test/CodeGen/AArch64/neon-crypto.ll
+++ /dev/null
@@ -1,145 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
-; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
-; arm64 has a separate test for this, covering the same features (crypto.ll). N.b. NO-CRYPTO will need porting.
-
-declare <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1m(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1p(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1c(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32>, <4 x i32>) #1
-
-declare i32 @llvm.arm.neon.sha1h(i32) #1
-
-declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) #1
-
-define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaeseq_u8:
-; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese
-entry:
-  %aese.i = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %aese.i
-}
-
-define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaesdq_u8:
-; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %aesd.i
-}
-
-define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesmcq_u8:
-; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %data)
-  ret <16 x i8> %aesmc.i
-}
-
-define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesimcq_u8:
-; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %data)
-  ret <16 x i8> %aesimc.i
-}
-
-define i32 @test_vsha1h_u32(i32 %hash_e) {
-; CHECK: test_vsha1h_u32:
-; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %sha1h1.i = tail call i32 @llvm.arm.neon.sha1h(i32 %hash_e)
-  ret i32 %sha1h1.i
-}
-
-define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) {
-; CHECK: test_vsha1su1q_u32:
-; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32> %tw0_3, <4 x i32> %w12_15)
-  ret <4 x i32> %sha1su12.i
-}
-
-define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) {
-; CHECK: test_vsha256su0q_u32:
-; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-  ret <4 x i32> %sha256su02.i
-}
-
-define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1cq_u32:
-; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1c1.i = tail call <4 x i32> @llvm.arm.neon.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1c1.i
-}
-
-define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1pq_u32:
-; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1p1.i = tail call <4 x i32> @llvm.arm.neon.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1p1.i
-}
-
-define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1mq_u32:
-; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1m1.i = tail call <4 x i32> @llvm.arm.neon.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1m1.i
-}
-
-define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) {
-; CHECK: test_vsha1su0q_u32:
-; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
-  ret <4 x i32> %sha1su03.i
-}
-
-define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
-; CHECK: test_vsha256hq_u32:
-; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-  ret <4 x i32> %sha256h3.i
-}
-
-define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
-; CHECK: test_vsha256h2q_u32:
-; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-  ret <4 x i32> %sha256h23.i
-}
-
-define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
-; CHECK: test_vsha256su1q_u32:
-; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-  ret <4 x i32> %sha256su13.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll
index 470bff771e3d..099b6856cec0 100644
--- a/test/CodeGen/AArch64/neon-diagnostics.ll
+++ b/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfma_lane_f32:
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
index f16b0365c8ed..f270b54abb46 100644
--- a/test/CodeGen/AArch64/neon-extract.ll
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vext_s8:
diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll
deleted file mode 100644
index bf43e51cc297..000000000000
--- a/test/CodeGen/AArch64/neon-facge-facgt.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 has duplicates for this functionality in vcmp.ll.
-
-declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  ret <2 x i32> %val
-}
-define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  ret <4 x i32> %val
-}
-
-define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret <2 x i64> %val
-}
-
-declare <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  ret <2 x i32> %val
-}
-define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  ret <4 x i32> %val
-}
-
-define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret <2 x i64> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll
index 9b1657c36f2a..af70302ca939 100644
--- a/test/CodeGen/AArch64/neon-fma.ll
+++ b/test/CodeGen/AArch64/neon-fma.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
 ;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
diff --git a/test/CodeGen/AArch64/neon-fpround_f128.ll b/test/CodeGen/AArch64/neon-fpround_f128.ll
index f6c0d06872db..a93f3f2723c3 100644
--- a/test/CodeGen/AArch64/neon-fpround_f128.ll
+++ b/test/CodeGen/AArch64/neon-fpround_f128.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <1 x double> @test_fpround_v1f128(<1 x fp128>* %a) {
 ; CHECK-LABEL: test_fpround_v1f128:
diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
deleted file mode 100644
index 199258d60ecb..000000000000
--- a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon  | FileCheck %s
-; arm64 has a duplicate for all these tests in vsqrt.ll
-
-; Set of tests for when the intrinsic is used.
-
-declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll
deleted file mode 100644
index 4d9ffe5dbd7b..000000000000
--- a/test/CodeGen/AArch64/neon-halving-add-sub.ll
+++ /dev/null
@@ -1,208 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 duplicates these in vhadd.ll and vhsub.ll
-
-declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-idiv.ll b/test/CodeGen/AArch64/neon-idiv.ll
index 9c9758a81f8b..de402c4780be 100644
--- a/test/CodeGen/AArch64/neon-idiv.ll
+++ b/test/CodeGen/AArch64/neon-idiv.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
 
 define <4 x i32> @test1(<4 x i32> %a) {
   %rem = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
diff --git a/test/CodeGen/AArch64/neon-load-store-v1i32.ll b/test/CodeGen/AArch64/neon-load-store-v1i32.ll
deleted file mode 100644
index 12361ba008db..000000000000
--- a/test/CodeGen/AArch64/neon-load-store-v1i32.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 does not use these pseudo-vectors, and they're not blessed by the PCS. Skipping.
-
-; Test load/store of v1i8, v1i16, v1i32 types can be selected correctly
-define void @load.store.v1i8(<1 x i8>* %ptr, <1 x i8>* %ptr2) {
-; CHECK-LABEL: load.store.v1i8:
-; CHECK: ldr b{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str b{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i8>* %ptr
-  store <1 x i8> %a, <1 x i8>* %ptr2
-  ret void
-}
-
-define void @load.store.v1i16(<1 x i16>* %ptr, <1 x i16>* %ptr2) {
-; CHECK-LABEL: load.store.v1i16:
-; CHECK: ldr h{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str h{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i16>* %ptr
-  store <1 x i16> %a, <1 x i16>* %ptr2
-  ret void
-}
-
-define void @load.store.v1i32(<1 x i32>* %ptr, <1 x i32>* %ptr2) {
-; CHECK-LABEL: load.store.v1i32:
-; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str s{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i32>* %ptr
-  store <1 x i32> %a, <1 x i32>* %ptr2
-  ret void
-}
diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
deleted file mode 100644
index 8642f09c4e28..000000000000
--- a/test/CodeGen/AArch64/neon-max-min-pairwise.ll
+++ /dev/null
@@ -1,347 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; These duplicate arm64 tests in vmax.ll
-
-declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smaxp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smaxp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umaxp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smaxp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smaxp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umaxp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umaxp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smaxp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smaxp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umaxp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umaxp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smaxp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smaxp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umaxp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umaxp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smaxp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smaxp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umaxp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umaxp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smaxp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smaxp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umaxp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umaxp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_sminp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sminp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uminp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sminp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sminp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uminp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uminp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sminp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sminp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uminp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uminp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sminp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sminp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uminp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uminp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sminp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sminp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uminp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uminp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sminp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sminp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uminp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uminp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnmp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnmp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnmp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnmp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnmp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnmp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnmp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnmp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnmp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vminv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_s32
-; CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vminv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_u32
-; CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vmaxv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_s32
-; CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vmaxv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_u32
-; CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll
deleted file mode 100644
index f9a50f4e5d72..000000000000
--- a/test/CodeGen/AArch64/neon-max-min.ll
+++ /dev/null
@@ -1,311 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; These duplicate tests in arm64's vmax.ll
-
-declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smax_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smax v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umax v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smax_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smax v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umax_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umax v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smax_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smax v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umax_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umax v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smax_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smax v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umax_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umax v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smax_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smax v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umax_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umax v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smax_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smax v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umax_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umax v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smin_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smin v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umin v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smin_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smin v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umin_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umin v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smin_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smin v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umin_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umin v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smin_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smin v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umin_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umin v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smin_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smin v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umin_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umin v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smin_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smin v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umin_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umin v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmax_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmax v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmax_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmax v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmax_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmax v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmin_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmin v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmin_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmin v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmin_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmin v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-
-declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnm_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnm_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnm_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnm_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnm v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnm_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnm v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnm_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnm v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
diff --git a/test/CodeGen/AArch64/neon-misc-scalar.ll b/test/CodeGen/AArch64/neon-misc-scalar.ll
deleted file mode 100644
index 3472c5f07bc7..000000000000
--- a/test/CodeGen/AArch64/neon-misc-scalar.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-;RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 already has copies of these tests (scattered).
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) {
-entry:
-  ; CHECK: test_vuqadd_s64
-  %vuqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-  ; CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vuqadd2.i
-}
-
-define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) {
-entry:
-  ; CHECK: test_vsqadd_u64
-  %vsqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-  ; CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vsqadd2.i
-}
-
-define <1 x i64> @test_vabs_s64(<1 x i64> %a) {
-  ; CHECK: test_vabs_s64
-entry:
-  %vabs1.i = tail call <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64> %a)
-  ; CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vabs1.i
-}
-
-define <1 x i64> @test_vqabs_s64(<1 x i64> %a) {
-  ; CHECK: test_vqabs_s64
-entry:
-  %vqabs1.i = tail call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %a)
-  ; CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vqabs1.i
-}
-
-define <1 x i64> @test_vqneg_s64(<1 x i64> %a) {
-  ; CHECK: test_vqneg_s64
-entry:
-  %vqneg1.i = tail call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %a)
-  ; CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vqneg1.i
-}
-
-define <1 x i64> @test_vneg_s64(<1 x i64> %a) {
-  ; CHECK: test_vneg_s64
-entry:
-  %sub.i = sub <1 x i64> zeroinitializer, %a
-  ; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %sub.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll
deleted file mode 100644
index 5682f103e93c..000000000000
--- a/test/CodeGen/AArch64/neon-misc.ll
+++ /dev/null
@@ -1,2014 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has a separate copy of these in aarch64-neon-misc.ll due to different intrinsics.
-
-define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-  ret <8 x i8> %shuffle.i
-}
-
-define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x i16> %shuffle.i
-}
-
-define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-  ret <8 x i16> %shuffle.i
-}
-
-define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-  ret <2 x i32> %shuffle.i
-}
-
-define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  ret <2 x float> %shuffle.i
-}
-
-define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x i32> %shuffle.i
-}
-
-define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x float> %shuffle.i
-}
-
-define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
-  ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
-  ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
-  ret <1 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
-  ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
-  ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
-  ret <1 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
-  ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
-  ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
-  ret <2 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
-  ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
-  ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
-  ret <2 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
-  ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
-  ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
-  ret <1 x i64> %vpadal2.i
-}
-
-define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
-  ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
-  ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
-  ret <1 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
-  ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
-  ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
-  ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
-  ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
-  ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
-  ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vqabs.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vqabs.i
-}
-
-define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vqabs.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vqabs.i
-}
-
-define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vqabs1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vqabs1.i
-}
-
-define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vqabs1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vqabs1.i
-}
-
-define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vqabs1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vqabs1.i
-}
-
-define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vqabs1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vqabs1.i
-}
-
-define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vqabs1.i = tail call <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vqabs1.i
-}
-
-define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vqneg.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vqneg.i
-}
-
-define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vqneg.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vqneg.i
-}
-
-define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vqneg1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vqneg1.i
-}
-
-define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vqneg1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vqneg1.i
-}
-
-define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vqneg1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vqneg1.i
-}
-
-define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vqneg1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vqneg1.i
-}
-
-define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vqneg1.i = tail call <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vqneg1.i
-}
-
-define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %sub.i = sub <8 x i8> zeroinitializer, %a
-  ret <8 x i8> %sub.i
-}
-
-define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %sub.i = sub <16 x i8> zeroinitializer, %a
-  ret <16 x i8> %sub.i
-}
-
-define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %sub.i = sub <4 x i16> zeroinitializer, %a
-  ret <4 x i16> %sub.i
-}
-
-define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %sub.i = sub <8 x i16> zeroinitializer, %a
-  ret <8 x i16> %sub.i
-}
-
-define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %sub.i = sub <2 x i32> zeroinitializer, %a
-  ret <2 x i32> %sub.i
-}
-
-define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %sub.i = sub <4 x i32> zeroinitializer, %a
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %sub.i = sub <2 x i64> zeroinitializer, %a
-  ret <2 x i64> %sub.i
-}
-
-define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
-  ret <2 x float> %sub.i
-}
-
-define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
-  ret <4 x float> %sub.i
-}
-
-define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
-  ret <2 x double> %sub.i
-}
-
-define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vabs.i
-}
-
-define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vabs.i
-}
-
-define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vabs1.i
-}
-
-define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vabs1.i
-}
-
-define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vabs1.i
-}
-
-define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vabs1.i
-}
-
-define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vabs1.i = tail call <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vabs1.i
-}
-
-define <2 x float> @test_vabs_f32(<2 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vabs1.i
-}
-
-define <4 x float> @test_vabsq_f32(<4 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vabs1.i
-}
-
-define <2 x double> @test_vabsq_f64(<2 x double> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vabs1.i = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vabs1.i
-}
-
-define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vuqadd.i = tail call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
-  ret <8 x i8> %vuqadd.i
-}
-
-define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vuqadd.i = tail call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
-  ret <16 x i8> %vuqadd.i
-}
-
-define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vuqadd2.i = tail call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
-  ret <4 x i16> %vuqadd2.i
-}
-
-define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vuqadd2.i = tail call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
-  ret <8 x i16> %vuqadd2.i
-}
-
-define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vuqadd2.i = tail call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
-  ret <2 x i32> %vuqadd2.i
-}
-
-define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vuqadd2.i = tail call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
-  ret <4 x i32> %vuqadd2.i
-}
-
-define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vuqadd2.i = tail call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
-  ret <2 x i64> %vuqadd2.i
-}
-
-define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vcls.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vcls.i
-}
-
-define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vcls.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vcls.i
-}
-
-define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vcls1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vcls1.i
-}
-
-define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vcls1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vcls1.i
-}
-
-define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcls1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vcls1.i
-}
-
-define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcls1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vcls1.i
-}
-
-define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
-  ret <8 x i8> %vclz.i
-}
-
-define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
-  ret <16 x i8> %vclz.i
-}
-
-define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
-  ret <4 x i16> %vclz1.i
-}
-
-define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
-  ret <8 x i16> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
-  ret <2 x i32> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
-  ret <4 x i32> %vclz1.i
-}
-
-define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vctpop.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vctpop.i
-}
-
-define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vctpop.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vctpop.i
-}
-
-define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-  ret <8 x i8> %neg.i
-}
-
-define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-  ret <16 x i8> %neg.i
-}
-
-define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
-  ret <4 x i16> %neg.i
-}
-
-define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-  ret <8 x i16> %neg.i
-}
-
-define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
-  ret <2 x i32> %neg.i
-}
-
-define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
-  ret <4 x i32> %neg.i
-}
-
-define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vrbit.i = tail call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vrbit.i
-}
-
-define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vrbit.i = tail call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vrbit.i
-}
-
-define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vmovn.i = trunc <8 x i16> %a to <8 x i8>
-  ret <8 x i8> %vmovn.i
-}
-
-define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vmovn.i = trunc <4 x i32> %a to <4 x i16>
-  ret <4 x i16> %vmovn.i
-}
-
-define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vmovn.i = trunc <2 x i64> %a to <2 x i32>
-  ret <2 x i32> %vmovn.i
-}
-
-define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vmovn.i.i = trunc <8 x i16> %b to <8 x i8>
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vmovn.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vmovn.i.i = trunc <4 x i32> %b to <4 x i16>
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vmovn.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vmovn.i.i = trunc <2 x i64> %b to <2 x i32>
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vmovn.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqdmull1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqdmull1.i
-}
-
-define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqdmull1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqdmull1.i
-}
-
-define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqdmull1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqdmull1.i
-}
-
-define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqdmull1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqdmull1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vqdmull1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqdmull1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vqdmull1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqdmull1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: test_vqmovn_high_s32
-  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: test_vqmovn_high_s64
-  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
-  %1 = sext <8 x i8> %a to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
-  %1 = sext <4 x i16> %a to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
-  %1 = sext <2 x i32> %a to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
-  %1 = zext <8 x i8> %a to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
-  %1 = zext <4 x i16> %a to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
-  %1 = zext <2 x i32> %a to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <4 x i16> @test_vcvt_f16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt1.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
-  ret <4 x i16> %vcvt1.i
-}
-
-define <8 x i16> @test_vcvt_high_f16_f32(<4 x i16> %a, <4 x float> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vcvt1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vcvt1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x float> @test_vcvt_f32_f16(<4 x i16> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
-  %vcvt1.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %a) #4
-  ret <4 x float> %vcvt1.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f16(<8 x i16> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %shuffle.i.i) #4
-  ret <4 x float> %vcvt1.i.i
-}
-
-define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptrunc <2 x double> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vcvt.i.i = fptrunc <2 x double> %b to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvt.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i = call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %a) #4
-  ret <2 x float> %vcvtx_f32_f641.i
-}
-
-define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %b) #4
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-  %vcvt.i = fpext <2 x float> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %shuffle.i.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  %vcvt.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
-  ret <2 x double> %vcvt.i.i
-}
-
-define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndn1.i = tail call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndn1.i
-}
-
-define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndn1.i = tail call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndn1.i
-}
-
-define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndn1.i = tail call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndn1.i
-}
-
-define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrnda1.i = tail call <2 x float> @llvm.round.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrnda1.i
-}
-
-define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-   %vrnda1.i = tail call <4 x float> @llvm.round.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrnda1.i
-}
-
-define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrnda1.i = tail call <2 x double> @llvm.round.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrnda1.i
-}
-
-define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndp1.i = tail call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndp1.i
-}
-
-define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndp1.i = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndp1.i
-}
-
-define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndp1.i = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndp1.i
-}
-
-define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndm1.i = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndm1.i
-}
-
-define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndm1.i = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndm1.i
-}
-
-define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-   %vrndm1.i = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndm1.i
-}
-
-define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndx1.i = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndx1.i
-}
-
-define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndx1.i = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndx1.i
-}
-
-define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndx1.i = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndx1.i
-}
-
-define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-   %vrnd1.i = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrnd1.i
-}
-
-define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrnd1.i = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrnd1.i
-}
-
-define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrnd1.i = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrnd1.i
-}
-
-define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndi1.i = tail call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndi1.i
-}
-
-define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndi1.i = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndi1.i
-}
-
-define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndi1.i = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndi1.i
-}
-
-define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x double> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x double> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_s64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x float> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_u64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x float> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_s16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt.i = fptosi <4 x float> %a to <4 x i16>
-  ret <4 x i16> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_u16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt.i = fptoui <4 x float> %a to <4 x i16>
-  ret <4 x i16> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_s32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x double> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x double> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_s8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i8>
-  ret <1 x i8> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_u8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i8>
-  ret <1 x i8> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_s16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i16>
-  ret <1 x i16> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_u16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i16>
-  ret <1 x i16> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_s32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i32>
-  ret <1 x i32> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_u32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i32>
-  ret <1 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtns_f321.i = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtns_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtns_f321.i = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtns_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_s64_f64
-; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtns_f641.i = call <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtns_f641.i
-}
-
-define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtnu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtnu_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtnu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtnu_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_u64_f64
-; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtnu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtnu_f641.i
-}
-
-define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtps_f321.i = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtps_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtps_f321.i = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtps_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_s64_f64
-; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtps_f641.i = call <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtps_f641.i
-}
-
-define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtpu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtpu_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtpu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtpu_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_u64_f64
-; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtpu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtpu_f641.i
-}
-
-define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtms_f321.i = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtms_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtms_f321.i = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtms_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_s64_f64
-; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtms_f641.i = call <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtms_f641.i
-}
-
-define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtmu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtmu_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtmu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtmu_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_u64_f64
-; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtmu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtmu_f641.i
-}
-
-define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtas_f321.i = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtas_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtas_f321.i = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtas_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_s64_f64
-; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtas_f641.i = call <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtas_f641.i
-}
-
-define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtau_f321.i = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtau_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtau_f321.i = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtau_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_u64_f64
-; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtau_f641.i = call <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtau_f641.i
-}
-
-define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrsqrte1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrsqrte1.i
-}
-
-define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrsqrte1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrsqrte1.i
-}
-
-define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrsqrte1.i = tail call <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrsqrte1.i
-}
-
-define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrecpe1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrecpe1.i
-}
-
-define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrecpe1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrecpe1.i
-}
-
-define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrecpe1.i = tail call <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrecpe1.i
-}
-
-define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrecpe1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vrecpe1.i
-}
-
-define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrecpe1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vrecpe1.i
-}
-
-define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vsqrt1.i = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vsqrt1.i
-}
-
-define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vsqrt1.i = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vsqrt1.i
-}
-
-define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vsqrt1.i = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vsqrt1.i
-}
-
-define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_s16(<4 x i16> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_u16(<4 x i16> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_s32(<2 x i32> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_u32(<2 x i32> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: sxtb w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i8> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i8> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: sxth w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i16> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xffff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i16> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i32> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i32> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #2
-
-declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) #2
-
-declare <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) #2
-
-declare <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2
-
-declare <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float>)
-
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.rint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.rint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.round.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.round.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) #2
-
-declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) #2
-
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #2
-
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) #2
-
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) #2
-
-declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) #2
-
-declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) #2
-
-declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) #2
-
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) #2
-
-declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) #2
-
-declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) #2
-
-declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) #2
-
-declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) #2
-
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #3
-
-declare <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) #2
-
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) #2
-
-
-define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fptosi <1 x double> %a to <1 x i64>
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fptoui <1 x double> %a to <1 x i64>
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_s64_f64
-; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_u64_f64
-; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_s64_f64
-; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_u64_f64
-; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_s64_f64
-; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_u64_f64
-; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_s64_f64
-; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_u64_f64
-; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = sitofp <1 x i64> %a to <1 x double>
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = uitofp <1 x i64> %a to <1 x double>
-  ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double>)
-
-define <1 x double> @test_vrndn_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndn_f64
-; CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnda_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnda_f64
-; CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.round.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndp_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndp_f64
-; CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndm_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndm_f64
-; CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndx_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndx_f64
-; CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnd_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnd_f64
-; CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndi_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndi_f64
-; CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double>)
-
-define <1 x double> @test_vrsqrte_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrsqrte_f64
-; CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecpe_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrecpe_f64
-; CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vsqrt_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vsqrt_f64
-; CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrecps_f64
-; CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrsqrts_f64
-; CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
-
-define i64 @test_vaddlv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_s32
-; CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-define i64 @test_vaddlv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_u32
-; CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32>)
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll
index 37daadef0b28..71bb0e70abfa 100644
--- a/test/CodeGen/AArch64/neon-mla-mls.ll
+++ b/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 7eadde481613..40649aeb1b8e 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-AARCH64
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 define <8 x i8> @movi8b() {
 ; CHECK-LABEL: movi8b:
@@ -15,87 +14,75 @@ define <16 x i8> @movi16b() {
 
 define <2 x i32> @movi2s_lsl0() {
 ; CHECK-LABEL: movi2s_lsl0:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.2s, #0xff
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x0000ff000000ff
+; CHECK: movi {{d[0-9]+}}, #0x0000ff000000ff
    ret <2 x i32> < i32 255, i32 255 >
 }
 
 define <2 x i32> @movi2s_lsl8() {
 ; CHECK-LABEL: movi2s_lsl8:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.2s, #0xff, lsl #8
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x00ff000000ff00
+; CHECK: movi {{d[0-9]+}}, #0x00ff000000ff00
    ret <2 x i32> < i32 65280, i32 65280 >
 }
 
 define <2 x i32> @movi2s_lsl16() {
 ; CHECK-LABEL: movi2s_lsl16:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.2s, #0xff, lsl #16
-; CHECK-ARM64: movi {{d[0-9]+}}, #0xff000000ff0000
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff0000
    ret <2 x i32> < i32 16711680, i32 16711680 >
 
 }
 
 define <2 x i32> @movi2s_lsl24() {
 ; CHECK-LABEL: movi2s_lsl24:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.2s, #0xff, lsl #24
-; CHECK-ARM64: movi {{d[0-9]+}}, #0xff000000ff000000
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff000000
    ret <2 x i32> < i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i32> @movi4s_lsl0() {
 ; CHECK-LABEL: movi4s_lsl0:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4s, #0xff
-; CHECK-ARM64: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
+; CHECK: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
    ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
 }
 
 define <4 x i32> @movi4s_lsl8() {
 ; CHECK-LABEL: movi4s_lsl8:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4s, #0xff, lsl #8
-; CHECK-ARM64: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
+; CHECK: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
    ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
 }
 
 define <4 x i32> @movi4s_lsl16() {
 ; CHECK-LABEL: movi4s_lsl16:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4s, #0xff, lsl #16
-; CHECK-ARM64:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
    ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
 
 }
 
 define <4 x i32> @movi4s_lsl24() {
 ; CHECK-LABEL: movi4s_lsl24:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4s, #0xff, lsl #24
-; CHECK-ARM64:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
    ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i16> @movi4h_lsl0() {
 ; CHECK-LABEL: movi4h_lsl0:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4h, #0xff
-; CHECK-ARM64:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
+; CHECK:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
    ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <4 x i16> @movi4h_lsl8() {
 ; CHECK-LABEL: movi4h_lsl8:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
-; CHECK-ARM64: movi d0, #0xff00ff00ff00ff00
+; CHECK: movi d0, #0xff00ff00ff00ff00
    ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 define <8 x i16> @movi8h_lsl0() {
 ; CHECK-LABEL: movi8h_lsl0:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.8h, #{{0xff|255}}
-; CHECK-ARM64: movi v0.2d, #0xff00ff00ff00ff
+; CHECK: movi v0.2d, #0xff00ff00ff00ff
    ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <8 x i16> @movi8h_lsl8() {
 ; CHECK-LABEL: movi8h_lsl8:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
-; CHECK-ARM64: movi v0.2d, #0xff00ff00ff00ff00
+; CHECK: movi v0.2d, #0xff00ff00ff00ff00
    ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
@@ -177,30 +164,26 @@ define <8 x i16> @mvni8h_lsl8() {
 
 define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
 ; CHECK-LABEL: movi2s_msl8:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.2s, #0xff, msl #8
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x00ffff0000ffff
+; CHECK: movi {{d[0-9]+}}, #0x00ffff0000ffff
 	ret <2 x i32> < i32 65535, i32 65535 >
 }
 
 define <2 x i32> @movi2s_msl16() {
 ; CHECK-LABEL: movi2s_msl16:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.2s, #0xff, msl #16
-; CHECK-ARM64:  movi d0, #0xffffff00ffffff
+; CHECK:  movi d0, #0xffffff00ffffff
    ret <2 x i32> < i32 16777215, i32 16777215 >
 }
 
 
 define <4 x i32> @movi4s_msl8() {
 ; CHECK-LABEL: movi4s_msl8:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4s, #0xff, msl #8
-; CHECK-ARM64:  movi v0.2d, #0x00ffff0000ffff
+; CHECK:  movi v0.2d, #0x00ffff0000ffff
    ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
 }
 
 define <4 x i32> @movi4s_msl16() {
 ; CHECK-LABEL: movi4s_msl16:
-; CHECK-AARCH64:  movi {{v[0-9]+}}.4s, #0xff, msl #16
-; CHECK-ARM64:  movi v0.2d, #0xffffff00ffffff
+; CHECK:  movi v0.2d, #0xffffff00ffffff
    ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
 }
 
diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll
deleted file mode 100644
index 869bd445c718..000000000000
--- a/test/CodeGen/AArch64/neon-mul-div.ll
+++ /dev/null
@@ -1,754 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has its own copy of this because of the intrinsics
-
-define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = mul <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = mul <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = mul <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = mul <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = mul <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = mul <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK-LABEL: mul1xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <1 x i64> %A, %B;
-  ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK-LABEL: mul2xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <2 x i64> %A, %B;
-  ret <2 x i64> %tmp3
-}
-
- define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fmul <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fmul <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fmul <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-
- define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fdiv <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fdiv <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fdiv <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-
-define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
-; CHECK: bl fmod
-	%tmp3 = frem <1 x double> %A, %B;
-	ret <1 x double> %tmp3
-}
-
-define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
-; CHECK: bl fmod
-; CHECK: bl fmod
-	%tmp3 = frem <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>)
-
-define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: poly_mulv8i8:
-   %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: pmul v0.8b, v0.8b, v1.8b
-   ret <8 x i8> %prod
-}
-
-define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: poly_mulv16i8:
-   %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: pmul v0.16b, v0.16b, v1.16b
-   ret <16 x i8> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
-define <1 x i8> @test_mul_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_mul_v1i8:
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = mul <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_mul_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_mul_v1i16:
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = mul <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_mul_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_mul_v1i32:
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = mul <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-or-combine.ll b/test/CodeGen/AArch64/neon-or-combine.ll
index e8da72f42cd5..260f6935ddef 100644
--- a/test/CodeGen/AArch64/neon-or-combine.ll
+++ b/test/CodeGen/AArch64/neon-or-combine.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; Check that the DAGCombiner does not crash with an assertion failure
 ; when performing a target specific combine to simplify a 'or' dag node
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index 99507cecf1c8..4f8571db7480 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
@@ -54,8 +53,7 @@ entry:
 
 define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp1_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -71,8 +69,7 @@ entry:
 
 define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp1q_s64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -112,8 +109,7 @@ entry:
 
 define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp1_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -129,8 +125,7 @@ entry:
 
 define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp1q_u64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -138,8 +133,7 @@ entry:
 
 define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp1_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -155,8 +149,7 @@ entry:
 
 define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vuzp1q_f64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -228,8 +221,7 @@ entry:
 
 define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp2_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -245,9 +237,7 @@ entry:
 
 define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp2q_s64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-AARCH64-NEXT: mov {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -287,8 +277,7 @@ entry:
 
 define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp2_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -304,9 +293,7 @@ entry:
 
 define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp2q_u64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-AARCH64-NEXT: mov {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -314,8 +301,7 @@ entry:
 
 define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp2_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -331,9 +317,7 @@ entry:
 
 define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vuzp2q_f64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-AARCH64-NEXT: mov {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -405,8 +389,7 @@ entry:
 
 define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip1_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -422,8 +405,7 @@ entry:
 
 define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip1q_s64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -463,8 +445,7 @@ entry:
 
 define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip1_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -480,8 +461,7 @@ entry:
 
 define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip1q_u64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -489,8 +469,7 @@ entry:
 
 define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip1_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -506,8 +485,7 @@ entry:
 
 define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vzip1q_f64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -579,8 +557,7 @@ entry:
 
 define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip2_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -596,8 +573,7 @@ entry:
 
 define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip2q_s64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -637,8 +613,7 @@ entry:
 
 define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip2_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -654,8 +629,7 @@ entry:
 
 define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip2q_u64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -663,8 +637,7 @@ entry:
 
 define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip2_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -680,8 +653,7 @@ entry:
 
 define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vzip2q_f64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -753,8 +725,7 @@ entry:
 
 define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn1_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -770,8 +741,7 @@ entry:
 
 define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn1q_s64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -811,8 +781,7 @@ entry:
 
 define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn1_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -828,8 +797,7 @@ entry:
 
 define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn1q_u64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -837,8 +805,7 @@ entry:
 
 define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn1_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -854,8 +821,7 @@ entry:
 
 define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vtrn1q_f64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -927,8 +893,7 @@ entry:
 
 define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn2_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -944,8 +909,7 @@ entry:
 
 define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn2q_s64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -985,8 +949,7 @@ entry:
 
 define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn2_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -1002,8 +965,7 @@ entry:
 
 define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn2q_u64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -1011,8 +973,7 @@ entry:
 
 define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn2_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -1028,8 +989,7 @@ entry:
 
 define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vtrn2q_f64:
-; CHECK-AARCH64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -2534,10 +2494,8 @@ entry:
 
 define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2572,10 +2530,8 @@ entry:
 
 define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2586,10 +2542,8 @@ entry:
 
 define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2756,10 +2710,8 @@ entry:
 
 define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2794,10 +2746,8 @@ entry:
 
 define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2808,10 +2758,8 @@ entry:
 
 define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2978,10 +2926,8 @@ entry:
 
 define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn_s32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -3016,10 +2962,8 @@ entry:
 
 define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn_u32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -3030,10 +2974,8 @@ entry:
 
 define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn_f32:
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK-AARCH64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -3183,7 +3125,4 @@ define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
   %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
   ret %struct.uint8x8x2_t %.fca.0.1.insert
 
-; CHECK-AARCH64: dup	{{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-AARCH64-NEXT: uzp1	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-AARCH64-NEXT: uzp2	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll
deleted file mode 100644
index 5c99ba1e4d4f..000000000000
--- a/test/CodeGen/AArch64/neon-rounding-halving-add.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Just intrinsic calls: arm64 has similar in vhadd.ll
-
-declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll
deleted file mode 100644
index 692df988cfbb..000000000000
--- a/test/CodeGen/AArch64/neon-rounding-shift.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Just intrinsic calls: arm64 has similar in vshift.ll
-
-declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_urshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: urshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_srshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: srshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
deleted file mode 100644
index 996835bfc5ac..000000000000
--- a/test/CodeGen/AArch64/neon-saturating-add-sub.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Just intrinsic calls: arm64 has similar in vqadd.ll
-declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-
-declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqadd_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqadd v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqadd_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqadd v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqsub_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqsub v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqsub_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqsub v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
deleted file mode 100644
index a59eebd55d38..000000000000
--- a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Just intrinsic calls: arm64 has similar in vshift.ll
-
-declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqrshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqrshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqrshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqrshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqrshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqrshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqrshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqrshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqrshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqrshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqrshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqrshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqrshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqrshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqrshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqrshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqrshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqrshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqrshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqrshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll
deleted file mode 100644
index 035740cba5d3..000000000000
--- a/test/CodeGen/AArch64/neon-saturating-shift.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Just intrinsic calls: arm64 has similar in vshift.ll
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-scalar-abs.ll b/test/CodeGen/AArch64/neon-scalar-abs.ll
deleted file mode 100644
index bb351ab86fcd..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-abs.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has tests for i64 versions, uses different approach for others.
-
-define i64 @test_vabsd_s64(i64 %a) {
-; CHECK: test_vabsd_s64
-; CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vabs1.i = tail call <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64> %vabs.i)
-  %0 = extractelement <1 x i64> %vabs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64>)
-
-define i8 @test_vqabsb_s8(i8 %a) {
-; CHECK: test_vqabsb_s8
-; CHECK: sqabs {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vqabs1.i = call <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8> %vqabs.i)
-  %0 = extractelement <1 x i8> %vqabs1.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8>)
-
-define i16 @test_vqabsh_s16(i16 %a) {
-; CHECK: test_vqabsh_s16
-; CHECK: sqabs {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqabs1.i = call <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16> %vqabs.i)
-  %0 = extractelement <1 x i16> %vqabs1.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16>)
-
-define i32 @test_vqabss_s32(i32 %a) {
-; CHECK: test_vqabss_s32
-; CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqabs1.i = call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %vqabs.i)
-  %0 = extractelement <1 x i32> %vqabs1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>)
-
-define i64 @test_vqabsd_s64(i64 %a) {
-; CHECK: test_vqabsd_s64
-; CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqabs1.i = call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %vqabs.i)
-  %0 = extractelement <1 x i64> %vqabs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
deleted file mode 100644
index 7e262cb8bdb6..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-add-sub.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has a copy of the key parts in AdvSIMD-Scalar.ll
-
-define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-	%tmp3 = add <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-	%tmp3 = sub <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_add_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uadd_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_usub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index f5636db5e142..32f59626b381 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
deleted file mode 100644
index ff2941325208..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has separate copy due to intrinsics (aarch64-neon-scalar-by-elem-mul.ll)
-define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
-  ; CHECK: test_fmul_lane_ss2S
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = fmul float %a, %tmp1;
-  ret float %tmp2;
-}
-
-define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
-  ; CHECK: test_fmul_lane_ss2S_swap
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = fmul float %tmp1, %a;
-  ret float %tmp2;
-}
-
-
-define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
-  ; CHECK: test_fmul_lane_ss4S
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = fmul float %a, %tmp1;
-  ret float %tmp2;
-}
-
-define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
-  ; CHECK: test_fmul_lane_ss4S_swap
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = fmul float %tmp1, %a;
-  ret float %tmp2;
-}
-
-
-define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
-  ; CHECK: test_fmul_lane_ddD
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = fmul double %a, %tmp1;
-  ret double %tmp2;
-}
-
-
-
-define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
-  ; CHECK: test_fmul_lane_dd2D
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = fmul double %a, %tmp1;
-  ret double %tmp2;
-}
-
-
-define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
-  ; CHECK: test_fmul_lane_dd2D_swap
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = fmul double %tmp1, %a;
-  ret double %tmp2;
-}
-
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
-
-define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
-  ; CHECK: test_fmulx_lane_f32
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
-  ret float %tmp2;
-}
-
-define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
-  ; CHECK: test_fmulx_laneq_f32
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
-  ret float %tmp2;
-}
-
-define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
-  ; CHECK: test_fmulx_laneq_f32_swap
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a)
-  ret float %tmp2;
-}
-
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
-
-define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
-  ; CHECK: test_fmulx_lane_f64
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_0
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp1 = extractelement <2 x double> %v, i32 0
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-
-define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_1
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_1_swap
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a)
-  ret double %tmp2;
-}
-
diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll
deleted file mode 100644
index 2ecde91d7e1b..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-compare.ll
+++ /dev/null
@@ -1,344 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 has (the non-trivial parts of) this test covered by vcmp.ll
-
-;; Scalar Integer Compare
-
-define i64 @test_vceqd(i64 %a, i64 %b) {
-; CHECK: test_vceqd
-; CHECK: cmeq {{d[0-9]+}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vceq.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vceq1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceq.i, <1 x i64> %vceq1.i)
-  %0 = extractelement <1 x i64> %vceq2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vceqzd(i64 %a) {
-; CHECK: test_vceqzd
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vceqz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vceqz1.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceqz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vceqz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcged(i64 %a, i64 %b) {
-; CHECK: test_vcged
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcge.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcge1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgezd(i64 %a) {
-; CHECK: test_vcgezd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcgez.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgez1.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcgez.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcgez1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgtd(i64 %a, i64 %b) {
-; CHECK: test_vcgtd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcgt.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgt1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgtzd(i64 %a) {
-; CHECK: test_vcgtzd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcgtz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgtz1.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgtz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcgtz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcled(i64 %a, i64 %b) {
-; CHECK: test_vcled
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcgt.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcgt1.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vclezd(i64 %a) {
-; CHECK: test_vclezd
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vclez.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vclez1.i = call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64> %vclez.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vclez1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcltd(i64 %a, i64 %b) {
-; CHECK: test_vcltd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcge.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcge1.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcltzd(i64 %a) {
-; CHECK: test_vcltzd
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcltz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcltz1.i = call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64> %vcltz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcltz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vtstd(i64 %a, i64 %b) {
-; CHECK: test_vtstd
-; CHECK: cmtst {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vtst.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vtst1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vtst2.i = call <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64> %vtst.i, <1 x i64> %vtst1.i)
-  %0 = extractelement <1 x i64> %vtst2.i, i32 0
-  ret i64 %0
-}
-
-
-define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcage_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
-  ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcagt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
-  ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcale_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
-  ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcalt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
-  ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp eq <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp eq <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vceq_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp oeq <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sge <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp uge <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcge_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp oge <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sle <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ule <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcle_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp ole <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sgt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ugt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcgt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp ogt <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp slt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ult <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vclt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp olt <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_p64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
-; CHECK: test_vceqzq_p64
-; CHECK: cmeq  {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0
-  %1 = icmp eq <2 x i64> %a, zeroinitializer
-  %vceqz.i = sext <2 x i1> %1 to <2 x i64>
-  ret <2 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgez_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sge <1 x i64> %a, zeroinitializer
-  %vcgez.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcgez.i
-}
-
-define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vclez_s64
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sle <1 x i64> %a, zeroinitializer
-  %vclez.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vclez.i
-}
-
-define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgtz_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sgt <1 x i64> %a, zeroinitializer
-  %vcgtz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcgtz.i
-}
-
-define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcltz_s64
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0
-  %1 = icmp slt <1 x i64> %a, zeroinitializer
-  %vcltz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcltz.i
-}
-
-declare <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchi.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index a505dafa3e78..a01df3275a99 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,11 +1,9 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK
 
 
 define float @test_dup_sv2S(<2 x float> %v) {
  ; CHECK-LABEL: test_dup_sv2S
- ; CHECK-AARCH64: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
- ; CHECK-ARM64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+ ; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
  %tmp1 = extractelement <2 x float> %v, i32 1
  ret float  %tmp1
 }
@@ -39,16 +37,14 @@ define double @test_dup_dvD(<1 x double> %v) {
 
 define double @test_dup_dv2D(<2 x double> %v) {
  ; CHECK-LABEL: test_dup_dv2D
- ; CHECK-AARCH64: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
- ; CHECK-ARM64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define double @test_dup_dv2D_0(<2 x double> %v) {
  ; CHECK-LABEL: test_dup_dv2D_0
- ; CHECK-AARCH64: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
- ; CHECK-ARM64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  ; CHECK: ret
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
@@ -56,50 +52,43 @@ define double @test_dup_dv2D_0(<2 x double> %v) {
 
 define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
  ; CHECK-LABEL: test_vector_dup_bv16B
- ; CHECK-AARCH64: dup {{b[0-9]+}}, {{v[0-9]+}}.b[14]
  %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
  ; CHECK-LABEL: test_vector_dup_bv8B
- ; CHECK-AARCH64: dup {{b[0-9]+}}, {{v[0-9]+}}.b[7]
  %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
  ; CHECK-LABEL: test_vector_dup_hv8H
- ; CHECK-AARCH64: dup {{h[0-9]+}}, {{v[0-9]+}}.h[7]
  %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
  ; CHECK-LABEL: test_vector_dup_hv4H
- ; CHECK-AARCH64: dup {{h[0-9]+}}, {{v[0-9]+}}.h[3]
  %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
  ; CHECK-LABEL: test_vector_dup_sv4S
- ; CHECK-AARCH64: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
  ; CHECK-LABEL: test_vector_dup_sv2S
- ; CHECK-AARCH64: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
  ; CHECK-LABEL: test_vector_dup_dv2D
- ; CHECK-AARCH64: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
- ; CHECK-ARM64: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
+ ; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
  %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
  ret <1 x i64> %shuffle.i
 }
diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll
deleted file mode 100644
index c19b0a765c60..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-cvt.ll
+++ /dev/null
@@ -1,134 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 has a different approach to scalars. Discarding.
-
-define float @test_vcvts_f32_s32(i32 %a) {
-; CHECK: test_vcvts_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32> %vcvtf.i)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64> %vcvtf.i)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_f32_u32(i32 %a) {
-; CHECK: test_vcvts_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32> %vcvtf.i)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64> %vcvtf.i)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_n_f32_s32(i32 %a) {
-; CHECK: test_vcvts_n_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define float @test_vcvts_n_f32_u32(i32 %a) {
-; CHECK: test_vcvts_n_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define i32 @test_vcvts_n_s32_f32(float %a) {
-; CHECK: test_vcvts_n_s32_f32
-; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float %a, i32 1)
-  %0 = extractelement <1 x i32> %fcvtzs1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_s64_f64(double %a) {
-; CHECK: test_vcvtd_n_s64_f64
-; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double %a, i32 1)
-  %0 = extractelement <1 x i64> %fcvtzs1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double, i32)
-
-define i32 @test_vcvts_n_u32_f32(float %a) {
-; CHECK: test_vcvts_n_u32_f32
-; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
-entry:
-  %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float %a, i32 32)
-  %0 = extractelement <1 x i32> %fcvtzu1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_u64_f64(double %a) {
-; CHECK: test_vcvtd_n_u64_f64
-; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
-entry:
-  %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double %a, i32 64)
-  %0 = extractelement <1 x i64> %fcvtzu1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-ext.ll b/test/CodeGen/AArch64/neon-scalar-ext.ll
deleted file mode 100644
index 502fcdacfc10..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-ext.ll
+++ /dev/null
@@ -1,114 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 doesn't use <1 x iN> types, for N < 64.
-
-define <1 x i64> @test_zext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i32_v1i64:
-; CHECK: ushll	v0.2d, v0.2s, #0
-  %1 = extractelement <2 x i32> %v, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
-  %3 = zext <1 x i32> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i32> @test_zext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i32:
-; CHECK: ushll	v0.4s, v0.4h, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = zext <1 x i16> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i16> @test_zext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i16:
-; CHECK: ushll	v0.8h, v0.8b, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i16>
-  ret <1 x i16> %3
-}
-
-define <1 x i32> @test_zext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i32:
-; CHECK: dup     b0, v0.b[0]
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i64> @test_zext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i64:
-; CHECK: dup    h0, v0.h[0]
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = zext <1 x i16> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_zext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i64:
-; CHECK: dup	b0, v0.b[0]
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i32_v1i64:
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <2 x i32> %v, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
-  %3 = sext <1 x i32> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i32> @test_sext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i32:
-; CHECK: sshll	v0.4s, v0.4h, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = sext <1 x i16> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i16> @test_sext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i16:
-; CHECK: sshll	v0.8h, v0.8b, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i16>
-  ret <1 x i16> %3
-}
-
-define <1 x i32> @test_sext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i32:
-; CHECK: sshll	v0.8h, v0.8b, #0
-; CHECK: sshll	v0.4s, v0.4h, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i64> @test_sext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i64:
-; CHECK: sshll	v0.4s, v0.4h, #0
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = sext <1 x i16> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i64:
-; CHECK: sshll	v0.8h, v0.8b, #0
-; CHECK: sshll	v0.4s, v0.4h, #0
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
deleted file mode 100644
index 2004226bd135..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; intrinsic wrangling that arm64 does differently.
-
-define i8 @test_vqmovunh_s16(i16 %a) {
-; CHECK: test_vqmovunh_s16
-; CHECK: sqxtun {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovun1.i = call <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16> %vqmovun.i)
-  %0 = extractelement <1 x i8> %vqmovun1.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vqmovuns_s32(i32 %a) {
-; CHECK: test_vqmovuns_s32
-; CHECK: sqxtun {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovun1.i = call <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32> %vqmovun.i)
-  %0 = extractelement <1 x i16> %vqmovun1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovund_s64(i64 %a) {
-; CHECK: test_vqmovund_s64
-; CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovun1.i = call <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64> %vqmovun.i)
-  %0 = extractelement <1 x i32> %vqmovun1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_s16(i16 %a) {
-; CHECK: test_vqmovnh_s16
-; CHECK: sqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16> %vqmovn.i)
-  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vqmovns_s32(i32 %a) {
-; CHECK: test_vqmovns_s32
-; CHECK: sqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32> %vqmovn.i)
-  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovnd_s64(i64 %a) {
-; CHECK: test_vqmovnd_s64
-; CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64> %vqmovn.i)
-  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_u16(i16 %a) {
-; CHECK: test_vqmovnh_u16
-; CHECK: uqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16> %vqmovn.i)
-  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
-  ret i8 %0
-}
-
-
-define i16 @test_vqmovns_u32(i32 %a) {
-; CHECK: test_vqmovns_u32
-; CHECK: uqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32> %vqmovn.i)
-  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovnd_u64(i64 %a) {
-; CHECK: test_vqmovnd_u64
-; CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64> %vqmovn.i)
-  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll
deleted file mode 100644
index 9b2ae2bbc0ab..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-fabd.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has these two tests in vabs.ll
-
-define float @test_vabds_f32(float %a, float %b) {
-; CHECK-LABEL: test_vabds_f32
-; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vabd.f32(float %a, float %a)
-  ret float %0
-}
-
-define double @test_vabdd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vabdd_f64
-; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vabd.f64(double %a, double %b)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vabd.f64(double, double)
-declare float @llvm.aarch64.neon.vabd.f32(float, float)
diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
deleted file mode 100644
index 341ed69b4822..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-fcvt.ll
+++ /dev/null
@@ -1,234 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 duplicates these tests in cvt.ll
-
-;; Scalar Floating-point Convert
-
-define float @test_vcvtxn(double %a) {
-; CHECK: test_vcvtxn
-; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtf = call float @llvm.aarch64.neon.fcvtxn(double %a)
-  ret float %vcvtf
-}
-
-declare float @llvm.aarch64.neon.fcvtxn(double)
-
-define i32 @test_vcvtass(float %a) {
-; CHECK: test_vcvtass
-; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtas1.i = call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtas1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float)
-
-define i64 @test_test_vcvtasd(double %a) {
-; CHECK: test_test_vcvtasd
-; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtas1.i = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtas1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double)
-
-define i32 @test_vcvtaus(float %a) {
-; CHECK: test_vcvtaus
-; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtau1.i = call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtau1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float)
-
-define i64 @test_vcvtaud(double %a) {
-; CHECK: test_vcvtaud
-; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtau1.i = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtau1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double) 
-
-define i32 @test_vcvtmss(float %a) {
-; CHECK: test_vcvtmss
-; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtms1.i = call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtms1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float)
-
-define i64 @test_vcvtmd_s64_f64(double %a) {
-; CHECK: test_vcvtmd_s64_f64
-; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtms1.i = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtms1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double)
-
-define i32 @test_vcvtmus(float %a) {
-; CHECK: test_vcvtmus
-; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtmu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float)
-
-define i64 @test_vcvtmud(double %a) {
-; CHECK: test_vcvtmud
-; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtmu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double)
-
-define i32 @test_vcvtnss(float %a) {
-; CHECK: test_vcvtnss
-; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtns1.i = call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtns1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float)
-
-define i64 @test_vcvtnd_s64_f64(double %a) {
-; CHECK: test_vcvtnd_s64_f64
-; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtns1.i = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtns1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double)
-
-define i32 @test_vcvtnus(float %a) {
-; CHECK: test_vcvtnus
-; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtnu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float)
-
-define i64 @test_vcvtnud(double %a) {
-; CHECK: test_vcvtnud
-; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtnu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double)
-
-define i32 @test_vcvtpss(float %a) {
-; CHECK: test_vcvtpss
-; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtps1.i = call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtps1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float)
-
-define i64 @test_vcvtpd_s64_f64(double %a) {
-; CHECK: test_vcvtpd_s64_f64
-; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtps1.i = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtps1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double)
-
-define i32 @test_vcvtpus(float %a) {
-; CHECK: test_vcvtpus
-; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtpu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float)
-
-define i64 @test_vcvtpud(double %a) {
-; CHECK: test_vcvtpud
-; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtpu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double)
-
-define i32 @test_vcvtss(float %a) {
-; CHECK: test_vcvtss
-; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtzs1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float)
-
-define i64 @test_vcvtd_s64_f64(double %a) {
-; CHECK: test_vcvtd_s64_f64
-; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvzs1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvzs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double)
-
-define i32 @test_vcvtus(float %a) {
-; CHECK: test_vcvtus
-; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtzu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float)
-
-define i64 @test_vcvtud(double %a) {
-; CHECK: test_vcvtud
-; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtzu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
deleted file mode 100644
index b17d8655c6f9..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
+++ /dev/null
@@ -1,283 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; arm64 does not use intrinsics for comparisons.
-
-;; Scalar Floating-point Compare
-
-define i32 @test_vceqs_f32(float %a, float %b) {
-; CHECK-LABEL: test_vceqs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fceq2.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fceq2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vceqd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vceqd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fceq2.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fceq2.i, i32 0
-  ret i64 %0
-}
-
-define <1 x i64> @test_vceqz_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vceqz_f64
-; CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0.0
-entry:
-  %0 = fcmp oeq <1 x double> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %0 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define i32 @test_vceqzs_f32(float %a) {
-; CHECK-LABEL: test_vceqzs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fceq1.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fceq1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vceqzd_f64(double %a) {
-; CHECK-LABEL: test_vceqzd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fceq1.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fceq1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcges_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcges_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcge2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcged_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcge2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgezs_f32(float %a) {
-; CHECK-LABEL: test_vcgezs_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcge1.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcge1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgezd_f64(double %a) {
-; CHECK-LABEL: test_vcgezd_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcge1.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcge1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcgts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcgtd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgtzs_f32(float %a) {
-; CHECK-LABEL: test_vcgtzs_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcgt1.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcgt1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgtzd_f64(double %a) {
-; CHECK-LABEL: test_vcgtzd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcgt1.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcgt1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcles_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcles_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcge2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcled_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcge2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vclezs_f32(float %a) {
-; CHECK-LABEL: test_vclezs_f32
-; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcle1.i = call <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcle1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vclezd_f64(double %a) {
-; CHECK-LABEL: test_vclezd_f64
-; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcle1.i = call <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcle1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vclts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vclts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcltd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcltzs_f32(float %a) {
-; CHECK-LABEL: test_vcltzs_f32
-; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fclt1.i = call <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fclt1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcltzd_f64(double %a) {
-; CHECK-LABEL: test_vcltzd_f64
-; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fclt1.i = call <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fclt1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcages_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcages_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcage2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaged_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcage2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcagts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcagts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcagt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcagt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcagtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcagtd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcagt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcagt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcales_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcales_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcage2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaled_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcage2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcalts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcalts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcalt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcalt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaltd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcalt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcalt2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double, double)
diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll
deleted file mode 100644
index ac44c090b411..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-mul.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Just intrinsic wrangling, and arm64 does scalar differently anyway.
-
-define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmulhh_s16
-; CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
-  %3 = call <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
-  %4 = extractelement <1 x i16> %3, i32 0
-  ret i16 %4
-}
-
-define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulhs_s32
-; CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
-  %3 = call <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
-  %4 = extractelement <1 x i32> %3, i32 0
-  ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqrdmulhh_s16
-; CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
-  %3 = call <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
-  %4 = extractelement <1 x i16> %3, i32 0
-  ret i16 %4
-}
-
-define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqrdmulhs_s32
-; CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
-  %3 = call <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
-  %4 = extractelement <1 x i32> %3, i32 0
-  ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define float @test_vmulxs_f32(float %a, float %b) {
-; CHECK: test_vmulxs_f32
-; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vmulxd_f64(double %a, double %b) {
-; CHECK: test_vmulxd_f64
-; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
-
-define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlalh_s16
-; CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmlal.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmlal1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmlal2.i = insertelement <1 x i16> undef, i16 %c, i32 0
-  %vqdmlal3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32> %vqdmlal.i, <1 x i16> %vqdmlal1.i, <1 x i16> %vqdmlal2.i)
-  %0 = extractelement <1 x i32> %vqdmlal3.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlals_s32
-; CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmlal.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqdmlal1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmlal2.i = insertelement <1 x i32> undef, i32 %c, i32 0
-  %vqdmlal3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64> %vqdmlal.i, <1 x i32> %vqdmlal1.i, <1 x i32> %vqdmlal2.i)
-  %0 = extractelement <1 x i64> %vqdmlal3.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlslh_s16
-; CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmlsl.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmlsl1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmlsl2.i = insertelement <1 x i16> undef, i16 %c, i32 0
-  %vqdmlsl3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32> %vqdmlsl.i, <1 x i16> %vqdmlsl1.i, <1 x i16> %vqdmlsl2.i)
-  %0 = extractelement <1 x i32> %vqdmlsl3.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlsls_s32
-; CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmlsl.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqdmlsl1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmlsl2.i = insertelement <1 x i32> undef, i32 %c, i32 0
-  %vqdmlsl3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64> %vqdmlsl.i, <1 x i32> %vqdmlsl1.i, <1 x i32> %vqdmlsl2.i)
-  %0 = extractelement <1 x i64> %vqdmlsl3.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmullh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmullh_s16
-; CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmull.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqdmull1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmull2.i = call <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16> %vqdmull.i, <1 x i16> %vqdmull1.i)
-  %0 = extractelement <1 x i32> %vqdmull2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmulls_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulls_s32
-; CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmull.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmull1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmull2.i = call <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32> %vqdmull.i, <1 x i32> %vqdmull1.i)
-  %0 = extractelement <1 x i64> %vqdmull2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32>, <1 x i32>)
diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll
deleted file mode 100644
index 6eb0a1a152b1..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-neg.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; Intrinsic wrangling. arm64 does it differently.
-
-define i64 @test_vnegd_s64(i64 %a) {
-; CHECK: test_vnegd_s64
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vneg1.i = tail call <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64> %vneg.i)
-  %0 = extractelement <1 x i64> %vneg1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64>)
-
-define i8 @test_vqnegb_s8(i8 %a) {
-; CHECK: test_vqnegb_s8
-; CHECK: sqneg {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vqneg1.i = call <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8> %vqneg.i)
-  %0 = extractelement <1 x i8> %vqneg1.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8>)
-
-define i16 @test_vqnegh_s16(i16 %a) {
-; CHECK: test_vqnegh_s16
-; CHECK: sqneg {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqneg1.i = call <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16> %vqneg.i)
-  %0 = extractelement <1 x i16> %vqneg1.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16>)
-
-define i32 @test_vqnegs_s32(i32 %a) {
-; CHECK: test_vqnegs_s32
-; CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqneg1.i = call <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32> %vqneg.i)
-  %0 = extractelement <1 x i32> %vqneg1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32>)
-
-define i64 @test_vqnegd_s64(i64 %a) {
-; CHECK: test_vqnegd_s64
-; CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqneg1.i = call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %vqneg.i)
-  %0 = extractelement <1 x i64> %vqneg1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll
deleted file mode 100644
index 4b1ca6e91c8d..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-recip.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; duplicates arm64 tests in vsqrt.ll
-
-define float @test_vrecpss_f32(float %a, float %b) {
-; CHECK: test_vrecpss_f32
-; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vrecps.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vrecpsd_f64(double %a, double %b) {
-; CHECK: test_vrecpsd_f64
-; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vrecps.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrecps.f32(float, float)
-declare double @llvm.aarch64.neon.vrecps.f64(double, double)
-
-define float @test_vrsqrtss_f32(float %a, float %b) {
-; CHECK: test_vrsqrtss_f32
-; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vrsqrts.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vrsqrtsd_f64(double %a, double %b) {
-; CHECK: test_vrsqrtsd_f64
-; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vrsqrts.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrsqrts.f32(float, float)
-declare double @llvm.aarch64.neon.vrsqrts.f64(double, double)
-
-define float @test_vrecpes_f32(float %a) {
-; CHECK: test_vrecpes_f32
-; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrecpe.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrecped_f64(double %a) {
-; CHECK: test_vrecped_f64
-; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrecpe.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpe.f32(float)
-declare double @llvm.aarch64.neon.vrecpe.f64(double)
-
-define float @test_vrecpxs_f32(float %a) {
-; CHECK: test_vrecpxs_f32
-; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrecpx.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrecpxd_f64(double %a) {
-; CHECK: test_vrecpxd_f64
-; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrecpx.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpx.f32(float)
-declare double @llvm.aarch64.neon.vrecpx.f64(double)
-
-define float @test_vrsqrtes_f32(float %a) {
-; CHECK: test_vrsqrtes_f32
-; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrsqrte.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrsqrted_f64(double %a) {
-; CHECK: test_vrsqrted_f64
-; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrsqrte.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrsqrte.f32(float)
-declare double @llvm.aarch64.neon.vrsqrte.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
deleted file mode 100644
index 2b94d7524eb9..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
+++ /dev/null
@@ -1,216 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Intrinsic wrangling. Duplicates various arm64 tests.
-
-declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
-
-define <1 x i64> @test_addp_v1i64(<2 x i64> %a) {
-; CHECK: test_addp_v1i64:
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
-  ret <1 x i64> %val
-}
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float>)
-
-define float @test_faddp_f32(<2 x float> %a) {
-; CHECK: test_faddp_f32:
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double>)
-
-define double @test_faddp_f64(<2 x double> %a) {
-; CHECK: test_faddp_f64:
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
-
-define float @test_fmaxp_f32(<2 x float> %a) {
-; CHECK: test_fmaxp_f32:
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double>)
-
-define double @test_fmaxp_f64(<2 x double> %a) {
-; CHECK: test_fmaxp_f64:
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float>)
-
-define float @test_fminp_f32(<2 x float> %a) {
-; CHECK: test_fminp_f32:
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double>)
-
-define double @test_fminp_f64(<2 x double> %a) {
-; CHECK: test_fminp_f64:
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float>)
-
-define float @test_fmaxnmp_f32(<2 x float> %a) {
-; CHECK: test_fmaxnmp_f32:
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double>)
-
-define double @test_fmaxnmp_f64(<2 x double> %a) {
-; CHECK: test_fmaxnmp_f64:
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float>)
-
-define float @test_fminnmp_f32(<2 x float> %a) {
-; CHECK: test_fminnmp_f32:
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double>)
-
-define double @test_fminnmp_f64(<2 x double> %a) {
-; CHECK: test_fminnmp_f64:
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-define float @test_vaddv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vaddv_f32
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define float @test_vaddvq_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vaddvq_f32
-; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float> %a)
-  ret float %1
-}
-
-define double @test_vaddvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vaddvq_f64
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vmaxv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxv_f32
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vmaxvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxvq_f64
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vminv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminv_f32
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vminvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminvq_f64
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define double @test_vmaxnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxnmvq_f64
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vmaxnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxnmv_f32
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vminnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminnmvq_f64
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vminnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminnmv_f32
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_s64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_u64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define i64 @test_vaddvq_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_s64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-define i64 @test_vaddvq_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_u64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>)
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float>)
diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
deleted file mode 100644
index ae097afb3a37..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Duplicates arm64'd vshift.ll
-
-declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
deleted file mode 100644
index ea5f8f9286fc..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
+++ /dev/null
@@ -1,243 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Intrinsic wrangling and arm64 does it differently.
-
-declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqadd_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqadd_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqsub_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqsub_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqadd_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqadd_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqsub_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqsub_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqadd_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqadd_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqsub_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-
-define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqsub_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqadd_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqadd_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqsub_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqsub_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define i8 @test_vuqaddb_s8(i8 %a, i8 %b) {
-; CHECK: test_vuqaddb_s8
-; CHECK: suqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
-  %vuqadd2.i = call <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8> %vuqadd.i, <1 x i8> %vuqadd1.i)
-  %0 = extractelement <1 x i8> %vuqadd2.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vuqaddh_s16(i16 %a, i16 %b) {
-; CHECK: test_vuqaddh_s16
-; CHECK: suqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vuqadd2.i = call <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16> %vuqadd.i, <1 x i16> %vuqadd1.i)
-  %0 = extractelement <1 x i16> %vuqadd2.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vuqadds_s32(i32 %a, i32 %b) {
-; CHECK: test_vuqadds_s32
-; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vuqadd2.i = call <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32> %vuqadd.i, <1 x i32> %vuqadd1.i)
-  %0 = extractelement <1 x i32> %vuqadd2.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vuqaddd_s64(i64 %a, i64 %b) {
-; CHECK: test_vuqaddd_s64
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vuqadd2.i = call <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64> %vuqadd.i, <1 x i64> %vuqadd1.i)
-  %0 = extractelement <1 x i64> %vuqadd2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define i8 @test_vsqaddb_u8(i8 %a, i8 %b) {
-; CHECK: test_vsqaddb_u8
-; CHECK: usqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
-  %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %vsqadd.i, <1 x i8> %vsqadd1.i)
-  %0 = extractelement <1 x i8> %vsqadd2.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vsqaddh_u16(i16 %a, i16 %b) {
-; CHECK: test_vsqaddh_u16
-; CHECK: usqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %vsqadd.i, <1 x i16> %vsqadd1.i)
-  %0 = extractelement <1 x i16> %vsqadd2.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vsqadds_u32(i32 %a, i32 %b) {
-; CHECK: test_vsqadds_u32
-; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vsqadd2.i = call <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32> %vsqadd.i, <1 x i32> %vsqadd1.i)
-  %0 = extractelement <1 x i32> %vsqadd2.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vsqaddd_u64(i64 %a, i64 %b) {
-; CHECK: test_vsqaddd_u64
-; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsqadd2.i = call <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64> %vsqadd.i, <1 x i64> %vsqadd1.i)
-  %0 = extractelement <1 x i64> %vsqadd2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
deleted file mode 100644
index e78c55bfe166..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Intrinsic wrangling and arm64 does it differently.
-
-declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqrshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqrshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqrshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqrshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqrshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqrshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
deleted file mode 100644
index b7f956cf612a..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; Intrinsic wrangling and arm64 does it differently.
-
-declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
deleted file mode 100644
index a2bdae5f52ce..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
+++ /dev/null
@@ -1,532 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; Intrinsic wrangling & arm64 does it differently.
-
-define i64 @test_vshrd_n_s64(i64 %a) {
-; CHECK: test_vshrd_n_s64
-; CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsshr1 = call <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64> %vsshr, i32 63)
-  %0 = extractelement <1 x i64> %vsshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64>, i32)
-
-define i64 @test_vshrd_n_u64(i64 %a) {
-; CHECK: test_vshrd_n_u64
-; CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vushr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vushr1 = call <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64> %vushr, i32 63)
-  %0 = extractelement <1 x i64> %vushr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_s64(i64 %a) {
-; CHECK: test_vrshrd_n_s64
-; CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsrshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsrshr1 = call <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64> %vsrshr, i32 63)
-  %0 = extractelement <1 x i64> %vsrshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_u64(i64 %a) {
-; CHECK: test_vrshrd_n_u64
-; CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vurshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vurshr1 = call <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64> %vurshr, i32 63)
-  %0 = extractelement <1 x i64> %vurshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_s64
-; CHECK: ssra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vssra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vssra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vssra2 = call <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64> %vssra, <1 x i64> %vssra1, i32 63)
-  %0 = extractelement <1 x i64> %vssra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_u64
-; CHECK: usra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vusra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vusra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vusra2 = call <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64> %vusra, <1 x i64> %vusra1, i32 63)
-  %0 = extractelement <1 x i64> %vusra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_s64
-; CHECK: srsra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsrsra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsrsra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsrsra2 = call <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64> %vsrsra, <1 x i64> %vsrsra1, i32 63)
-  %0 = extractelement <1 x i64> %vsrsra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_u64
-; CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vursra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vursra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vursra2 = call <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64> %vursra, <1 x i64> %vursra1, i32 63)
-  %0 = extractelement <1 x i64> %vursra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vshld_n_s64(i64 %a) {
-; CHECK: test_vshld_n_s64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
-  %0 = extractelement <1 x i64> %vshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64>, i32)
-
-define i64 @test_vshld_n_u64(i64 %a) {
-; CHECK: test_vshld_n_u64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
-  %0 = extractelement <1 x i64> %vshl1, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vqshlb_n_s8(i8 %a) {
-; CHECK: test_vqshlb_n_s8
-; CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vsqshl = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8> %vsqshl, i32 7)
-  %0 = extractelement <1 x i8> %vsqshl1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_s16(i16 %a) {
-; CHECK: test_vqshlh_n_s16
-; CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vsqshl = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16> %vsqshl, i32 15)
-  %0 = extractelement <1 x i16> %vsqshl1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_s32(i32 %a) {
-; CHECK: test_vqshls_n_s32
-; CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vsqshl = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32> %vsqshl, i32 31)
-  %0 = extractelement <1 x i32> %vsqshl1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_s64(i64 %a) {
-; CHECK: test_vqshld_n_s64
-; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsqshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64> %vsqshl, i32 63)
-  %0 = extractelement <1 x i64> %vsqshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlb_n_u8(i8 %a) {
-; CHECK: test_vqshlb_n_u8
-; CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vuqshl = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vuqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8> %vuqshl, i32 7)
-  %0 = extractelement <1 x i8> %vuqshl1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_u16(i16 %a) {
-; CHECK: test_vqshlh_n_u16
-; CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vuqshl = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16> %vuqshl, i32 15)
-  %0 = extractelement <1 x i16> %vuqshl1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_u32(i32 %a) {
-; CHECK: test_vqshls_n_u32
-; CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vuqshl = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32> %vuqshl, i32 31)
-  %0 = extractelement <1 x i32> %vuqshl1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_u64(i64 %a) {
-; CHECK: test_vqshld_n_u64
-; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vuqshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64> %vuqshl, i32 63)
-  %0 = extractelement <1 x i64> %vuqshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlub_n_s8(i8 %a) {
-; CHECK: test_vqshlub_n_s8
-; CHECK: sqshlu {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vsqshlu = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqshlu1 = call <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8> %vsqshlu, i32 7)
-  %0 = extractelement <1 x i8> %vsqshlu1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshluh_n_s16(i16 %a) {
-; CHECK: test_vqshluh_n_s16
-; CHECK: sqshlu {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vsqshlu = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshlu1 = call <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16> %vsqshlu, i32 15)
-  %0 = extractelement <1 x i16> %vsqshlu1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshlus_n_s32(i32 %a) {
-; CHECK: test_vqshlus_n_s32
-; CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vsqshlu = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshlu1 = call <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32> %vsqshlu, i32 31)
-  %0 = extractelement <1 x i32> %vsqshlu1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshlud_n_s64(i64 %a) {
-; CHECK: test_vqshlud_n_s64
-; CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsqshlu = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshlu1 = call <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64> %vsqshlu, i32 63)
-  %0 = extractelement <1 x i64> %vsqshlu1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_s64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
-  %0 = extractelement <1 x i64> %vsri2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_u64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
-  %0 = extractelement <1 x i64> %vsri2, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vslid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_s64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
-  %0 = extractelement <1 x i64> %vsli2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vslid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_u64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
-  %0 = extractelement <1 x i64> %vsli2, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vqshrnh_n_s16(i16 %a) {
-; CHECK: test_vqshrnh_n_s16
-; CHECK: sqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16> %vsqshrn, i32 8)
-  %0 = extractelement <1 x i8> %vsqshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_s32(i32 %a) {
-; CHECK: test_vqshrns_n_s32
-; CHECK: sqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32> %vsqshrn, i32 16)
-  %0 = extractelement <1 x i16> %vsqshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_s64(i64 %a) {
-; CHECK: test_vqshrnd_n_s64
-; CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64> %vsqshrn, i32 32)
-  %0 = extractelement <1 x i32> %vsqshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrnh_n_u16(i16 %a) {
-; CHECK: test_vqshrnh_n_u16
-; CHECK: uqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vuqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16> %vuqshrn, i32 8)
-  %0 = extractelement <1 x i8> %vuqshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_u32(i32 %a) {
-; CHECK: test_vqshrns_n_u32
-; CHECK: uqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vuqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32> %vuqshrn, i32 16)
-  %0 = extractelement <1 x i16> %vuqshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_u64(i64 %a) {
-; CHECK: test_vqshrnd_n_u64
-; CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vuqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64> %vuqshrn, i32 32)
-  %0 = extractelement <1 x i32> %vuqshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_s16(i16 %a) {
-; CHECK: test_vqrshrnh_n_s16
-; CHECK: sqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16> %vsqrshrn, i32 8)
-  %0 = extractelement <1 x i8> %vsqrshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_s32(i32 %a) {
-; CHECK: test_vqrshrns_n_s32
-; CHECK: sqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32> %vsqrshrn, i32 16)
-  %0 = extractelement <1 x i16> %vsqrshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_s64(i64 %a) {
-; CHECK: test_vqrshrnd_n_s64
-; CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64> %vsqrshrn, i32 32)
-  %0 = extractelement <1 x i32> %vsqrshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_u16(i16 %a) {
-; CHECK: test_vqrshrnh_n_u16
-; CHECK: uqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vuqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16> %vuqrshrn, i32 8)
-  %0 = extractelement <1 x i8> %vuqrshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_u32(i32 %a) {
-; CHECK: test_vqrshrns_n_u32
-; CHECK: uqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vuqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %vuqrshrn, i32 16)
-  %0 = extractelement <1 x i16> %vuqrshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_u64(i64 %a) {
-; CHECK: test_vqrshrnd_n_u64
-; CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vuqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64> %vuqrshrn, i32 32)
-  %0 = extractelement <1 x i32> %vuqrshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrunh_n_s16(i16 %a) {
-; CHECK: test_vqshrunh_n_s16
-; CHECK: sqshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqshrun = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16> %vsqshrun, i32 8)
-  %0 = extractelement <1 x i8> %vsqshrun1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshruns_n_s32(i32 %a) {
-; CHECK: test_vqshruns_n_s32
-; CHECK: sqshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqshrun = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32> %vsqshrun, i32 16)
-  %0 = extractelement <1 x i16> %vsqshrun1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrund_n_s64(i64 %a) {
-; CHECK: test_vqshrund_n_s64
-; CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqshrun = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64> %vsqshrun, i32 32)
-  %0 = extractelement <1 x i32> %vsqshrun1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrunh_n_s16(i16 %a) {
-; CHECK: test_vqrshrunh_n_s16
-; CHECK: sqrshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqrshrun = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqrshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16> %vsqrshrun, i32 8)
-  %0 = extractelement <1 x i8> %vsqrshrun1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshruns_n_s32(i32 %a) {
-; CHECK: test_vqrshruns_n_s32
-; CHECK: sqrshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqrshrun = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqrshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32> %vsqrshrun, i32 16)
-  %0 = extractelement <1 x i16> %vsqrshrun1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrund_n_s64(i64 %a) {
-; CHECK: test_vqrshrund_n_s64
-; CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqrshrun = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqrshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64> %vsqrshrun, i32 32)
-  %0 = extractelement <1 x i32> %vsqrshrun1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll
deleted file mode 100644
index cf3fc0c486a5..000000000000
--- a/test/CodeGen/AArch64/neon-scalar-shift.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; Duplicates existing arm64 tests in vshift.ll and vcmp.ll
-
-declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_s64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = and <1 x i64> %a, %b
-  %1 = icmp ne <1 x i64> %0, zeroinitializer
-  %vtst.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_u64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = and <1 x i64> %a, %b
-  %1 = icmp ne <1 x i64> %0, zeroinitializer
-  %vtst.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vsli_n_p64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
-entry:
-  %vsli_n2 = tail call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %a, <1 x i64> %b, i32 0)
-  ret <1 x i64> %vsli_n2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsliq_n_p64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
-  %vsli_n2 = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 0)
-  ret <2 x i64> %vsli_n2
-}
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vrsqrte_u32
-; CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vrsqrte1.i = tail call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
-  ret <2 x i32> %vrsqrte1.i
-}
-
-define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vrsqrteq_u32
-; CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsqrte1.i = tail call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
-  ret <4 x i32> %vrsqrte1.i
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-  ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-  ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
-  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
-  ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
-  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
-  ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
-  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
-  ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
-  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
-  ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
-  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
-  ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-  ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-  ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
-  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
-  ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
-  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
-  ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
-  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
-  ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
-  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
-  ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d,
-entry:
-  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
-  ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/neon-select_cc.ll
deleted file mode 100644
index 57a819671b60..000000000000
--- a/test/CodeGen/AArch64/neon-select_cc.ll
+++ /dev/null
@@ -1,202 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has separate copy of this test due to different codegen.
-define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_i8:
-; CHECK: and	w0, w0, #0xff
-; CHECK-NEXT: cmp	w0, w1, uxtb
-; CHECK-NEXT: csetm	w0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.8b, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i8 %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_i8:
-; CHECK: and	w0, w0, #0xff
-; CHECK-NEXT: cmp	w0, w1, uxtb
-; CHECK-NEXT: csetm	w0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.16b, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i8 %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
-; CHECK-LABEL: test_select_cc_v4i16:
-; CHECK: and	w0, w0, #0xffff
-; CHECK-NEXT: cmp	w0, w1, uxth
-; CHECK-NEXT: csetm	w0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.4h, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i16 %a, %b
-  %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
-  ret <4x i16> %e
-}
-
-define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
-; CHECK-LABEL: test_select_cc_v8i16:
-; CHECK: and	w0, w0, #0xffff
-; CHECK-NEXT: cmp	w0, w1, uxth
-; CHECK-NEXT: csetm	w0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.8h, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i16 %a, %b
-  %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
-  ret <8x i16> %e
-}
-
-define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
-; CHECK-LABEL: test_select_cc_v2i32:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK-NEXT: csetm	w0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
-  ret <2x i32> %e
-}
-
-define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
-; CHECK-LABEL: test_select_cc_v4i32:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK-NEXT: csetm	w0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
-  ret <4x i32> %e
-}
-
-define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
-; CHECK-LABEL: test_select_cc_v1i64:
-; CHECK: cmp	x0, x1
-; CHECK-NEXT: csetm	x0, eq
-; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
-  ret <1x i64> %e
-}
-
-define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
-; CHECK-LABEL: test_select_cc_v2i64:
-; CHECK: cmp	x0, x1
-; CHECK-NEXT: csetm	x0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
-  ret <2x i64> %e
-}
-
-define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
-; CHECK-LABEL: test_select_cc_v1f32:
-; CHECK: fcmp	s0, s1
-; CHECK-NEXT: fcsel	s0, s2, s3, eq
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
-  ret <1 x float> %e
-}
-  
-define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
-; CHECK-LABEL: test_select_cc_v2f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
-  ret <2 x float> %e
-}
-
-define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
-; CHECK-LABEL: test_select_cc_v4f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
-  ret <4x float> %e
-}
-
-define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
-; CHECK-LABEL: test_select_cc_v4f32_icmp:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK: csetm	w0, eq
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
-  ret <4x float> %e
-}
-
-define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v1f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
-  ret <1 x double> %e
-}
-
-define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v1f64_icmp:
-; CHECK: cmp	 x0, x1
-; CHECK-NEXT: csetm	x0, eq
-; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
-  ret <1 x double> %e
-}
-
-define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v2f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
-  ret <2 x double> %e
-}
diff --git a/test/CodeGen/AArch64/neon-shift-left-long.ll b/test/CodeGen/AArch64/neon-shift-left-long.ll
index d16b131559bd..d10d551805a6 100644
--- a/test/CodeGen/AArch64/neon-shift-left-long.ll
+++ b/test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i16> @test_sshll_v8i8(<8 x i8> %a) {
 ; CHECK: test_sshll_v8i8:
diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll
deleted file mode 100644
index 088200d972c5..000000000000
--- a/test/CodeGen/AArch64/neon-shift.ll
+++ /dev/null
@@ -1,172 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 already has these tests: pure intrinsics & trivial shifts.
-
-declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: ushl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_ushl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: ushl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_ushl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: ushl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_ushl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: ushl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_ushl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: ushl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_ushl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: ushl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_ushl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: ushl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-
-define <8 x i8> @test_shl_v8i8(<8 x i8> %a) {
-; CHECK: test_shl_v8i8:
-; CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %tmp = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %tmp
-}
-
-define <4 x i16> @test_shl_v4i16(<4 x i16> %a) {
-; CHECK: test_shl_v4i16:
-; CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %tmp = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %tmp
-}
-
-define <2 x i32> @test_shl_v2i32(<2 x i32> %a) {
-; CHECK: test_shl_v2i32:
-; CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %tmp = shl <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %tmp
-}
-
-define <16 x i8> @test_shl_v16i8(<16 x i8> %a) {
-; CHECK: test_shl_v16i8:
-; CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %tmp = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %tmp
-}
-
-define <8 x i16> @test_shl_v8i16(<8 x i16> %a) {
-; CHECK: test_shl_v8i16:
-; CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %tmp = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %tmp
-}
-
-define <4 x i32> @test_shl_v4i32(<4 x i32> %a) {
-; CHECK: test_shl_v4i32:
-; CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %tmp = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %tmp
-}
-
-define <2 x i64> @test_shl_v2i64(<2 x i64> %a) {
-; CHECK: test_shl_v2i64:
-; CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
-  %tmp = shl <2 x i64> %a, <i64 63, i64 63>
-  ret <2 x i64> %tmp
-}
-
diff --git a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
deleted file mode 100644
index 628a6760c9eb..000000000000
--- a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
+++ /dev/null
@@ -1,334 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has all tests not involving v1iN.
-
-define <8 x i8> @shl.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: shl.v8i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = shl <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @shl.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: shl.v4i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = shl <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @shl.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: shl.v2i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = shl <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @shl.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: shl.v1i64:
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = shl <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @shl.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shl.v16i8:
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = shl <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @shl.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shl.v8i16:
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = shl <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @shl.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shl.v4i32:
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = shl <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @shl.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shl.v2i64:
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = shl <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <8 x i8> @lshr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: lshr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = lshr <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @lshr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: lshr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = lshr <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @lshr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: lshr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = lshr <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @lshr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: lshr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = lshr <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @lshr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: lshr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = lshr <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @lshr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: lshr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = lshr <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @lshr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: lshr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = lshr <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @lshr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: lshr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = lshr <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <8 x i8> @ashr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: ashr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = ashr <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @ashr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: ashr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = ashr <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @ashr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: ashr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = ashr <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @ashr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: ashr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = ashr <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @ashr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: ashr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: sshl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = ashr <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @ashr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: ashr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: sshl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = ashr <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @ashr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: ashr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: sshl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = ashr <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @ashr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: ashr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: sshl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = ashr <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <1 x i64> @shl.v1i64.0(<1 x i64> %a) {
-; CHECK-LABEL: shl.v1i64.0:
-; CHECK-NOT: shl d{{[0-9]+}}, d{{[0-9]+}}, #0
-  %c = shl <1 x i64> %a, zeroinitializer
-  ret <1 x i64> %c
-}
-
-define <2 x i32> @shl.v2i32.0(<2 x i32> %a) {
-; CHECK-LABEL: shl.v2i32.0:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
-  %c = shl <2 x i32> %a, zeroinitializer
-  ret <2 x i32> %c
-}
-
-; The following test cases test shl/ashr/lshr with v1i8/v1i16/v1i32 types
-
-define <1 x i8> @shl.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: shl.v1i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = shl <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: shl.v1i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = shl <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shl.v1i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = shl <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: ashr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = ashr <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: ashr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = ashr <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: ashr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = ashr <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: lshr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = lshr <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: lshr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = lshr <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: lshr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = lshr <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @shl.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: shl.v1i8.imm:
-; CHECK: shl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = shl <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: shl.v1i16.imm:
-; CHECK: shl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #5
-  %c = shl <1 x i16> %a, <i16 5>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: shl.v1i32.imm:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
-  %c = shl <1 x i32> %a, zeroinitializer
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: ashr.v1i8.imm:
-; CHECK: sshr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = ashr <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: ashr.v1i16.imm:
-; CHECK: sshr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
-  %c = ashr <1 x i16> %a, <i16 10>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: ashr.v1i32.imm:
-; CHECK: sshr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
-  %c = ashr <1 x i32> %a, <i32 31>
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: lshr.v1i8.imm:
-; CHECK: ushr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = lshr <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: lshr.v1i16.imm:
-; CHECK: ushr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
-  %c = lshr <1 x i16> %a, <i16 10>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: lshr.v1i32.imm:
-; CHECK: ushr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
-  %c = lshr <1 x i32> %a, <i32 31>
-  ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
deleted file mode 100644
index a3b160413f5e..000000000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
+++ /dev/null
@@ -1,2317 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; arm64 already has these. Essentially just a copy/paste from Clang output from
-; arm_neon.h
-
-define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v16i8:
-; CHECK: ld1 { v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}]
-  %tmp = load <16 x i8>* %ptr
-  store <16 x i8> %tmp, <16 x i8>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i16:
-; CHECK: ld1 { v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}]
-  %tmp = load <8 x i16>* %ptr
-  store <8 x i16> %tmp, <8 x i16>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i32:
-; CHECK: ld1 { v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %tmp = load <4 x i32>* %ptr
-  store <4 x i32> %tmp, <4 x i32>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i64:
-; CHECK: ld1 { v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %tmp = load <2 x i64>* %ptr
-  store <2 x i64> %tmp, <2 x i64>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i8:
-; CHECK: ld1 { v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}]
-  %tmp = load <8 x i8>* %ptr
-  store <8 x i8> %tmp, <8 x i8>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i16:
-; CHECK: ld1 { v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}]
-  %tmp = load <4 x i16>* %ptr
-  store <4 x i16> %tmp, <4 x i16>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i32:
-; CHECK: ld1 { v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %tmp = load <2 x i32>* %ptr
-  store <2 x i32> %tmp, <2 x i32>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v1i64:
-; CHECK: ld1 { v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-; CHECK: st1 { v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %tmp = load <1 x i64>* %ptr
-  store <1 x i64> %tmp, <1 x i64>* %ptr2
-  ret void
-}
-
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-
-define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1q_s8
-; CHECK: ld1 { v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
-  ret <16 x i8> %vld1
-}
-
-define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1q_s16
-; CHECK: ld1 { v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
-  ret <8 x i16> %vld1
-}
-
-define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1q_s32
-; CHECK: ld1 { v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
-  ret <4 x i32> %vld1
-}
-
-define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1q_s64
-; CHECK: ld1 { v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
-  ret <2 x i64> %vld1
-}
-
-define <4 x float> @test_vld1q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1q_f32
-; CHECK: ld1 { v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
-  ret <4 x float> %vld1
-}
-
-define <2 x double> @test_vld1q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1q_f64
-; CHECK: ld1 { v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
-  ret <2 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_s8
-; CHECK: ld1 { v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
-  ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_s16
-; CHECK: ld1 { v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
-  ret <4 x i16> %vld1
-}
-
-define <2 x i32> @test_vld1_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1_s32
-; CHECK: ld1 { v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
-  ret <2 x i32> %vld1
-}
-
-define <1 x i64> @test_vld1_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1_s64
-; CHECK: ld1 { v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
-  ret <1 x i64> %vld1
-}
-
-define <2 x float> @test_vld1_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1_f32
-; CHECK: ld1 { v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
-  ret <2 x float> %vld1
-}
-
-define <1 x double> @test_vld1_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1_f64
-; CHECK: ld1 { v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
-  ret <1 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_p8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_p8
-; CHECK: ld1 { v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
-  ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_p16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_p16
-; CHECK: ld1 { v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
-  ret <4 x i16> %vld1
-}
-
-define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2q_s8
-; CHECK: ld2 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}]
-  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
-  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2q_s16
-; CHECK: ld2 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
-  %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2q_s32
-; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2q_s64
-; CHECK: ld2 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2q_f32
-; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2q_f64
-; CHECK: ld2 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2_s8
-; CHECK: ld2 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}]
-  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
-  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2_s16
-; CHECK: ld2 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
-  %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2_s32
-; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2_s64
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2_f32
-; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2_f64
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3q_s8
-; CHECK: ld3 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}]
-  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
-  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3q_s16
-; CHECK: ld3 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
-  %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3q_s32
-; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3q_s64
-; CHECK: ld3 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3q_f32
-; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3q_f64
-; CHECK: ld3 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3_s8
-; CHECK: ld3 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}]
-  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
-  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3_s16
-; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
-  %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3_s32
-; CHECK: ld3 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3_s64
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3_f32
-; CHECK: ld3 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3_f64
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4q_s8
-; CHECK: ld4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}]
-  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
-  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4q_s16
-; CHECK: ld4 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
-  %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4q_s32
-; CHECK: ld4 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4q_s64
-; CHECK: ld4 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4q_f32
-; CHECK: ld4 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4q_f64
-; CHECK: ld4 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4_s8
-; CHECK: ld4 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}]
-  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
-  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4_s16
-; CHECK: ld4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
-  %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4_s32
-; CHECK: ld4 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4_s64
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4_f32
-; CHECK: ld4 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4_f64
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
-declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
-declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
-declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-
-define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_s8
-; CHECK: st1 { v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_s16
-; CHECK: st1 { v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_s32
-; CHECK: st1 { v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_s64
-; CHECK: st1 { v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_f32
-; CHECK: st1 { v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_f64
-; CHECK: st1 { v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_s8
-; CHECK: st1 { v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_s16
-; CHECK: st1 { v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_s32
-; CHECK: st1 { v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_s64
-; CHECK: st1 { v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_f32
-; CHECK: st1 { v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_f64
-; CHECK: st1 { v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
-  ret void
-}
-
-define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s8
-; CHECK: st2 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
-  ret void
-}
-
-define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s16
-; CHECK: st2 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
-  ret void
-}
-
-define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s32
-; CHECK: st2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s64
-; CHECK: st2 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f32
-; CHECK: st2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f64
-; CHECK: st2 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s8
-; CHECK: st2 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
-  ret void
-}
-
-define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s16
-; CHECK: st2 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
-  ret void
-}
-
-define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s32
-; CHECK: st2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s64
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f32
-; CHECK: st2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f64
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s8
-; CHECK: st3 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
-  ret void
-}
-
-define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s16
-; CHECK: st3 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
-  ret void
-}
-
-define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s32
-; CHECK: st3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s64
-; CHECK: st3 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f32
-; CHECK: st3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f64
-; CHECK: st3 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s8
-; CHECK: st3 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
-  ret void
-}
-
-define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s16
-; CHECK: st3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
-  ret void
-}
-
-define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s32
-; CHECK: st3 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s64
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f32
-; CHECK: st3 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f64
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s8
-; CHECK: st4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
-  ret void
-}
-
-define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s16
-; CHECK: st4 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
-  ret void
-}
-
-define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s32
-; CHECK: st4 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s64
-; CHECK: st4 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f32
-; CHECK: st4 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f64
-; CHECK: st4 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s8
-; CHECK: st4 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
-  ret void
-}
-
-define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s16
-; CHECK: st4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
-  ret void
-}
-
-define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s32
-; CHECK: st4 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s64
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f32
-; CHECK: st4 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f64
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
-declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
-
-define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x2
-; CHECK: ld1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
-  %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0
-  %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1
-  ret %struct.int8x16x2_t %5
-}
-
-define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x2
-; CHECK: ld1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1
-  %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0
-  %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1
-  ret %struct.int16x8x2_t %6
-}
-
-define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x2
-; CHECK: ld1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
-  %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0
-  %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1
-  ret %struct.int32x4x2_t %6
-}
-
-define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x2
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1
-  %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0
-  %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1
-  ret %struct.int64x2x2_t %6
-}
-
-define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x2
-; CHECK: ld1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float> } %2, 1
-  %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0
-  %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1
-  ret %struct.float32x4x2_t %6
-}
-
-
-define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x2
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double> } %2, 1
-  %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0
-  %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1
-  ret %struct.float64x2x2_t %6
-}
-
-define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x2
-; CHECK: ld1 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
-  %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0
-  %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1
-  ret %struct.int8x8x2_t %5
-}
-
-define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x2
-; CHECK: ld1 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1
-  %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0
-  %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1
-  ret %struct.int16x4x2_t %6
-}
-
-define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a)  {
-; CHECK-LABEL: test_vld1_s32_x2
-; CHECK: ld1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1
-  %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0
-  %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1
-  ret %struct.int32x2x2_t %6
-}
-
-define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x2
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1
-  %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0
-  %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1
-  ret %struct.int64x1x2_t %6
-}
-
-define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x2
-; CHECK: ld1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float> } %2, 1
-  %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0
-  %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1
-  ret %struct.float32x2x2_t %6
-}
-
-define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x2
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double> } %2, 1
-  %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0
-  %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1
-  ret %struct.float64x1x2_t %6
-}
-
-define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x3
-; CHECK: ld1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b },
-; [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
-  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
-  %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0
-  %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1
-  %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2
-  ret %struct.int8x16x3_t %7
-}
-
-define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x3
-; CHECK: ld1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
-  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
-  %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0
-  %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1
-  %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2
-  ret %struct.int16x8x3_t %8
-}
-
-define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x3
-; CHECK: ld1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
-  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
-  %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0
-  %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1
-  %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2
-  ret %struct.int32x4x3_t %8
-}
-
-define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x3
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
-  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
-  %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0
-  %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1
-  %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2
-  ret %struct.int64x2x3_t %8
-}
-
-define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x3
-; CHECK: ld1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1
-  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2
-  %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0
-  %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1
-  %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2
-  ret %struct.float32x4x3_t %8
-}
-
-
-define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x3
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0
-  %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1
-  %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2
-  ret %struct.float64x2x3_t %8
-}
-
-define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x3
-; CHECK: ld1 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b },
-; [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0
-  %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1
-  %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2
-  ret %struct.int8x8x3_t %7
-}
-
-define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x3
-; CHECK: ld1 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0
-  %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1
-  %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2
-  ret %struct.int16x4x3_t %8
-}
-
-define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a)  {
-  %1 = bitcast i32* %a to i8*
-; CHECK-LABEL: test_vld1_s32_x3
-; CHECK: ld1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s },
-; [{{x[0-9]+|sp}}]
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0
-  %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1
-  %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2
-  ret %struct.int32x2x3_t %8
-}
-
-define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x3
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
-  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
-  %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0
-  %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1
-  %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2
-  ret %struct.int64x1x3_t %8
-}
-
-define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x3
-; CHECK: ld1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1
-  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2
-  %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0
-  %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1
-  %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2
-  ret %struct.float32x2x3_t %8
-}
-
-
-define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x3
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d },
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1
-  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2
-  %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0
-  %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1
-  %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2
-  ret %struct.float64x1x3_t %8
-}
-
-define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x4
-; CHECK: ld1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
-  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
-  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
-  %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0
-  %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1
-  %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2
-  %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3
-  ret %struct.int8x16x4_t %9
-}
-
-define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x4
-; CHECK: ld1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
-  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
-  %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3
-  %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0
-  %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1
-  %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2
-  %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3
-  ret %struct.int16x8x4_t %10
-}
-
-define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x4
-; CHECK: ld1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
-  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
-  %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3
-  %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0
-  %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1
-  %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2
-  %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3
-  ret %struct.int32x4x4_t %10
-}
-
-define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x4
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
-  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
-  %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3
-  %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0
-  %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1
-  %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2
-  %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3
-  ret %struct.int64x2x4_t %10
-}
-
-define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x4
-; CHECK: ld1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
-  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
-  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
-  %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0
-  %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1
-  %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2
-  %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3
-  ret %struct.float32x4x4_t %10
-}
-
-define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x4
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
-  %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0
-  %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1
-  %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2
-  %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3
-  ret %struct.float64x2x4_t %10
-}
-
-define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x4
-; CHECK: ld1 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-  %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0
-  %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1
-  %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2
-  %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3
-  ret %struct.int8x8x4_t %9
-}
-
-define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x4
-; CHECK: ld1 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3
-  %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0
-  %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1
-  %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2
-  %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3
-  ret %struct.int16x4x4_t %10
-}
-
-define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a)  {
-; CHECK-LABEL: test_vld1_s32_x4
-; CHECK: ld1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
-  %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0
-  %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1
-  %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2
-  %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3
-  ret %struct.int32x2x4_t %10
-}
-
-define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x4
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
-  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
-  %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3
-  %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0
-  %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1
-  %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2
-  %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3
-  ret %struct.int64x1x4_t %10
-}
-
-define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x4
-; CHECK: ld1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1
-  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2
-  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3
-  %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0
-  %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1
-  %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2
-  %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3
-  ret %struct.float32x2x4_t %10
-}
-
-
-define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x4
-; CHECK: ld1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1
-  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2
-  %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3
-  %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0
-  %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1
-  %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2
-  %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3
-  ret %struct.float64x1x4_t %10
-}
-
-define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x2
-; CHECK: st1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <16 x i8>] %b, 0
-  %2 = extractvalue [2 x <16 x i8>] %b, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x2
-; CHECK: st1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <8 x i16>] %b, 0
-  %2 = extractvalue [2 x <8 x i16>] %b, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x2
-; CHECK: st1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x i32>] %b, 0
-  %2 = extractvalue [2 x <4 x i32>] %b, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x2
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x i64>] %b, 0
-  %2 = extractvalue [2 x <2 x i64>] %b, 1
-  %3 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x2
-; CHECK: st1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x float>] %b, 0
-  %2 = extractvalue [2 x <4 x float>] %b, 1
-  %3 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4)
-  ret void
-}
-
-
-define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x2
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x double>] %b, 0
-  %2 = extractvalue [2 x <2 x double>] %b, 1
-  %3 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x2
-; CHECK: st1 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x2
-; CHECK: st1 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x i16>] %b, 0
-  %2 = extractvalue [2 x <4 x i16>] %b, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x2
-; CHECK: st1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x i32>] %b, 0
-  %2 = extractvalue [2 x <2 x i32>] %b, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x2
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <1 x i64>] %b, 0
-  %2 = extractvalue [2 x <1 x i64>] %b, 1
-  %3 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x2
-; CHECK: st1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x float>] %b, 0
-  %2 = extractvalue [2 x <2 x float>] %b, 1
-  %3 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x2
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <1 x double>] %b, 0
-  %2 = extractvalue [2 x <1 x double>] %b, 1
-  %3 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x3
-; CHECK: st1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <16 x i8>] %b, 0
-  %2 = extractvalue [3 x <16 x i8>] %b, 1
-  %3 = extractvalue [3 x <16 x i8>] %b, 2
-  tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x3
-; CHECK: st1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <8 x i16>] %b, 0
-  %2 = extractvalue [3 x <8 x i16>] %b, 1
-  %3 = extractvalue [3 x <8 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x3
-; CHECK: st1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x i32>] %b, 0
-  %2 = extractvalue [3 x <4 x i32>] %b, 1
-  %3 = extractvalue [3 x <4 x i32>] %b, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x3
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x i64>] %b, 0
-  %2 = extractvalue [3 x <2 x i64>] %b, 1
-  %3 = extractvalue [3 x <2 x i64>] %b, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x3
-; CHECK: st1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x float>] %b, 0
-  %2 = extractvalue [3 x <4 x float>] %b, 1
-  %3 = extractvalue [3 x <4 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x3
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x double>] %b, 0
-  %2 = extractvalue [3 x <2 x double>] %b, 1
-  %3 = extractvalue [3 x <2 x double>] %b, 2
-  %4 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x3
-; CHECK: st1 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <8 x i8>] %b, 0
-  %2 = extractvalue [3 x <8 x i8>] %b, 1
-  %3 = extractvalue [3 x <8 x i8>] %b, 2
-  tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x3
-; CHECK: st1 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x3
-; CHECK: st1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x i32>] %b, 0
-  %2 = extractvalue [3 x <2 x i32>] %b, 1
-  %3 = extractvalue [3 x <2 x i32>] %b, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x3
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <1 x i64>] %b, 0
-  %2 = extractvalue [3 x <1 x i64>] %b, 1
-  %3 = extractvalue [3 x <1 x i64>] %b, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x3
-; CHECK: st1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x float>] %b, 0
-  %2 = extractvalue [3 x <2 x float>] %b, 1
-  %3 = extractvalue [3 x <2 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x3
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d },
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <1 x double>] %b, 0
-  %2 = extractvalue [3 x <1 x double>] %b, 1
-  %3 = extractvalue [3 x <1 x double>] %b, 2
-  %4 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x4
-; CHECK: st1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <16 x i8>] %b, 0
-  %2 = extractvalue [4 x <16 x i8>] %b, 1
-  %3 = extractvalue [4 x <16 x i8>] %b, 2
-  %4 = extractvalue [4 x <16 x i8>] %b, 3
-  tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x4
-; CHECK: st1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <8 x i16>] %b, 0
-  %2 = extractvalue [4 x <8 x i16>] %b, 1
-  %3 = extractvalue [4 x <8 x i16>] %b, 2
-  %4 = extractvalue [4 x <8 x i16>] %b, 3
-  %5 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x4
-; CHECK: st1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x i32>] %b, 0
-  %2 = extractvalue [4 x <4 x i32>] %b, 1
-  %3 = extractvalue [4 x <4 x i32>] %b, 2
-  %4 = extractvalue [4 x <4 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x4
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x i64>] %b, 0
-  %2 = extractvalue [4 x <2 x i64>] %b, 1
-  %3 = extractvalue [4 x <2 x i64>] %b, 2
-  %4 = extractvalue [4 x <2 x i64>] %b, 3
-  %5 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x4
-; CHECK: st1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x float>] %b, 0
-  %2 = extractvalue [4 x <4 x float>] %b, 1
-  %3 = extractvalue [4 x <4 x float>] %b, 2
-  %4 = extractvalue [4 x <4 x float>] %b, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x4
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x double>] %b, 0
-  %2 = extractvalue [4 x <2 x double>] %b, 1
-  %3 = extractvalue [4 x <2 x double>] %b, 2
-  %4 = extractvalue [4 x <2 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x4
-; CHECK: st1 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <8 x i8>] %b, 0
-  %2 = extractvalue [4 x <8 x i8>] %b, 1
-  %3 = extractvalue [4 x <8 x i8>] %b, 2
-  %4 = extractvalue [4 x <8 x i8>] %b, 3
-  tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x4
-; CHECK: st1 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x i16>] %b, 0
-  %2 = extractvalue [4 x <4 x i16>] %b, 1
-  %3 = extractvalue [4 x <4 x i16>] %b, 2
-  %4 = extractvalue [4 x <4 x i16>] %b, 3
-  %5 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x4
-; CHECK: st1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x i32>] %b, 0
-  %2 = extractvalue [4 x <2 x i32>] %b, 1
-  %3 = extractvalue [4 x <2 x i32>] %b, 2
-  %4 = extractvalue [4 x <2 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x4
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <1 x i64>] %b, 0
-  %2 = extractvalue [4 x <1 x i64>] %b, 1
-  %3 = extractvalue [4 x <1 x i64>] %b, 2
-  %4 = extractvalue [4 x <1 x i64>] %b, 3
-  %5 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x4
-; CHECK: st1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x float>] %b, 0
-  %2 = extractvalue [4 x <2 x float>] %b, 1
-  %3 = extractvalue [4 x <2 x float>] %b, 2
-  %4 = extractvalue [4 x <2 x float>] %b, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x4
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <1 x double>] %b, 0
-  %2 = extractvalue [4 x <1 x double>] %b, 1
-  %3 = extractvalue [4 x <1 x double>] %b, 2
-  %4 = extractvalue [4 x <1 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8)
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
deleted file mode 100644
index 75c2a82ab57e..000000000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-one.ll
+++ /dev/null
@@ -1,2300 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; interesting parts copied into arm64 directory as aarch64-neon-simd-ldst-one.ll
-
-%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
-%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
-%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v16i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
-  ret <16 x i8> %b
-}
-
-define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
-  ret <8 x i16> %b
-}
-
-define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
-  ret <4 x i32> %b
-}
-
-define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i64> %a, <i64 1, i64 2>
-  ret <2 x i64> %b
-}
-
-define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4f32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
-  ret <4 x float> %b
-}
-
-define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2f64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
-  ret <2 x double> %b
-}
-
-define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
-  ret <8 x i8> %b
-}
-
-define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
-  ret <4 x i16> %b
-}
-
-define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i32> %a, <i32 1, i32 2>
-  ret <2 x i32> %b
-}
-
-define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1q_dup_s8
-; CHECK: ld1r { {{v[0-9]+}}.16b }, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %lane
-}
-
-define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1q_dup_s16
-; CHECK: ld1r { {{v[0-9]+}}.8h }, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  ret <8 x i16> %lane
-}
-
-define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1q_dup_s32
-; CHECK: ld1r { {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  ret <4 x i32> %lane
-}
-
-define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1q_dup_s64
-; CHECK: ld1r { {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %lane
-}
-
-define <4 x float> @test_vld1q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1q_dup_f32
-; CHECK: ld1r { {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <4 x float> undef, float %0, i32 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  ret <4 x float> %lane
-}
-
-define <2 x double> @test_vld1q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1q_dup_f64
-; CHECK: ld1r { {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <2 x double> undef, double %0, i32 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %lane
-}
-
-define <8 x i8> @test_vld1_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1_dup_s8
-; CHECK: ld1r { {{v[0-9]+}}.8b }, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  ret <8 x i8> %lane
-}
-
-define <4 x i16> @test_vld1_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1_dup_s16
-; CHECK: ld1r { {{v[0-9]+}}.4h }, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  ret <4 x i16> %lane
-}
-
-define <2 x i32> @test_vld1_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1_dup_s32
-; CHECK: ld1r { {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  ret <2 x i32> %lane
-}
-
-define <1 x i64> @test_vld1_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1_dup_s64
-; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %1
-}
-
-define <2 x float> @test_vld1_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1_dup_f32
-; CHECK: ld1r { {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <2 x float> undef, float %0, i32 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  ret <2 x float> %lane
-}
-
-define <1 x double> @test_vld1_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1_dup_f64
-; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %1
-}
-
-define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1i64
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load i64* %a, align 8
-  store i64 %1, i64* %b, align 8
-  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
-  ret <1 x i64> %vecinit.i
-}
-
-define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1f64
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load double* %a, align 8
-  store double %1, double* %b, align 8
-  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
-  ret <1 x double> %vecinit.i
-}
-
-define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2q_dup_s8
-; CHECK: ld2r { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2q_dup_s16
-; CHECK: ld2r { {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2q_dup_s32
-; CHECK: ld2r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2q_dup_s64
-; CHECK: ld2r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2q_dup_f32
-; CHECK: ld2r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2q_dup_f64
-; CHECK: ld2r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2_dup_s8
-; CHECK: ld2r { {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2_dup_s16
-; CHECK: ld2r { {{v[0-9]+}}.4h, {{v[0-9]+}}.4h }, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2_dup_s32
-; CHECK: ld2r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2_dup_s64
-; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2_dup_f32
-; CHECK: ld2r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2_dup_f64
-; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3q_dup_s8
-; CHECK: ld3r { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3q_dup_s16
-; CHECK: ld3r { {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3q_dup_s32
-; CHECK: ld3r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3q_dup_s64
-; CHECK: ld3r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3q_dup_f32
-; CHECK: ld3r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
-  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3q_dup_f64
-; CHECK: ld3r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
-  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3_dup_s8
-; CHECK: ld3r { {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3_dup_s16
-; CHECK: ld3r { {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h }, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3_dup_s32
-; CHECK: ld3r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3_dup_s64
-; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3_dup_f32
-; CHECK: ld3r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
-  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3_dup_f64
-; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4q_dup_s8
-; CHECK: ld4r { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
-  %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4q_dup_s16
-; CHECK: ld4r { {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
-  %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4q_dup_s32
-; CHECK: ld4r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
-  %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4q_dup_s64
-; CHECK: ld4r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
-  %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4q_dup_f32
-; CHECK: ld4r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
-  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
-  %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4q_dup_f64
-; CHECK: ld4r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
-  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
-  %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4_dup_s8
-; CHECK: ld4r { {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
-  %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4_dup_s16
-; CHECK: ld4r { {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h }, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
-  %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4_dup_s32
-; CHECK: ld4r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
-  %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4_dup_s64
-; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
-  %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4_dup_f32
-; CHECK: ld4r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
-  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
-  %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4_dup_f64
-; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
-  %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vld1q_lane_s8
-; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
-  ret <16 x i8> %vld1_lane
-}
-
-define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vld1q_lane_s16
-; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
-  ret <8 x i16> %vld1_lane
-}
-
-define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vld1q_lane_s32
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
-  ret <4 x i32> %vld1_lane
-}
-
-define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vld1q_lane_s64
-; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
-  ret <2 x i64> %vld1_lane
-}
-
-define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vld1q_lane_f32
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
-  ret <4 x float> %vld1_lane
-}
-
-define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vld1q_lane_f64
-; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
-  ret <2 x double> %vld1_lane
-}
-
-define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vld1_lane_s8
-; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
-  ret <8 x i8> %vld1_lane
-}
-
-define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vld1_lane_s16
-; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
-  ret <4 x i16> %vld1_lane
-}
-
-define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vld1_lane_s32
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
-  ret <2 x i32> %vld1_lane
-}
-
-define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vld1_lane_s64
-; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %vld1_lane
-}
-
-define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vld1_lane_f32
-; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
-  ret <2 x float> %vld1_lane
-}
-
-define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vld1_lane_f64
-; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %vld1_lane
-}
-
-define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s16
-; CHECK: ld2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
-  %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s32
-; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s64
-; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f32
-; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f64
-; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s8
-; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s16
-; CHECK: ld2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s32
-; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s64
-; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f32
-; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f64
-; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s16
-; CHECK: ld3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s32
-; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s64
-; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f32
-; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f64
-; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s8
-; CHECK: ld3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s16
-; CHECK: ld3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s32
-; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s64
-; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f32
-; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f64
-; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s8
-; CHECK: ld4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s16
-; CHECK: ld4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s32
-; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s64
-; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f32
-; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f64
-; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s8
-; CHECK: ld4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s16
-; CHECK: ld4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s32
-; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s64
-; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f32
-; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f64
-; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_lane_s8
-; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <16 x i8> %b, i32 15
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_lane_s16
-; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i16> %b, i32 7
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_lane_s32
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i32> %b, i32 3
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_lane_s64
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i64> %b, i32 1
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_lane_f32
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x float> %b, i32 3
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_lane_f64
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x double> %b, i32 1
-  store double %0, double* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_lane_s8
-; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i8> %b, i32 7
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane_s16
-; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i16> %b, i32 3
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane_s32
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i32> %b, i32 1
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_lane_s64
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <1 x i64> %b, i32 0
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane_f32
-; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x float> %b, i32 1
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_lane_f64
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <1 x double> %b, i32 0
-  store double %0, double* %a, align 8
-  ret void
-}
-
-define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s8
-; CHECK: st2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
-  ret void
-}
-
-define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s16
-; CHECK: st2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s32
-; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s64
-; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f32
-; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f64
-; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s8
-; CHECK: st2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s16
-; CHECK: st2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s32
-; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s64
-; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f32
-; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f64
-; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s8
-; CHECK: st3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
-  ret void
-}
-
-define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s16
-; CHECK: st3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s32
-; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s64
-; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f32
-; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f64
-; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s8
-; CHECK: st3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s16
-; CHECK: st3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s32
-; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s64
-; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f32
-; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f64
-; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s8
-; CHECK: st4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
-  ret void
-}
-
-define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s16
-; CHECK: st4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s32
-; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s64
-; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f32
-; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f64
-; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s8
-; CHECK: st4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s16
-; CHECK: st4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s32
-; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s64
-; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f32
-; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f64
-; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-
-define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s8
-; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_u8
-; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.uint8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_p8
-; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.poly8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s8
-; CHECK: ld3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
-  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_u8
-; CHECK: ld3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
-  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.uint8x16x3_t %.fca.0.2.insert
-}
-
diff --git a/test/CodeGen/AArch64/neon-simd-ldst.ll b/test/CodeGen/AArch64/neon-simd-ldst.ll
deleted file mode 100644
index 7c78b6933426..000000000000
--- a/test/CodeGen/AArch64/neon-simd-ldst.ll
+++ /dev/null
@@ -1,165 +0,0 @@
-; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; Just intrinsic mashing. Duplicates existing arm64 tests.
-
-define void @test_ldstq_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_4v
-; CHECK: ld4     { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
-; CHECK: st4     { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
-entry:
-  %tobool62 = icmp eq i32 %count, 0
-  br i1 %tobool62, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.063 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.063, -1
-  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %io, i32 1)
-  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
-  tail call void @llvm.arm.neon.vst4.v16i8(i8* %io, <16 x i8> %vld4.fca.0.extract, <16 x i8> %vld4.fca.1.extract, <16 x i8> %vld4.fca.2.extract, <16 x i8> %vld4.fca.3.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_3v
-; CHECK: ld3     { v0.16b, v1.16b, v2.16b }, [x0]
-; CHECK: st3     { v0.16b, v1.16b, v2.16b }, [x0]
-entry:
-  %tobool47 = icmp eq i32 %count, 0
-  br i1 %tobool47, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.048 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.048, -1
-  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %io, i32 1)
-  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
-  tail call void @llvm.arm.neon.vst3.v16i8(i8* %io, <16 x i8> %vld3.fca.0.extract, <16 x i8> %vld3.fca.1.extract, <16 x i8> %vld3.fca.2.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_2v
-; CHECK: ld2     { v0.16b, v1.16b }, [x0]
-; CHECK: st2     { v0.16b, v1.16b }, [x0]
-entry:
-  %tobool22 = icmp eq i32 %count, 0
-  br i1 %tobool22, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.023, -1
-  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %io, i32 1)
-  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
-  tail call void @llvm.arm.neon.vst2.v16i8(i8* %io, <16 x i8> %vld2.fca.0.extract, <16 x i8> %vld2.fca.1.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldst_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_4v
-; CHECK: ld4     { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
-; CHECK: st4     { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
-entry:
-  %tobool42 = icmp eq i32 %count, 0
-  br i1 %tobool42, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.043 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.043, -1
-  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %io, i32 1)
-  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
-  tail call void @llvm.arm.neon.vst4.v8i8(i8* %io, <8 x i8> %vld4.fca.0.extract, <8 x i8> %vld4.fca.1.extract, <8 x i8> %vld4.fca.2.extract, <8 x i8> %vld4.fca.3.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_3v
-; CHECK: ld3     { v0.8b, v1.8b, v2.8b }, [x0]
-; CHECK: st3     { v0.8b, v1.8b, v2.8b }, [x0]
-entry:
-  %tobool32 = icmp eq i32 %count, 0
-  br i1 %tobool32, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.033 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.033, -1
-  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %io, i32 1)
-  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %io, <8 x i8> %vld3.fca.0.extract, <8 x i8> %vld3.fca.1.extract, <8 x i8> %vld3.fca.2.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_2v
-; CHECK: ld2     { v0.8b, v1.8b }, [x0]
-; CHECK: st2     { v0.8b, v1.8b }, [x0]
-entry:
-  %tobool22 = icmp eq i32 %count, 0
-  br i1 %tobool22, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.023, -1
-  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %io, i32 1)
-  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
-  tail call void @llvm.arm.neon.vst2.v8i8(i8* %io, <8 x i8> %vld2.fca.0.extract, <8 x i8> %vld2.fca.1.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
deleted file mode 100644
index 8acf6b792eb1..000000000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
+++ /dev/null
@@ -1,354 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-;Check for a post-increment updating load.
-define <4 x i16> @test_vld1_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld1_fx_update
-; CHECK: ld1 { v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}], #8
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 2)
-  %tmp2 = getelementptr i16* %A, i32 4
-  store i16* %tmp2, i16** %ptr
-  ret <4 x i16> %tmp1
-}
-
-;Check for a post-increment updating load with register increment.
-define <2 x i32> @test_vld1_reg_update(i32** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld1_reg_update
-; CHECK: ld1 { v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 4)
-  %tmp2 = getelementptr i32* %A, i32 %inc
-  store i32* %tmp2, i32** %ptr
-  ret <2 x i32> %tmp1
-}
-
-define <2 x float> @test_vld2_fx_update(float** %ptr) nounwind {
-; CHECK: test_vld2_fx_update
-; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}], #16
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  %tmp1 = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 4)
-  %tmp2 = extractvalue { <2 x float>, <2 x float> } %tmp1, 0
-  %tmp3 = getelementptr float* %A, i32 4
-  store float* %tmp3, float** %ptr
-  ret <2 x float> %tmp2
-}
-
-define <16 x i8> @test_vld2_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld2_reg_update
-; CHECK: ld2 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  %tmp0 = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1)
-  %tmp1 = extractvalue { <16 x i8>, <16 x i8> } %tmp0, 0
-  %tmp2 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp2, i8** %ptr
-  ret <16 x i8> %tmp1
-}
-
-define <4 x i32> @test_vld3_fx_update(i32** %ptr) nounwind {
-; CHECK: test_vld3_fx_update
-; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}], #48
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  %tmp1 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 4)
-  %tmp2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %tmp1, 0
-  %tmp3 = getelementptr i32* %A, i32 12
-  store i32* %tmp3, i32** %ptr
-  ret <4 x i32> %tmp2
-}
-
-define <4 x i16> @test_vld3_reg_update(i16** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld3_reg_update
-; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 2)
-  %tmp2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %tmp1, 0
-  %tmp3 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp3, i16** %ptr
-  ret <4 x i16> %tmp2
-}
-
-define <8 x i16> @test_vld4_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld4_fx_update
-; CHECK: ld4 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}], #64
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
-  %tmp2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %tmp1, 0
-  %tmp3 = getelementptr i16* %A, i32 32
-  store i16* %tmp3, i16** %ptr
-  ret <8 x i16> %tmp2
-}
-
-define <8 x i8> @test_vld4_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld4_reg_update
-; CHECK: ld4 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  %tmp0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1)
-  %tmp1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %tmp0, 0
-  %tmp2 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp2, i8** %ptr
-  ret <8 x i8> %tmp1
-}
-
-define void @test_vst1_fx_update(float** %ptr, <2 x float> %B) nounwind {
-; CHECK: test_vst1_fx_update
-; CHECK: st1 { v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}], #8
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %B, i32 4)
-  %tmp2 = getelementptr float* %A, i32 2
-  store float* %tmp2, float** %ptr
-  ret void
-}
-
-define void @test_vst1_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst1_reg_update
-; CHECK: st1 { v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %B, i32 2)
-  %tmp1 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst2_fx_update(i64** %ptr, <1 x i64> %B) nounwind {
-; CHECK: test_vst2_fx_update
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [{{x[0-9]+|sp}}], #16
-  %A = load i64** %ptr
-  %tmp0 = bitcast i64* %A to i8*
-  call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %B, <1 x i64> %B, i32 8)
-  %tmp1 = getelementptr i64* %A, i32 2
-  store i64* %tmp1, i64** %ptr
-  ret void
-}
-
-define void @test_vst2_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst2_reg_update
-; CHECK: st2 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, i32 4)
-  %tmp0 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp0, i8** %ptr
-  ret void
-}
-
-define void @test_vst3_fx_update(i32** %ptr, <2 x i32> %B) nounwind {
-; CHECK: test_vst3_fx_update
-; CHECK: st3 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}], #24
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %B, <2 x i32> %B, <2 x i32> %B, i32 4)
-  %tmp1 = getelementptr i32* %A, i32 6
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst3_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst3_reg_update
-; CHECK: st3 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %B, <8 x i16> %B, <8 x i16> %B, i32 2)
-  %tmp1 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst4_fx_update(float** %ptr, <4 x float> %B) nounwind {
-; CHECK: test_vst4_fx_update
-; CHECK: st4 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}], #64
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %B, <4 x float> %B, <4 x float> %B, <4 x float> %B, i32 4)
-  %tmp1 = getelementptr float* %A, i32 16
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-define void @test_vst4_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst4_reg_update
-; CHECK: st4 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, i32 1)
-  %tmp0 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp0, i8** %ptr
-  ret void
-}
-
-
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define <16 x i8> @test_vld1x2_fx_update(i8* %a, i8** %ptr) {
-; CHECK: test_vld1x2_fx_update
-; CHECK: ld1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}], #32
-  %1 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %tmp1 = getelementptr i8* %a, i32 32
-  store i8* %tmp1, i8** %ptr
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vld1x2_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x2_reg_update
-; CHECK: ld1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret <8 x i16> %3
-}
-
-define <2 x i64> @test_vld1x3_fx_update(i64* %a, i64** %ptr) {
-; CHECK: test_vld1x3_fx_update
-; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}], #48
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %tmp1 = getelementptr i64* %a, i32 6
-  store i64* %tmp1, i64** %ptr
-  ret  <2 x i64> %3
-}
-
-define <8 x i16> @test_vld1x3_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x3_reg_update
-; CHECK: ld1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret <8 x i16> %3
-}
-
-define <4 x float> @test_vld1x4_fx_update(float* %a, float** %ptr) {
-; CHECK: test_vld1x4_fx_update
-; CHECK: ld1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}], #64
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %tmp1 = getelementptr float* %a, i32 16
-  store float* %tmp1, float** %ptr
-  ret <4 x float> %3
-}
-
-define <8 x i8> @test_vld1x4_reg_update(i8* readonly %a, i8** %ptr, i32 %inc) #0 {
-; CHECK: test_vld1x4_reg_update
-; CHECK: ld1 { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret <8 x i8> %2
-}
-
-define void @test_vst1x2_fx_update(i8* %a, [2 x <16 x i8>] %b.coerce, i8** %ptr) #2 {
-; CHECK: test_vst1x2_fx_update
-; CHECK: st1 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}], #32
-  %1 = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %2 = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
-  %tmp1 = getelementptr i8* %a, i32 32
-  store i8* %tmp1, i8** %ptr
-  ret void
-}
-
-define void @test_vst1x2_reg_update(i16* %a, [2 x <8 x i16>] %b.coerce, i16** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x2_reg_update
-; CHECK: st1 { v{{[0-9]+}}.8h, v{{[0-9]+}}.8h }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %2 = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst1x3_fx_update(i32* %a, [3 x <2 x i32>] %b.coerce, i32** %ptr) #2 {
-; CHECK: test_vst1x3_fx_update
-; CHECK: st1 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}], #24
-  %1 = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %2 = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %3 = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
-  %tmp1 = getelementptr i32* %a, i32 6
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst1x3_reg_update(i64* %a, [3 x <1 x i64>] %b.coerce, i64** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x3_reg_update
-; CHECK: st1 { v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %2 = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %3 = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
-  %tmp1 = getelementptr i64* %a, i32 %inc
-  store i64* %tmp1, i64** %ptr
-  ret void
-}
-
-define void @test_vst1x4_fx_update(float* %a, [4 x <4 x float>] %b.coerce, float** %ptr) #2 {
-; CHECK: test_vst1x4_fx_update
-; CHECK: st1 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}], #64
-  %1 = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %2 = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %3 = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %4 = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
-  %tmp1 = getelementptr float* %a, i32 16
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-define void @test_vst1x4_reg_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x4_reg_update
-; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #3
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) #3
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
deleted file mode 100644
index e53d6cb339b0..000000000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
+++ /dev/null
@@ -1,319 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
-; CHECK-LABEL: test_vld2q_dup_fx_update
-; CHECK: ld2r  { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}], #2
-  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
-  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
-  %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
-  %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <16 x i8>] } %7
-}
-
-define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2q_dup_reg_update
-; CHECK: ld2r  { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
-  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
-  %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
-  %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
-  %tmp1 = getelementptr i32* %a, i32 %inc
-  store i32* %tmp1, i32** %ptr
-  ret { [2 x <4 x i32>] } %8
-}
-
-define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
-; CHECK-LABEL: test_vld3_dup_fx_update
-; CHECK: ld3r  { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}], #6
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
-  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
-  %9 = insertvalue { [3 x <4 x i16>] }  undef, <4 x i16> %4, 0, 0
-  %10 = insertvalue { [3 x <4 x i16>] }  %9, <4 x i16> %6, 0, 1
-  %11 = insertvalue { [3 x <4 x i16>] }  %10, <4 x i16> %8, 0, 2
-  %tmp1 = getelementptr i16* %a, i32 3
-  store i16* %tmp1, i16** %ptr
-  ret { [3 x <4 x i16>] }  %11
-}
-
-define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_dup_reg_update
-; CHECK: ld3r  { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
-  %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
-  %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
-  %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
-  %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret { [3 x <8 x i8>] }%10
-}
-
-define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
-; CHECK-LABEL: test_vld4_dup_fx_update
-; CHECK: ld4r  { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}], #16
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
-  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
-  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
-  %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
-  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
-  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
-  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
-  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
-  %tmp1 = getelementptr i32* %a, i32 4
-  store i32* %tmp1, i32** %ptr
-  ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_dup_reg_update
-; CHECK: ld4r  { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
-  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
-  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
-  %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
-  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
-  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
-  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
-  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret { [4 x <2 x double>] } %14
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vld2_lane_fx_update
-; CHECK: ld2  { v{{[0-9]+}}.b, v{{[0-9]+}}.b }[7], [x{{[0-9]+|sp}}], #2
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
-  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
-  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
-  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
-  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <8 x i8>] } %7
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2_lane_reg_update
-; CHECK: ld2  { v{{[0-9]+}}.b, v{{[0-9]+}}.b }[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
-  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
-  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
-  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
-  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <8 x i8>] } %7
-}
-
-define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vld3_lane_fx_update
-; CHECK: ld3  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], #12
-  %1 = extractvalue [3 x <2 x float>] %b, 0
-  %2 = extractvalue [3 x <2 x float>] %b, 1
-  %3 = extractvalue [3 x <2 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
-  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
-  %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
-  %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
-  %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
-  %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
-  %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
-  %tmp1 = getelementptr float* %a, i32 3
-  store float* %tmp1, float** %ptr
-  ret { [3 x <2 x float>] } %11
-}
-
-define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_lane_reg_update
-; CHECK: ld3  { v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h }[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
-  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
-  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
-  %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
-  %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
-  %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
-  %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret { [3 x <4 x i16>] } %11
-}
-
-define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
-; CHECK-LABEL: test_vld4_lane_fx_update
-; CHECK: ld4  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], #16
-  %1 = extractvalue [4 x <2 x i32>] %b, 0
-  %2 = extractvalue [4 x <2 x i32>] %b, 1
-  %3 = extractvalue [4 x <2 x i32>] %b, 2
-  %4 = extractvalue [4 x <2 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
-  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
-  %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
-  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
-  %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
-  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
-  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
-  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
-  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
-  %tmp1 = getelementptr i32* %a, i32 4
-  store i32* %tmp1, i32** %ptr
-  ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_lane_reg_update
-; CHECK: ld4  { v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d }[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x double>] %b, 0
-  %2 = extractvalue [4 x <2 x double>] %b, 1
-  %3 = extractvalue [4 x <2 x double>] %b, 2
-  %4 = extractvalue [4 x <2 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
-  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
-  %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
-  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
-  %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
-  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
-  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
-  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
-  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret { [4 x <2 x double>] } %14
-}
-
-define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vst2_lane_fx_update
-; CHECK: st2  { v{{[0-9]+}}.b, v{{[0-9]+}}.b }[7], [x{{[0-9]+|sp}}], #2
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret void
-}
-
-define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst2_lane_reg_update
-; CHECK: st2  { v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
-  %tmp1 = getelementptr i32* %a, i32 %inc
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vst3_lane_fx_update
-; CHECK: st3  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[3], [x{{[0-9]+|sp}}], #12
-  %1 = extractvalue [3 x <4 x float>] %b, 0
-  %2 = extractvalue [3 x <4 x float>] %b, 1
-  %3 = extractvalue [3 x <4 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
-  %tmp1 = getelementptr float* %a, i32 3
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst3_lane_reg_update
-; CHECK: st3  { v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h }[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
-; CHECK-LABEL: test_vst4_lane_fx_update
-; CHECK: st4  { v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d }[1], [x{{[0-9]+|sp}}], #32
-  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
-  %tmp1 = getelementptr double* %a, i32 4
-  store double* %tmp1, double** %ptr
-  ret void
-}
-
-
-define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst4_lane_reg_update
-; CHECK: st4  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
-  %tmp1 = getelementptr float* %a, i32 %inc
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-shift.ll b/test/CodeGen/AArch64/neon-simd-shift.ll
deleted file mode 100644
index 5615e3c8361b..000000000000
--- a/test/CodeGen/AArch64/neon-simd-shift.ll
+++ /dev/null
@@ -1,1557 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has separate copy of parts that aren't pure intrinsic wrangling.
-
-define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vshr_n_s8
-; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vshr_n_s16
-; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vshr_n_s32
-; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_s8
-; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_s16
-; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_s32
-; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_s64
-; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vshr_n_u8
-; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vshr_n_u16
-; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vshr_n_u32
-; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_u8
-; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_u16
-; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_u32
-; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_u64
-; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_s8
-; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_s16
-; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_s32
-; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_s8
-; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_s16
-; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_s32
-; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_s64
-; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_u8
-; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_u16
-; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_u32
-; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_u8
-; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_u16
-; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_u32
-; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_u64
-; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_s8
-; CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_s16
-; CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_s32
-; CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_s8
-; CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_s16
-; CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_s32
-; CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_s64
-; CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_u8
-; CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_u16
-; CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_u32
-; CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_u8
-; CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_u16
-; CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_u32
-; CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_u64
-; CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_s8
-; CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %1 = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %b, i32 3)
-  %vrsra_n = add <8 x i8> %1, %a
-  ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_s16
-; CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %1 = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %b, i32 3)
-  %vrsra_n = add <4 x i16> %1, %a
-  ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_s32
-; CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %1 = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %b, i32 3)
-  %vrsra_n = add <2 x i32> %1, %a
-  ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_s8
-; CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %1 = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %b, i32 3)
-  %vrsra_n = add <16 x i8> %1, %a
-  ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_s16
-; CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %1 = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %b, i32 3)
-  %vrsra_n = add <8 x i16> %1, %a
-  ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_s32
-; CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %1 = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %b, i32 3)
-  %vrsra_n = add <4 x i32> %1, %a
-  ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_s64
-; CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %1 = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %b, i32 3)
-  %vrsra_n = add <2 x i64> %1, %a
-  ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_u8
-; CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %1 = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %b, i32 3)
-  %vrsra_n = add <8 x i8> %1, %a
-  ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_u16
-; CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %1 = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %b, i32 3)
-  %vrsra_n = add <4 x i16> %1, %a
-  ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_u32
-; CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %1 = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %b, i32 3)
-  %vrsra_n = add <2 x i32> %1, %a
-  ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_u8
-; CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %1 = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %b, i32 3)
-  %vrsra_n = add <16 x i8> %1, %a
-  ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_u16
-; CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %1 = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %b, i32 3)
-  %vrsra_n = add <8 x i16> %1, %a
-  ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_u32
-; CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %1 = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %b, i32 3)
-  %vrsra_n = add <4 x i32> %1, %a
-  ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_u64
-; CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %1 = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %b, i32 3)
-  %vrsra_n = add <2 x i64> %1, %a
-  ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_s8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsri_n
-}
-
-
-define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_s16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
-  ret <4 x i16> %vsri
-}
-
-
-define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsri_n_s32
-; CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsri = tail call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
-  ret <2 x i32> %vsri
-}
-
-
-define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_s8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsri_n
-}
-
-
-define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_s16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
-  ret <8 x i16> %vsri
-}
-
-
-define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsriq_n_s32
-; CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsri = tail call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
-  ret <4 x i32> %vsri
-}
-
-
-define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsriq_n_s64
-; CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsri = tail call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
-  ret <2 x i64> %vsri
-}
-
-define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_p8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsri_n
-}
-
-define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_p16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
-  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
-  ret <4 x i16> %vsri
-}
-
-define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_p8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsri_n
-}
-
-define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_p16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
-  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
-  ret <8 x i16> %vsri
-}
-
-define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_s8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_s16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
-  ret <4 x i16> %vsli
-}
-
-define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsli_n_s32
-; CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsli = tail call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
-  ret <2 x i32> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_s8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_s16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
-  ret <8 x i16> %vsli
-}
-
-define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsliq_n_s32
-; CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsli = tail call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
-  ret <4 x i32> %vsli
-}
-
-define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsliq_n_s64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsli = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
-  ret <2 x i64> %vsli
-}
-
-define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_p8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_p16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
-  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
-  ret <4 x i16> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_p8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_p16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
-  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
-  ret <8 x i16> %vsli
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshl = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <8 x i8> %vqshl
-}
-
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
-  ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
-  ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
-  ret <2 x i64> %vqshl
-}
-
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <8 x i8> %vqshl_n
-}
-
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
-  ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
-  ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
-  ret <2 x i64> %vqshl
-}
-
-define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshlu_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshlu = tail call <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vqshlu
-}
-
-
-define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshlu_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshlu = tail call <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vqshlu
-}
-
-
-define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshlu_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshlu = tail call <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vqshlu
-}
-
-
-define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshluq_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshlu = tail call <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vqshlu
-}
-
-
-define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshluq_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshlu = tail call <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vqshlu
-}
-
-
-define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshluq_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshlu = tail call <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vqshlu
-}
-
-
-define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshluq_n_s64
-; CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshlu = tail call <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vqshlu
-}
-
-
-define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_s16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_s32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_s64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_u16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_u32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_u64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_s16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_s32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_s64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_u16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_u32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_u64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrun_n_s16
-; CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrun
-}
-
-
-define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrun_n_s32
-; CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrun
-}
-
-define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrun_n_s64
-; CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrun
-}
-
-define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrun_high_n_s16
-; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrun_high_n_s32
-; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrun_high_n_s64
-; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrn_n_s16
-; CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vrshrn
-}
-
-
-define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrn_n_s32
-; CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vrshrn
-}
-
-
-define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrn_n_s64
-; CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vrshrn
-}
-
-define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vrshrn_high_n_s16
-; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vrshrn_high_n_s32
-; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vrshrn_high_n_s64
-; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrun_n_s16
-; CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrun
-}
-
-define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrun_n_s32
-; CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrun
-}
-
-define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrun_n_s64
-; CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrun
-}
-
-define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrun_high_n_s16
-; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrun_high_n_s32
-; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrun_high_n_s64
-; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_s16
-; CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_s32
-; CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_s64
-; CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrn
-}
-
-
-define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_u16
-; CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_u32
-; CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_u64
-; CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrn
-}
-
-
-define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_s16
-; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_s32
-; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_s64
-; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_u16
-; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_u32
-; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_u64
-; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_s16
-; CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_s32
-; CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_s64
-; CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrn
-}
-
-
-define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_u16
-; CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_u32
-; CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_u64
-; CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrn
-}
-
-
-define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_s16
-; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_s32
-; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_s64
-; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_u16
-; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_u32
-; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_u64
-; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
-  ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
-  ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_s64
-; CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
-  ret <2 x double> %vcvt
-}
-
-define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
-  ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
-  ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_u64
-; CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
-  ret <2 x double> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 31)
-  ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) {
-; CHECK: test_vcvtq_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 31)
-  ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_s64_f64
-; CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %a, i32 50)
-  ret <2 x i64> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 31)
-  ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 31)
-  ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_u64_f64
-; CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %a, i32 50)
-  ret <2 x i64> %vcvt
-}
-
-declare <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) 
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) 
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) 
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
-
-define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll
deleted file mode 100644
index 53924923f795..000000000000
--- a/test/CodeGen/AArch64/neon-simd-tbl.ll
+++ /dev/null
@@ -1,829 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; This test is just intrinsic pumping. arm64 has its own tbl/tbx tests.
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8>, <8 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-simd-vget.ll b/test/CodeGen/AArch64/neon-simd-vget.ll
deleted file mode 100644
index 93d5e2ad3455..000000000000
--- a/test/CodeGen/AArch64/neon-simd-vget.ll
+++ /dev/null
@@ -1,226 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has its own copy: aarch64-neon-simd-vget.ll
-
-define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_s8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_s16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_high_s32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_s64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_u8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_u16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_high_u32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_u64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_p64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_f16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x float> @test_vget_high_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vget_high_f32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x float> %shuffle.i
-}
-
-define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_p8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_p16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <1 x double> @test_vget_high_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vget_high_f64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
-  ret <1 x double> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_s8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_s16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_low_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_low_s32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_s64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_u8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_u16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_low_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_low_u32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_u64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_p64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_p64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_f16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_f16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x float> @test_vget_low_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vget_low_f32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_p8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_p16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <1 x double> @test_vget_low_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vget_low_f64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer
-  ret <1 x double> %shuffle.i
-}
diff --git a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
deleted file mode 100644
index 142b0a8bd537..000000000000
--- a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-; not relevant for arm64: <1 x iN> isn't legal
-
-; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the
-; allocator to keep the value live until it's needed.
-
-%bigtype_v1i8 = type [20 x <1 x i8>]
-
-define void @spill_fpr8(%bigtype_v1i8* %addr) {
-; CHECK-LABEL: spill_fpr8:
-; CHECK: 1-byte Folded Spill
-; CHECK: 1-byte Folded Reload
-  %val1 = load volatile %bigtype_v1i8* %addr
-  %val2 = load volatile %bigtype_v1i8* %addr
-  store volatile %bigtype_v1i8 %val1, %bigtype_v1i8* %addr
-  store volatile %bigtype_v1i8 %val2, %bigtype_v1i8* %addr
-  ret void
-}
-
-%bigtype_v1i16 = type [20 x <1 x i16>]
-
-define void @spill_fpr16(%bigtype_v1i16* %addr) {
-; CHECK-LABEL: spill_fpr16:
-; CHECK: 2-byte Folded Spill
-; CHECK: 2-byte Folded Reload
-  %val1 = load volatile %bigtype_v1i16* %addr
-  %val2 = load volatile %bigtype_v1i16* %addr
-  store volatile %bigtype_v1i16 %val1, %bigtype_v1i16* %addr
-  store volatile %bigtype_v1i16 %val2, %bigtype_v1i16* %addr
-  ret void
-}
diff --git a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
index dbaccacdf554..1df3719c8867 100644
--- a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; A vector TruncStore can not be selected.
 ; Test a trunc IR and a vector store IR can be selected correctly.
diff --git a/test/CodeGen/AArch64/neon-v1i1-setcc.ll b/test/CodeGen/AArch64/neon-v1i1-setcc.ll
deleted file mode 100644
index 114e44ac8bf0..000000000000
--- a/test/CodeGen/AArch64/neon-v1i1-setcc.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; arm64 has a separate copy as aarch64-neon-v1i1-setcc.ll
-
-; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
-; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
-; As the v1i64 operands of SETCC are legal types, they will not be scalarized.
-; Currently the type legalizer will have an assertion failure as it assumes all
-; operands of SETCC have been legalized.
-; FIXME: If the algorithm of type scalarization is improved and can legaize
-; "v1i1 SETCC" correctly, these test cases are not needed.
-
-define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_0:
-; CHECK: cmge d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = icmp sge <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_0:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_select_v1i1_2:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
-  ret <1 x double> %res
-}
-
-define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_br_extr_cmp:
-; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  br i1 %2, label %if.end, label %if.then
-
-if.then:
-  ret i32 0;
-
-if.end:
-  ret i32 1;
-}
diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/neon-vector-list-spill.ll
deleted file mode 100644
index 5df0aacb38af..000000000000
--- a/test/CodeGen/AArch64/neon-vector-list-spill.ll
+++ /dev/null
@@ -1,176 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast
-; arm64 has separate copy as aarch64-neon-vector-list-spill.ll
-
-; FIXME: We should not generate ld/st for such register spill/fill, because the
-; test case seems very simple and the register pressure is not high. If the
-; spill/fill algorithm is optimized, this test case may not be triggered. And
-; then we can delete it.
-define i32 @spill.DPairReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DPairReg:
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
-  %res = extractelement <2 x i32> %vld.extract, i32 1
-  ret i32 %res
-}
-
-define i16 @spill.DTripleReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
-  %res = extractelement <4 x i16> %vld.extract, i32 1
-  ret i16 %res
-}
-
-define i16 @spill.DQuadReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
-  %res = extractelement <4 x i16> %vld.extract, i32 0
-  ret i16 %res
-}
-
-define i32 @spill.QPairReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QPairReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %res = extractelement <4 x i32> %vld.extract, i32 1
-  ret i32 %res
-}
-
-define float @spill.QTripleReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
-  %res = extractelement <4 x float> %vld3.extract, i32 1
-  ret float %res
-}
-
-define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
-  %res = extractelement <16 x i8> %vld.extract, i32 1
-  ret i8 %res
-}
-
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-
-declare void @foo()
-
-; FIXME: We should not generate ld/st for such register spill/fill, because the
-; test case seems very simple and the register pressure is not high. If the
-; spill/fill algorithm is optimized, this test case may not be triggered. And
-; then we can delete it.
-; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
diff --git a/test/CodeGen/AArch64/nzcv-save.ll b/test/CodeGen/AArch64/nzcv-save.ll
new file mode 100644
index 000000000000..32baff3dbe64
--- /dev/null
+++ b/test/CodeGen/AArch64/nzcv-save.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+; CHECK: mrs [[NZCV_SAVE:x[0-9]+]], NZCV
+; CHECK: msr NZCV, [[NZCV_SAVE]]
+
+; DAG ends up with two uses for the flags from an ADCS node, which means they
+; must be saved for later.
+define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp {
+entry:
+  %c = load i256* %cc
+  %d = load i256* %dd
+  %add = add nsw i256 %c, %d
+  store i256 %add, i256* %a, align 8
+  %or = or i256 %c, 1606938044258990275541962092341162602522202993782792835301376
+  %add6 = add nsw i256 %or, %d
+  store i256 %add6, i256* %b, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index 399d1c1123fb..e8c762504fc9 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,6 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 ; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
diff --git a/test/CodeGen/AArch64/ragreedy-csr.ll b/test/CodeGen/AArch64/ragreedy-csr.ll
index 20e1b30d74d8..de29b1baa8d5 100644
--- a/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -1,4 +1,3 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
 
 ; This testing case is reduced from 197.parser prune_match function.
diff --git a/test/CodeGen/AArch64/regress-bitcast-formals.ll b/test/CodeGen/AArch64/regress-bitcast-formals.ll
index 7f3ba7276b50..58e0542d84f5 100644
--- a/test/CodeGen/AArch64/regress-bitcast-formals.ll
+++ b/test/CodeGen/AArch64/regress-bitcast-formals.ll
@@ -1,4 +1,3 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs < %s | FileCheck %s
 
 ; CallingConv.td requires a bitcast for vector arguments. Make sure we're
diff --git a/test/CodeGen/AArch64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/regress-f128csel-flags.ll
index a7352d6815ee..25b5e0c5f776 100644
--- a/test/CodeGen/AArch64/regress-f128csel-flags.ll
+++ b/test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; We used to not mark NZCV as being used in the continuation basic-block
 ; when lowering a 128-bit "select" to branches. This meant a subsequent use
diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll
index 5c2142aeeeb4..5e6ab0a9675b 100644
--- a/test/CodeGen/AArch64/regress-fp128-livein.ll
+++ b/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s
 
 ; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
 ; causing a crash during live range calc.
diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll
index 4a6ad55b6796..03c3f33d9477 100644
--- a/test/CodeGen/AArch64/regress-tail-livereg.ll
+++ b/test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -1,4 +1,3 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 @var = global void()* zeroinitializer
 
@@ -18,3 +17,17 @@ define void @foo() {
 ; CHECK: br {{x([0-79]|1[0-8])}}
        ret void
 }
+
+; No matter how tempting it is, LLVM should not use x30 since that'll be
+; restored to its incoming value before the "br".
+define void @test_x30_tail() {
+; CHECK-LABEL: test_x30_tail:
+; CHECK: mov [[DEST:x[0-9]+]], x30
+; CHECK: br [[DEST]]
+  %addr = call i8* @llvm.returnaddress(i32 0)
+  %faddr = bitcast i8* %addr to void()*
+  tail call void %faddr()
+  ret void
+}
+
+declare i8* @llvm.returnaddress(i32)
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 1f8ad4503c41..477d99625eec 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix CHECK-AARCH64
-; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix CHECK-ARM64
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 
 ; When generating DAG selection tables, TableGen used to only flag an
 ; instruction as needing a chain on its own account if it had a built-in pattern
@@ -13,8 +12,7 @@
 declare void @bar(i8*)
 
 define i64 @test_chains() {
-; CHECK-AARCH64-LABEL: test_chains:
-; CHECK-ARM64-LABEL: test_chains:
+; CHECK-LABEL: test_chains:
 
   %locvar = alloca i8
 
@@ -26,19 +24,14 @@ define i64 @test_chains() {
   %inc.3 = add i64 %inc.2, 1
   %inc.4 = trunc i64 %inc.3 to i8
   store i8 %inc.4, i8* %locvar
-; CHECK-AARCH64: ldrb {{w[0-9]+}}, [sp, [[LOCADDR:#[0-9]+]]]
-; CHECK-AARCH64: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-AARCH64: strb {{w[0-9]+}}, [sp, [[LOCADDR]]]
-; CHECK-AARCH64: ldrb {{w[0-9]+}}, [sp, [[LOCADDR]]]
 
-; CHECK-ARM64: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
-; CHECK-ARM64: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-ARM64: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
-; CHECK-ARM64: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
 
   %ret.1 = load i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
   ret i64 %ret.2
-; CHECK-AARCH64: ret
-; CHECK-ARM64: ret
+; CHECK: ret
 }
diff --git a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
index cfd94e1503b1..c3167e4f4bdd 100644
--- a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
+++ b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
 @var = global i32 0
 
 declare void @bar()
diff --git a/test/CodeGen/AArch64/regress-wzr-allocatable.ll b/test/CodeGen/AArch64/regress-wzr-allocatable.ll
deleted file mode 100644
index 8620ce14e9b4..000000000000
--- a/test/CodeGen/AArch64/regress-wzr-allocatable.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0
-
-; Skipping for arm64, there's no evidence it would ever have hit the same
-; problem.
-
-; When WZR wasn't marked as reserved, this function tried to allocate
-; it at O0 and then generated an internal fault (mostly incidentally)
-; when it discovered that it was already in use for a multiplication.
-
-; I'm not really convinced this is a good test since it could easily
-; stop testing what it does now with no-one any the wiser. However, I
-; can't think of a better way to force the allocator to use WZR
-; specifically.
-
-define void @test() nounwind {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %entry
-  br i1 undef, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  br label %for.cond6
-
-for.cond6:                                        ; preds = %for.body9, %for.end
-  br i1 undef, label %for.body9, label %while.cond30
-
-for.body9:                                        ; preds = %for.cond6
-  store i16 0, i16* undef, align 2
-  %0 = load i32* undef, align 4
-  %1 = load i32* undef, align 4
-  %mul15 = mul i32 %0, %1
-  %add16 = add i32 %mul15, 32768
-  %div = udiv i32 %add16, 65535
-  %add17 = add i32 %div, 1
-  store i32 %add17, i32* undef, align 4
-  br label %for.cond6
-
-while.cond30:                                     ; preds = %for.cond6
-  ret void
-}
diff --git a/test/CodeGen/AArch64/returnaddr.ll b/test/CodeGen/AArch64/returnaddr.ll
index 3f7edcbaa89c..b136f044cad8 100644
--- a/test/CodeGen/AArch64/returnaddr.ll
+++ b/test/CodeGen/AArch64/returnaddr.ll
@@ -1,4 +1,3 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i8* @rt0(i32 %x) nounwind readnone {
diff --git a/test/CodeGen/AArch64/setcc-takes-i32.ll b/test/CodeGen/AArch64/setcc-takes-i32.ll
index 21c2688ca70f..ec8615910cf0 100644
--- a/test/CodeGen/AArch64/setcc-takes-i32.ll
+++ b/test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
 
 ; Most important point here is that the promotion of the i1 works
 ; correctly. Previously LLVM thought that i64 was the appropriate SetCC output,
diff --git a/test/CodeGen/AArch64/sext_inreg.ll b/test/CodeGen/AArch64/sext_inreg.ll
deleted file mode 100644
index 7873c6462d78..000000000000
--- a/test/CodeGen/AArch64/sext_inreg.ll
+++ /dev/null
@@ -1,202 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; arm64: This test contains much that is unique and valuable. Unfortunately the
-; bits that are unique aren't valuable and the bits that are valuable aren't
-; unique. (weird ABI types vs bog-standard shifting & extensions).
-
-; For formal arguments, we have the following vector type promotion,
-; v2i8 is promoted to v2i32(f64)
-; v2i16 is promoted to v2i32(f64)
-; v4i8 is promoted to v4i16(f64)
-; v8i1 is promoted to v8i16(f128)
-
-define <2 x i8> @test_sext_inreg_v2i8i16(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <2 x i8> %v1 to <2 x i16>
-  %2 = sext <2 x i8> %v2 to <2 x i16>
-  %3 = shufflevector <2 x i16> %1, <2 x i16> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i16> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i16_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16_2
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %a1 = shl <2 x i32> %v1, <i32 24, i32 24>
-  %a2 = ashr <2 x i32> %a1, <i32 24, i32 24>
-  %b1 = shl <2 x i32> %v2, <i32 24, i32 24>
-  %b2 = ashr <2 x i32> %b1, <i32 24, i32 24>
-  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
-  %d = trunc <2 x i32> %c to <2 x i8>
-  ret <2 x i8> %d
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i32(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i32
-; CHECK: sshll	 v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll	 v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <2 x i8> %v1 to <2 x i32>
-  %2 = sext <2 x i8> %v2 to <2 x i32>
-  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i32> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i64(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i64
-; CHECK: ushll   v1.2d, v1.2s, #0
-; CHECK: ushll   v0.2d, v0.2s, #0
-; CHECK: shl     v0.2d, v0.2d, #56
-; CHECK: sshr    v0.2d, v0.2d, #56
-; CHECK: shl     v1.2d, v1.2d, #56
-; CHECK: sshr    v1.2d, v1.2d, #56
-  %1 = sext <2 x i8> %v1 to <2 x i64>
-  %2 = sext <2 x i8> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <4 x i8> %v1 to <4 x i16>
-  %2 = sext <4 x i8> %v2 to <4 x i16>
-  %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i16> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16_2(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16_2
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %a1 = shl <4 x i16> %v1, <i16 8, i16 8, i16 8, i16 8>
-  %a2 = ashr <4 x i16> %a1, <i16 8, i16 8, i16 8, i16 8>
-  %b1 = shl <4 x i16> %v2, <i16 8, i16 8, i16 8, i16 8>
-  %b2 = ashr <4 x i16> %b1, <i16 8, i16 8, i16 8, i16 8>
-  %c = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %d = trunc <4 x i16> %c to <4 x i8>
-  ret <4 x i8> %d
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i32(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i32
-; CHECK: ushll   v1.4s, v1.4h, #0
-; CHECK: ushll   v0.4s, v0.4h, #0
-; CHECK: shl     v0.4s, v0.4s, #24
-; CHECK: sshr    v0.4s, v0.4s, #24
-; CHECK: shl     v1.4s, v1.4s, #24
-; CHECK: sshr    v1.4s, v1.4s, #24
-  %1 = sext <4 x i8> %v1 to <4 x i32>
-  %2 = sext <4 x i8> %v2 to <4 x i32>
-  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i32> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define <8 x i8> @test_sext_inreg_v8i8i16(<8 x i8> %v1, <8 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK: sshll   v1.8h, v1.8b, #0
-  %1 = sext <8 x i8> %v1 to <8 x i16>
-  %2 = sext <8 x i8> %v2 to <8 x i16>
-  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  ret <8 x i8> %4
-}
-
-define <8 x i1> @test_sext_inreg_v8i1i16(<8 x i1> %v1, <8 x i1> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i1i16
-; CHECK: ushll   v1.8h, v1.8b, #0
-; CHECK: ushll   v0.8h, v0.8b, #0
-; CHECK: shl     v0.8h, v0.8h, #15
-; CHECK: sshr    v0.8h, v0.8h, #15
-; CHECK: shl     v1.8h, v1.8h, #15
-; CHECK: sshr    v1.8h, v1.8h, #15
-  %1 = sext <8 x i1> %v1 to <8 x i16>
-  %2 = sext <8 x i1> %v2 to <8 x i16>
-  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %4 = trunc <8 x i16> %3 to <8 x i1>
-  ret <8 x i1> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32
-; CHECK: sshll   v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
-  %1 = sext <2 x i16> %v1 to <2 x i32>
-  %2 = sext <2 x i16> %v2 to <2 x i32>
-  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i32> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32_2
-; CHECK: sshll   v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
-  %a1 = shl <2 x i32> %v1, <i32 16, i32 16>
-  %a2 = ashr <2 x i32> %a1, <i32 16, i32 16>
-  %b1 = shl <2 x i32> %v2, <i32 16, i32 16>
-  %b2 = ashr <2 x i32> %b1, <i32 16, i32 16>
-  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
-  %d = trunc <2 x i32> %c to <2 x i16>
-  ret <2 x i16> %d
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i64(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i64
-; CHECK: ushll   v1.2d, v1.2s, #0
-; CHECK: ushll   v0.2d, v0.2s, #0
-; CHECK: shl     v0.2d, v0.2d, #48
-; CHECK: sshr    v0.2d, v0.2d, #48
-; CHECK: shl     v1.2d, v1.2d, #48
-; CHECK: sshr    v1.2d, v1.2d, #48
-  %1 = sext <2 x i16> %v1 to <2 x i64>
-  %2 = sext <2 x i16> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <4 x i16> @test_sext_inreg_v4i16i32(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i16i32
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK: sshll v1.4s, v1.4h, #0
-  %1 = sext <4 x i16> %v1 to <4 x i32>
-  %2 = sext <4 x i16> %v2 to <4 x i32>
-  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i32> %3 to <4 x i16>
-  ret <4 x i16> %4
-}
-
-define <2 x i32> @test_sext_inreg_v2i32i64(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i32i64
-; CHECK: sshll v0.2d, v0.2s, #0
-; CHECK: sshll v1.2d, v1.2s, #0
-  %1 = sext <2 x i32> %v1 to <2 x i64>
-  %2 = sext <2 x i32> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i32>
-  ret <2 x i32> %4
-}
-
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 20f1062a44dc..34e3bb410e8c 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-load-store-opt=0 | FileCheck %s
 
 declare void @callee_stack0()
 declare void @callee_stack8([8 x i32], i64)
@@ -73,10 +73,10 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
   tail call void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
+; CHECK: ldr [[VAL0:x[0-9]+]],
+; CHECK: ldr [[VAL1:x[0-9]+]],
+; CHECK: str [[VAL1]],
+; CHECK: str [[VAL0]],
 
 ; CHECK-NOT: add sp, sp,
 ; CHECK: b callee_stack16
@@ -91,7 +91,7 @@ define void @indirect_tail() {
   %fptr = load void(i32)** @func
   tail call void %fptr(i32 42)
   ret void
-; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, #:lo12:func]
-; CHECK: movz w0, #42
+; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:func]
+; CHECK: movz w0, #{{42|0x2a}}
 ; CHECK: br [[FPTR]]
 }
diff --git a/test/CodeGen/AArch64/sincos-expansion.ll b/test/CodeGen/AArch64/sincos-expansion.ll
index 1498eb53625a..c3a172dfb427 100644
--- a/test/CodeGen/AArch64/sincos-expansion.ll
+++ b/test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 define float @test_sincos_f32(float %f) {
   %sin = call float @sinf(float %f) readnone
diff --git a/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
index baa73a3c7163..22f33a83394b 100644
--- a/test/CodeGen/AArch64/sincospow-vector-expansion.ll
+++ b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-; RUN: llc -o - %s -verify-machineinstrs -mtriple=arm64-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc -o - %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <2 x float> @test_cos_v2f64(<2 x double> %v1) {
diff --git a/test/CodeGen/AArch64/stackpointer.ll b/test/CodeGen/AArch64/stackpointer.ll
deleted file mode 100644
index 1f20692c8c95..000000000000
--- a/test/CodeGen/AArch64/stackpointer.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnueabi | FileCheck %s
-; arm64 has a separate copy of this test
-
-define i64 @get_stack() nounwind {
-entry:
-; CHECK-LABEL: get_stack:
-; CHECK: mov   x0, sp
-	%sp = call i64 @llvm.read_register.i64(metadata !0)
-  ret i64 %sp
-}
-
-define void @set_stack(i64 %val) nounwind {
-entry:
-; CHECK-LABEL: set_stack:
-; CHECK: mov   sp, x0
-  call void @llvm.write_register.i64(metadata !0, i64 %val)
-  ret void
-}
-
-declare i64 @llvm.read_register.i64(metadata) nounwind
-declare void @llvm.write_register.i64(metadata, i64) nounwind
-
-; register unsigned long current_stack_pointer asm("sp");
-; CHECK-NOT: .asciz  "sp"
-!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index da05848dcc59..8aab84215260 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s
 
 declare fastcc void @callee_stack0()
 declare fastcc void @callee_stack8([8 x i32], i64)
@@ -9,91 +8,59 @@ define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
 ; CHECK-NEXT: // BB
 
-; CHECK-ARM64-LABEL: caller_to0_from0:
-; CHECK-ARM64-NEXT: // BB
-
   tail call fastcc void @callee_stack0()
   ret void
 
 ; CHECK-NEXT: b callee_stack0
-
-; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to0_from8([8 x i32], i64) {
 ; CHECK-LABEL: caller_to0_from8:
 
-; CHECK-ARM64-LABEL: caller_to0_from8:
-
   tail call fastcc void @callee_stack0()
   ret void
 
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack0
-
-; CHECK-ARM64: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to8_from0() {
 ; CHECK-LABEL: caller_to8_from0:
 ; CHECK: sub sp, sp, #32
 
-; CHECK-ARM64-LABEL: caller_to8_from0:
-; CHECK-ARM64: sub sp, sp, #32
-
 ; Key point is that the "42" should go #16 below incoming stack
 ; pointer (we didn't have arg space to reuse).
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to8_from8:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to8_from8:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Key point is that the "%a" should go where at SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to16_from8:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to16_from8:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Important point is that the call reuses the "dead" argument space
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
 
-; CHECK: str {{x[0-9]+}}, [sp, #24]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
-
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64-NEXT: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack16
   ret void
 }
 
@@ -102,19 +69,12 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: caller_to8_from24:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to8_from24:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Key point is that the "%a" should go where at #16 above SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #32]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK: str {{x[0-9]+}}, [sp, #32]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 
@@ -122,24 +82,13 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 ; CHECK-LABEL: caller_to16_from16:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to16_from16:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
-
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
-
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64-NEXT: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack16
 }
diff --git a/test/CodeGen/AArch64/tls-dynamic-together.ll b/test/CodeGen/AArch64/tls-dynamic-together.ll
deleted file mode 100644
index 80ed2181c4c1..000000000000
--- a/test/CodeGen/AArch64/tls-dynamic-together.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -O0 -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; arm64 has its own copy of this file, copied during implementation.
-
-; If the .tlsdesccall and blr parts are emitted completely separately (even with
-; glue) then LLVM will separate them quite happily (with a spill at O0, hence
-; the option). This is definitely wrong, so we make sure they are emitted
-; together.
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr {{x[0-9]+}}
-}
diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/tls-dynamics.ll
deleted file mode 100644
index 0fb84c823bc9..000000000000
--- a/test/CodeGen/AArch64/tls-dynamics.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-; arm64 has its own tls-dynamics.ll, copied from this one during implementation.
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], x0]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_generaldynamic_addr() {
-; CHECK-LABEL: test_generaldynamic_addr:
-
-  ret i32* @general_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], x0
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-@local_dynamic_var = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic() {
-; CHECK-LABEL: test_localdynamic:
-
-  %val = load i32* @local_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: ldr w0, [x0, [[DTP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_localdynamic_addr() {
-; CHECK-LABEL: test_localdynamic_addr:
-
-  ret i32* @local_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: add x0, x0, [[DTP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-; The entire point of the local-dynamic access model is to have a single call to
-; the expensive resolver. Make sure we achieve that goal.
-
-@local_dynamic_var2 = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic_deduplicate() {
-; CHECK-LABEL: test_localdynamic_deduplicate:
-
-  %val = load i32* @local_dynamic_var
-  %val2 = load i32* @local_dynamic_var2
-
-  %sum = add i32 %val, %val2
-  ret i32 %sum
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], {{#?}}:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK-NOT: _TLS_MODULE_BASE_
-
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/tls-execs.ll b/test/CodeGen/AArch64/tls-execs.ll
deleted file mode 100644
index 61600380c244..000000000000
--- a/test/CodeGen/AArch64/tls-execs.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-; arm64 has its own copy of tls-execs.ll, copied from this one during implementation.
-
-@initial_exec_var = external thread_local(initialexec) global i32
-
-define i32 @test_initial_exec() {
-; CHECK-LABEL: test_initial_exec:
-  %val = load i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-  ret i32 %val
-}
-
-define i32* @test_initial_exec_addr() {
-; CHECK-LABEL: test_initial_exec_addr:
-  ret i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-}
-
-@local_exec_var = thread_local(initialexec) global i32 0
-
-define i32 @test_local_exec() {
-; CHECK-LABEL: test_local_exec:
-  %val = load i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [A,A,0xa0'A',0x92'A']
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-
-  ret i32 %val
-}
-
-define i32* @test_local_exec_addr() {
-; CHECK-LABEL: test_local_exec_addr:
-  ret i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-}
diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll
index b6e2b19fb845..5dc7b5df475a 100644
--- a/test/CodeGen/AArch64/tst-br.ll
+++ b/test/CodeGen/AArch64/tst-br.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 ; We've got the usual issues with LLVM reordering blocks here. The
 ; tests are correct for the current order, but who knows when that
diff --git a/test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll b/test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
deleted file mode 100644
index 60cc6e40b359..000000000000
--- a/test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
+++ /dev/null
@@ -1,172 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-no-strict-align -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -aarch64-no-strict-align -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-strict-align -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -aarch64-strict-align -mattr=+neon -o - | FileCheck %s --check-prefix=BE-STRICT-ALIGN
-
-;; Check element-aligned 128-bit vector load/store - integer
-define <16 x i8> @qwordint (<16 x i8>* %head.v16i8,   <8 x i16>* %head.v8i16,   <4 x i32>* %head.v4i32, <2 x i64>* %head.v2i64,
-                            <16 x i8>* %tail.v16i8,   <8 x i16>* %tail.v8i16,   <4 x i32>* %tail.v4i32, <2 x i64>* %tail.v2i64) {
-; CHECK-LABEL: qwordint
-; CHECK: ld1     { v0.16b }, [x0]
-; CHECK: ld1     { v1.8h }, [x1]
-; CHECK: ld1     { v2.4s }, [x2]
-; CHECK: ld1     { v3.2d }, [x3]
-; CHECK: st1     { v0.16b }, [x4]
-; CHECK: st1     { v1.8h }, [x5]
-; CHECK: st1     { v2.4s }, [x6]
-; CHECK: st1     { v3.2d }, [x7]
-; BE-STRICT-ALIGN-LABEL: qwordint
-; BE-STRICT-ALIGN: ldrb
-; BE-STRICT-ALIGN: ldrh
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: strb
-; BE-STRICT-ALIGN: strh
-; BE-STRICT-ALIGN: str
-; BE-STRICT-ALIGN: str
-entry:
-  %val.v16i8 = load <16 x i8>* %head.v16i8, align 1
-  %val.v8i16 = load <8 x i16>* %head.v8i16, align 2
-  %val.v4i32 = load <4 x i32>* %head.v4i32, align 4
-  %val.v2i64 = load <2 x i64>* %head.v2i64, align 8
-  store <16 x i8> %val.v16i8, <16 x i8>* %tail.v16i8, align 1
-  store <8 x i16> %val.v8i16, <8 x i16>* %tail.v8i16, align 2
-  store <4 x i32> %val.v4i32, <4 x i32>* %tail.v4i32, align 4
-  store <2 x i64> %val.v2i64, <2 x i64>* %tail.v2i64, align 8
-  ret <16 x i8> %val.v16i8
-}
-
-;; Check element-aligned 128-bit vector load/store - floating point
-define <4 x float> @qwordfloat (<4 x float>* %head.v4f32,   <2 x double>* %head.v2f64,
-                                <4 x float>* %tail.v4f32,   <2 x double>* %tail.v2f64) {
-; CHECK-LABEL: qwordfloat
-; CHECK: ld1     { v0.4s }, [x0]
-; CHECK: ld1     { v1.2d }, [x1]
-; CHECK: st1     { v0.4s }, [x2]
-; CHECK: st1     { v1.2d }, [x3]
-; BE-STRICT-ALIGN-LABEL: qwordfloat
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: str
-; BE-STRICT-ALIGN: str
-entry:
-  %val.v4f32 = load <4 x float>*  %head.v4f32, align 4
-  %val.v2f64 = load <2 x double>* %head.v2f64, align 8
-  store <4 x float>  %val.v4f32, <4 x float>*  %tail.v4f32, align 4
-  store <2 x double> %val.v2f64, <2 x double>* %tail.v2f64, align 8
-  ret <4 x float> %val.v4f32
-}
-
-;; Check element-aligned 64-bit vector load/store - integer
-define <8 x i8> @dwordint (<8 x i8>* %head.v8i8,   <4 x i16>* %head.v4i16,   <2 x i32>* %head.v2i32, <1 x i64>* %head.v1i64,
-                           <8 x i8>* %tail.v8i8,   <4 x i16>* %tail.v4i16,   <2 x i32>* %tail.v2i32, <1 x i64>* %tail.v1i64) {
-; CHECK-LABEL: dwordint
-; CHECK: ld1     { v0.8b }, [x0]
-; CHECK: ld1     { v1.4h }, [x1]
-; CHECK: ld1     { v2.2s }, [x2]
-; CHECK: ld1     { v3.1d }, [x3]
-; CHECK: st1     { v0.8b }, [x4]
-; CHECK: st1     { v1.4h }, [x5]
-; CHECK: st1     { v2.2s }, [x6]
-; CHECK: st1     { v3.1d }, [x7]
-; BE-STRICT-ALIGN-LABEL: dwordint
-; BE-STRICT-ALIGN: ldrb
-; BE-STRICT-ALIGN: ldrh
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: ld1     { v1.1d }, [x3]
-; BE-STRICT-ALIGN: strb
-; BE-STRICT-ALIGN: strh
-; BE-STRICT-ALIGN: str
-; BE-STRICT-ALIGN: st1     { v1.1d }, [x7]
-entry:
-  %val.v8i8  = load <8 x i8>*  %head.v8i8,  align 1
-  %val.v4i16 = load <4 x i16>* %head.v4i16, align 2
-  %val.v2i32 = load <2 x i32>* %head.v2i32, align 4
-  %val.v1i64 = load <1 x i64>* %head.v1i64, align 8
-  store <8 x i8>  %val.v8i8,  <8 x i8>*  %tail.v8i8 , align 1
-  store <4 x i16> %val.v4i16, <4 x i16>* %tail.v4i16, align 2
-  store <2 x i32> %val.v2i32, <2 x i32>* %tail.v2i32, align 4
-  store <1 x i64> %val.v1i64, <1 x i64>* %tail.v1i64, align 8
-  ret <8 x i8> %val.v8i8
-}
-
-;; Check element-aligned 64-bit vector load/store - floating point
-define <2 x float> @dwordfloat (<2 x float>* %head.v2f32,   <1 x double>* %head.v1f64,
-                                <2 x float>* %tail.v2f32,   <1 x double>* %tail.v1f64) {
-; CHECK-LABEL: dwordfloat
-; CHECK: ld1     { v0.2s }, [x0]
-; CHECK: ld1     { v1.1d }, [x1]
-; CHECK: st1     { v0.2s }, [x2]
-; CHECK: st1     { v1.1d }, [x3]
-; BE-STRICT-ALIGN-LABEL: dwordfloat
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: ld1     { v1.1d }, [x1]
-; BE-STRICT-ALIGN: str
-; BE-STRICT-ALIGN: st1     { v1.1d }, [x3]
-entry:
-  %val.v2f32 = load <2 x float>*  %head.v2f32, align 4
-  %val.v1f64 = load <1 x double>* %head.v1f64, align 8
-  store <2 x float>  %val.v2f32, <2 x float>* %tail.v2f32, align 4
-  store <1 x double> %val.v1f64, <1 x double>* %tail.v1f64, align 8
-  ret <2 x float> %val.v2f32
-}
-
-;; Check load/store of 128-bit vectors with less-than 16-byte alignment
-define <2 x i64> @align2vi64 (<2 x i64>* %head.byte, <2 x i64>* %head.half, <2 x i64>* %head.word, <2 x i64>* %head.dword,
-                              <2 x i64>* %tail.byte, <2 x i64>* %tail.half, <2 x i64>* %tail.word, <2 x i64>* %tail.dword) {
-; CHECK-LABEL: align2vi64
-; CHECK: ld1     { v0.2d }, [x0]
-; CHECK: ld1     { v1.2d }, [x1]
-; CHECK: ld1     { v2.2d }, [x2]
-; CHECK: ld1     { v3.2d }, [x3]
-; CHECK: st1     { v0.2d }, [x4]
-; CHECK: st1     { v1.2d }, [x5]
-; CHECK: st1     { v2.2d }, [x6]
-; CHECK: st1     { v3.2d }, [x7]
-; BE-STRICT-ALIGN-LABEL: align2vi64
-; BE-STRICT-ALIGN: ldrb     
-; BE-STRICT-ALIGN: ldrh     
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: strb     
-; BE-STRICT-ALIGN: strh     
-; BE-STRICT-ALIGN: str
-entry:
-  %val.byte  = load <2 x i64>* %head.byte,  align 1
-  %val.half  = load <2 x i64>* %head.half,  align 2
-  %val.word  = load <2 x i64>* %head.word,  align 4
-  %val.dword = load <2 x i64>* %head.dword, align 8
-  store <2 x i64> %val.byte,  <2 x i64>* %tail.byte,  align 1
-  store <2 x i64> %val.half,  <2 x i64>* %tail.half,  align 2
-  store <2 x i64> %val.word,  <2 x i64>* %tail.word,  align 4
-  store <2 x i64> %val.dword, <2 x i64>* %tail.dword, align 8
-  ret <2 x i64> %val.byte
- }
-
-;; Check load/store of 64-bit vectors with less-than 8-byte alignment
-define <2 x float> @align2vf32 (<2 x float>* %head.byte, <2 x float>* %head.half, <2 x float>* %head.word, <2 x float>* %head.dword,
-                                <2 x float>* %tail.byte, <2 x float>* %tail.half, <2 x float>* %tail.word, <2 x float>* %tail.dword) {
-; CHECK-LABEL: align2vf32
-; CHECK: ld1     { v0.2s }, [x0]
-; CHECK: ld1     { v1.2s }, [x1]
-; CHECK: ld1     { v2.2s }, [x2]
-; CHECK: st1     { v0.2s }, [x4]
-; CHECK: st1     { v1.2s }, [x5]
-; CHECK: st1     { v2.2s }, [x6]
-; BE-STRICT-ALIGN-LABEL: align2vf32
-; BE-STRICT-ALIGN: ldrb 
-; BE-STRICT-ALIGN: ldrh    
-; BE-STRICT-ALIGN: ldr
-; BE-STRICT-ALIGN: strb    
-; BE-STRICT-ALIGN: strh    
-; BE-STRICT-ALIGN: str
-entry:
-  %val.byte  = load <2 x float>* %head.byte,  align 1
-  %val.half  = load <2 x float>* %head.half,  align 2
-  %val.word  = load <2 x float>* %head.word,  align 4
-  store <2 x float> %val.byte,  <2 x float>* %tail.byte,  align 1
-  store <2 x float> %val.half,  <2 x float>* %tail.half,  align 2
-  store <2 x float> %val.word,  <2 x float>* %tail.word,  align 4
-  ret <2 x float> %val.byte
-}
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
deleted file mode 100644
index 7b85227cbd32..000000000000
--- a/test/CodeGen/AArch64/variadic.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; arm64 has its own copy of this file, ported during implementation (variadic-aapcs.ll)
-
-%va_list = type {i8*, i8*, i8*, i32, i32}
-
-@var = global %va_list zeroinitializer
-
-declare void @llvm.va_start(i8*)
-
-define void @test_simple(i32 %n, ...) {
-; CHECK-LABEL: test_simple:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #48]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #48]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: str x1, [sp, #[[GPRFROMSP]]]
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-; CHECK: str x1, [sp, #[[GPRFROMSP]]]
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
-; CHECK-NOFP-NOT: str q0, [sp]
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-  ret void
-}
-
-define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
-; CHECK-LABEL: test_fewargs:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #96]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #32]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: mov x[[GPRBASE:[0-9]+]], sp
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #24]
-
-; Omit the middle ones
-
-; CHECK: str q1, [sp]
-; CHECK: str x3, [sp, #[[GPRFROMSP]]]
-
-; CHECK-NOFP-NOT: str q1, [sp]
-; CHECK-NOFP: str x4, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #112
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #40
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #31
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #32
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-  ret void
-}
-
-define void @test_nospare([8 x i64], [8 x float], ...) {
-; CHECK-LABEL: test_nospare:
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK-NOT: sub sp, sp
-; CHECK: mov [[STACK:x[0-9]+]], sp
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP-NOT: sub sp, sp
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #64
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-  ret void
-}
-
-; If there are non-variadic arguments on the stack (here two i64s) then the
-; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
-; CHECK-LABEL: test_offsetstack:
-; CHECK: sub sp, sp, #80
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #64]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q3, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #79
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #80
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[STACK:x[0-9]+]], sp, #96
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #40
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
-  ret void
-}
-
-declare void @llvm.va_end(i8*)
-
-define void @test_va_end() nounwind {
-; CHECK-LABEL: test_va_end:
-; CHECK-NEXT: BB#0
-; CHECK-NOFP: BB#0
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_end(i8* %addr)
-
-  ret void
-; CHECK-NEXT: ret
-; CHECK-NOFP-NEXT: ret
-}
-
-declare void @llvm.va_copy(i8* %dest, i8* %src)
-
-@second_list = global %va_list zeroinitializer
-
-define void @test_va_copy() {
-; CHECK-LABEL: test_va_copy:
-  %srcaddr = bitcast %va_list* @var to i8*
-  %dstaddr = bitcast %va_list* @second_list to i8*
-  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
-
-; Check beginning and end again:
-
-; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
-; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK-NOFP: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK-NOFP: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK-NOFP: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
-  ret void
-; CHECK: ret
-; CHECK-NOFP: ret
-}
-
-%struct.s_3i = type { i32, i32, i32 }
-
-; This checks that, if the last named argument is not a multiple of 8 bytes,
-; and is allocated on the stack, that __va_list.__stack is initialised to the
-; first 8-byte aligned location above it.
-define void @test_va_odd_struct_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, [1 x i64], %struct.s_3i* byval nocapture readnone align 4 %h, ...) {
-; CHECK-LABEL: test_va_odd_struct_on_stack:
-
-; CHECK: sub sp, sp, #128
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; This constant would be #140 if it was not 8-byte aligned
-; CHECK: add [[STACK:x[0-9]+]], sp, #144
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; This constant would be #12 if it was not 8-byte aligned
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #16
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
-  ret void
-}
diff --git a/test/CodeGen/AArch64/zero-reg.ll b/test/CodeGen/AArch64/zero-reg.ll
index c4073cba08db..bc112ab8db98 100644
--- a/test/CodeGen/AArch64/zero-reg.ll
+++ b/test/CodeGen/AArch64/zero-reg.ll
@@ -1,5 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
index ca5ae8b62e8b..2597b413ec7c 100644
--- a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
+++ b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "armv7-eabi"
 
diff --git a/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll b/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
index 4fb2be02ce9a..38eb0ea2c891 100644
--- a/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
+++ b/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 -mattr=-neonfp < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a8 -mattr=-neonfp -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; PR5423
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
index 35995b77c5bc..b040b2d91cd6 100644
--- a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
+++ b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
@@ -4,22 +4,26 @@
 
 %struct.foo = type { i64, i64 }
 
-define zeroext i8 @t(%struct.foo* %this) noreturn optsize {
+define zeroext i8 @t(%struct.foo* %this, i1 %tst) noreturn optsize {
 entry:
 ; ARM-LABEL:       t:
-; ARM:       str r2, [r1], r0
+; ARM-DAG:       mov r[[ADDR:[0-9]+]], #8
+; ARM-DAG:       mov [[VAL:r[0-9]+]], #0
+; ARM:       str [[VAL]], [r[[ADDR]]], r0
 
 ; THUMB-LABEL:     t:
-; THUMB-NOT: str r0, [r1], r0
-; THUMB:     str r1, [r0]
+; THUMB-DAG:       movs r[[ADDR:[0-9]+]], #8
+; THUMB-DAG:       movs [[VAL:r[0-9]+]], #0
+; THUMB-NOT: str {{[a-z0-9]+}}, [{{[a-z0-9]+}}], {{[a-z0-9]+}}
+; THUMB:     str [[VAL]], [r[[ADDR]]]
   %0 = getelementptr inbounds %struct.foo* %this, i32 0, i32 1 ; <i64*> [#uses=1]
   store i32 0, i32* inttoptr (i32 8 to i32*), align 8
-  br i1 undef, label %bb.nph96, label %bb3
+  br i1 %tst, label %bb.nph96, label %bb3
 
 bb3:                                              ; preds = %entry
   %1 = load i64* %0, align 4                      ; <i64> [#uses=0]
-  unreachable
+  ret i8 42
 
 bb.nph96:                                         ; preds = %entry
-  unreachable
+  ret i8 3
 }
diff --git a/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll b/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
index 32d350e9c8b1..e7e0580179c4 100644
--- a/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
+++ b/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=arm1136jf-s | FileCheck %s
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=arm1136jf-s -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; Radar 8589805: Counting the number of microcoded operations, such as for an
 ; LDM instruction, was causing an assertion failure because the microop count
 ; was being treated as an instruction count.
@@ -11,7 +11,7 @@
 define i32 @test(i32 %x) {
 entry:
   %0 = tail call signext i16 undef(i32* undef)
-  switch i32 undef, label %bb3 [
+  switch i32 %x, label %bb3 [
     i32 0, label %bb4
     i32 1, label %bb1
     i32 2, label %bb2
diff --git a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
index 85a113755bf4..3950c9e081f7 100644
--- a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
+++ b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -O3 -mtriple=armv6-apple-darwin -relocation-model=pic  -mcpu=arm1136jf-s | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -O3 -mtriple=armv6-apple-darwin -relocation-model=pic  -mcpu=arm1136jf-s -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://8959122 illegal register operands for UMULL instruction
 ;   in cfrac nightly test.
 ; Armv6 generates a umull that must write to two distinct destination regs.
@@ -7,7 +7,7 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:64-n32"
 target triple = "armv6-apple-darwin10"
 
-define void @ptoa() nounwind {
+define void @ptoa(i1 %tst, i8* %p8, i8 %val8) nounwind {
 entry:
   br i1 false, label %bb3, label %bb
 
@@ -16,7 +16,7 @@ bb:                                               ; preds = %entry
 
 bb3:                                              ; preds = %bb, %entry
   %0 = call noalias i8* @malloc() nounwind
-  br i1 undef, label %bb46, label %bb8
+  br i1 %tst, label %bb46, label %bb8
 
 bb8:                                              ; preds = %bb3
   %1 = getelementptr inbounds i8* %0, i32 0
@@ -35,7 +35,7 @@ bb8:                                              ; preds = %bb3
   %7 = or i8 %6, 48
   %8 = add i8 %6, 87
   %iftmp.5.0.1 = select i1 %5, i8 %7, i8 %8
-  store i8 %iftmp.5.0.1, i8* undef, align 1
+  store i8 %iftmp.5.0.1, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -49,7 +49,7 @@ bb8:                                              ; preds = %bb3
   %13 = or i8 %12, 48
   %14 = add i8 %12, 87
   %iftmp.5.0.2 = select i1 %11, i8 %13, i8 %14
-  store i8 %iftmp.5.0.2, i8* undef, align 1
+  store i8 %iftmp.5.0.2, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -73,8 +73,8 @@ bb8:                                              ; preds = %bb3
   %21 = udiv i32 %2, 100000
   %22 = urem i32 %21, 10
   %23 = icmp ult i32 %22, 10
-  %iftmp.5.0.5 = select i1 %23, i8 0, i8 undef
-  store i8 %iftmp.5.0.5, i8* undef, align 1
+  %iftmp.5.0.5 = select i1 %23, i8 0, i8 %val8
+  store i8 %iftmp.5.0.5, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -88,7 +88,7 @@ bb8:                                              ; preds = %bb3
   %28 = or i8 %27, 48
   %29 = add i8 %27, 87
   %iftmp.5.0.6 = select i1 %26, i8 %28, i8 %29
-  store i8 %iftmp.5.0.6, i8* undef, align 1
+  store i8 %iftmp.5.0.6, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -102,7 +102,7 @@ bb8:                                              ; preds = %bb3
   %34 = or i8 %33, 48
   %35 = add i8 %33, 87
   %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
-  store i8 %iftmp.5.0.7, i8* undef, align 1
+  store i8 %iftmp.5.0.7, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -117,7 +117,7 @@ bb8:                                              ; preds = %bb3
   %41 = add i8 %39, 87
   %iftmp.5.0.8 = select i1 %38, i8 %40, i8 %41
   store i8 %iftmp.5.0.8, i8* null, align 1
-  unreachable
+  br label %bb46
 
 bb46:                                             ; preds = %bb3
   ret void
diff --git a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
index bc72e126b407..837feb6e85c2 100644
--- a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
+++ b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
@@ -8,7 +8,7 @@
 
 @oStruct = external global %struct.Outer, align 4
 
-define void @main() nounwind {
+define void @main(i8 %val8) nounwind {
 ; CHECK-LABEL: main:
 ; CHECK-NOT: ldrd
 ; CHECK: mul
@@ -28,7 +28,7 @@ for.body:                                         ; preds = %_Z14printIsNotZeroi
   br i1 %tobool.i14, label %_Z14printIsNotZeroi.exit17, label %if.then.i16
 
 if.then.i16:                                      ; preds = %_Z14printIsNotZeroi.exit
-  unreachable
+  ret void
 
 _Z14printIsNotZeroi.exit17:                       ; preds = %_Z14printIsNotZeroi.exit
   br label %_Z14printIsNotZeroi.exit17.for.body_crit_edge
@@ -36,7 +36,7 @@ _Z14printIsNotZeroi.exit17:                       ; preds = %_Z14printIsNotZeroi
 _Z14printIsNotZeroi.exit17.for.body_crit_edge:    ; preds = %_Z14printIsNotZeroi.exit17
   %b.phi.trans.insert = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %inc, i32 3
   %tmp3.pre = load i8* %b.phi.trans.insert, align 1
-  %phitmp27 = icmp eq i8 undef, 0
+  %phitmp27 = icmp eq i8 %val8, 0
   br label %for.body
 
 for.end:                                          ; preds = %_Z14printIsNotZeroi.exit17
diff --git a/test/CodeGen/ARM/2012-11-14-subs_carry.ll b/test/CodeGen/ARM/2012-11-14-subs_carry.ll
index 8df295a2f658..33083303a3d4 100644
--- a/test/CodeGen/ARM/2012-11-14-subs_carry.ll
+++ b/test/CodeGen/ARM/2012-11-14-subs_carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 ;CHECK-LABEL: foo:
 ;CHECK: adds
diff --git a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
index 480d08714b31..162f86306ff4 100644
--- a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
+++ b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
@@ -42,34 +42,34 @@ UnifiedReturnBlock:
   ret i32 %tmp13
 }
 
-define hidden fastcc void @t3(i8** %retaddr) {
+define hidden fastcc void @t3(i8** %retaddr, i1 %tst, i8* %p8) {
 ; CHECK-LABEL: t3:
 ; CHECK: Block address taken
 ; CHECK-NOT: Address of block that was removed by CodeGen
 bb:
   store i8* blockaddress(@t3, %KBBlockZero_return_1), i8** %retaddr
-  br i1 undef, label %bb77, label %bb7.i
+  br i1 %tst, label %bb77, label %bb7.i
 
 bb7.i:                                            ; preds = %bb35
   br label %bb2.i
 
 KBBlockZero_return_1:                             ; preds = %KBBlockZero.exit
-  unreachable
+  ret void
 
 KBBlockZero_return_0:                             ; preds = %KBBlockZero.exit
-  unreachable
+  ret void
 
 bb77:                                             ; preds = %bb26, %bb12, %bb
   ret void
 
 bb2.i:                                            ; preds = %bb6.i350, %bb7.i
-  br i1 undef, label %bb6.i350, label %KBBlockZero.exit
+  br i1 %tst, label %bb6.i350, label %KBBlockZero.exit
 
 bb6.i350:                                         ; preds = %bb2.i
   br label %bb2.i
 
 KBBlockZero.exit:                                 ; preds = %bb2.i
-  indirectbr i8* undef, [label %KBBlockZero_return_1, label %KBBlockZero_return_0]
+  indirectbr i8* %p8, [label %KBBlockZero_return_1, label %KBBlockZero_return_0]
 }
 
 @foo = global i32 ()* null
diff --git a/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll b/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
index a438c1f4556a..05a4ef05e958 100644
--- a/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
+++ b/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; ModuleID = 'bugpoint-reduced-simplified.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 target triple = "armv7--linux-gnueabi"
diff --git a/test/CodeGen/ARM/Windows/chkstk.ll b/test/CodeGen/ARM/Windows/chkstk.ll
index 9c58fa08d378..cb787e14b5ba 100644
--- a/test/CodeGen/ARM/Windows/chkstk.ll
+++ b/test/CodeGen/ARM/Windows/chkstk.ll
@@ -16,9 +16,9 @@ entry:
 ; CHECK-DEFAULT-CODE-MODEL: 	sub.w sp, sp, r4
 
 ; CHECK-LARGE-CODE-MODEL: check_watermark:
-; CHECK-LARGE-CODE-MODEL: 	movw r4, #1024
 ; CHECK-LARGE-CODE-MODEL: 	movw r12, :lower16:__chkstk
 ; CHECK-LARGE-CODE-MODEL: 	movt r12, :upper16:__chkstk
+; CHECK-LARGE-CODE-MODEL: 	movw r4, #1024
 ; CHECK-LARGE-CODE-MODEL: 	blx r12
 ; CHECK-LARGE-CODE-MODEL: 	sub.w sp, sp, r4
 
diff --git a/test/CodeGen/ARM/Windows/global-minsize.ll b/test/CodeGen/ARM/Windows/global-minsize.ll
new file mode 100644
index 000000000000..c0be36caa6c4
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/global-minsize.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+@i = internal global i32 0, align 4
+
+; Function Attrs: minsize
+define arm_aapcs_vfpcc i32* @function() #0 {
+entry:
+  ret i32* @i
+}
+
+attributes #0 = { minsize }
+
+; CHECK: function:
+; CHECK:   movw  r0, :lower16:i
+; CHECK:   movt  r0, :upper16:i
+; CHECK:   bx    lr
diff --git a/test/CodeGen/ARM/Windows/long-calls.ll b/test/CodeGen/ARM/Windows/long-calls.ll
new file mode 100644
index 000000000000..e35f414579af
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/long-calls.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -arm-long-calls -o - %s \
+; RUN:    | FileCheck %s
+
+declare arm_aapcs_vfpcc void @callee()
+
+define arm_aapcs_vfpcc void @caller() nounwind {
+entry:
+  tail call void @callee()
+  ret void
+}
+
+; CHECK-LABEL: caller
+; CHECK: ldr [[REG:r[0-9]+]], [[CPI:.LCPI[_0-9]+]]
+; CHECK: bx [[REG]]
+; CHECK: .align 2
+; CHECK: [[CPI]]:
+; CHECK: .long callee
+
diff --git a/test/CodeGen/ARM/Windows/memset.ll b/test/CodeGen/ARM/Windows/memset.ll
index bcf744c909df..500e25e259c6 100644
--- a/test/CodeGen/ARM/Windows/memset.ll
+++ b/test/CodeGen/ARM/Windows/memset.ll
@@ -10,9 +10,9 @@ entry:
   unreachable
 }
 
-; CHECK: movs r1, #0
-; CHECK: mov.w r2, #512
 ; CHECK: movw r0, :lower16:source
 ; CHECK: movt r0, :upper16:source
+; CHECK: movs r1, #0
+; CHECK: mov.w r2, #512
 ; CHECK: memset
 
diff --git a/test/CodeGen/ARM/Windows/mov32t-bundling.ll b/test/CodeGen/ARM/Windows/mov32t-bundling.ll
new file mode 100644
index 000000000000..5f838378fa87
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/mov32t-bundling.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+@_begin = external global i8
+@_end = external global i8
+
+declare arm_aapcs_vfpcc void @force_emission()
+
+define arm_aapcs_vfpcc void @bundle() {
+entry:
+  br i1 icmp uge (i32 sub (i32 ptrtoint (i8* @_end to i32), i32 ptrtoint (i8* @_begin to i32)), i32 4), label %if.then, label %if.end
+
+if.then:
+  tail call arm_aapcs_vfpcc void @force_emission()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: bundle
+; CHECK-NOT: subs r0, r1, r0
+; CHECK: movw r0, :lower16:_begin
+; CHECK-NEXT: movt r0, :upper16:_begin
+; CHECK-NEXT: movw r1, :lower16:_end
+; CHECK-NEXT: movt r1, :upper16:_end
+; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: cmp r0, #4
+
diff --git a/test/CodeGen/ARM/Windows/structors.ll b/test/CodeGen/ARM/Windows/structors.ll
new file mode 100644
index 000000000000..a1a90265c03a
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/structors.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -o - %s | FileCheck %s
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @function, i8* null }]
+
+define arm_aapcs_vfpcc void @function() {
+entry:
+  ret void
+}
+
+; CHECK: .section .CRT$XCU,"rd"
+; CHECK: .long function
+
diff --git a/test/CodeGen/ARM/Windows/vla.ll b/test/CodeGen/ARM/Windows/vla.ll
new file mode 100644
index 000000000000..56901dee0dfa
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/vla.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-SMALL-CODE
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -code-model=large -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-LARGE-CODE
+; RUN: llc -mtriple=thumbv7-windows-msvc -mcpu=cortex-a9 -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-MSVC
+
+define arm_aapcs_vfpcc i8 @function(i32 %sz, i32 %idx) {
+entry:
+  %vla = alloca i8, i32 %sz, align 1
+  %arrayidx = getelementptr inbounds i8* %vla, i32 %idx
+  %0 = load volatile i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; CHECK-SMALL-CODE:   adds [[R4:r[0-9]+]], #7
+; CHECK-SMALL-CODE:   bic [[R4]], [[R4]], #7
+; CHECK-SMALL-CODE:   lsrs r4, [[R4]], #2
+; CHECK-SMALL-CODE:   bl __chkstk
+; CHECK-SMALL-CODE:   sub.w sp, sp, r4
+
+; CHECK-LARGE-CODE:   adds  [[R4:r[0-9]+]], #7
+; CHECK-LARGE-CODE:   bic   [[R4]], [[R4]], #7
+; CHECK-LARGE-CODE:   lsrs  r4, [[R4]], #2
+; CHECK-LARGE-CODE:   movw  [[IP:r[0-9]+]], :lower16:__chkstk
+; CHECK-LARGE-CODE:   movt  [[IP]], :upper16:__chkstk
+; CHECK-LARGE-CODE:   blx   [[IP]]
+; CHECK-LARGE-CODE:   sub.w sp, sp, r4
+
+; CHECK-MSVC-NOT: __chkstk
+
diff --git a/test/CodeGen/ARM/aapcs-hfa-code.ll b/test/CodeGen/ARM/aapcs-hfa-code.ll
new file mode 100644
index 000000000000..396e83816ccf
--- /dev/null
+++ b/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -0,0 +1,111 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabihf -o - | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7em-none-eabi -mcpu=cortex-m4 | FileCheck %s --check-prefix=CHECK-M4F
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define arm_aapcs_vfpcc void @test_1float({ float } %a) {
+  call arm_aapcs_vfpcc void @test_1float({ float } { float 1.0 })
+  ret void
+
+; CHECK-LABEL: test_1float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK: bl test_1float
+
+; CHECK-M4F-LABEL: test_1float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F: bl test_1float
+}
+
+define arm_aapcs_vfpcc void @test_2float({ float, float } %a) {
+  call arm_aapcs_vfpcc void @test_2float({ float, float } { float 1.0, float 2.0 })
+  ret void
+
+; CHECK-LABEL: test_2float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK: bl test_2float
+
+; CHECK-M4F-LABEL: test_2float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-M4F: bl test_2float
+}
+
+define arm_aapcs_vfpcc void @test_3float({ float, float, float } %a) {
+  call arm_aapcs_vfpcc void @test_3float({ float, float, float } { float 1.0, float 2.0, float 3.0 })
+  ret void
+
+; CHECK-LABEL: test_3float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s2, #3.{{0+}}e+00
+; CHECK: bl test_3float
+
+; CHECK-M4F-LABEL: test_3float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s2, #3.{{0+}}e+00
+; CHECK-M4F: bl test_3float
+}
+
+define arm_aapcs_vfpcc void @test_1double({ double } %a) {
+; CHECK-LABEL: test_1double:
+; CHECK-DAG: vmov.f64 d0, #1.{{0+}}e+00
+; CHECK: bl test_1double
+
+; CHECK-M4F-LABEL: test_1double:
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: vmov s0, [[ONELO]]
+; CHECK-M4F-DAG: vmov s1, [[ONEHI]]
+; CHECK-M4F: bl test_1double
+
+  call arm_aapcs_vfpcc void @test_1double({ double } { double 1.0 })
+  ret void
+}
+
+; Final double argument might be put in s15 & [sp] if we're careless. It should
+; go all on the stack.
+define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3 x float], double %a) {
+; CHECK-LABEL: test_1double_nosplit:
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
+; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: movt [[ONEHI]], #16368
+; CHECK: strd [[ONELO]], [[ONEHI]], [sp]
+; CHECK: bl test_1double_nosplit
+
+; CHECK-M4F-LABEL: test_1double_nosplit:
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: str [[ONELO]], [sp]
+; CHECK-M4F-DAG: str [[ONEHI]], [sp, #4]
+; CHECK-M4F: bl test_1double_nosplit
+  call arm_aapcs_vfpcc void @test_1double_nosplit([4 x float] undef, [4 x double] undef, [3 x float] undef, double 1.0)
+  ret void
+}
+
+; Final double argument might go at [sp, #4] if we're careless. Should go at
+; [sp, #8] to preserve alignment.
+define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double], float, double) {
+  call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0)
+
+; CHECK-LABEL: test_1double_misaligned:
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
+; CHECK-DAG: mov r[[BASE:[0-9]+]], sp
+; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: movt [[ONEHI]], #16368
+; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]!
+; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4]
+
+; CHECK-M4F-LABEL: test_1double_misaligned:
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: str [[ONELO]], [sp, #8]
+; CHECK-M4F-DAG: str [[ONEHI]], [sp, #12]
+; CHECK-M4F: bl test_1double_misaligned
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index 4de305b93bdb..f55ae10b247d 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -29,7 +29,7 @@ define i32 @foo_f() {
 
 @bar_i = alias internal i32* @bar
 
-@A = alias i64, i32* @bar
+@A = alias bitcast (i32* @bar to i64*)
 
 define i32 @test() {
 entry:
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index bf827d6b66e1..14eef832e693 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck -check-prefix=ARM %s
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck -check-prefix=THUMB %s
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
+; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=ARM %s
+; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=THUMB %s
+; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
 ; RUN:   | FileCheck -check-prefix=T2 %s
-; RUN: llc -mtriple=thumbv8-eabi %s -o - | FileCheck -check-prefix=V8 %s
+; RUN: llc -mtriple=thumbv8-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=V8 %s
 
 ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified.
 
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 9913f30712b6..462c1859dc91 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -171,9 +171,10 @@ define i64 @test6(i64* %ptr, i64 %val) {
 
 define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-LABEL: test7:
-; CHECK: dmb {{ish$}}
+; CHECK-DAG: mov [[VAL1LO:r[0-9]+]], r1
+; CHECK-DAG: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], r1
+; CHECK-LE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], [[VAL1LO]]
 ; CHECK-LE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
 ; CHECK-BE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG2]], r2
 ; CHECK-BE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG1]], r1
@@ -189,16 +190,17 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]]
-; CHECK-THUMB: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]], r2
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB-LE: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-  %r = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
+  %pair = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
+  %r = extractvalue { i64, i1 } %pair, 0
   ret i64 %r
 }
 
diff --git a/test/CodeGen/ARM/atomic-cmp.ll b/test/CodeGen/ARM/atomic-cmp.ll
index a4738077b1a1..629b16d86ab5 100644
--- a/test/CodeGen/ARM/atomic-cmp.ll
+++ b/test/CodeGen/ARM/atomic-cmp.ll
@@ -11,5 +11,6 @@ define i8 @t(i8* %a, i8 %b, i8 %c) nounwind {
 ; T2: ldrexb
 ; T2: strexb
   %tmp0 = cmpxchg i8* %a, i8 %b, i8 %c monotonic monotonic
-  ret i8 %tmp0
+  %tmp1 = extractvalue { i8, i1 } %tmp0, 0
+  ret i8 %tmp1
 }
diff --git a/test/CodeGen/ARM/atomic-load-store.ll b/test/CodeGen/ARM/atomic-load-store.ll
index 45a263d323b3..49342d2d1bfe 100644
--- a/test/CodeGen/ARM/atomic-load-store.ll
+++ b/test/CodeGen/ARM/atomic-load-store.ll
@@ -5,13 +5,13 @@
 ; RUN: llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
 
 define void @test1(i32* %ptr, i32 %val1) {
-; ARM: test1
+; ARM-LABEL: test1
 ; ARM: dmb {{ish$}}
 ; ARM-NEXT: str
 ; ARM-NEXT: dmb {{ish$}}
-; THUMBONE: test1
+; THUMBONE-LABEL: test1
 ; THUMBONE: __sync_lock_test_and_set_4
-; THUMBTWO: test1
+; THUMBTWO-LABEL: test1
 ; THUMBTWO: dmb {{ish$}}
 ; THUMBTWO-NEXT: str
 ; THUMBTWO-NEXT: dmb {{ish$}}
@@ -20,12 +20,12 @@ define void @test1(i32* %ptr, i32 %val1) {
 }
 
 define i32 @test2(i32* %ptr) {
-; ARM: test2
+; ARM-LABEL: test2
 ; ARM: ldr
 ; ARM-NEXT: dmb {{ish$}}
-; THUMBONE: test2
+; THUMBONE-LABEL: test2
 ; THUMBONE: __sync_val_compare_and_swap_4
-; THUMBTWO: test2
+; THUMBTWO-LABEL: test2
 ; THUMBTWO: ldr
 ; THUMBTWO-NEXT: dmb {{ish$}}
   %val = load atomic i32* %ptr seq_cst, align 4
@@ -33,22 +33,35 @@ define i32 @test2(i32* %ptr) {
 }
 
 define void @test3(i8* %ptr1, i8* %ptr2) {
-; ARM: test3
+; ARM-LABEL: test3
+; ARM-NOT: dmb
 ; ARM: ldrb
+; ARM-NOT: dmb
 ; ARM: strb
-; THUMBTWO: test3
+; ARM-NOT: dmb
+; ARM: bx lr
+
+; THUMBTWO-LABEL: test3
+; THUMBTWO-NOT: dmb
 ; THUMBTWO: ldrb
+; THUMBTWO-NOT: dmb
 ; THUMBTWO: strb
-; THUMBONE: test3
+; THUMBTWO-NOT: dmb
+; THUMBTWO: bx lr
+
+; THUMBONE-LABEL: test3
+; THUMBONE-NOT: dmb
 ; THUMBONE: ldrb
+; THUMBONE-NOT: dmb
 ; THUMBONE: strb
+; THUMBONE-NOT: dmb
   %val = load atomic i8* %ptr1 unordered, align 1
   store atomic i8 %val, i8* %ptr2 unordered, align 1
   ret void
 }
 
 define void @test4(i8* %ptr1, i8* %ptr2) {
-; THUMBONE: test4
+; THUMBONE-LABEL: test4
 ; THUMBONE: ___sync_val_compare_and_swap_1
 ; THUMBONE: ___sync_lock_test_and_set_1
   %val = load atomic i8* %ptr1 seq_cst, align 1
@@ -57,14 +70,14 @@ define void @test4(i8* %ptr1, i8* %ptr2) {
 }
 
 define i64 @test_old_load_64bit(i64* %p) {
-; ARMV4: test_old_load_64bit
+; ARMV4-LABEL: test_old_load_64bit
 ; ARMV4: ___sync_val_compare_and_swap_8
   %1 = load atomic i64* %p seq_cst, align 8
   ret i64 %1
 }
 
 define void @test_old_store_64bit(i64* %p, i64 %v) {
-; ARMV4: test_old_store_64bit
+; ARMV4-LABEL: test_old_store_64bit
 ; ARMV4: ___sync_lock_test_and_set_8
   store atomic i64 %v, i64* %p seq_cst, align 8
   ret void
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index ac8e949cf18c..b988242ae57e 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -198,7 +198,8 @@ entry:
 define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: test_cmpxchg_fail_order:
 
-  %oldval = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 0
 ; CHECK:     dmb ish
 ; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
 ; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
@@ -216,7 +217,8 @@ define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
 define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: test_cmpxchg_fail_order1:
 
-  %oldval = cmpxchg i32* %addr, i32 %desired, i32 %new acquire acquire
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new acquire acquire
+  %oldval = extractvalue { i32, i1 } %pair, 0
 ; CHECK-NOT:     dmb ish
 ; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
 ; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index a39565e515c7..7072aaaf733d 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1051,7 +1051,8 @@ define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
 
 define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i8:
-   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %old = extractvalue { i8, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
@@ -1077,7 +1078,8 @@ define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind
 
 define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
-   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %old = extractvalue { i16, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
@@ -1103,7 +1105,8 @@ define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounw
 
 define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
-   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %old = extractvalue { i32, i1 } %pair, 0
    store i32 %old, i32* @var32
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
@@ -1130,7 +1133,8 @@ define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 
 define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
-   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %pair = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %old = extractvalue { i64, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
diff --git a/test/CodeGen/ARM/big-endian-neon-trunc-store.ll b/test/CodeGen/ARM/big-endian-neon-trunc-store.ll
new file mode 100644
index 000000000000..65147ad5d3f7
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-neon-trunc-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple armeb-eabi -mattr v7,neon -o - | FileCheck %s
+
+define void @vector_trunc_store_2i64_to_2i16( <2 x i64>* %loadaddr, <2 x i16>* %storeaddr ) {
+; CHECK-LABEL: vector_trunc_store_2i64_to_2i16:
+; CHECK:       vmovn.i64  [[REG:d[0-9]+]]
+; CHECK:       vrev32.16  [[REG]], [[REG]]
+; CHECK:       vuzp.16    [[REG]], [[REG2:d[0-9]+]]
+; CHECK:       vrev32.16  [[REG]], [[REG2]]
+  %1 = load <2 x i64>* %loadaddr
+  %2 = trunc <2 x i64> %1 to <2 x i16>
+  store <2 x i16> %2, <2 x i16>* %storeaddr
+  ret void
+}
+
+define void @vector_trunc_store_4i32_to_4i8( <4 x i32>* %loadaddr, <4 x i8>* %storeaddr ) {
+; CHECK-LABEL: vector_trunc_store_4i32_to_4i8:
+; CHECK:       vmovn.i32 [[REG:d[0-9]+]]
+; CHECK:       vrev16.8  [[REG]], [[REG]]
+; CHECK:       vuzp.8    [[REG]], [[REG2:d[0-9]+]]
+; CHECK:       vrev32.8  [[REG]], [[REG2]]
+  %1 = load <4 x i32>* %loadaddr
+  %2 = trunc <4 x i32> %1 to <4 x i8>
+  store <4 x i8> %2, <4 x i8>* %storeaddr
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/big-endian-ret-f64.ll b/test/CodeGen/ARM/big-endian-ret-f64.ll
new file mode 100644
index 000000000000..614bfc0a5b3a
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-ret-f64.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=armebv7a-eabi %s -O0 -o - | FileCheck %s
+; RUN: llc -mtriple=armebv8a-eabi %s -O0 -o - | FileCheck %s
+
+define double @fn() {
+; CHECK-LABEL: fn
+; CHECK: ldr r0, [sp]
+; CHECK: ldr r1, [sp, #4]
+  %r = alloca double, align 8
+  %1 = load double* %r, align 8
+  ret double %1
+}
+
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 3e825e8d7d6f..d75d55d0fa68 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -33,6 +33,11 @@
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2,-vfp3,-vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,+d16,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=RELOC-OTHER
 
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
@@ -453,6 +458,11 @@
 ; CORTEX-A57-NOT:  .eabi_attribute 44
 ; CORTEX-A57:  .eabi_attribute 68, 3
 
+; RELOC-PIC:  .eabi_attribute 15, 1
+; RELOC-PIC:  .eabi_attribute 16, 1
+; RELOC-PIC:  .eabi_attribute 17, 2
+; RELOC-OTHER:  .eabi_attribute 17, 1
+
 define i32 @f(i64 %z) {
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index 40694bfca5cd..a35fd7476465 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=armv6-apple-ios5.0 -mattr=+vfp2 | FileCheck %s -check-prefix=CHECKV6
-; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 \
+; RUN: llc < %s -mtriple=armv6-apple-ios5.0 -mattr=+vfp2 -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=CHECKV6
+; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 -arm-atomic-cfg-tidy=0 \
 ; RUN:    | FileCheck %s -check-prefix=CHECKELF
 
 ; Enable tailcall optimization for iOS 5.0
diff --git a/test/CodeGen/ARM/cmpxchg-idioms.ll b/test/CodeGen/ARM/cmpxchg-idioms.ll
new file mode 100644
index 000000000000..fb88575cab3b
--- /dev/null
+++ b/test/CodeGen/ARM/cmpxchg-idioms.ll
@@ -0,0 +1,107 @@
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s | FileCheck %s
+
+define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_return:
+
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strex [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs r0, #1
+; CHECK: dmb ish
+; CHECK: bx lr
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs r0, #0
+; CHECK: dmb ish
+; CHECK: bx lr
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %conv = zext i1 %success to i32
+  ret i32 %conv
+}
+
+define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
+; CHECK-LABEL: test_return_bool:
+
+; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], [[OLDBYTE]]
+; CHECK: bne [[FAIL:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strexb [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+  ; FIXME: this eor is redundant. Need to teach DAG combine that.
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs [[TMP:r[0-9]+]], #1
+; CHECK: eor r0, [[TMP]], #1
+; CHECK: bx lr
+
+; CHECK: [[FAIL]]:
+; CHECK: movs [[TMP:r[0-9]+]], #0
+; CHECK: eor r0, [[TMP]], #1
+; CHECK: bx lr
+
+
+  %pair = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
+  %success = extractvalue { i8, i1 } %pair, 1
+  %failure = xor i1 %success, 1
+  ret i1 %failure
+}
+
+define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_conditional:
+
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: dmb ish
+; CHECK: b.w _bar
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: dmb ish
+; CHECK: b.w _baz
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  tail call void @bar() #2
+  br label %end
+
+false:
+  tail call void @baz() #2
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @bar()
+declare void @baz()
diff --git a/test/CodeGen/ARM/cmpxchg-weak.ll b/test/CodeGen/ARM/cmpxchg-weak.ll
new file mode 100644
index 000000000000..126e33062623
--- /dev/null
+++ b/test/CodeGen/ARM/cmpxchg-weak.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
+
+define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_cmpxchg_weak:
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 0
+; CHECK:     dmb ish
+; CHECK:     ldrex   [[LOADED:r[0-9]+]], [r0]
+; CHECK:     cmp     [[LOADED]], r1
+; CHECK:     strexeq [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK:     cmpeq   [[SUCCESS]], #0
+; CHECK:     bne     [[DONE:LBB[0-9]+_[0-9]+]]
+; CHECK:     dmb     ish
+; CHECK: [[DONE]]:
+; CHECK:     str     r3, [r0]
+; CHECK:     bx      lr
+
+  store i32 %oldval, i32* %addr
+  ret void
+}
+
+
+define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_cmpxchg_weak_to_bool:
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %success = extractvalue { i32, i1 } %pair, 1
+
+; CHECK:      dmb     ish
+; CHECK:      mov     r0, #0
+; CHECK:      ldrex   [[LOADED:r[0-9]+]], [r1]
+; CHECK:      cmp     [[LOADED]], r2
+; CHECK:      strexeq [[STATUS:r[0-9]+]], r3, [r1]
+; CHECK:      cmpeq   [[STATUS]], #0
+; CHECK:      bne     [[DONE:LBB[0-9]+_[0-9]+]]
+; CHECK:      dmb     ish
+; CHECK:      mov     r0, #1
+; CHECK: [[DONE]]:
+; CHECK:      bx      lr
+
+  ret i1 %success
+}
diff --git a/test/CodeGen/ARM/data-in-code-annotations.ll b/test/CodeGen/ARM/data-in-code-annotations.ll
index da70178225eb..5eb81b24de0f 100644
--- a/test/CodeGen/ARM/data-in-code-annotations.ll
+++ b/test/CodeGen/ARM/data-in-code-annotations.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-darwin -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 define double @f1() nounwind {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index 695a20b5976d..eb0120f7c1bb 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -12,11 +12,11 @@ declare void @bar(i8*)
 
 define void @check_simple() minsize {
 ; CHECK-LABEL: check_simple:
-; CHECK: push {r3, r4, r5, r6, r7, lr}
+; CHECK: push.w {r7, r8, r9, r10, r11, lr}
 ; CHECK-NOT: sub sp, sp,
 ; ...
 ; CHECK-NOT: add sp, sp,
-; CHECK: pop {r0, r1, r2, r3, r7, pc}
+; CHECK: pop.w {r0, r1, r2, r3, r11, pc}
 
 ; CHECK-T1-LABEL: check_simple:
 ; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
@@ -44,11 +44,11 @@ define void @check_simple() minsize {
 
 define void @check_simple_too_big() minsize {
 ; CHECK-LABEL: check_simple_too_big:
-; CHECK: push {r7, lr}
+; CHECK: push.w {r11, lr}
 ; CHECK: sub sp,
 ; ...
 ; CHECK: add sp,
-; CHECK: pop {r7, pc}
+; CHECK: pop.w {r11, pc}
   %var = alloca i8, i32 64
   call void @bar(i8* %var)
   ret void
@@ -93,11 +93,11 @@ define void @check_vfp_fold() minsize {
 ; folded in except that doing so would clobber the value being returned.
 define i64 @check_no_return_clobber() minsize {
 ; CHECK-LABEL: check_no_return_clobber:
-; CHECK: push {r1, r2, r3, r4, r5, r6, r7, lr}
+; CHECK: push.w {r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NOT: sub sp,
 ; ...
 ; CHECK: add sp, #24
-; CHECK: pop {r7, pc}
+; CHECK: pop.w {r11, pc}
 
   ; Just to keep iOS FileCheck within previous function:
 ; CHECK-IOS-LABEL: check_no_return_clobber:
@@ -176,9 +176,9 @@ define void @test_varsize(...) minsize {
 
 ; CHECK-LABEL: test_varsize:
 ; CHECK: sub	sp, #16
-; CHECK: push	{r5, r6, r7, lr}
+; CHECK: push.w {r9, r10, r11, lr}
 ; ...
-; CHECK: pop.w	{r2, r3, r7, lr}
+; CHECK: pop.w	{r2, r3, r11, lr}
 ; CHECK: add	sp, #16
 ; CHECK: bx	lr
 
diff --git a/test/CodeGen/ARM/fptoint.ll b/test/CodeGen/ARM/fptoint.ll
index c7217560f48b..f50d0b96fe99 100644
--- a/test/CodeGen/ARM/fptoint.ll
+++ b/test/CodeGen/ARM/fptoint.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 -mattr=+v6,+vfp2 %s -o - | FileCheck %s
 
 @i = weak global i32 0		; <i32*> [#uses=2]
 @u = weak global i32 0		; <i32*> [#uses=2]
diff --git a/test/Transforms/GlobalMerge/ARM/arm.ll b/test/CodeGen/ARM/global-merge-1.ll
similarity index 93%
rename from test/Transforms/GlobalMerge/ARM/arm.ll
rename to test/CodeGen/ARM/global-merge-1.ll
index 8c77de62eced..341597e6188c 100644
--- a/test/Transforms/GlobalMerge/ARM/arm.ll
+++ b/test/CodeGen/ARM/global-merge-1.ll
@@ -1,9 +1,9 @@
 ; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=true | FileCheck -check-prefix=NO-MERGE %s
 ; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=true | FileCheck -check-prefix=MERGE %s
 
 ; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
 ; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index cd8a56138621..a994d3d01ae8 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv8 -print-machineinstrs=if-converter -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -print-machineinstrs=if-converter -arm-atomic-cfg-tidy=0 -o /dev/null 2>&1 | FileCheck %s
 
 %struct.S = type { i8* (i8*)*, [1 x i8] }
 define internal zeroext i8 @bar(%struct.S* %x, %struct.S* nocapture %y) nounwind readonly {
diff --git a/test/CodeGen/ARM/ifcvt10.ll b/test/CodeGen/ARM/ifcvt10.ll
index 26c72723b287..509c182fc973 100644
--- a/test/CodeGen/ARM/ifcvt10.ll
+++ b/test/CodeGen/ARM/ifcvt10.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -arm-atomic-cfg-tidy=0 -mcpu=cortex-a9 | FileCheck %s
 ; rdar://8402126
 ; Make sure if-converter is not predicating vldmia and ldmia. These are
 ; micro-coded and would have long issue latency even if predicated on
diff --git a/test/CodeGen/ARM/indirectbr-3.ll b/test/CodeGen/ARM/indirectbr-3.ll
index 5a9c45902edc..291fedb81104 100644
--- a/test/CodeGen/ARM/indirectbr-3.ll
+++ b/test/CodeGen/ARM/indirectbr-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 ; If ARMBaseInstrInfo::AnalyzeBlocks returns the wrong value, which was possible
 ; for blocks with indirect branches, the IfConverter could end up deleting
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index c5be6674da1b..cb67dd929f41 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -35,15 +35,15 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
   ; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
   ; appropriate sentinel so no special return needed).
 ; CHECK-M-LABEL: irq_fn:
-; CHECK-M: push {r4, r6, r7, lr}
-; CHECK-M: add r7, sp, #8
+; CHECK-M: push.w {r4, r10, r11, lr}
+; CHECK-M: add.w r11, sp, #8
 ; CHECK-M: mov r4, sp
 ; CHECK-M: bic r4, r4, #7
 ; CHECK-M: mov sp, r4
 ; CHECK-M: blx _bar
-; CHECK-M: sub.w r4, r7, #8
+; CHECK-M: sub.w r4, r11, #8
 ; CHECK-M: mov sp, r4
-; CHECK-M: pop {r4, r6, r7, pc}
+; CHECK-M: pop.w {r4, r10, r11, pc}
 
   call arm_aapcscc void @bar()
   ret void
diff --git a/test/CodeGen/ARM/jump_tables.ll b/test/CodeGen/ARM/jump_tables.ll
new file mode 100644
index 000000000000..907a86c25387
--- /dev/null
+++ b/test/CodeGen/ARM/jump_tables.ll
@@ -0,0 +1,32 @@
+; RUN: llc <%s -mtriple=arm-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=ARM %s
+; RUN: llc <%s -mtriple=thumb-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=THUMB %s
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+
+; ARM:         ldr     r0, [[LABEL:.*]]
+; ARM:         mov     pc, lr
+; ARM: [[LABEL]]:
+; ARM:         .long   __llvm_jump_instr_table_0_1
+
+; THUMB:         ldr     r0, [[LABEL:.*]]
+; THUMB:         bx      lr
+; THUMB: [[LABEL]]:
+; THUMB:         .long   __llvm_jump_instr_table_0_1
+}
+
+; ARM:         .globl  __llvm_jump_instr_table_0_1
+; ARM:         .align  3
+; ARM:         .type   __llvm_jump_instr_table_0_1,%function
+; ARM: __llvm_jump_instr_table_0_1:
+; ARM:         b     indirect_fun(PLT)
+
+; THUMB:         .globl  __llvm_jump_instr_table_0_1
+; THUMB:         .align  3
+; THUMB:         .thumb_func
+; THUMB:         .type   __llvm_jump_instr_table_0_1,%function
+; THUMB: __llvm_jump_instr_table_0_1:
+; THUMB:         b     indirect_fun(PLT)
diff --git a/test/CodeGen/ARM/ldstrex-m.ll b/test/CodeGen/ARM/ldstrex-m.ll
new file mode 100644
index 000000000000..b50699f4cde6
--- /dev/null
+++ b/test/CodeGen/ARM/ldstrex-m.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 | FileCheck %s
+
+; CHECK-LABEL: f0:
+; CHECK-NOT: ldrexd
+define i64 @f0(i64* %p) nounwind readonly {
+entry:
+  %0 = load atomic i64* %p seq_cst, align 8
+  ret i64 %0
+}
+
+; CHECK-LABEL: f1:
+; CHECK-NOT: strexd
+define void @f1(i64* %p) nounwind readonly {
+entry:
+  store atomic i64 0, i64* %p seq_cst, align 8
+  ret void
+}
+
+; CHECK-LABEL: f2:
+; CHECK-NOT: ldrexd
+; CHECK-NOT: strexd
+define i64 @f2(i64* %p) nounwind readonly {
+entry:
+  %0 = atomicrmw add i64* %p, i64 1 seq_cst
+  ret i64 %0
+}
+
+; CHECK-LABEL: f3:
+; CHECK: ldr
+define i32 @f3(i32* %p) nounwind readonly {
+entry:
+  %0 = load atomic i32* %p seq_cst, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: f4:
+; CHECK: ldrb
+define i8 @f4(i8* %p) nounwind readonly {
+entry:
+  %0 = load atomic i8* %p seq_cst, align 4
+  ret i8 %0
+}
+
+; CHECK-LABEL: f5:
+; CHECK: str
+define void @f5(i32* %p) nounwind readonly {
+entry:
+  store atomic i32 0, i32* %p seq_cst, align 4
+  ret void
+}
+
+; CHECK-LABEL: f6:
+; CHECK: ldrex
+; CHECK: strex
+define i32 @f6(i32* %p) nounwind readonly {
+entry:
+  %0 = atomicrmw add i32* %p, i32 1 seq_cst
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/lit.local.cfg b/test/CodeGen/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/CodeGen/ARM/lit.local.cfg
+++ b/test/CodeGen/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/ARM/lsr-unfolded-offset.ll b/test/CodeGen/ARM/lsr-unfolded-offset.ll
index 1dafa0067b5a..3ad60d47b53b 100644
--- a/test/CodeGen/ARM/lsr-unfolded-offset.ll
+++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=greedy < %s | FileCheck %s
+; RUN: llc -regalloc=greedy -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 ; LSR shouldn't introduce more induction variables than needed, increasing
 ; register pressure and therefore spilling. There is more room for improvement
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index 26adf0c2ad04..bb2d42ca9ede 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched %s -o - 2>&1 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s
 ;
 ; Loop counter copies should be eliminated.
 ; There is also a MUL here, but we don't care where it is scheduled.
diff --git a/test/CodeGen/ARM/none-macho.ll b/test/CodeGen/ARM/none-macho.ll
index 2795b8cd2d15..60c21716dc35 100644
--- a/test/CodeGen/ARM/none-macho.ll
+++ b/test/CodeGen/ARM/none-macho.ll
@@ -48,8 +48,8 @@ define i32 @test_frame_ptr() {
 ; CHECK-LABEL: test_frame_ptr:
   call void @test_trap()
 
-  ; Frame pointer is r7 as for Darwin
-; CHECK: mov r7, sp
+  ; Frame pointer is r11.
+; CHECK: mov r11, sp
   ret i32 42
 }
 
@@ -63,11 +63,9 @@ define void @test_two_areas(%big_arr* %addr) {
   ; This goes with the choice of r7 as FP (largely). FP and LR have to be stored
   ; consecutively on the stack for the frame record to be valid, which means we
   ; need the 2 register-save areas employed by iOS.
-; CHECK-NON-FAST: push {r4, r5, r6, r7, lr}
-; CHECK-NON-FAST: push.w {r8, r9, r10, r11}
+; CHECK-NON-FAST: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ...
-; CHECK-NON-FAST: pop.w {r8, r9, r10, r11}
-; CHECK-NON-FAST: pop {r4, r5, r6, r7, pc}
+; CHECK-NON-FAST: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
   ret void
 }
 
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index b245674c3c91..feed5ad2830a 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -regalloc=basic | FileCheck %s
 ; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
 
 %struct.int16x8_t = type { <8 x i16> }
diff --git a/test/CodeGen/ARM/segmented-stacks.ll b/test/CodeGen/ARM/segmented-stacks.ll
index a7804b900a50..9873bf332948 100644
--- a/test/CodeGen/ARM/segmented-stacks.ll
+++ b/test/CodeGen/ARM/segmented-stacks.ll
@@ -57,6 +57,8 @@ define void @test_basic() #0 {
 define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; ARM-linux:      test_nested:
@@ -68,7 +70,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; ARM-linux-NEXT: cmp     r4, r5
 ; ARM-linux-NEXT: blo     .LBB1_2
 
-; ARM-linux:      mov     r4, #0
+; ARM-linux:      mov     r4, #56
 ; ARM-linux-NEXT: mov     r5, #0
 ; ARM-linux-NEXT: stmdb   sp!, {lr}
 ; ARM-linux-NEXT: bl      __morestack
@@ -87,7 +89,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; ARM-android-NEXT: cmp     r4, r5
 ; ARM-android-NEXT: blo     .LBB1_2
 
-; ARM-android:      mov     r4, #0
+; ARM-android:      mov     r4, #56
 ; ARM-android-NEXT: mov     r5, #0
 ; ARM-android-NEXT: stmdb   sp!, {lr}
 ; ARM-android-NEXT: bl      __morestack
@@ -234,4 +236,14 @@ define fastcc void @test_fastcc_large() #0 {
 
 }
 
+define void @test_nostack() #0 {
+	ret void
+
+; ARM-linux-LABEL: test_nostack:
+; ARM-linux-NOT:   bl __morestack
+
+; ARM-android-LABEL: test_nostack:
+; ARM-android-NOT:   bl __morestack
+}
+
 attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index b9246635e408..4fa97ea5b689 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-elf -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-elf -mattr=+neon -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4789
 
 %bar = type { float, float, float }
diff --git a/test/CodeGen/ARM/struct-byval-frame-index.ll b/test/CodeGen/ARM/struct-byval-frame-index.ll
index 465ee1218fda..0fd55ec6c943 100644
--- a/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/test/CodeGen/ARM/struct-byval-frame-index.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a15 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a15 -verify-machineinstrs -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 ; Check a spill right after a function call with large struct byval is correctly
 ; generated.
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 8da875fef272..01df3b42d107 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -1,5 +1,5 @@
 ; Tests for the two-address instruction pass.
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 -arm-atomic-cfg-tidy=0 %s -o - | FileCheck %s
 
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
diff --git a/test/CodeGen/ARM/undefined.ll b/test/CodeGen/ARM/undefined.ll
new file mode 100644
index 000000000000..86422fb54412
--- /dev/null
+++ b/test/CodeGen/ARM/undefined.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple armv7-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv6m-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-eabi -o - %s | FileCheck %s
+
+declare void @llvm.arm.undefined(i32) nounwind
+
+define void @undefined_trap() {
+entry:
+  tail call void @llvm.arm.undefined(i32 254)
+  ret void
+}
+
+; CHECK-LABEL: undefined_trap
+; CHECK: udf #254
diff --git a/test/CodeGen/ARM/va_arg.ll b/test/CodeGen/ARM/va_arg.ll
index f18b49822847..d901a7461fc8 100644
--- a/test/CodeGen/ARM/va_arg.ll
+++ b/test/CodeGen/ARM/va_arg.ll
@@ -24,13 +24,13 @@ entry:
 ; CHECK-NOT:	bfc
 ; CHECK: bx	lr
 
-define double @test2(i32 %a, i32 %b, ...) nounwind optsize {
+define double @test2(i32 %a, i32* %b, ...) nounwind optsize {
 entry:
   %ap = alloca i8*, align 4                       ; <i8**> [#uses=3]
   %ap1 = bitcast i8** %ap to i8*                  ; <i8*> [#uses=2]
   call void @llvm.va_start(i8* %ap1)
   %0 = va_arg i8** %ap, i32                       ; <i32> [#uses=0]
-  store i32 %0, i32* undef
+  store i32 %0, i32* %b
   %1 = va_arg i8** %ap, double                    ; <double> [#uses=1]
   call void @llvm.va_end(i8* %ap1)
   ret double %1
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
index d0a9ac6d2b56..f2e5eb9b7e03 100644
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
 
diff --git a/test/CodeGen/ARM/widen-vmovs.ll b/test/CodeGen/ARM/widen-vmovs.ll
index 1efbc73650d8..316cfabab48c 100644
--- a/test/CodeGen/ARM/widen-vmovs.ll
+++ b/test/CodeGen/ARM/widen-vmovs.ll
@@ -17,7 +17,7 @@ target triple = "thumbv7-apple-ios"
 ; - Register liveness is verified.
 ; - The execution domain switch to vorr works across basic blocks.
 
-define void @Mm() nounwind {
+define void @Mm(i32 %in, float* %addr) nounwind {
 entry:
   br label %for.body4
 
@@ -27,10 +27,10 @@ for.body4:
 for.body.i:
   %tmp3.i = phi float [ 1.000000e+10, %for.body4 ], [ %add.i, %for.body.i ]
   %add.i = fadd float %tmp3.i, 1.000000e+10
-  %exitcond.i = icmp eq i32 undef, 41
+  %exitcond.i = icmp eq i32 %in, 41
   br i1 %exitcond.i, label %rInnerproduct.exit, label %for.body.i
 
 rInnerproduct.exit:
-  store float %add.i, float* undef, align 4
+  store float %add.i, float* %addr, align 4
   br label %for.body4
 }
diff --git a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
deleted file mode 100644
index 250732d6e842..000000000000
--- a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
-
-        .text
-        .globl _foo
-        .cfi_startproc
-_foo:
-        stp x29, x30, [sp, #-16]!
- .cfi_adjust_cfa_offset 16
-
-        ldp x29, x30, [sp], #16
- .cfi_adjust_cfa_offset -16
-        .cfi_restore x29
-        .cfi_restore x30
-
-        ret
-
-        .cfi_endproc
diff --git a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll b/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
deleted file mode 100644
index 1a07c986557d..000000000000
--- a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; CHECK: fptosi_1
-; CHECK: fcvtzs.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptosi_1() nounwind noinline ssp {
-entry:
-  %0 = fptosi <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
-; CHECK: fptoui_1
-; CHECK: fcvtzu.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptoui_1() nounwind noinline ssp {
-entry:
-  %0 = fptoui <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll b/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
deleted file mode 100644
index 63129a4b830f..000000000000
--- a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f1:
-; CHECK: sshll.2d v0, v0, #0
-; CHECK-NEXT: scvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = sitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f2:
-; CHECK: ushll.2d v0, v0, #0
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = uitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-
-; CHECK: autogen_SD19655
-; CHECK: scvtf
-; CHECK: ret
-define void @autogen_SD19655() {
-  %T = load <2 x i64>* undef
-  %F = sitofp <2 x i64> undef to <2 x float>
-  store <2 x float> %F, <2 x float>* undef
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/fminv.ll b/test/CodeGen/ARM64/fminv.ll
deleted file mode 100644
index ca706d897ca6..000000000000
--- a/test/CodeGen/ARM64/fminv.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-define float @test_fminv_v2f32(<2 x float> %in) {
-; CHECK: test_fminv_v2f32:
-; CHECK: fminp s0, v0.2s
-  %min = call float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float> %in)
-  ret float %min
-}
-
-define float @test_fminv_v4f32(<4 x float> %in) {
-; CHECK: test_fminv_v4f32:
-; CHECK: fminv s0, v0.4s
-  %min = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %in)
-  ret float %min
-}
-
-define double @test_fminv_v2f64(<2 x double> %in) {
-; CHECK: test_fminv_v2f64:
-; CHECK: fminp d0, v0.2d
-  %min = call double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxv_v2f32:
-; CHECK: fmaxp s0, v0.2s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %in)
-  ret float %max
-}
-
-define float @test_fmaxv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxv_v4f32:
-; CHECK: fmaxv s0, v0.4s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %in)
-  ret float %max
-}
-
-define double @test_fmaxv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxv_v2f64:
-; CHECK: fmaxp d0, v0.2d
-  %max = call double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double>)
-
-define float @test_fminnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fminnmv_v2f32:
-; CHECK: fminnmp s0, v0.2s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float> %in)
-  ret float %minnm
-}
-
-define float @test_fminnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fminnmv_v4f32:
-; CHECK: fminnmv s0, v0.4s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %in)
-  ret float %minnm
-}
-
-define double @test_fminnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fminnmv_v2f64:
-; CHECK: fminnmp d0, v0.2d
-  %minnm = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %minnm
-}
-
-declare float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxnmv_v2f32:
-; CHECK: fmaxnmp s0, v0.2s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
-  ret float %maxnm
-}
-
-define float @test_fmaxnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxnmv_v4f32:
-; CHECK: fmaxnmv s0, v0.4s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
-  ret float %maxnm
-}
-
-define double @test_fmaxnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxnmv_v2f64:
-; CHECK: fmaxnmp d0, v0.2d
-  %maxnm = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %maxnm
-}
-
-declare float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/lit.local.cfg b/test/CodeGen/ARM64/lit.local.cfg
deleted file mode 100644
index 3468e27f07f5..000000000000
--- a/test/CodeGen/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-import re
-
-config.suffixes = ['.ll']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
-# For now we don't test arm64-win32.
-if re.search(r'cygwin|mingw32|win32', config.target_triple):
-    config.unsupported = True
diff --git a/test/CodeGen/ARM64/register-offset-addressing.ll b/test/CodeGen/ARM64/register-offset-addressing.ll
deleted file mode 100644
index c27360257cdc..000000000000
--- a/test/CodeGen/ARM64/register-offset-addressing.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i8 @t1(i16* %a, i64 %b) {
-; CHECK: t1
-; CHECK: lsl [[REG:x[0-9]+]], x1, #1
-; CHECK: ldrb w0, [x0, [[REG]]]
-; CHECK: ret
-  %tmp1 = getelementptr inbounds i16* %a, i64 %b
-  %tmp2 = load i16* %tmp1
-  %tmp3 = trunc i16 %tmp2 to i8
-  ret i8 %tmp3
-}
diff --git a/test/CodeGen/ARM64/simd-scalar-to-vector.ll b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
deleted file mode 100644
index 10e8d6c0ea41..000000000000
--- a/test/CodeGen/ARM64/simd-scalar-to-vector.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
-
-define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
-; CHECK: uaddlv.16b h0, v0
-; CHECK: rshrn.8b v0, v0, #4
-; CHECK: dup.16b v0, v0[0]
-; CHECK: ret
-
-; CHECK-FAST: uaddlv.16b
-; CHECK-FAST: rshrn.8b
-; CHECK-FAST: dup.16b
-  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp1 = trunc i32 %tmp to i16
-  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
-  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
-  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %tmp4
-}
-
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/tbl.ll b/test/CodeGen/ARM64/tbl.ll
deleted file mode 100644
index e1edd21d8a24..000000000000
--- a/test/CodeGen/ARM64/tbl.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
-; CHECK: tbl1_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
-; CHECK: tbl1_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK: tbl2_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK: tbl2_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbl3_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbl3_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbl4_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbl4_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
-; CHECK: tbx1_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
-; CHECK: tbx1_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbx2_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbx2_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbx3_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbx3_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK: tbx4_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK: tbx4_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vcnt.ll b/test/CodeGen/ARM64/vcnt.ll
deleted file mode 100644
index e00658a4bda2..000000000000
--- a/test/CodeGen/ARM64/vcnt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_8b:
-;CHECK: cls.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_16b:
-;CHECK: cls.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_4h:
-;CHECK: cls.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_8h:
-;CHECK: cls.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_2s:
-;CHECK: cls.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_4s:
-;CHECK: cls.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_n.ll b/test/CodeGen/ARM64/vcvt_n.ll
deleted file mode 100644
index 46de557b0709..000000000000
--- a/test/CodeGen/ARM64/vcvt_n.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxpu:
-; CHECK: ucvtf.2s	v0, v0, #9
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
-  ret <2 x float> %vcvt_n1
-}
-
-define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxps:
-; CHECK: scvtf.2s	v0, v0, #12
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
-  ret <2 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxpu:
-; CHECK: ucvtf.4s	v0, v0, #18
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
-  ret <4 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxps:
-; CHECK: scvtf.4s	v0, v0, #30
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
-  ret <4 x float> %vcvt_n1
-}
-define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
-  ret <2 x double> %vcvt_n1
-}
-
-define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
-  ret <2 x double> %vcvt_n1
-}
-
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vminmaxnm.ll b/test/CodeGen/ARM64/vminmaxnm.ll
deleted file mode 100644
index 628640759a51..000000000000
--- a/test/CodeGen/ARM64/vminmaxnm.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vmaxnm2.i
-}
-
-define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.4s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vmaxnm2.i
-}
-
-define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2d	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vmaxnm2.i
-}
-
-define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.2s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vminnm2.i
-}
-
-define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.4s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vminnm2.i
-}
-
-define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fminnm.2d	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vminnm2.i
-}
-
-declare <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-
-
-define double @test_fmaxnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fmaxnmv:
-; CHECK: fmaxnmp.2d d0, v0
-  %max = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-define double @test_fminnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fminnmv:
-; CHECK: fminnmp.2d d0, v0
-  %min = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vqsub.ll b/test/CodeGen/ARM64/vqsub.ll
deleted file mode 100644
index 0afeb68348b6..000000000000
--- a/test/CodeGen/ARM64/vqsub.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub8b:
-;CHECK: sqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub4h:
-;CHECK: sqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub2s:
-;CHECK: sqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub8b:
-;CHECK: uqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub4h:
-;CHECK: uqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub2s:
-;CHECK: uqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub16b:
-;CHECK: sqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub8h:
-;CHECK: sqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub4s:
-;CHECK: sqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqsub2d:
-;CHECK: sqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub16b:
-;CHECK: uqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub8h:
-;CHECK: uqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub4s:
-;CHECK: uqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqsub2d:
-;CHECK: uqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/CPP/atomic.ll b/test/CodeGen/CPP/atomic.ll
new file mode 100644
index 000000000000..e79c45d166a5
--- /dev/null
+++ b/test/CodeGen/CPP/atomic.ll
@@ -0,0 +1,89 @@
+; RUN: llc -march=cpp -o - %s | FileCheck %s
+
+define void @test_atomicrmw(i32* %addr, i32 %inc) {
+  %inst0 = atomicrmw xchg i32* %addr, i32 %inc seq_cst
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Xchg, {{.*}}, SequentiallyConsistent, CrossThread
+  ; CHECK: [[INST]]->setName("inst0");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst1 = atomicrmw add i32* %addr, i32 %inc seq_cst
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Add, {{.*}}, SequentiallyConsistent, CrossThread
+  ; CHECK: [[INST]]->setName("inst1");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst2 = atomicrmw volatile sub i32* %addr, i32 %inc singlethread monotonic
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Sub, {{.*}}, Monotonic, SingleThread
+  ; CHECK: [[INST]]->setName("inst2");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst3 = atomicrmw and i32* %addr, i32 %inc acq_rel
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::And, {{.*}}, AcquireRelease, CrossThread
+  ; CHECK: [[INST]]->setName("inst3");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst4 = atomicrmw nand i32* %addr, i32 %inc release
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Nand, {{.*}}, Release, CrossThread
+  ; CHECK: [[INST]]->setName("inst4");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst5 = atomicrmw volatile or i32* %addr, i32 %inc singlethread seq_cst
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Or, {{.*}}, SequentiallyConsistent, SingleThread
+  ; CHECK: [[INST]]->setName("inst5");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst6 = atomicrmw xor i32* %addr, i32 %inc release
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Xor, {{.*}}, Release, CrossThread
+  ; CHECK: [[INST]]->setName("inst6");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst7 = atomicrmw volatile max i32* %addr, i32 %inc singlethread monotonic
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Max, {{.*}}, Monotonic, SingleThread
+  ; CHECK: [[INST]]->setName("inst7");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst8 = atomicrmw min i32* %addr, i32 %inc acquire
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Min, {{.*}}, Acquire, CrossThread
+  ; CHECK: [[INST]]->setName("inst8");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst9 = atomicrmw volatile umax i32* %addr, i32 %inc monotonic
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::UMax, {{.*}}, Monotonic, CrossThread
+  ; CHECK: [[INST]]->setName("inst9");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst10 = atomicrmw umin i32* %addr, i32 %inc singlethread release
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::UMin, {{.*}}, Release, SingleThread
+  ; CHECK: [[INST]]->setName("inst10");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+
+  ret void
+}
+
+define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+  %inst0 = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, SequentiallyConsistent, Monotonic, CrossThread
+  ; CHECK: [[INST]]->setName("inst0");
+  ; CHECK: [[INST]]->setVolatile(false);
+  ; CHECK: [[INST]]->setWeak(false);
+
+  %inst1 = cmpxchg volatile i32* %addr, i32 %desired, i32 %new singlethread acq_rel acquire
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, AcquireRelease, Acquire, SingleThread
+  ; CHECK: [[INST]]->setName("inst1");
+  ; CHECK: [[INST]]->setVolatile(true);
+  ; CHECK: [[INST]]->setWeak(false);
+
+  %inst2 = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, SequentiallyConsistent, Monotonic, CrossThread
+  ; CHECK: [[INST]]->setName("inst2");
+  ; CHECK: [[INST]]->setVolatile(false);
+  ; CHECK: [[INST]]->setWeak(true);
+
+  %inst3 = cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread acq_rel acquire
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, AcquireRelease, Acquire, SingleThread
+  ; CHECK: [[INST]]->setName("inst3");
+  ; CHECK: [[INST]]->setVolatile(true);
+  ; CHECK: [[INST]]->setWeak(true);
+
+  ret void
+}
diff --git a/test/CodeGen/CPP/lit.local.cfg b/test/CodeGen/CPP/lit.local.cfg
index 4063dd1b8612..3ff5c6b69737 100644
--- a/test/CodeGen/CPP/lit.local.cfg
+++ b/test/CodeGen/CPP/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'CppBackend' in targets:
+if not 'CppBackend' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 802ee2cb0558..0e98280694c4 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
 
+; ARM & AArch64 run an extra SimplifyCFG which disrupts this test.
+; XFAIL: arm,aarch64
+
 ; Make sure we have the correct weight attached to each successor.
 define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
 ; CHECK: Machine code for function test2:
diff --git a/test/CodeGen/Generic/select.ll b/test/CodeGen/Generic/select.ll
index 77636eb6e615..c4841b79acb6 100644
--- a/test/CodeGen/Generic/select.ll
+++ b/test/CodeGen/Generic/select.ll
@@ -192,4 +192,3 @@ define <1 x i32> @checkScalariseVSELECT(<1 x i32> %a, <1 x i32> %b) {
         %s = select <1 x i1> %cond, <1 x i32> %a, <1 x i32> %b
         ret <1 x i32> %s
 }
-
diff --git a/test/CodeGen/Generic/stop-after.ll b/test/CodeGen/Generic/stop-after.ll
index 557e097840af..5e0e350bc7fe 100644
--- a/test/CodeGen/Generic/stop-after.ll
+++ b/test/CodeGen/Generic/stop-after.ll
@@ -5,6 +5,6 @@
 ; STOP: Loop Strength Reduction
 ; STOP-NEXT: Machine Function Analysis
 
-; START: -machine-branch-prob -gc-lowering
+; START: -machine-branch-prob -jump-instr-tables -gc-lowering
 ; START: FunctionPass Manager
 ; START-NEXT: Lower Garbage Collection Instructions
diff --git a/test/CodeGen/Hexagon/lit.local.cfg b/test/CodeGen/Hexagon/lit.local.cfg
index e96bab818a3c..ba72ff632d4e 100644
--- a/test/CodeGen/Hexagon/lit.local.cfg
+++ b/test/CodeGen/Hexagon/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Hexagon' in targets:
+if not 'Hexagon' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
index b6ba22e47cc5..2559e23ae1f5 100644
--- a/test/CodeGen/MSP430/fp.ll
+++ b/test/CodeGen/MSP430/fp.ll
@@ -21,7 +21,7 @@ entry:
 ; does not happen anymore. Note that the only reason an ISR is used here is that
 ; the register allocator selects r4 first instead of fifth in a normal function.
 define msp430_intrcc void @fpb_alloced() #0 {
-; CHECK_LABEL: fpb_alloced:
+; CHECK-LABEL: fpb_alloced:
 ; CHECK-NOT: mov.b #0, r4
 ; CHECK: nop
   call void asm sideeffect "nop", "r"(i8 0)
diff --git a/test/CodeGen/MSP430/lit.local.cfg b/test/CodeGen/MSP430/lit.local.cfg
index a18fe6f927d8..b1cf1fbd21d7 100644
--- a/test/CodeGen/MSP430/lit.local.cfg
+++ b/test/CodeGen/MSP430/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'MSP430' in targets:
+if not 'MSP430' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Mips/2008-08-01-AsmInline.ll b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
index e274bc0e14f0..3c1bb39b4340 100644
--- a/test/CodeGen/Mips/2008-08-01-AsmInline.ll
+++ b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
 
 %struct.DWstruct = type { i32, i32 }
diff --git a/test/CodeGen/Mips/2013-11-18-fp64-const0.ll b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
index f8390d9a1ca7..6a210a0c76ce 100644
--- a/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
+++ b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=mips -mattr=-fp64 < %s | FileCheck -check-prefix=CHECK-FP32 %s
-; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck -check-prefix=CHECK-FP64 %s
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck -check-prefix=CHECK-FP64 %s
 
 ; This test case is a simplified version of an llvm-stress generated test with
 ; seed=3718491962.
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
new file mode 100644
index 000000000000..f113a0eb1d54
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'loadstore2.c'
+target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@c2 = common global i8 0, align 1
+@c1 = common global i8 0, align 1
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@s2 = common global i16 0, align 2
+@s1 = common global i16 0, align 2
+@i2 = common global i32 0, align 4
+@i1 = common global i32 0, align 4
+@f2 = common global float 0.000000e+00, align 4
+@f1 = common global float 0.000000e+00, align 4
+@d2 = common global double 0.000000e+00, align 8
+@d1 = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @cfoo() #0 {
+entry:
+  %0 = load i8* @c2, align 1
+  store i8 %0, i8* @c1, align 1
+; CHECK-LABEL:	cfoo:
+; CHECK:	lbu	$[[REGc:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	sb	$[[REGc]], 0(${{[0-9]+}})
+
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sfoo() #0 {
+entry:
+  %0 = load i16* @s2, align 2
+  store i16 %0, i16* @s1, align 2
+; CHECK-LABEL:	sfoo:
+; CHECK:	lhu	$[[REGs:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	sh	$[[REGs]], 0(${{[0-9]+}})
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ifoo() #0 {
+entry:
+  %0 = load i32* @i2, align 4
+  store i32 %0, i32* @i1, align 4
+; CHECK-LABEL:	ifoo:
+; CHECK:	lw	$[[REGi:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	sw	$[[REGi]], 0(${{[0-9]+}})
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ffoo() #0 {
+entry:
+  %0 = load float* @f2, align 4
+  store float %0, float* @f1, align 4
+; CHECK-LABEL:	ffoo:
+; CHECK:	lwc1	$f[[REGf:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	swc1	$f[[REGf]], 0(${{[0-9]+}})
+
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dfoo() #0 {
+entry:
+  %0 = load double* @d2, align 8
+  store double %0, double* @d1, align 8
+; CHECK-LABEL:        dfoo:
+; CHECK:        ldc1    $f[[REGd:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:        sdc1    $f[[REGd]], 0(${{[0-9]+}})
+; CHECK:        .end    dfoo
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
new file mode 100644
index 000000000000..6759c01c774b
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@f = common global float 0.000000e+00, align 4
+@de = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @f1() #0 {
+entry:
+  store float 0x3FFA76C8C0000000, float* @f, align 4
+  ret void
+; CHECK:  .ent  f1
+; CHECK:  lui  $[[REG1:[0-9]+]], 16339
+; CHECK:  ori  $[[REG2:[0-9]+]], $[[REG1]], 46662
+; CHECK:  mtc1  $[[REG2]], $f[[REG3:[0-9]+]]
+; CHECK:  lw  $[[REG4:[0-9]+]], %got(f)(${{[0-9]+}})
+; CHECK:  swc1  $f[[REG3]], 0($[[REG4]])
+; CHECK:   .end  f1
+
+}
+
+; Function Attrs: nounwind
+define void @d1() #0 {
+entry:
+  store double 1.234567e+00, double* @de, align 8
+; CHECK:  .ent  d1
+; CHECK:  lui  $[[REG1a:[0-9]+]], 16371
+; CHECK:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
+; CHECK:  lui  $[[REG1b:[0-9]+]], 21403
+; CHECK:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
+; CHECK:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
+; CHECK:  mthc1  $[[REG2a]], $f[[REG3]]
+; CHECK:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
+; CHECK:  .end  d1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll
index 8ec5d9313994..d9ad0f8ad86a 100644
--- a/test/CodeGen/Mips/analyzebranch.ll
+++ b/test/CodeGen/Mips/analyzebranch.ll
@@ -1,9 +1,25 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=32-GPR
+; RUN: llc -march=mips64 -mcpu=mips4    < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips64 -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=64-GPR
 
 define double @foo(double %a, double %b) nounwind readnone {
 entry:
-; CHECK: bc1f $BB
-; CHECK: nop
+; ALL-LABEL: foo:
+
+; FCC:           bc1f $BB
+; FCC:           nop
+
+; 32-GPR:        mtc1      $zero, $[[Z:f[0-9]]]
+; 32-GPR:        mthc1     $zero, $[[Z:f[0-9]]]
+; 64-GPR:        dmtc1     $zero, $[[Z:f[0-9]]]
+; GPR:           cmp.olt.d $[[FGRCC:f[0-9]+]], $[[Z]], $f12
+; GPR:           mfc1      $[[GPRCC:[0-9]+]], $[[FGRCC]]
+; GPR-NOT:       not       $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez      $[[GPRCC]], $BB
 
   %cmp = fcmp ogt double %a, 0.000000e+00
   br i1 %cmp, label %if.end6, label %if.else
@@ -25,8 +41,17 @@ return:                                           ; preds = %if.else, %if.end6
 
 define void @f1(float %f) nounwind {
 entry:
-; CHECK: bc1f $BB
-; CHECK: nop
+; ALL-LABEL: f1:
+
+; FCC:           bc1f $BB
+; FCC:           nop
+
+; GPR:           mtc1     $zero, $[[Z:f[0-9]]]
+; GPR:           cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $[[Z]]
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           beqz     $[[GPRCC]], $BB
+
   %cmp = fcmp une float %f, 0.000000e+00
   br i1 %cmp, label %if.then, label %if.end
 
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 77d7bf31545f..066d42cc302d 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,5 +1,14 @@
-; RUN: llc -march=mipsel --disable-machine-licm < %s | FileCheck %s -check-prefix=CHECK-EL
-; RUN: llc -march=mips   --disable-machine-licm < %s | FileCheck %s -check-prefix=CHECK-EB
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips4    < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+
+; Keep one big-endian check so that we don't reduce testing, but don't add more
+; since endianness doesn't affect the body of the atomic operations.
+; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=CHECK-EB
 
 @x = common global i32 0, align 4
 
@@ -8,21 +17,16 @@ entry:
   %0 = atomicrmw add i32* @x, i32 %incr monotonic
   ret i32 %0
 
-; CHECK-EL-LABEL:   AtomicLoadAdd32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   addu    $[[R2:[0-9]+]], $[[R1]], $4
-; CHECK-EL:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-
-; CHECK-EB-LABEL:   AtomicLoadAdd32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   addu    $[[R2:[0-9]+]], $[[R1]], $4
-; CHECK-EB:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
+; ALL-LABEL: AtomicLoadAdd32:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R1:[0-9]+]], 0($[[R0]])
+; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
+; ALL:           sc      $[[R2]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicLoadNand32(i32 %incr) nounwind {
@@ -30,23 +34,17 @@ entry:
   %0 = atomicrmw nand i32* @x, i32 %incr monotonic
   ret i32 %0
 
-; CHECK-EL-LABEL:   AtomicLoadNand32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   and     $[[R3:[0-9]+]], $[[R1]], $4
-; CHECK-EL:   nor     $[[R2:[0-9]+]], $zero, $[[R3]]
-; CHECK-EL:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-
-; CHECK-EB-LABEL:   AtomicLoadNand32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   and     $[[R3:[0-9]+]], $[[R1]], $4
-; CHECK-EB:   nor     $[[R2:[0-9]+]], $zero, $[[R3]]
-; CHECK-EB:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
+; ALL-LABEL: AtomicLoadNand32:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R1:[0-9]+]], 0($[[R0]])
+; ALL:           and     $[[R3:[0-9]+]], $[[R1]], $4
+; ALL:           nor     $[[R2:[0-9]+]], $zero, $[[R3]]
+; ALL:           sc      $[[R2]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicSwap32(i32 %newval) nounwind {
@@ -57,19 +55,15 @@ entry:
   %0 = atomicrmw xchg i32* @x, i32 %tmp monotonic
   ret i32 %0
 
-; CHECK-EL-LABEL:   AtomicSwap32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      ${{[0-9]+}}, 0($[[R0]])
-; CHECK-EL:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-
-; CHECK-EB-LABEL:   AtomicSwap32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      ${{[0-9]+}}, 0($[[R0]])
-; CHECK-EB:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
+; ALL-LABEL: AtomicSwap32:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      ${{[0-9]+}}, 0($[[R0]])
+; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicCmpSwap32(i32 %oldval, i32 %newval) nounwind {
@@ -78,25 +72,20 @@ entry:
   store i32 %newval, i32* %newval.addr, align 4
   %tmp = load i32* %newval.addr, align 4
   %0 = cmpxchg i32* @x, i32 %oldval, i32 %tmp monotonic monotonic
-  ret i32 %0
+  %1 = extractvalue { i32, i1 } %0, 0
+  ret i32 %1
+
+; ALL-LABEL: AtomicCmpSwap32:
 
-; CHECK-EL-LABEL:   AtomicCmpSwap32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $2, 0($[[R0]])
-; CHECK-EL:   bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; CHECK-EL:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-; CHECK-EL:   $[[BB1]]:
-
-; CHECK-EB-LABEL:   AtomicCmpSwap32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $2, 0($[[R0]])
-; CHECK-EB:   bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; CHECK-EB:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
-; CHECK-EB:   $[[BB1]]:
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $2, 0($[[R0]])
+; ALL:           bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
+; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
+; ALL:       $[[BB1]]:
 }
 
 
@@ -108,56 +97,38 @@ entry:
   %0 = atomicrmw add i8* @y, i8 %incr monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicLoadAdd8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EL:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicLoadAdd8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EB:   and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL-LABEL: AtomicLoadAdd8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:           and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH:    sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicLoadSub8(i8 signext %incr) nounwind {
@@ -165,56 +136,38 @@ entry:
   %0 = atomicrmw sub i8* @y, i8 %incr monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicLoadSub8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv     $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   subu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EL:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicLoadSub8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; ALL-LABEL: AtomicLoadSub8:
+
+; MIPS32-ANY: lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY: ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:        addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:        and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:        andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:   sll     $[[R5:[0-9]+]], $[[R3]], 3
 ; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
 ; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   subu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EB:   and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL:        ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:        sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:        nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:        sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:    $[[BB0:[A-Z_0-9]+]]:
+; ALL:        ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:        subu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:        and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:        and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:        or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:        sc      $[[R14]], 0($[[R2]])
+; ALL:        beqz    $[[R14]], $[[BB0]]
+
+; ALL:        and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:        srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH: sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH: sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicLoadNand8(i8 signext %incr) nounwind {
@@ -222,58 +175,39 @@ entry:
   %0 = atomicrmw nand i8* @y, i8 %incr monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicLoadNand8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EL:   nor     $[[R11:[0-9]+]], $zero, $[[R18]]
-; CHECK-EL:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicLoadNand8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EB:   nor     $[[R11:[0-9]+]], $zero, $[[R18]]
-; CHECK-EB:   and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL-LABEL: AtomicLoadNand8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:           nor     $[[R11:[0-9]+]], $zero, $[[R18]]
+; ALL:           and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH:    sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicSwap8(i8 signext %newval) nounwind {
@@ -281,121 +215,126 @@ entry:
   %0 = atomicrmw xchg i8* @y, i8 %newval monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicSwap8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   and     $[[R18:[0-9]+]], $[[R9]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicSwap8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   and     $[[R18:[0-9]+]], $[[R9]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL-LABEL: AtomicSwap8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R18:[0-9]+]], $[[R9]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH:    sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
 entry:
-  %0 = cmpxchg i8* @y, i8 %oldval, i8 %newval monotonic monotonic
+  %pair0 = cmpxchg i8* @y, i8 %oldval, i8 %newval monotonic monotonic
+  %0 = extractvalue { i8, i1 } %pair0, 0
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicCmpSwap8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   andi    $[[R8:[0-9]+]], $4, 255
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $[[R8]], $[[R4]]
-; CHECK-EL:   andi    $[[R10:[0-9]+]], $5, 255
-; CHECK-EL:   sllv    $[[R11:[0-9]+]], $[[R10]], $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R12:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R12]], $[[R6]]
-; CHECK-EL:   bne     $[[R13]], $[[R9]], $[[BB1:[A-Z_0-9]+]]
-
-; CHECK-EL:   and     $[[R14:[0-9]+]], $[[R12]], $[[R7]]
-; CHECK-EL:   or      $[[R15:[0-9]+]], $[[R14]], $[[R11]]
-; CHECK-EL:   sc      $[[R15]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R15]], $[[BB0]]
-
-; CHECK-EL:   $[[BB1]]:
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R13]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicCmpSwap8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   andi    $[[R9:[0-9]+]], $4, 255
-; CHECK-EB:   sllv    $[[R10:[0-9]+]], $[[R9]], $[[R5]]
-; CHECK-EB:   andi    $[[R11:[0-9]+]], $5, 255
-; CHECK-EB:   sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R13:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; CHECK-EB:   bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
-; CHECK-EB:   or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
-; CHECK-EB:   sc      $[[R16]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R16]], $[[BB0]]
-
-; CHECK-EB:   $[[BB1]]:
-; CHECK-EB:   srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
-; CHECK-EB:   sll     $[[R18:[0-9]+]], $[[R17]], 24
-; CHECK-EB:   sra     $2, $[[R18]], 24
+; ALL-LABEL: AtomicCmpSwap8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           andi    $[[R9:[0-9]+]], $4, 255
+; ALL:           sllv    $[[R10:[0-9]+]], $[[R9]], $[[R5]]
+; ALL:           andi    $[[R11:[0-9]+]], $5, 255
+; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
+; ALL:           bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
+; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
+; ALL:           sc      $[[R16]], 0($[[R2]])
+; ALL:           beqz    $[[R16]], $[[BB0]]
+
+; ALL:       $[[BB1]]:
+; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
+; NO-SEB-SEH:    sra     $2, $[[R18]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R17]]
+}
+
+; Check one i16 so that we cover the seh sign extend
+@z = common global i16 0, align 1
+
+define signext i16 @AtomicLoadAdd16(i16 signext %incr) nounwind {
+entry:
+  %0 = atomicrmw add i16* @z, i16 %incr monotonic
+  ret i16 %0
+
+; ALL-LABEL: AtomicLoadAdd16:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(z)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(z)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 2
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 65535
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:           and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 16
+; NO-SEB-SEH:    sra     $2, $[[R17]], 16
+
+; MIPS32R2:      seh     $2, $[[R16]]
 }
 
+
 @countsint = common global i32 0, align 4
 
 define i32 @CheckSync(i32 %v) nounwind noinline {
@@ -403,19 +342,13 @@ entry:
   %0 = atomicrmw add i32* @countsint, i32 %v seq_cst
   ret i32 %0 
 
-; CHECK-EL-LABEL:   CheckSync:
-; CHECK-EL:   sync 0
-; CHECK-EL:   ll
-; CHECK-EL:   sc
-; CHECK-EL:   beq
-; CHECK-EL:   sync 0
-
-; CHECK-EB-LABEL:   CheckSync:
-; CHECK-EB:   sync 0
-; CHECK-EB:   ll
-; CHECK-EB:   sc
-; CHECK-EB:   beq
-; CHECK-EB:   sync 0
+; ALL-LABEL: CheckSync:
+
+; ALL:           sync
+; ALL:           ll
+; ALL:           sc
+; ALL:           beq
+; ALL:           sync
 }
 
 ; make sure that this assertion in
@@ -429,8 +362,29 @@ entry:
 
 define i32 @zeroreg() nounwind {
 entry:
-  %0 = cmpxchg i32* @a, i32 1, i32 0 seq_cst seq_cst
+  %pair0 = cmpxchg i32* @a, i32 1, i32 0 seq_cst seq_cst
+  %0 = extractvalue { i32, i1 } %pair0, 0
   %1 = icmp eq i32 %0, 1
   %conv = zext i1 %1 to i32
   ret i32 %conv
 }
+
+; Check that MIPS32R6 has the correct offset range.
+; FIXME: At the moment, we don't seem to do addr+offset for any atomic load/store.
+define i32 @AtomicLoadAdd32_OffGt9Bit(i32 %incr) nounwind {
+entry:
+  %0 = atomicrmw add i32* getelementptr(i32* @x, i32 256), i32 %incr monotonic
+  ret i32 %0
+
+; ALL-LABEL: AtomicLoadAdd32_OffGt9Bit:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:           addiu   $[[PTR:[0-9]+]], $[[R0]], 1024
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R1:[0-9]+]], 0($[[PTR]])
+; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
+; ALL:           sc      $[[R2]], 0($[[PTR]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
+}
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
index dc07c637418c..c26415233d0b 100644
--- a/test/CodeGen/Mips/atomicops.ll
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -20,7 +20,8 @@ entry:
   %add.i = add nsw i32 %0, 2
   %1 = load volatile i32* %x, align 4
   %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
-  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst seq_cst
+  %pair = cmpxchg i32* %x, i32 1, i32 2 seq_cst seq_cst
+  %2 = extractvalue { i32, i1 } %pair, 0
   %3 = load volatile i32* %x, align 4
   %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
   %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index b9bf2b60a66c..88d1d07e29ad 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
 ; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
-; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
 
 @a = external global i32
 
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index b9732eb66350..61f398314de1 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -1,17 +1,43 @@
-; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips -regalloc=basic < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mips     -mcpu=mips32                 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc -march=mips     -mcpu=mips32 -regalloc=basic < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc -march=mips     -mcpu=mips32r2               < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc -march=mips     -mcpu=mips32r6               < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMP
+; RUN: llc -march=mips64el -mcpu=mips4                  < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc -march=mips64el -mcpu=mips64                 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc -march=mips64el -mcpu=mips64r6               < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMP
 
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
 @i3 = common global i32* null, align 4
 
-; O32-DAG:  lw $[[R0:[0-9]+]], %got(i3)
-; O32-DAG:  addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1)
-; O32:      movn $[[R0]], $[[R1]], ${{[0-9]+}}
-; N64-DAG:  ldr $[[R0:[0-9]+]]
-; N64-DAG:  ld $[[R1:[0-9]+]], %got_disp(i1)
-; N64:      movn $[[R0]], $[[R1]], ${{[0-9]+}}
+; ALL-LABEL: cmov1:
+
+; 32-CMOV-DAG:  lw $[[R0:[0-9]+]], %got(i3)
+; 32-CMOV-DAG:  addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1)
+; 32-CMOV-DAG:  movn $[[R0]], $[[R1]], $4
+; 32-CMOV-DAG:  lw $2, 0($[[R0]])
+
+; 32-CMP-DAG:   lw $[[R0:[0-9]+]], %got(i3)
+; 32-CMP-DAG:   addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1)
+; 32-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[R1]], $4
+; 32-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[R0]], $4
+; 32-CMP-DAG:   or $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 32-CMP-DAG:   lw $2, 0($[[T2]])
+
+; 64-CMOV-DAG:  ldr $[[R0:[0-9]+]]
+; 64-CMOV-DAG:  ld $[[R1:[0-9]+]], %got_disp(i1)
+; 64-CMOV-DAG:  movn $[[R0]], $[[R1]], $4
+
+; 64-CMP-DAG:   ld $[[R0:[0-9]+]], %got_disp(i3)(
+; 64-CMP-DAG:   daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(i1)
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; 64-CMP-DAG:   sll $[[CC:[0-9]+]], $4, 0
+; 64-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[R1]], $[[CC]]
+; 64-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[R0]], $[[CC]]
+; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64-CMP-DAG:   ld $2, 0($[[T2]])
+
 define i32* @cmov1(i32 %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
@@ -23,14 +49,35 @@ entry:
 @c = global i32 1, align 4
 @d = global i32 0, align 4
 
-; O32-LABEL: cmov2:
-; O32: addiu $[[R1:[0-9]+]], ${{[a-z0-9]+}}, %got(d)
-; O32: addiu $[[R0:[0-9]+]], ${{[a-z0-9]+}}, %got(c)
-; O32: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
-; N64-LABEL: cmov2:
-; N64: daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d)
-; N64: daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got_disp(c)
-; N64: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
+; ALL-LABEL: cmov2:
+
+; 32-CMOV-DAG:  addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(d)
+; 32-CMOV-DAG:  addiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got(c)
+; 32-CMOV-DAG:  movn  $[[R1]], $[[R0]], $4
+; 32-CMOV-DAG:  lw $2, 0($[[R0]])
+
+; 32-CMP-DAG:   addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(d)
+; 32-CMP-DAG:   addiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got(c)
+; 32-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[R0]], $4
+; 32-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[R1]], $4
+; 32-CMP-DAG:   or $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 32-CMP-DAG:   lw $2, 0($[[T2]])
+
+; 64-CMOV:      daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d)
+; 64-CMOV:      daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got_disp(c)
+; 64-CMOV:      movn  $[[R1]], $[[R0]], $4
+
+; 64-CMP-DAG:   daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d)
+; 64-CMP-DAG:   daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got_disp(c)
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; 64-CMP-DAG:   sll $[[CC:[0-9]+]], $4, 0
+; 64-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[R0]], $[[CC]]
+; 64-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[R1]], $[[CC]]
+; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64-CMP-DAG:   lw $2, 0($[[T2]])
+
 define i32 @cmov2(i32 %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
@@ -40,9 +87,28 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: cmov3:
-; O32: xori $[[R0:[0-9]+]], ${{[0-9]+}}, 234
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: cmov3:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be either one of two registers so we can at least
+; check that.
+
+; 32-CMOV:      xori $[[R0:[0-9]+]], $4, 234
+; 32-CMOV:      movz ${{[26]}}, $5, $[[R0]]
+
+; 32-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 32-CMP-DAG:   selnez $[[T0:[0-9]+]], $5, $[[CC]]
+; 32-CMP-DAG:   seleqz $[[T1:[0-9]+]], $6, $[[CC]]
+; 32-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV:      xori $[[R0:[0-9]+]], $4, 234
+; 64-CMOV:      movz ${{[26]}}, $5, $[[R0]]
+
+; 64-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 64-CMP-DAG:   selnez $[[T0:[0-9]+]], $5, $[[CC]]
+; 64-CMP-DAG:   seleqz $[[T1:[0-9]+]], $6, $[[CC]]
+; 64-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
 define i32 @cmov3(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
@@ -50,9 +116,36 @@ entry:
   ret i32 %cond
 }
 
-; N64-LABEL: cmov4:
-; N64: xori $[[R0:[0-9]+]], ${{[0-9]+}}, 234
-; N64: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: cmov4:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be one of two registers so we can at least check
+; that.
+
+; 32-CMOV-DAG: xori $[[R0:[0-9]+]], $4, 234
+; 32-CMOV-DAG: lw $[[R1:2]], 16($sp)
+; 32-CMOV-DAG: lw $[[R2:3]], 20($sp)
+; 32-CMOV-DAG: movz $[[R1]], $6, $[[R0]]
+; 32-CMOV-DAG: movz $[[R2]], $7, $[[R0]]
+
+; 32-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 32-CMP-DAG:  lw $[[R1:[0-9]+]], 16($sp)
+; 32-CMP-DAG:  lw $[[R2:[0-9]+]], 20($sp)
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $6, $[[R0]]
+; 32-CMP-DAG:  selnez $[[T1:[0-9]+]], $7, $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T2:[0-9]+]], $[[R1]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T3:[0-9]+]], $[[R2]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T2]]
+; 32-CMP-DAG:  or $3, $[[T1]], $[[T3]]
+
+; 64-CMOV: xori $[[R0:[0-9]+]], $4, 234
+; 64-CMOV: movz ${{[26]}}, $5, $[[R0]]
+
+; 64-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $5, $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $6, $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
 define i64 @cmov4(i32 %a, i64 %b, i64 %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
@@ -68,9 +161,33 @@ entry:
 ;  (movz t, (setlt a, N + 1), f)
 ; if N + 1 fits in 16-bit.
 
-; O32-LABEL: slti0:
-; O32: slti $[[R0:[0-9]+]], ${{[0-9]+}}, 32767
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti0:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: slti $[[R0:[0-9]+]], $4, 32767
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: slti $[[R0:[0-9]+]], $4, 32767
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti0(i32 %a) {
 entry:
@@ -79,19 +196,72 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: slti1:
-; O32: slt ${{[0-9]+}}
+; ALL-LABEL: slti1:
+
+; 32-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 32-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  addiu $[[I32767:[0-9]+]], $zero, 32767
+; 32-CMP-DAG:  slt $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti1(i32 %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32767
-  %cond = select i1 %cmp, i32 3, i32 5
+  %cond = select i1 %cmp, i32 7, i32 5
   ret i32 %cond
 }
 
-; O32-LABEL: slti2:
-; O32: slti $[[R0:[0-9]+]], ${{[0-9]+}}, -32768
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti2:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: slti $[[R0:[0-9]+]], $4, -32768
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: slti $[[R0:[0-9]+]], $4, -32768
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti2(i32 %a) {
 entry:
@@ -100,8 +270,41 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: slti3:
-; O32: slt ${{[0-9]+}}
+; ALL-LABEL: slti3:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 32-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 32-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 32-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 32-CMP-DAG:  slt $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 64-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 64-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[IMM]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti3(i32 %a) {
 entry:
@@ -112,30 +315,117 @@ entry:
 
 ; 64-bit patterns.
 
-; N64-LABEL: slti64_0:
-; N64: slti $[[R0:[0-9]+]], ${{[0-9]+}}, 32767
-; N64: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti64_0:
+
+; 32-CMOV-DAG:  slt $[[CC:[0-9]+]], $zero, $4
+; 32-CMOV-DAG:  addiu $[[I32766:[0-9]+]], $zero, 32766
+; 32-CMOV-DAG:  sltu $[[R1:[0-9]+]], $[[I32766]], $5
+; 32-CMOV-DAG:  movz $[[CC:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMOV-DAG:  addiu $[[I4:3]], $zero, 4
+; 32-CMOV-DAG:  movn $[[I4]], $[[I5]], $[[CC]]
+; 32-CMOV-DAG:  addiu $2, $zero, 0
+
+; 32-CMP-DAG:   slt $[[CC0:[0-9]+]], $zero, $4
+; 32-CMP-DAG:   addiu $[[I32766:[0-9]+]], $zero, 32766
+; 32-CMP-DAG:   sltu $[[CC1:[0-9]+]], $[[I32766]], $5
+; 32-CMP-DAG:   seleqz $[[CC2:[0-9]+]], $[[CC0]], $4
+; 32-CMP-DAG:   selnez $[[CC3:[0-9]+]], $[[CC1]], $4
+; 32-CMP:       or $[[CC:[0-9]+]], $[[CC3]], $[[CC2]]
+; 32-CMP-DAG:   addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:   addiu $[[I4:[0-9]+]], $zero, 4
+; 32-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[I4]], $[[CC]]
+; 32-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[I5]], $[[CC]]
+; 32-CMP-DAG:   or $3, $[[T1]], $[[T0]]
+; 32-CMP-DAG:   addiu $2, $zero, 0
+
+; 64-CMOV-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMOV-DAG:  addiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; 64-CMOV-DAG:  movz $[[I4]], $[[I5]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  addiu $[[I4:[0-9]+]], $zero, 4
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by adding/subtracting the result of slti
+;        to/from one of the constants.
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_0(i64 %a) {
 entry:
   %cmp = icmp sgt i64 %a, 32766
-  %conv = select i1 %cmp, i64 3, i64 4
+  %conv = select i1 %cmp, i64 5, i64 4
   ret i64 %conv
 }
 
-; N64-LABEL: slti64_1:
-; N64: slt ${{[0-9]+}}
+; ALL-LABEL: slti64_1:
+
+; 32-CMOV-DAG:  slt $[[CC:[0-9]+]], $zero, $4
+; 32-CMOV-DAG:  addiu $[[I32766:[0-9]+]], $zero, 32767
+; 32-CMOV-DAG:  sltu $[[R1:[0-9]+]], $[[I32766]], $5
+; 32-CMOV-DAG:  movz $[[CC:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMOV-DAG:  addiu $[[I4:3]], $zero, 4
+; 32-CMOV-DAG:  movn $[[I4]], $[[I5]], $[[CC]]
+; 32-CMOV-DAG:  addiu $2, $zero, 0
+
+; 32-CMP-DAG:   slt $[[CC0:[0-9]+]], $zero, $4
+; 32-CMP-DAG:   addiu $[[I32766:[0-9]+]], $zero, 32767
+; 32-CMP-DAG:   sltu $[[CC1:[0-9]+]], $[[I32766]], $5
+; 32-CMP-DAG:   seleqz $[[CC2:[0-9]+]], $[[CC0]], $4
+; 32-CMP-DAG:   selnez $[[CC3:[0-9]+]], $[[CC1]], $4
+; 32-CMP:       or $[[CC:[0-9]+]], $[[CC3]], $[[CC2]]
+; 32-CMP-DAG:   addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:   addiu $[[I4:[0-9]+]], $zero, 4
+; 32-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[I4]], $[[CC]]
+; 32-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[I5]], $[[CC]]
+; 32-CMP-DAG:   or $3, $[[T1]], $[[T0]]
+; 32-CMP-DAG:   addiu $2, $zero, 0
+
+; 64-CMOV-DAG: daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMOV-DAG: daddiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG: daddiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I4]], $[[I5]], $[[R0]]
+
+; 64-CMP-DAG:  daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  daddiu $[[I4:2]], $zero, 4
+; 64-CMP-DAG:  daddiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_1(i64 %a) {
 entry:
   %cmp = icmp sgt i64 %a, 32767
-  %conv = select i1 %cmp, i64 3, i64 4
+  %conv = select i1 %cmp, i64 5, i64 4
   ret i64 %conv
 }
 
-; N64-LABEL: slti64_2:
-; N64: slti $[[R0:[0-9]+]], ${{[0-9]+}}, -32768
-; N64: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti64_2:
+
+; FIXME: The 32-bit versions of this test are too complicated to reasonably
+;        match at the moment. They do show some missing optimizations though
+;        such as:
+;           (movz $a, $b, (neg $c)) -> (movn $a, $b, $c)
+
+; 64-CMOV-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG:  addiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; 64-CMOV-DAG:  movz $[[I4]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I4:[0-9]+]], $zero, 4
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by adding/subtracting the result of slti
+;        to/from one of the constants.
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_2(i64 %a) {
 entry:
@@ -144,21 +434,64 @@ entry:
   ret i64 %conv
 }
 
-; N64-LABEL: slti64_3:
-; N64: slt ${{[0-9]+}}
+; ALL-LABEL: slti64_3:
+
+; FIXME: The 32-bit versions of this test are too complicated to reasonably
+;        match at the moment. They do show some missing optimizations though
+;        such as:
+;           (movz $a, $b, (neg $c)) -> (movn $a, $b, $c)
+
+; 64-CMOV-DAG: daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMOV-DAG: daddiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG: daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, 32766
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I4]], $[[I5]], $[[R0]]
+
+; 64-CMP-DAG:  daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  daddiu $[[I4:2]], $zero, 4
+; 64-CMP-DAG:  daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, 32766
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_3(i64 %a) {
 entry:
   %cmp = icmp sgt i64 %a, -32770
-  %conv = select i1 %cmp, i64 3, i64 4
+  %conv = select i1 %cmp, i64 5, i64 4
   ret i64 %conv
 }
 
 ; sltiu instructions.
 
-; O32-LABEL: sltiu0:
-; O32: sltiu $[[R0:[0-9]+]], ${{[0-9]+}}, 32767
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: sltiu0:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, 32767
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, 32767
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu0(i32 %a) {
 entry:
@@ -167,19 +500,72 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: sltiu1:
-; O32: sltu ${{[0-9]+}}
+; ALL-LABEL: sltiu1:
+
+; 32-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 32-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  addiu $[[I32767:[0-9]+]], $zero, 32767
+; 32-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu1(i32 %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32767
-  %cond = select i1 %cmp, i32 3, i32 5
+  %cond = select i1 %cmp, i32 7, i32 5
   ret i32 %cond
 }
 
-; O32-LABEL: sltiu2:
-; O32: sltiu $[[R0:[0-9]+]], ${{[0-9]+}}, -32768
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: sltiu2:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, -32768
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, -32768
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu2(i32 %a) {
 entry:
@@ -188,8 +574,41 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: sltiu3:
-; O32: sltu ${{[0-9]+}}
+; ALL-LABEL: sltiu3:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 32-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 32-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 32-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 32-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 64-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 64-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 64-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 64-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[IMM]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu3(i32 %a) {
 entry:
@@ -210,11 +629,25 @@ define i32 @slti4(i32 %a) nounwind readnone {
   ret i32 %2
 }
 
-; O32-LABEL: slti4:
-; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; O32-NOT: movn
-; O32:.size slti4
+; ALL-LABEL: slti4:
+
+; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMOV-DAG: addiu $2, [[R1]], 3
+; 32-CMOV-NOT: movn
+
+; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMP-DAG:  addiu $2, [[R1]], 3
+; 32-CMP-NOT:  seleqz
+; 32-CMP-NOT:  selnez
+
+; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMOV-DAG: addiu $2, [[R1]], 3
+; 64-CMOV-NOT: movn
+
+; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMP-DAG:  addiu $2, [[R1]], 3
+; 64-CMP-NOT:  seleqz
+; 64-CMP-NOT:  selnez
 
 define i32 @slti5(i32 %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
@@ -222,11 +655,25 @@ define i32 @slti5(i32 %a) nounwind readnone {
   ret i32 %2
 }
 
-; O32-LABEL: slti5:
-; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; O32-DAG: addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
-; O32-NOT: movn
-; O32:.size slti5
+; ALL-LABEL: slti5:
+
+; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMOV-DAG: addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 32-CMOV-NOT: movn
+
+; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMP-DAG:  addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 32-CMP-NOT:  seleqz
+; 32-CMP-NOT:  selnez
+
+; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMOV-DAG: addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 64-CMOV-NOT: movn
+
+; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMP-DAG:  addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 64-CMP-NOT:  seleqz
+; 64-CMP-NOT:  selnez
 
 define i32 @slti6(i32 %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
@@ -234,9 +681,26 @@ define i32 @slti6(i32 %a) nounwind readnone {
   ret i32 %2
 }
 
-; O32-LABEL: slti6:
-; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; O32-DAG: xori [[R1]], [[R1]], 1
-; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; O32-NOT: movn
-; O32:.size slti6
+; ALL-LABEL: slti6:
+
+; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMOV-DAG: xori [[R1]], [[R1]], 1
+; 32-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 32-CMOV-NOT: movn
+
+; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMP-DAG:  xori [[R1]], [[R1]], 1
+; 32-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 32-CMP-NOT:  seleqz
+; 32-CMP-NOT:  selnez
+
+; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMOV-DAG: xori [[R1]], [[R1]], 1
+; 64-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 64-CMOV-NOT: movn
+
+; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMP-DAG:  xori [[R1]], [[R1]], 1
+; 64-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 64-CMP-NOT:  seleqz
+; 64-CMP-NOT:  selnez
diff --git a/test/CodeGen/Mips/countleading.ll b/test/CodeGen/Mips/countleading.ll
new file mode 100644
index 000000000000..81fb2b44942b
--- /dev/null
+++ b/test/CodeGen/Mips/countleading.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=mipsel -mcpu=mips32   < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R1-R2 -check-prefix=MIPS32-GT-R1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R1-R2 -check-prefix=MIPS32-GT-R1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R6    -check-prefix=MIPS32-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips4    < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS4 %s
+; RUN: llc -march=mips64el -mcpu=mips64   < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
+; R!N: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
+
+; Prefixes:
+;   ALL      - All
+;   MIPS32-GT-R1 - MIPS64r1 and above (does not include MIPS64's)
+;   MIPS64-GT-R1 - MIPS64r1 and above
+
+define i32 @ctlz_i32(i32 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlz_i32:
+
+; MIPS4-NOT:     clz
+
+; MIPS32-GT-R1:  clz $2, $4
+
+; MIPS64-GT-R1:  clz $2, $4
+
+  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %X, i1 true)
+  ret i32 %tmp1
+}
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+
+define i32 @ctlo_i32(i32 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlo_i32:
+
+; MIPS4-NOT:     clo
+
+; MIPS32-GT-R1:  clo $2, $4
+
+; MIPS64-GT-R1:  clo $2, $4
+
+  %neg = xor i32 %X, -1
+  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %neg, i1 true)
+  ret i32 %tmp1
+}
+
+define i64 @ctlz_i64(i64 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlz_i64:
+
+; MIPS4-NOT:     dclz
+
+; MIPS32-GT-R1-DAG: clz $[[R0:[0-9]+]], $4
+; MIPS32-GT-R1-DAG: clz $[[R1:[0-9]+]], $5
+; MIPS32-GT-R1-DAG: addiu $[[R2:2+]], $[[R0]], 32
+; MIPS32-R1-R2-DAG: movn $[[R2]], $[[R1]], $5
+; MIPS32-R6-DAG:    selnez $[[R5:[0-9]+]], $[[R2]], $5
+; MIPS32-R6-DAG:    seleqz $[[R6:[0-9]+]], $[[R1]], $5
+; MIPS32-R6-DAG:    or $2, $[[R5]], $[[R6]]
+; MIPS32-GT-R1-DAG: addiu $3, $zero, 0
+
+; MIPS64-GT-R1:  dclz $2, $4
+
+  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
+  ret i64 %tmp1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+
+define i64 @ctlo_i64(i64 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlo_i64:
+
+; MIPS4-NOT:     dclo
+
+; MIPS32-GT-R1-DAG: clo $[[R0:[0-9]+]], $4
+; MIPS32-GT-R1-DAG: clo $[[R1:[0-9]+]], $5
+; MIPS32-GT-R1-DAG: addiu $[[R2:2+]], $[[R0]], 32
+; MIPS32-GT-R1-DAG: addiu $[[R3:[0-9]+]], $zero, -1
+; MIPS32-GT-R1-DAG: xor $[[R4:[0-9]+]], $5, $[[R3]]
+; MIPS32-R1-R2-DAG: movn $[[R2]], $[[R1]], $[[R4]]
+; MIPS32-R6-DAG:    selnez $[[R5:[0-9]+]], $[[R1]], $[[R4]]
+; MIPS32-R6-DAG:    seleqz $[[R6:[0-9]+]], $[[R2]], $[[R4]]
+; MIPS32-R6-DAG:    or $2, $[[R5]], $[[R6]]
+; MIPS32-GT-R1-DAG: addiu $3, $zero, 0
+
+; MIPS64-GT-R1:  dclo $2, $4
+
+  %neg = xor i64 %X, -1
+  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
+  ret i64 %tmp1
+}
diff --git a/test/CodeGen/Mips/divrem.ll b/test/CodeGen/Mips/divrem.ll
index b631c3b279f4..97f836044406 100644
--- a/test/CodeGen/Mips/divrem.ll
+++ b/test/CodeGen/Mips/divrem.ll
@@ -1,77 +1,223 @@
-; RUN: llc -march=mips -verify-machineinstrs < %s |\
-; RUN: FileCheck %s -check-prefix=TRAP
-; RUN: llc -march=mips -mno-check-zero-division < %s |\
-; RUN: FileCheck %s -check-prefix=NOCHECK
+; RUN: llc -march=mips   -mcpu=mips32   -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=ACC32-TRAP
+; RUN: llc -march=mips   -mcpu=mips32r2 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=ACC32-TRAP
+; RUN: llc -march=mips   -mcpu=mips32r6 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=GPR32-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64   -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=ACC64-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64r2 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=ACC64-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64r6 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=GPR64-TRAP
 
-; TRAP-LABEL: sdiv1:
-; TRAP: div $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mflo
+; RUN: llc -march=mips   -mcpu=mips32   -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=NOCHECK
+; RUN: llc -march=mips   -mcpu=mips32r2 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=NOCHECK
+; RUN: llc -march=mips   -mcpu=mips32r6 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64   -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64r6 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=NOCHECK
 
-; NOCHECK-LABEL: sdiv1:
-; NOCHECK-NOT: teq
-; NOCHECK: .end sdiv1
+; FileCheck Prefixes:
+;   ALL - All targets
+;   ACC32 - Accumulator based multiply/divide on 32-bit targets
+;   ACC64 - Same as ACC32 but only for 64-bit targets
+;   GPR32 - GPR based multiply/divide on 32-bit targets
+;   GPR64 - Same as GPR32 but only for 64-bit targets
+;   ACC32-TRAP - Same as TRAP and ACC32 combined
+;   ACC64-TRAP - Same as TRAP and ACC64 combined
+;   GPR32-TRAP - Same as TRAP and GPR32 combined
+;   GPR64-TRAP - Same as TRAP and GPR64 combined
+;   NOCHECK - Division by zero will not be detected
 
 @g0 = common global i32 0, align 4
 @g1 = common global i32 0, align 4
 
 define i32 @sdiv1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: sdiv1:
+
+; ACC32:         div $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         div $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         div $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mflo $2
+; ACC64:         mflo $2
+
+; ALL: .end sdiv1
+
   %div = sdiv i32 %a0, %a1
   ret i32 %div
 }
 
-; TRAP-LABEL: srem1:
-; TRAP: div $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mfhi
-
 define i32 @srem1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: srem1:
+
+; ACC32:         div $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         mod $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         mod $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mfhi $2
+; ACC64:         mfhi $2
+
+; ALL: .end srem1
+
   %rem = srem i32 %a0, %a1
   ret i32 %rem
 }
 
-; TRAP-LABEL: udiv1:
-; TRAP: divu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mflo
-
 define i32 @udiv1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: udiv1:
+
+; ACC32:         divu $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         divu $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         divu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mflo $2
+; ACC64:         mflo $2
+
+; ALL: .end udiv1
   %div = udiv i32 %a0, %a1
   ret i32 %div
 }
 
-; TRAP-LABEL: urem1:
-; TRAP: divu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mfhi
-
 define i32 @urem1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: urem1:
+
+; ACC32:         divu $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         modu $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         modu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mfhi $2
+; ACC64:         mfhi $2
+
+; ALL: .end urem1
+
   %rem = urem i32 %a0, %a1
   ret i32 %rem
 }
 
-; TRAP: div $zero,
 define i32 @sdivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
 entry:
+; ALL-LABEL: sdivrem1:
+
+; ACC32:         div $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC32:         mflo $2
+; ACC32:         mfhi $[[R0:[0-9]+]]
+; ACC32:         sw $[[R0]], 0(${{[0-9]+}})
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sw $[[R0]], 0(${{[0-9]+}})
+
+; GPR32:         mod $[[R0:[0-9]+]], $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR32:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR32-DAG:     div $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         mod $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR64-DAG:     div $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end sdivrem1
+
   %rem = srem i32 %a0, %a1
   store i32 %rem, i32* %r, align 4
   %div = sdiv i32 %a0, %a1
   ret i32 %div
 }
 
-; TRAP: divu $zero,
 define i32 @udivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
 entry:
+; ALL-LABEL: udivrem1:
+
+; ACC32:         divu $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC32:         mflo $2
+; ACC32:         mfhi $[[R0:[0-9]+]]
+; ACC32:         sw $[[R0]], 0(${{[0-9]+}})
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sw $[[R0]], 0(${{[0-9]+}})
+
+; GPR32:         modu $[[R0:[0-9]+]], $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR32:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR32-DAG:     divu $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; GPR64:         modu $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR64-DAG:     divu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end udivrem1
+
   %rem = urem i32 %a0, %a1
   store i32 %rem, i32* %r, align 4
   %div = udiv i32 %a0, %a1
   ret i32 %div
 }
 
+; FIXME: It's not clear what this is supposed to test.
 define i32 @killFlags() {
 entry:
   %0 = load i32* @g0, align 4
@@ -79,3 +225,164 @@ entry:
   %div = sdiv i32 %0, %1
   ret i32 %div
 }
+
+define i64 @sdiv2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: sdiv2:
+
+; ACC32:         lw $25, %call16(__divdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         ddiv $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         ddiv $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mflo $2
+
+; ALL: .end sdiv2
+
+  %div = sdiv i64 %a0, %a1
+  ret i64 %div
+}
+
+define i64 @srem2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: srem2:
+
+; ACC32:         lw $25, %call16(__moddi3)(
+; ACC32:         jalr $25
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         dmod $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mfhi $2
+
+; ALL: .end srem2
+
+  %rem = srem i64 %a0, %a1
+  ret i64 %rem
+}
+
+define i64 @udiv2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: udiv2:
+
+; ACC32:         lw $25, %call16(__udivdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         ddivu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mflo $2
+
+; ALL: .end udiv2
+  %div = udiv i64 %a0, %a1
+  ret i64 %div
+}
+
+define i64 @urem2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: urem2:
+
+; ACC32:         lw $25, %call16(__umoddi3)(
+; ACC32:         jalr $25
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         dmodu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mfhi $2
+
+; ALL: .end urem2
+
+  %rem = urem i64 %a0, %a1
+  ret i64 %rem
+}
+
+define i64 @sdivrem2(i64 %a0, i64 %a1, i64* nocapture %r) nounwind {
+entry:
+; ALL-LABEL: sdivrem2:
+
+; sdivrem2 is too complex to effectively check. We can at least check for the
+; calls though.
+; ACC32:         lw $25, %call16(__moddi3)(
+; ACC32:         jalr $25
+; ACC32:         lw $25, %call16(__divdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         ddiv $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64:         dmod $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64-DAG:     ddiv $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end sdivrem2
+
+  %rem = srem i64 %a0, %a1
+  store i64 %rem, i64* %r, align 8
+  %div = sdiv i64 %a0, %a1
+  ret i64 %div
+}
+
+define i64 @udivrem2(i64 %a0, i64 %a1, i64* nocapture %r) nounwind {
+entry:
+; ALL-LABEL: udivrem2:
+
+; udivrem2 is too complex to effectively check. We can at least check for the
+; calls though.
+; ACC32:         lw $25, %call16(__umoddi3)(
+; ACC32:         jalr $25
+; ACC32:         lw $25, %call16(__udivdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         ddivu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64:         dmodu $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64-DAG:     ddivu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end udivrem2
+
+  %rem = urem i64 %a0, %a1
+  store i64 %rem, i64* %r, align 8
+  %div = udiv i64 %a0, %a1
+  ret i64 %div
+}
diff --git a/test/CodeGen/Mips/dsp-r1.ll b/test/CodeGen/Mips/dsp-r1.ll
index acdd17d1afd4..fbd970399640 100644
--- a/test/CodeGen/Mips/dsp-r1.ll
+++ b/test/CodeGen/Mips/dsp-r1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+dsp < %s | FileCheck %s
 
 define i32 @test__builtin_mips_extr_w1(i32 %i0, i32, i64 %a0) nounwind {
 entry:
diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
new file mode 100644
index 000000000000..e78497a9521e
--- /dev/null
+++ b/test/CodeGen/Mips/ehframe-indirect.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-android < %s | FileCheck %s
+
+define i32 @main() {
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 128, DW.ref.__gxx_personality_v0
+
+entry:
+  invoke void @foo() to label %cont unwind label %lpad
+; CHECK: foo
+; CHECK: jalr
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8*
+    bitcast (i32 (...)* @__gxx_personality_v0 to i8*) catch i8* null
+  ret i32 0
+
+cont:
+  ret i32 0
+}
+; CHECK: .cfi_endproc
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @foo()
+
+; CHECK: .hidden DW.ref.__gxx_personality_v0
+; CHECK: .weak DW.ref.__gxx_personality_v0
+; CHECK: .section .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat
+; CHECK: .align 2
+; CHECK: .type DW.ref.__gxx_personality_v0,@object
+; CHECK: .size DW.ref.__gxx_personality_v0, 4
+; CHECK: DW.ref.__gxx_personality_v0:
+; CHECK: .4byte __gxx_personality_v0
diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll
new file mode 100644
index 000000000000..27246fe20bb2
--- /dev/null
+++ b/test/CodeGen/Mips/fcmp.ll
@@ -0,0 +1,727 @@
+; RUN: llc < %s -march=mipsel   -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=32-C
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32-C
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32-CMP
+; RUN: llc < %s -march=mips64el -mcpu=mips4    | FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: llc < %s -march=mips64el -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64-CMP
+
+define i32 @false_f32(float %a, float %b) nounwind {
+; ALL-LABEL: false_f32:
+; ALL:           addiu $2, $zero, 0
+
+  %1 = fcmp false float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oeq_f32(float %a, float %b) nounwind {
+; ALL-LABEL: oeq_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp oeq float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ogt_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ogt_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.olt.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.olt.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ogt float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oge_f32(float %a, float %b) nounwind {
+; ALL-LABEL: oge_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ole.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ole.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp oge float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @olt_f32(float %a, float %b) nounwind {
+; ALL-LABEL: olt_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.olt.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.olt.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp olt float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ole_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ole_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ole.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ole.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ole float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @one_f32(float %a, float %b) nounwind {
+; ALL-LABEL: one_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $2, $[[T1]]
+
+; 64-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $2, $[[T1]]
+
+  %1 = fcmp one float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ord_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ord_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $2, $[[T1]]
+
+; 64-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $2, $[[T1]]
+
+  %1 = fcmp ord float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ueq_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ueq_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ueq float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ugt_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ugt_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ugt float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uge_f32(float %a, float %b) nounwind {
+; ALL-LABEL: uge_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp uge float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ult_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ult_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ult float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ule_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ule_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ule float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @une_f32(float %a, float %b) nounwind {
+; ALL-LABEL: une_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $2, $[[T1]]
+
+; 64-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $2, $[[T1]]
+
+  %1 = fcmp une float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uno_f32(float %a, float %b) nounwind {
+; ALL-LABEL: uno_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp uno float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @true_f32(float %a, float %b) nounwind {
+; ALL-LABEL: true_f32:
+; ALL:           addiu $2, $zero, 1
+
+  %1 = fcmp true float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @false_f64(double %a, double %b) nounwind {
+; ALL-LABEL: false_f64:
+; ALL:           addiu $2, $zero, 0
+
+  %1 = fcmp false double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oeq_f64(double %a, double %b) nounwind {
+; ALL-LABEL: oeq_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp oeq double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ogt_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ogt_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.olt.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.olt.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ogt double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oge_f64(double %a, double %b) nounwind {
+; ALL-LABEL: oge_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ole.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ole.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp oge double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @olt_f64(double %a, double %b) nounwind {
+; ALL-LABEL: olt_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.olt.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.olt.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp olt double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ole_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ole_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ole.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ole.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ole double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @one_f64(double %a, double %b) nounwind {
+; ALL-LABEL: one_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $2, $[[T1]]
+
+; 64-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $2, $[[T1]]
+
+  %1 = fcmp one double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ord_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ord_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $2, $[[T1]]
+
+; 64-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $2, $[[T1]]
+
+  %1 = fcmp ord double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ueq_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ueq_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ueq double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ugt_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ugt_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ugt double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uge_f64(double %a, double %b) nounwind {
+; ALL-LABEL: uge_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp uge double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ult_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ult_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ult double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ule_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ule_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp ule double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @une_f64(double %a, double %b) nounwind {
+; ALL-LABEL: une_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $2, $[[T1]]
+
+; 64-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $2, $[[T1]]
+
+  %1 = fcmp une double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uno_f64(double %a, double %b) nounwind {
+; ALL-LABEL: uno_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $2, $[[T0]]
+
+; 64-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $2, $[[T0]]
+
+  %1 = fcmp uno double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @true_f64(double %a, double %b) nounwind {
+; ALL-LABEL: true_f64:
+; ALL:           addiu $2, $zero, 1
+
+  %1 = fcmp true double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
index 44c4117510bf..3a9d9c73b279 100644
--- a/test/CodeGen/Mips/fcopysign.ll
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -17,7 +17,7 @@ entry:
 
 ; 32R2: ext  $[[EXT:[0-9]+]], ${{[0-9]+}}, 31, 1
 ; 32R2: ins  $[[INS:[0-9]+]], $[[EXT]], 31, 1
-; 32R2: mtc1 $[[INS]], $f1
+; 32R2: mthc1 $[[INS]], $f0
 
 ; 64: daddiu $[[T0:[0-9]+]], $zero, 1
 ; 64: dsll   $[[MSK1:[0-9]+]], $[[T0]], 63
diff --git a/test/CodeGen/Mips/fmadd1.ll b/test/CodeGen/Mips/fmadd1.ll
index a9a8e212e491..271631efb40a 100644
--- a/test/CodeGen/Mips/fmadd1.ll
+++ b/test/CodeGen/Mips/fmadd1.ll
@@ -5,15 +5,54 @@
 ; IEEE 754 (1985) and IEEE 754 (2008). These instructions are therefore only
 ; available when -enable-no-nans-fp-math is given.
 
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefix=32R2 -check-prefix=CHECK
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=64R2 -check-prefix=CHECK
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2NAN -check-prefix=CHECK
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2NAN -check-prefix=CHECK
+; RUN: llc < %s -march=mipsel   -mcpu=mips32              -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32   -check-prefix=32-NONAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2            -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NONAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6            -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64   -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NONAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32              | FileCheck %s -check-prefix=ALL -check-prefix=32 -check-prefix=32-NAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2            | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6            | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64   -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NAN
 
 define float @FOO0float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO0float:
-; CHECK: madd.s 
+; ALL-LABEL: FOO0float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 32R2:          mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2:          madd.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          add.s $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 64R2:          madd.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.s $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.s $[[T0:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.s $[[T1:f[0-9]+]], $[[T0]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
   %mul = fmul float %a, %b
   %add = fadd float %mul, %c
   %add1 = fadd float %add, 0.000000e+00
@@ -22,8 +61,39 @@ entry:
 
 define float @FOO1float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO1float:
-; CHECK: msub.s 
+; ALL-LABEL: FOO1float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 32R2:          mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2:          msub.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          add.s $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      sub.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 64R2:          msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.s $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.s $[[T0:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.s $[[T1:f[0-9]+]], $[[T0]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
   %add = fadd float %sub, 0.000000e+00
@@ -32,11 +102,44 @@ entry:
 
 define float @FOO2float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO2float:
-; 32R2: nmadd.s 
-; 64R2: nmadd.s 
-; 32R2NAN: madd.s 
-; 64R2NAN: madd.s 
+; ALL-LABEL: FOO2float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NONAN:    nmadd.s $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NAN:      madd.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      sub.s  $f0, $[[T2]], $[[T1]]
+
+; 32R6-DAG:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      sub.s $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 64R2-NONAN:    nmadd.s $f0, $f14, $f12, $f13
+
+; 64R2-NAN:      madd.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.s  $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.s $f0, $[[T2]], $[[T1]]
+
   %mul = fmul float %a, %b
   %add = fadd float %mul, %c
   %sub = fsub float 0.000000e+00, %add
@@ -45,11 +148,36 @@ entry:
 
 define float @FOO3float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO3float:
-; 32R2: nmsub.s 
-; 64R2: nmsub.s 
-; 32R2NAN: msub.s 
-; 64R2NAN: msub.s 
+; ALL-LABEL: FOO3float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NONAN:    nmsub.s $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NAN:      msub.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      sub.s  $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 64R2-NAN:      msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.s  $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.s $f0, $[[T2]], $[[T1]]
+
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
   %sub1 = fsub float 0.000000e+00, %sub
@@ -58,8 +186,40 @@ entry:
 
 define double @FOO10double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO10double:
-; CHECK: madd.d
+; ALL-LABEL: FOO10double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 32R2:          ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2:          madd.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          mthc1 $zero, $[[T2]]
+; 32R2:          add.d $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 64R2:          madd.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.d $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
   %mul = fmul double %a, %b
   %add = fadd double %mul, %c
   %add1 = fadd double %add, 0.000000e+00
@@ -68,8 +228,40 @@ entry:
 
 define double @FOO11double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO11double:
-; CHECK: msub.d
+; ALL-LABEL: FOO11double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 32R2:          ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2:          msub.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          mthc1 $zero, $[[T2]]
+; 32R2:          add.d $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 64R2:          msub.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.d $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
   %add = fadd double %sub, 0.000000e+00
@@ -78,11 +270,45 @@ entry:
 
 define double @FOO12double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO12double:
-; 32R2: nmadd.d 
-; 64R2: nmadd.d 
-; 32R2NAN: madd.d 
-; 64R2NAN: madd.d 
+; ALL-LABEL: FOO12double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NONAN:    nmadd.d $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NAN:      madd.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      mthc1 $zero, $[[T2]]
+; 32R2-NAN:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 64R2-NONAN:    nmadd.d $f0, $f14, $f12, $f13
+
+; 64R2-NAN:      madd.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.d $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
   %mul = fmul double %a, %b
   %add = fadd double %mul, %c
   %sub = fsub double 0.000000e+00, %add
@@ -91,11 +317,45 @@ entry:
 
 define double @FOO13double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO13double:
-; 32R2: nmsub.d 
-; 64R2: nmsub.d 
-; 32R2NAN: msub.d 
-; 64R2NAN: msub.d 
+; ALL-LABEL: FOO13double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NONAN:    nmsub.d $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NAN:      msub.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      mthc1 $zero, $[[T2]]
+; 32R2-NAN:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 64R2-NONAN:    nmsub.d $f0, $f14, $f12, $f13
+
+; 64R2-NAN:      msub.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.d $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
   %sub1 = fsub double 0.000000e+00, %sub
diff --git a/test/CodeGen/Mips/fp-indexed-ls.ll b/test/CodeGen/Mips/fp-indexed-ls.ll
index d8c37e7d90fc..787e131f6ec5 100644
--- a/test/CodeGen/Mips/fp-indexed-ls.ll
+++ b/test/CodeGen/Mips/fp-indexed-ls.ll
@@ -1,6 +1,13 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s \
-; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
+; RUN: llc -march=mipsel   -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R1
+; RUN: llc -march=mipsel   -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R2
+; RUN: llc -march=mipsel   -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R6
+; RUN: llc -march=mips64el -mcpu=mips4    -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64   -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64R6
+
+; Check that [ls][dwu]xc1 are not emitted for nacl.
+; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=CHECK-NACL
 
 %struct.S = type <{ [4 x float] }>
 %struct.S2 = type <{ [4 x double] }>
@@ -14,8 +21,30 @@
 
 define float @foo0(float* nocapture %b, i32 %o) nounwind readonly {
 entry:
-; CHECK: lwxc1
+; ALL-LABEL: foo0:
+
+; MIPS32R1:      sll $[[T1:[0-9]+]], $5, 2
+; MIPS32R1:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R1:      lwc1 $f0, 0($[[T3]])
+
+; MIPS32R2:      sll $[[T1:[0-9]+]], $5, 2
+; MIPS32R2:      lwxc1 $f0, $[[T1]]($4)
+
+; MIPS32R6:      sll $[[T1:[0-9]+]], $5, 2
+; MIPS32R6:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R6:      lwc1 $f0, 0($[[T3]])
+
+; MIPS4:         sll $[[T0:[0-9]+]], $5, 0
+; MIPS4:         dsll $[[T1:[0-9]+]], $[[T0]], 2
+; MIPS4:         lwxc1 $f0, $[[T1]]($4)
+
+; MIPS64R6:      sll $[[T0:[0-9]+]], $5, 0
+; MIPS64R6:      dsll $[[T1:[0-9]+]], $[[T0]], 2
+; MIPS64R6:      daddu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS64R6:      lwc1 $f0, 0($[[T3]])
+
 ; CHECK-NACL-NOT: lwxc1
+
   %arrayidx = getelementptr inbounds float* %b, i32 %o
   %0 = load float* %arrayidx, align 4
   ret float %0
@@ -23,8 +52,30 @@ entry:
 
 define double @foo1(double* nocapture %b, i32 %o) nounwind readonly {
 entry:
-; CHECK: ldxc1
+; ALL-LABEL: foo1:
+
+; MIPS32R1:      sll $[[T1:[0-9]+]], $5, 3
+; MIPS32R1:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R1:      ldc1 $f0, 0($[[T3]])
+
+; MIPS32R2:      sll $[[T1:[0-9]+]], $5, 3
+; MIPS32R2:      ldxc1 $f0, $[[T1]]($4)
+
+; MIPS32R6:      sll $[[T1:[0-9]+]], $5, 3
+; MIPS32R6:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R6:      ldc1 $f0, 0($[[T3]])
+
+; MIPS4:         sll $[[T0:[0-9]+]], $5, 0
+; MIPS4:         dsll $[[T1:[0-9]+]], $[[T0]], 3
+; MIPS4:         ldxc1 $f0, $[[T1]]($4)
+
+; MIPS64R6:      sll $[[T0:[0-9]+]], $5, 0
+; MIPS64R6:      dsll $[[T1:[0-9]+]], $[[T0]], 3
+; MIPS64R6:      daddu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS64R6:      ldc1 $f0, 0($[[T3]])
+
 ; CHECK-NACL-NOT: ldxc1
+
   %arrayidx = getelementptr inbounds double* %b, i32 %o
   %0 = load double* %arrayidx, align 8
   ret double %0
@@ -32,7 +83,23 @@ entry:
 
 define float @foo2(i32 %b, i32 %c) nounwind readonly {
 entry:
-; CHECK-NOT: luxc1
+; ALL-LABEL: foo2:
+
+; luxc1 did not exist in MIPS32r1
+; MIPS32R1-NOT:  luxc1
+
+; luxc1 is a misnomer since it aligns the given pointer downwards and performs
+; an aligned load. We mustn't use it to handle unaligned loads.
+; MIPS32R2-NOT:  luxc1
+
+; luxc1 was removed in MIPS32r6
+; MIPS32R6-NOT:  luxc1
+
+; MIPS4-NOT:     luxc1
+
+; luxc1 was removed in MIPS64r6
+; MIPS64R6-NOT:  luxc1
+
   %arrayidx1 = getelementptr inbounds [4 x %struct.S]* @s, i32 0, i32 %b, i32 0, i32 %c
   %0 = load float* %arrayidx1, align 1
   ret float %0
@@ -40,8 +107,28 @@ entry:
 
 define void @foo3(float* nocapture %b, i32 %o) nounwind {
 entry:
-; CHECK: swxc1
+; ALL-LABEL: foo3:
+
+; MIPS32R1-DAG:  lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R1-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R1-DAG:  swc1 $[[T0]], 0($[[T1]])
+
+; MIPS32R2:      lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R2:      swxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS32R6-DAG:  lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R6-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R6-DAG:  swc1 $[[T0]], 0($[[T1]])
+
+; MIPS4:         lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS4:         swxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS64R6-DAG:  lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS64R6-DAG:  daddu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS64R6-DAG:  swc1 $[[T0]], 0($[[T1]])
+
 ; CHECK-NACL-NOT: swxc1
+
   %0 = load float* @gf, align 4
   %arrayidx = getelementptr inbounds float* %b, i32 %o
   store float %0, float* %arrayidx, align 4
@@ -50,8 +137,28 @@ entry:
 
 define void @foo4(double* nocapture %b, i32 %o) nounwind {
 entry:
-; CHECK: sdxc1
+; ALL-LABEL: foo4:
+
+; MIPS32R1-DAG:  ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R1-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R1-DAG:  sdc1 $[[T0]], 0($[[T1]])
+
+; MIPS32R2:      ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R2:      sdxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS32R6-DAG:  ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R6-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R6-DAG:  sdc1 $[[T0]], 0($[[T1]])
+
+; MIPS4:         ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS4:         sdxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS64R6-DAG:  ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS64R6-DAG:  daddu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS64R6-DAG:  sdc1 $[[T0]], 0($[[T1]])
+
 ; CHECK-NACL-NOT: sdxc1
+
   %0 = load double* @gd, align 8
   %arrayidx = getelementptr inbounds double* %b, i32 %o
   store double %0, double* %arrayidx, align 8
@@ -60,7 +167,18 @@ entry:
 
 define void @foo5(i32 %b, i32 %c) nounwind {
 entry:
-; CHECK-NOT: suxc1
+; ALL-LABEL: foo5:
+
+; MIPS32R1-NOT:  suxc1
+
+; MIPS32R2-NOT:  suxc1
+
+; MIPS32R6-NOT:  suxc1
+
+; MIPS4-NOT:     suxc1
+
+; MIPS64R6-NOT:  suxc1
+
   %0 = load float* @gf, align 4
   %arrayidx1 = getelementptr inbounds [4 x %struct.S]* @s, i32 0, i32 %b, i32 0, i32 %c
   store float %0, float* %arrayidx1, align 1
@@ -69,8 +187,18 @@ entry:
 
 define double @foo6(i32 %b, i32 %c) nounwind readonly {
 entry:
-; CHECK: foo6
-; CHECK-NOT: luxc1
+; ALL-LABEL: foo6:
+
+; MIPS32R1-NOT:  luxc1
+
+; MIPS32R2-NOT:  luxc1
+
+; MIPS32R6-NOT:  luxc1
+
+; MIPS4-NOT:     luxc1
+
+; MIPS64R6-NOT:  luxc1
+
   %arrayidx1 = getelementptr inbounds [4 x %struct.S2]* @s2, i32 0, i32 %b, i32 0, i32 %c
   %0 = load double* %arrayidx1, align 1
   ret double %0
@@ -78,8 +206,18 @@ entry:
 
 define void @foo7(i32 %b, i32 %c) nounwind {
 entry:
-; CHECK: foo7
-; CHECK-NOT: suxc1
+; ALL-LABEL: foo7:
+
+; MIPS32R1-NOT:  suxc1
+
+; MIPS32R2-NOT:  suxc1
+
+; MIPS32R6-NOT:  suxc1
+
+; MIPS4-NOT:     suxc1
+
+; MIPS64R6-NOT:  suxc1
+
   %0 = load double* @gd, align 8
   %arrayidx1 = getelementptr inbounds [4 x %struct.S2]* @s2, i32 0, i32 %b, i32 0, i32 %c
   store double %0, double* %arrayidx1, align 1
@@ -88,16 +226,36 @@ entry:
 
 define float @foo8() nounwind readonly {
 entry:
-; CHECK: foo8
-; CHECK-NOT: luxc1
+; ALL-LABEL: foo8:
+
+; MIPS32R1-NOT:  luxc1
+
+; MIPS32R2-NOT:  luxc1
+
+; MIPS32R6-NOT:  luxc1
+
+; MIPS4-NOT:     luxc1
+
+; MIPS64R6-NOT:  luxc1
+
   %0 = load float* getelementptr inbounds (%struct.S3* @s3, i32 0, i32 1), align 1
   ret float %0
 }
 
 define void @foo9(float %f) nounwind {
 entry:
-; CHECK: foo9
-; CHECK-NOT: suxc1
+; ALL-LABEL: foo9:
+
+; MIPS32R1-NOT:  suxc1
+
+; MIPS32R2-NOT:  suxc1
+
+; MIPS32R6-NOT:  suxc1
+
+; MIPS4-NOT:     suxc1
+
+; MIPS64R6-NOT:  suxc1
+
   store float %f, float* getelementptr inbounds (%struct.S3* @s3, i32 0, i32 1), align 1
   ret void
 }
diff --git a/test/CodeGen/Mips/fpbr.ll b/test/CodeGen/Mips/fpbr.ll
index a136557cc4a3..311b83015a56 100644
--- a/test/CodeGen/Mips/fpbr.ll
+++ b/test/CodeGen/Mips/fpbr.ll
@@ -1,9 +1,25 @@
-; RUN: llc  < %s -march=mipsel | FileCheck %s
+; RUN: llc < %s -march=mipsel -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=32-FCC
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=32-FCC
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=32-GPR
+; RUN: llc < %s -march=mips64el -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=64-FCC
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=64-FCC
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=64-GPR
 
 define void @func0(float %f2, float %f3) nounwind {
 entry:
-; CHECK: c.eq.s
-; CHECK: bc1f
+; ALL-LABEL: func0:
+
+; 32-FCC:        c.eq.s $f12, $f14
+; 64-FCC:        c.eq.s $f12, $f13
+; FCC:           bc1f   $BB0_2
+
+; 32-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f14
+; 64-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f13
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; FIXME: We ought to be able to transform not+bnez -> beqz
+; GPR:           not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB0_2
+
   %cmp = fcmp oeq float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -25,8 +41,18 @@ declare void @g1(...)
 
 define void @func1(float %f2, float %f3) nounwind {
 entry:
-; CHECK: c.olt.s
-; CHECK: bc1f
+; ALL-LABEL: func1:
+
+; 32-FCC:        c.olt.s $f12, $f14
+; 64-FCC:        c.olt.s $f12, $f13
+; FCC:           bc1f    $BB1_2
+
+; 32-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB1_2
+
   %cmp = fcmp olt float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -44,8 +70,18 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func2(float %f2, float %f3) nounwind {
 entry:
-; CHECK: c.ole.s
-; CHECK: bc1t
+; ALL-LABEL: func2:
+
+; 32-FCC:        c.ole.s $f12, $f14
+; 64-FCC:        c.ole.s $f12, $f13
+; FCC:           bc1t    $BB2_2
+
+; 32-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           beqz     $[[GPRCC]], $BB2_2
+
   %cmp = fcmp ugt float %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
 
@@ -63,8 +99,19 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func3(double %f2, double %f3) nounwind {
 entry:
-; CHECK: c.eq.d
-; CHECK: bc1f
+; ALL-LABEL: func3:
+
+; 32-FCC:        c.eq.d $f12, $f14
+; 64-FCC:        c.eq.d $f12, $f13
+; FCC:           bc1f $BB3_2
+
+; 32-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f14
+; 64-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f13
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; FIXME: We ought to be able to transform not+bnez -> beqz
+; GPR:           not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB3_2
+
   %cmp = fcmp oeq double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -82,8 +129,18 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func4(double %f2, double %f3) nounwind {
 entry:
-; CHECK: c.olt.d
-; CHECK: bc1f
+; ALL-LABEL: func4:
+
+; 32-FCC:        c.olt.d $f12, $f14
+; 64-FCC:        c.olt.d $f12, $f13
+; FCC:           bc1f $BB4_2
+
+; 32-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB4_2
+
   %cmp = fcmp olt double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -101,8 +158,18 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func5(double %f2, double %f3) nounwind {
 entry:
-; CHECK: c.ole.d
-; CHECK: bc1t
+; ALL-LABEL: func5:
+
+; 32-FCC:        c.ole.d $f12, $f14
+; 64-FCC:        c.ole.d $f12, $f13
+; FCC:           bc1t $BB5_2
+
+; 32-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           beqz     $[[GPRCC]], $BB5_2
+
   %cmp = fcmp ugt double %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
index f9e53cbb07a4..c09108dc0744 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
@@ -9,7 +9,7 @@ define i32 @main() nounwind {
 entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 1048576) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i32 7, i32 1048576) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
index 1fdf672fe197..2b24b0f82c57 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
@@ -10,7 +10,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
 
-  tail call i32 asm "addi $0,$1,$2", "=r,r,J"(i32 1024, i32 3) nounwind
+  tail call i32 asm "addiu $0,$1,$2", "=r,r,J"(i32 1024, i32 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
index 49dcc8745857..5edb3e24674e 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
@@ -10,7 +10,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
 
-  tail call i32 asm "addi $0,$1,$2", "=r,r,L"(i32 7, i32 1048579) nounwind
+  tail call i32 asm "addiu $0,$1,$2", "=r,r,L"(i32 7, i32 1048579) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
index 770669d913e8..eaa540acdafa 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
@@ -11,7 +11,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,N"(i32 7, i32 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
index cd4431ac5265..56afbaaa9cd6 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
@@ -11,6 +11,6 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'O'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 undef, i32 16384) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,O"(i32 undef, i32 16384) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
index 0a4739ebb96b..0a55cb55e5f2 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
@@ -11,6 +11,6 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'P'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 undef, i32 655536) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 undef, i32 655536) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
index 94ded307fda9..a67ddce222ae 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -1,33 +1,34 @@
 ; Positive test for inline register constraints
 ;
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
 
 define i32 @main() nounwind {
 entry:
 
 ; r with char
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},23
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},23
 ;CHECK:	#NO_APP
-  tail call i8 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
+  tail call i8 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
 
 ; r with short
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},13
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},13
 ;CHECK:	#NO_APP
-  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
+  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
 
 ; r with int
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK:	#NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
 
 ; Now c with 1024: make sure register $25 is picked
 ; CHECK: #APP
-; CHECK: addi $25,${{[0-9]+}},1024
+; CHECK: addiu $25,${{[0-9]+}},1024
 ; CHECK: #NO_APP	
-   tail call i32 asm sideeffect "addi $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
+   tail call i32 asm sideeffect "addiu $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
 
 ; Now l with 1024: make sure register lo is picked. We do this by checking the instruction
 ; after the inline expression for a mflo to pull the value out of lo.
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
index 787066602575..a7ba762b1064 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
@@ -12,9 +12,9 @@ entry:
 
 ; r with long long
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK:	#NO_APP
-  tail call i64 asm sideeffect "addi $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
+  tail call i64 asm sideeffect "addiu $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
index 7bb4adc31bd8..6512851a11be 100644
--- a/test/CodeGen/Mips/inlineasm-operand-code.ll
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll
@@ -12,9 +12,9 @@ define i32 @constraint_X() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_X:
 ;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
+;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
 ;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -23,9 +23,9 @@ define i32 @constraint_x() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_x:
 ;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffd
+;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffd
 ;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -34,9 +34,9 @@ define i32 @constraint_d() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_d:
 ;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-3
 ;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -45,9 +45,9 @@ define i32 @constraint_m() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_m:
 ;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-4
+;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-4
 ;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -56,15 +56,15 @@ define i32 @constraint_z() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL: constraint_z:
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},-3
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
 
 ; z with 0
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},$0
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},$0
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
   ret i32 0
 }
 
@@ -73,9 +73,9 @@ define i32 @constraint_longlong() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL: constraint_longlong:
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i64 asm sideeffect "addi $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
+  tail call i64 asm sideeffect "addiu $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm_constraint.ll b/test/CodeGen/Mips/inlineasm_constraint.ll
index 8d30f45d84e3..76b73dc276ae 100644
--- a/test/CodeGen/Mips/inlineasm_constraint.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint.ll
@@ -5,21 +5,21 @@ entry:
 
 ; First I with short
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},4096
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},4096
 ; CHECK: #NO_APP
-  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
+  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
 
 ; Then I with int
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP
-   tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
+   tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
 
 ; Now J with 0
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},0
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},0
 ; CHECK: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
 
 ; Now K with 64
 ; CHECK: #APP
@@ -35,29 +35,29 @@ entry:
 
 ; Now N with -3
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
 
 ; Now O with -3
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
 
 ; Now P with 65535
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},65535
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},65535
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
 
 ; Now R Which takes the address of c
   %c = alloca i32, align 4
   store i32 -4469539, i32* %c, align 4
-  %8 = call i32 asm sideeffect "lwl $0, 1 + $1\0A\09lwr $0, 2 + $1\0A\09", "=r,*R"(i32* %c) #1
+  %8 = call i32 asm sideeffect "lw $0, 1 + $1\0A\09lw $0, 2 + $1\0A\09", "=r,*R"(i32* %c) #1
 ; CHECK: #APP
-; CHECK: lwl ${{[0-9]+}}, 1 + 0(${{[0-9]+}})
-; CHECK: lwr ${{[0-9]+}}, 2 + 0(${{[0-9]+}})
+; CHECK: lw ${{[0-9]+}}, 1 + 0(${{[0-9]+}})
+; CHECK: lw ${{[0-9]+}}, 2 + 0(${{[0-9]+}})
 ; CHECK: #NO_APP	
 
   ret i32 0
diff --git a/test/CodeGen/Mips/lit.local.cfg b/test/CodeGen/Mips/lit.local.cfg
index 1fa54b428cd9..a3183a25afaa 100644
--- a/test/CodeGen/Mips/lit.local.cfg
+++ b/test/CodeGen/Mips/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index d0928ee26613..a3f5ebfb5460 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -1,29 +1,439 @@
-; RUN: llc -march=mipsel < %s | FileCheck  -check-prefix=EL %s
-; RUN: llc -march=mips < %s | FileCheck  -check-prefix=EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32              < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
+; RUN: llc -march=mips     -mcpu=mips32              < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32r2            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
+; RUN: llc -march=mips     -mcpu=mips32r2            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EL %s
+; RUN: llc -march=mips     -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EB %s
+; RUN: llc -march=mips64el -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EB %s
 
+%struct.SLL = type { i64 }
 %struct.SI = type { i32 }
+%struct.SUI = type { i32 }
 
+@sll = common global %struct.SLL zeroinitializer, align 1
 @si = common global %struct.SI zeroinitializer, align 1
+@sui = common global %struct.SUI zeroinitializer, align 1
 
-define i32 @foo_load_i() nounwind readonly {
+define i32 @load_SI() nounwind readonly {
 entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
+; ALL-LABEL: load_SI:
+
+; MIPS32-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      lw $2, 0($[[PTR]])
+
+; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      lw $2, 0($[[PTR]])
 
   %0 = load i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
   ret i32 %0
 }
 
-define void @foo_store_i(i32 %a) nounwind {
+define void @store_SI(i32 %a) nounwind {
 entry:
-; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: swr $[[R0]], 0($[[R1]])
-; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: swr $[[R0]], 3($[[R1]])
+; ALL-LABEL: store_SI:
+
+; MIPS32-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      sw $4, 0($[[PTR]])
+
+; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      sw $4, 0($[[PTR]])
 
   store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
   ret void
 }
 
+define i64 @load_SLL() nounwind readonly {
+entry:
+; ALL-LABEL: load_SLL:
+
+; MIPS32-EL:     lwl $2, 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $2, 0($[[R1]])
+; MIPS32-EL:     lwl $3, 7($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $3, 4($[[R1]])
+
+; MIPS32-EB:     lwl $2, 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $2, 3($[[R1]])
+; MIPS32-EB:     lwl $3, 4($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $3, 7($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(sll)(
+; MIPS32R6-DAG:  lw $2, 0($[[PTR]])
+; MIPS32R6-DAG:  lw $3, 4($[[PTR]])
+
+; MIPS64-EL:     ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64-EL:     ldr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     ldr $[[R0]], 7($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
+; MIPS64R6:      ld $2, 0($[[PTR]])
+
+  %0 = load i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret i64 %0
+}
+
+define i64 @load_SI_sext_to_i64() nounwind readonly {
+entry:
+; ALL-LABEL: load_SI_sext_to_i64:
+
+; MIPS32-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6-EL:   lw $2, 0($[[PTR]])
+; MIPS32R6-EL:   sra $3, $2, 31
+; MIPS32R6-EB:   lw $3, 0($[[PTR]])
+; MIPS32R6-EB:   sra $2, $3, 31
+
+; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      lw $2, 0($[[PTR]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  %conv = sext i32 %0 to i64
+  ret i64 %conv
+}
+
+define i64 @load_UI() nounwind readonly {
+entry:
+; ALL-LABEL: load_UI:
+
+; MIPS32-EL-DAG: lwl $[[R2:2]], 3($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: lwr $[[R2]],   0($[[R1]])
+; MIPS32-EL-DAG: addiu $3, $zero, 0
+
+; MIPS32-EB-DAG: lwl $[[R2:3]], 0($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: lwr $[[R2]],   3($[[R1]])
+; MIPS32-EB-DAG: addiu $2, $zero, 0
+
+; MIPS32R6:        lw $[[PTR:[0-9]+]], %got(sui)(
+; MIPS32R6-EL-DAG: lw $2, 0($[[PTR]])
+; MIPS32R6-EL-DAG: addiu $3, $zero, 0
+; MIPS32R6-EB-DAG: lw $3, 0($[[PTR]])
+; MIPS32R6-EB-DAG: addiu $2, $zero, 0
+
+; MIPS64-EL-DAG: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL-DAG: lwr $[[R0]], 0($[[R1]])
+; MIPS64-EL-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
+; MIPS64-EL-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 32
+; MIPS64-EL-DAG: daddiu $[[R4:[0-9]+]], $[[R3]], -1
+; MIPS64-EL-DAG: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sui)(
+; MIPS64R6:      lwu $2, 0($[[PTR]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SUI* @sui, i64 0, i32 0), align 1
+  %conv = zext i32 %0 to i64
+  ret i64 %conv
+}
+
+define void @store_SLL(i64 %a) nounwind {
+entry:
+; ALL-LABEL: store_SLL:
+
+; MIPS32-EL-DAG: swl $[[A1:4]], 3($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: swr $[[A1]],   0($[[R1]])
+; MIPS32-EL-DAG: swl $[[A2:5]], 7($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: swr $[[A2]],   4($[[R1]])
+
+; MIPS32-EB-DAG: swl $[[A1:4]], 0($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: swr $[[A1]],   3($[[R1]])
+; MIPS32-EB-DAG: swl $[[A1:5]], 4($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: swr $[[A1]],   7($[[R1]])
+
+; MIPS32R6-DAG:  lw $[[PTR:[0-9]+]], %got(sll)(
+; MIPS32R6-DAG:  sw $4, 0($[[PTR]])
+; MIPS32R6-DAG:  sw $5, 4($[[PTR]])
+
+; MIPS64-EL:     sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64-EL:     sdr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     sdr $[[R0]], 7($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
+; MIPS64R6:      sd $4, 0($[[PTR]])
+
+  store i64 %a, i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret void
+}
+
+define void @store_SI_trunc_from_i64(i32 %a) nounwind {
+entry:
+; ALL-LABEL: store_SI_trunc_from_i64:
+
+; MIPS32-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      sw $4, 0($[[PTR]])
+
+; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      sw $4, 0($[[PTR]])
+
+  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  ret void
+}
+
+;
+; Structures are simply concatenations of the members. They are unaffected by
+; endianness
+;
+
+%struct.S0 = type { i8, i8 }
+@struct_s0 = common global %struct.S0 zeroinitializer, align 1
+%struct.S1 = type { i16, i16 }
+@struct_s1 = common global %struct.S1 zeroinitializer, align 1
+%struct.S2 = type { i32, i32 }
+@struct_s2 = common global %struct.S2 zeroinitializer, align 1
+
+define void @copy_struct_S0() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S0:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+
+; FIXME: We should be able to do better than this on MIPS32r6/MIPS64r6 since
+;        we have unaligned halfword load/store available
+; ALL-DAG:       lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; ALL-DAG:       sb $[[R1]], 2($[[PTR]])
+; ALL-DAG:       lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; ALL-DAG:       sb $[[R1]], 3($[[PTR]])
+
+  %0 = load %struct.S0* getelementptr inbounds (%struct.S0* @struct_s0, i32 0), align 1
+  store %struct.S0 %0, %struct.S0* getelementptr inbounds (%struct.S0* @struct_s0, i32 1), align 1
+  ret void
+}
+
+define void @copy_struct_S1() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S1:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 7($[[PTR]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6-DAG:  sh $[[R1]], 4($[[PTR]])
+; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS32R6-DAG:  sh $[[R1]], 6($[[PTR]])
+
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 7($[[PTR]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sh $[[R1]], 4($[[PTR]])
+; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS64R6-DAG:  sh $[[R1]], 6($[[PTR]])
+
+  %0 = load %struct.S1* getelementptr inbounds (%struct.S1* @struct_s1, i32 0), align 1
+  store %struct.S1 %0, %struct.S1* getelementptr inbounds (%struct.S1* @struct_s1, i32 1), align 1
+  ret void
+}
+
+define void @copy_struct_S2() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S2:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],        0($[[PTR]])
+; MIPS32-EL-DAG: swl $[[R1]],       11($[[PTR]])
+; MIPS32-EL-DAG: swr $[[R1]],        8($[[PTR]])
+; MIPS32-EL-DAG: lwl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],        4($[[PTR]])
+; MIPS32-EL-DAG: swl $[[R1]],       15($[[PTR]])
+; MIPS32-EL-DAG: swr $[[R1]],       12($[[PTR]])
+
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],        3($[[PTR]])
+; MIPS32-EB-DAG: swl $[[R1]],        8($[[PTR]])
+; MIPS32-EB-DAG: swr $[[R1]],       11($[[PTR]])
+; MIPS32-EB-DAG: lwl $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],        7($[[PTR]])
+; MIPS32-EB-DAG: swl $[[R1]],       12($[[PTR]])
+; MIPS32-EB-DAG: swr $[[R1]],       15($[[PTR]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6-DAG:  sw $[[R1]],        8($[[PTR]])
+; MIPS32R6-DAG:  lw $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-DAG:  sw $[[R1]],       12($[[PTR]])
+
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],        0($[[PTR]])
+; MIPS64-EL-DAG: swl $[[R1]],       11($[[PTR]])
+; MIPS64-EL-DAG: swr $[[R1]],        8($[[PTR]])
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],        4($[[PTR]])
+; MIPS64-EL-DAG: swl $[[R1]],       15($[[PTR]])
+; MIPS64-EL-DAG: swr $[[R1]],       12($[[PTR]])
+
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: lwr $[[R1]],        3($[[PTR]])
+; MIPS64-EB-DAG: swl $[[R1]],        8($[[PTR]])
+; MIPS64-EB-DAG: swr $[[R1]],       11($[[PTR]])
+; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS64-EB-DAG: lwr $[[R1]],        7($[[PTR]])
+; MIPS64-EB-DAG: swl $[[R1]],       12($[[PTR]])
+; MIPS64-EB-DAG: swr $[[R1]],       15($[[PTR]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sw $[[R1]],        8($[[PTR]])
+; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS64R6-DAG:  sw $[[R1]],       12($[[PTR]])
+
+  %0 = load %struct.S2* getelementptr inbounds (%struct.S2* @struct_s2, i32 0), align 1
+  store %struct.S2 %0, %struct.S2* getelementptr inbounds (%struct.S2* @struct_s2, i32 1), align 1
+  ret void
+}
+
+;
+; Arrays are simply concatenations of the members. They are unaffected by
+; endianness
+;
+
+@arr = common global [7 x i8] zeroinitializer, align 1
+
+define void @pass_array_byval() nounwind {
+entry:
+; ALL-LABEL: pass_array_byval:
+
+; MIPS32-EL:     lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32-EL-DAG: lwl $[[R1:4]], 3($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],   0($[[PTR]])
+; MIPS32-EL-DAG: lbu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32-EL-DAG: lbu $[[R3:[0-9]+]], 5($[[PTR]])
+; MIPS32-EL-DAG: sll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS32-EL-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS32-EL-DAG: lbu $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS32-EL-DAG: sll $[[T2:[0-9]+]], $[[R4]], 16
+; MIPS32-EL-DAG: or  $5, $[[T1]], $[[T2]]
+
+; MIPS32-EB:     lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32-EB-DAG: lwl $[[R1:4]], 0($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],   3($[[PTR]])
+; MIPS32-EB-DAG: lbu $[[R2:[0-9]+]], 5($[[PTR]])
+; MIPS32-EB-DAG: lbu $[[R3:[0-9]+]], 4($[[PTR]])
+; MIPS32-EB-DAG: sll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS32-EB-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS32-EB-DAG: sll $[[T1]], $[[T1]], 16
+; MIPS32-EB-DAG: lbu $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS32-EB-DAG: sll $[[T2:[0-9]+]], $[[R4]], 8
+; MIPS32-EB-DAG: or  $5, $[[T1]], $[[T2]]
+
+; MIPS32R6:        lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32R6-DAG:    lw $4, 0($[[PTR]])
+; MIPS32R6-EL-DAG: lhu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-EL-DAG: lbu $[[R3:[0-9]+]], 6($[[PTR]])
+; MIPS32R6-EL-DAG: sll $[[T0:[0-9]+]], $[[R3]], 16
+; MIPS32R6-EL-DAG: or  $5, $[[R2]], $[[T0]]
+
+; MIPS32R6-EB-DAG: lhu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-EB-DAG: lbu $[[R3:[0-9]+]], 6($[[PTR]])
+; MIPS32R6-EB-DAG: sll $[[T0:[0-9]+]], $[[R2]], 16
+; MIPS32R6-EB-DAG: or  $5, $[[T0]], $[[R3]]
+
+; MIPS64-EL:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],   0($[[PTR]])
+
+; MIPS64-EB:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64-EB-DAG: lwl  $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: lwr  $[[R1]],   3($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[R1]], $[[R1]], 32
+; MIPS64-EB-DAG: lbu  $[[R2:[0-9]+]], 5($[[PTR]])
+; MIPS64-EB-DAG: lbu  $[[R3:[0-9]+]], 4($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS64-EB-DAG: or   $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS64-EB-DAG: dsll $[[T1]], $[[T1]], 16
+; MIPS64-EB-DAG: or   $[[T3:[0-9]+]], $[[R1]], $[[T1]]
+; MIPS64-EB-DAG: lbu  $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[T4:[0-9]+]], $[[R4]], 8
+; MIPS64-EB-DAG: or   $4, $[[T3]], $[[T4]]
+
+; MIPS64R6:      ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+
+  tail call void @extern_func([7 x i8]* byval @arr) nounwind
+  ret void
+}
+
+declare void @extern_func([7 x i8]* byval)
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index f9980940643d..a403744c8fd5 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -7,6 +7,8 @@
 ; RUN:   < %s | FileCheck %s -check-prefix=N64
 ; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=micromips \
 ; RUN:   -force-mips-long-branch -O3 < %s | FileCheck %s -check-prefix=MICROMIPS
+; RUN: llc -mtriple=mipsel-none-nacl -force-mips-long-branch -O3 < %s \
+; RUN:   | FileCheck %s -check-prefix=NACL
 
 
 @x = external global i32
@@ -80,10 +82,7 @@ end:
 ; Check for long branch expansion:
 ; N64:           daddiu  $sp, $sp, -16
 ; N64-NEXT:      sd      $ra, 0($sp)
-; N64-NEXT:      lui     $1, %highest(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
-; N64-NEXT:      daddiu  $1, $1, %higher(($[[BB2]])-($[[BB1]]))
-; N64-NEXT:      dsll    $1, $1, 16
-; N64-NEXT:      daddiu  $1, $1, %hi(($[[BB2]])-($[[BB1]]))
+; N64-NEXT:      daddiu  $1, $zero, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
 ; N64-NEXT:      dsll    $1, $1, 16
 ; N64-NEXT:      bal     $[[BB1]]
 ; N64-NEXT:      daddiu  $1, $1, %lo(($[[BB2]])-($[[BB1]]))
@@ -129,4 +128,36 @@ end:
 ; MICROMIPS:   $[[BB2]]:
 ; MICROMIPS:        jr      $ra
 ; MICROMIPS:        nop
+
+
+; Check the NaCl version.  Check that sp change is not in the branch delay slot
+; of "jr $1" instruction.  Check that target of indirect branch "jr $1" is
+; bundle aligned.
+
+; NACL:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; NACL:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; NACL:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; NACL:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; NACL:             addiu   $sp, $sp, -8
+; NACL-NEXT:        sw      $ra, 0($sp)
+; NACL-NEXT:        lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; NACL-NEXT:        bal     $[[BB1]]
+; NACL-NEXT:        addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; NACL-NEXT:   $[[BB1]]:
+; NACL-NEXT:        addu    $1, $ra, $1
+; NACL-NEXT:        lw      $ra, 0($sp)
+; NACL-NEXT:        addiu   $sp, $sp, 8
+; NACL-NEXT:        jr      $1
+; NACL-NEXT:        nop
+
+; NACL:        $[[BB0]]:
+; NACL:             lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; NACL:             addiu   $[[R2:[0-9]+]], $zero, 1
+; NACL:             sw      $[[R2]], 0($[[R1]])
+; NACL:             .align  4
+; NACL-NEXT:   $[[BB2]]:
+; NACL:             jr      $ra
+; NACL:             nop
 }
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 0dbb2c27b8f9..82229677ff11 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -1,9 +1,49 @@
-; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=32
-; RUN: llc -march=mips -mattr=dsp < %s | FileCheck %s -check-prefix=DSP
+; RUN: llc -march=mips -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=32
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R6
+; RUN: llc -march=mips -mcpu=mips32 -mattr=dsp < %s | FileCheck %s -check-prefix=DSP
+; RUN: llc -march=mips -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=64
+; RUN: llc -march=mips -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64
+; RUN: llc -march=mips -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64R6
+
+; FIXME: The MIPS16 test should check its output
 ; RUN: llc -march=mips -mcpu=mips16 < %s
 
-; 32: madd ${{[0-9]+}}
-; DSP: madd $ac
+; ALL-LABEL: madd1:
+
+; 32-DAG:        sra $[[T0:[0-9]+]], $6, 31
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]add ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       sra $[[T0:[0-9]+]], $6, 31
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       madd $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
+; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        sll $[[T3:[0-9]+]], $6, 0
+; 64-DAG:        daddu $2, $[[T2]], $[[T3]]
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      sll $[[T3:[0-9]+]], $6, 0
+; 64R6-DAG:      daddu $2, $[[T2]], $[[T3]]
+
 define i64 @madd1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = sext i32 %a to i64
@@ -14,8 +54,47 @@ entry:
   ret i64 %add
 }
 
-; 32: maddu ${{[0-9]+}}
-; DSP: maddu $ac
+; ALL-LABEL: madd2:
+
+; FIXME: We don't really need this instruction
+; 32-DAG:        addiu $[[T0:[0-9]+]], $zero, 0
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]addu ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       maddu $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
+; FIXME: There's a redundant move here. We should remove it
+; 32R6-DAG:      muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $2, $[[T3]], $[[T2]]
+
+; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
+; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
+; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
+; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
+; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
+; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64-DAG:        daddu $2, $[[T4]], $[[T6]]
+
+; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
+; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
+; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
+; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64R6-DAG:      daddu $2, $[[T4]], $[[T6]]
+
 define i64 @madd2(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = zext i32 %a to i64
@@ -26,8 +105,38 @@ entry:
   ret i64 %add
 }
 
-; 32: madd ${{[0-9]+}}
-; DSP: madd $ac
+; ALL-LABEL: madd3:
+
+; 32-DAG:        mthi $6
+; 32-DAG:        mtlo $7
+; 32-DAG:        [[m:m]]add ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       mthi $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       mtlo $[[AC]], $7
+; DSP-DAG:       madd $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $7
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $7
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $6
+; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        daddu $2, $[[T2]], $6
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      daddu $2, $[[T2]], $6
+
 define i64 @madd3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 entry:
   %conv = sext i32 %a to i64
@@ -37,8 +146,41 @@ entry:
   ret i64 %add
 }
 
-; 32: msub ${{[0-9]+}}
-; DSP: msub $ac
+; ALL-LABEL: msub1:
+
+; 32-DAG:        sra $[[T0:[0-9]+]], $6, 31
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]sub ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       sra $[[T0:[0-9]+]], $6, 31
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       msub $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      muh  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      mul  $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T3:[0-9]+]], $6, $[[T1]]
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $[[T0]]
+; 32R6-DAG:      sra  $[[T5:[0-9]+]], $6, 31
+; 32R6-DAG:      subu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      subu $3, $6, $[[T1]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        sll $[[T3:[0-9]+]], $6, 0
+; 64-DAG:        dsubu $2, $[[T3]], $[[T2]]
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      sll $[[T3:[0-9]+]], $6, 0
+; 64R6-DAG:      dsubu $2, $[[T3]], $[[T2]]
+
 define i64 @msub1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = sext i32 %c to i64
@@ -49,8 +191,48 @@ entry:
   ret i64 %sub
 }
 
-; 32: msubu ${{[0-9]+}}
-; DSP: msubu $ac
+; ALL-LABEL: msub2:
+
+; FIXME: We don't really need this instruction
+; 32-DAG:        addiu $[[T0:[0-9]+]], $zero, 0
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]subu ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       msubu $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      muhu $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
+
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $6, $[[T1]]
+; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
+; 32R6-DAG:      negu $2, $[[T3]]
+; 32R6-DAG:      subu $3, $6, $[[T1]]
+
+; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
+; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
+; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
+; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
+; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
+; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64-DAG:        dsubu $2, $[[T6]], $[[T4]]
+
+; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
+; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
+; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
+; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64R6-DAG:      dsubu $2, $[[T6]], $[[T4]]
+
 define i64 @msub2(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = zext i32 %c to i64
@@ -61,8 +243,39 @@ entry:
   ret i64 %sub
 }
 
-; 32: msub ${{[0-9]+}}
-; DSP: msub $ac
+; ALL-LABEL: msub3:
+
+; FIXME: We don't really need this instruction
+; 32-DAG:        mthi $6
+; 32-DAG:        mtlo $7
+; 32-DAG:        [[m:m]]sub ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       msub $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      muh $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $7, $[[T1]]
+; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
+; 32R6-DAG:      subu $2, $6, $[[T3]]
+; 32R6-DAG:      subu $3, $7, $[[T1]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        dsubu $2, $6, $[[T2]]
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      dsubu $2, $6, $[[T2]]
+
 define i64 @msub3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 entry:
   %conv = sext i32 %a to i64
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index 4d590b641f4c..d523f253bf39 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -1,7 +1,11 @@
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips4 -soft-float -O1 \
-; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64 -soft-float -O1 \
-; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
+; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r2 -soft-float -O1 \
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
+; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r6 -soft-float -O1 \
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=CMP_CC_FMT
 
 @gld0 = external global fp128
 @gld1 = external global fp128
@@ -9,8 +13,8 @@
 @gf1 = external global float
 @gd1 = external global double
 
-; CHECK-LABEL: addLD:
-; CHECK: ld $25, %call16(__addtf3)
+; ALL-LABEL: addLD:
+; ALL: ld $25, %call16(__addtf3)
 
 define fp128 @addLD() {
 entry:
@@ -20,8 +24,8 @@ entry:
   ret fp128 %add
 }
 
-; CHECK-LABEL: subLD:
-; CHECK: ld $25, %call16(__subtf3)
+; ALL-LABEL: subLD:
+; ALL: ld $25, %call16(__subtf3)
 
 define fp128 @subLD() {
 entry:
@@ -31,8 +35,8 @@ entry:
   ret fp128 %sub
 }
 
-; CHECK-LABEL: mulLD:
-; CHECK: ld $25, %call16(__multf3)
+; ALL-LABEL: mulLD:
+; ALL: ld $25, %call16(__multf3)
 
 define fp128 @mulLD() {
 entry:
@@ -42,8 +46,8 @@ entry:
   ret fp128 %mul
 }
 
-; CHECK-LABEL: divLD:
-; CHECK: ld $25, %call16(__divtf3)
+; ALL-LABEL: divLD:
+; ALL: ld $25, %call16(__divtf3)
 
 define fp128 @divLD() {
 entry:
@@ -53,8 +57,8 @@ entry:
   ret fp128 %div
 }
 
-; CHECK-LABEL: conv_LD_char:
-; CHECK: ld $25, %call16(__floatsitf)
+; ALL-LABEL: conv_LD_char:
+; ALL: ld $25, %call16(__floatsitf)
 
 define fp128 @conv_LD_char(i8 signext %a) {
 entry:
@@ -62,8 +66,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_short:
-; CHECK: ld $25, %call16(__floatsitf)
+; ALL-LABEL: conv_LD_short:
+; ALL: ld $25, %call16(__floatsitf)
 
 define fp128 @conv_LD_short(i16 signext %a) {
 entry:
@@ -71,8 +75,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_int:
-; CHECK: ld $25, %call16(__floatsitf)
+; ALL-LABEL: conv_LD_int:
+; ALL: ld $25, %call16(__floatsitf)
 
 define fp128 @conv_LD_int(i32 %a) {
 entry:
@@ -80,8 +84,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_LL:
-; CHECK: ld $25, %call16(__floatditf)
+; ALL-LABEL: conv_LD_LL:
+; ALL: ld $25, %call16(__floatditf)
 
 define fp128 @conv_LD_LL(i64 %a) {
 entry:
@@ -89,8 +93,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_UChar:
-; CHECK: ld $25, %call16(__floatunsitf)
+; ALL-LABEL: conv_LD_UChar:
+; ALL: ld $25, %call16(__floatunsitf)
 
 define fp128 @conv_LD_UChar(i8 zeroext %a) {
 entry:
@@ -98,8 +102,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_UShort:
-; CHECK: ld $25, %call16(__floatunsitf)
+; ALL-LABEL: conv_LD_UShort:
+; ALL: ld $25, %call16(__floatunsitf)
 
 define fp128 @conv_LD_UShort(i16 zeroext %a) {
 entry:
@@ -107,8 +111,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_UInt:
-; CHECK: ld $25, %call16(__floatunsitf)
+; ALL-LABEL: conv_LD_UInt:
+; ALL: ld $25, %call16(__floatunsitf)
 
 define fp128 @conv_LD_UInt(i32 %a) {
 entry:
@@ -116,8 +120,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_ULL:
-; CHECK: ld $25, %call16(__floatunditf)
+; ALL-LABEL: conv_LD_ULL:
+; ALL: ld $25, %call16(__floatunditf)
 
 define fp128 @conv_LD_ULL(i64 %a) {
 entry:
@@ -125,8 +129,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_char_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_char_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define signext i8 @conv_char_LD(fp128 %a) {
 entry:
@@ -134,8 +138,8 @@ entry:
   ret i8 %conv
 }
 
-; CHECK-LABEL: conv_short_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_short_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define signext i16 @conv_short_LD(fp128 %a) {
 entry:
@@ -143,8 +147,8 @@ entry:
   ret i16 %conv
 }
 
-; CHECK-LABEL: conv_int_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_int_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define i32 @conv_int_LD(fp128 %a) {
 entry:
@@ -152,8 +156,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: conv_LL_LD:
-; CHECK: ld $25, %call16(__fixtfdi)
+; ALL-LABEL: conv_LL_LD:
+; ALL: ld $25, %call16(__fixtfdi)
 
 define i64 @conv_LL_LD(fp128 %a) {
 entry:
@@ -161,8 +165,8 @@ entry:
   ret i64 %conv
 }
 
-; CHECK-LABEL: conv_UChar_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_UChar_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define zeroext i8 @conv_UChar_LD(fp128 %a) {
 entry:
@@ -170,8 +174,8 @@ entry:
   ret i8 %conv
 }
 
-; CHECK-LABEL: conv_UShort_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_UShort_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define zeroext i16 @conv_UShort_LD(fp128 %a) {
 entry:
@@ -179,8 +183,8 @@ entry:
   ret i16 %conv
 }
 
-; CHECK-LABEL: conv_UInt_LD:
-; CHECK: ld $25, %call16(__fixunstfsi)
+; ALL-LABEL: conv_UInt_LD:
+; ALL: ld $25, %call16(__fixunstfsi)
 
 define i32 @conv_UInt_LD(fp128 %a) {
 entry:
@@ -188,8 +192,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: conv_ULL_LD:
-; CHECK: ld $25, %call16(__fixunstfdi)
+; ALL-LABEL: conv_ULL_LD:
+; ALL: ld $25, %call16(__fixunstfdi)
 
 define i64 @conv_ULL_LD(fp128 %a) {
 entry:
@@ -197,8 +201,8 @@ entry:
   ret i64 %conv
 }
 
-; CHECK-LABEL: conv_LD_float:
-; CHECK: ld $25, %call16(__extendsftf2)
+; ALL-LABEL: conv_LD_float:
+; ALL: ld $25, %call16(__extendsftf2)
 
 define fp128 @conv_LD_float(float %a) {
 entry:
@@ -206,8 +210,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_double:
-; CHECK: ld $25, %call16(__extenddftf2)
+; ALL-LABEL: conv_LD_double:
+; ALL: ld $25, %call16(__extenddftf2)
 
 define fp128 @conv_LD_double(double %a) {
 entry:
@@ -215,8 +219,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_float_LD:
-; CHECK: ld $25, %call16(__trunctfsf2)
+; ALL-LABEL: conv_float_LD:
+; ALL: ld $25, %call16(__trunctfsf2)
 
 define float @conv_float_LD(fp128 %a) {
 entry:
@@ -224,8 +228,8 @@ entry:
   ret float %conv
 }
 
-; CHECK-LABEL: conv_double_LD:
-; CHECK: ld $25, %call16(__trunctfdf2)
+; ALL-LABEL: conv_double_LD:
+; ALL: ld $25, %call16(__trunctfdf2)
 
 define double @conv_double_LD(fp128 %a) {
 entry:
@@ -233,13 +237,13 @@ entry:
   ret double %conv
 }
 
-; CHECK-LABEL:             libcall1_fabsl:
-; CHECK-DAG: ld      $[[R0:[0-9]+]], 8($[[R4:[0-9]+]])
-; CHECK-DAG: daddiu  $[[R1:[0-9]+]], $zero, 1
-; CHECK-DAG: dsll    $[[R2:[0-9]+]], $[[R1]], 63
-; CHECK-DAG: daddiu  $[[R3:[0-9]+]], $[[R2]], -1
-; CHECK-DAG: and     $4, $[[R0]], $[[R3]]
-; CHECK-DAG: ld      $2, 0($[[R4]])
+; ALL-LABEL:             libcall1_fabsl:
+; ALL-DAG: ld      $[[R0:[0-9]+]], 8($[[R4:[0-9]+]])
+; ALL-DAG: daddiu  $[[R1:[0-9]+]], $zero, 1
+; ALL-DAG: dsll    $[[R2:[0-9]+]], $[[R1]], 63
+; ALL-DAG: daddiu  $[[R3:[0-9]+]], $[[R2]], -1
+; ALL-DAG: and     $4, $[[R0]], $[[R3]]
+; ALL-DAG: ld      $2, 0($[[R4]])
 
 define fp128 @libcall1_fabsl() {
 entry:
@@ -250,8 +254,8 @@ entry:
 
 declare fp128 @fabsl(fp128) #1
 
-; CHECK-LABEL: libcall1_ceill:
-; CHECK: ld $25, %call16(ceill)
+; ALL-LABEL: libcall1_ceill:
+; ALL: ld $25, %call16(ceill)
 
 define fp128 @libcall1_ceill() {
 entry:
@@ -262,8 +266,8 @@ entry:
 
 declare fp128 @ceill(fp128) #1
 
-; CHECK-LABEL: libcall1_sinl:
-; CHECK: ld $25, %call16(sinl)
+; ALL-LABEL: libcall1_sinl:
+; ALL: ld $25, %call16(sinl)
 
 define fp128 @libcall1_sinl() {
 entry:
@@ -274,8 +278,8 @@ entry:
 
 declare fp128 @sinl(fp128) #2
 
-; CHECK-LABEL: libcall1_cosl:
-; CHECK: ld $25, %call16(cosl)
+; ALL-LABEL: libcall1_cosl:
+; ALL: ld $25, %call16(cosl)
 
 define fp128 @libcall1_cosl() {
 entry:
@@ -286,8 +290,8 @@ entry:
 
 declare fp128 @cosl(fp128) #2
 
-; CHECK-LABEL: libcall1_expl:
-; CHECK: ld $25, %call16(expl)
+; ALL-LABEL: libcall1_expl:
+; ALL: ld $25, %call16(expl)
 
 define fp128 @libcall1_expl() {
 entry:
@@ -298,8 +302,8 @@ entry:
 
 declare fp128 @expl(fp128) #2
 
-; CHECK-LABEL: libcall1_exp2l:
-; CHECK: ld $25, %call16(exp2l)
+; ALL-LABEL: libcall1_exp2l:
+; ALL: ld $25, %call16(exp2l)
 
 define fp128 @libcall1_exp2l() {
 entry:
@@ -310,8 +314,8 @@ entry:
 
 declare fp128 @exp2l(fp128) #2
 
-; CHECK-LABEL: libcall1_logl:
-; CHECK: ld $25, %call16(logl)
+; ALL-LABEL: libcall1_logl:
+; ALL: ld $25, %call16(logl)
 
 define fp128 @libcall1_logl() {
 entry:
@@ -322,8 +326,8 @@ entry:
 
 declare fp128 @logl(fp128) #2
 
-; CHECK-LABEL: libcall1_log2l:
-; CHECK: ld $25, %call16(log2l)
+; ALL-LABEL: libcall1_log2l:
+; ALL: ld $25, %call16(log2l)
 
 define fp128 @libcall1_log2l() {
 entry:
@@ -334,8 +338,8 @@ entry:
 
 declare fp128 @log2l(fp128) #2
 
-; CHECK-LABEL: libcall1_log10l:
-; CHECK: ld $25, %call16(log10l)
+; ALL-LABEL: libcall1_log10l:
+; ALL: ld $25, %call16(log10l)
 
 define fp128 @libcall1_log10l() {
 entry:
@@ -346,8 +350,8 @@ entry:
 
 declare fp128 @log10l(fp128) #2
 
-; CHECK-LABEL: libcall1_nearbyintl:
-; CHECK: ld $25, %call16(nearbyintl)
+; ALL-LABEL: libcall1_nearbyintl:
+; ALL: ld $25, %call16(nearbyintl)
 
 define fp128 @libcall1_nearbyintl() {
 entry:
@@ -358,8 +362,8 @@ entry:
 
 declare fp128 @nearbyintl(fp128) #1
 
-; CHECK-LABEL: libcall1_floorl:
-; CHECK: ld $25, %call16(floorl)
+; ALL-LABEL: libcall1_floorl:
+; ALL: ld $25, %call16(floorl)
 
 define fp128 @libcall1_floorl() {
 entry:
@@ -370,8 +374,8 @@ entry:
 
 declare fp128 @floorl(fp128) #1
 
-; CHECK-LABEL: libcall1_sqrtl:
-; CHECK: ld $25, %call16(sqrtl)
+; ALL-LABEL: libcall1_sqrtl:
+; ALL: ld $25, %call16(sqrtl)
 
 define fp128 @libcall1_sqrtl() {
 entry:
@@ -382,8 +386,8 @@ entry:
 
 declare fp128 @sqrtl(fp128) #2
 
-; CHECK-LABEL: libcall1_rintl:
-; CHECK: ld $25, %call16(rintl)
+; ALL-LABEL: libcall1_rintl:
+; ALL: ld $25, %call16(rintl)
 
 define fp128 @libcall1_rintl() {
 entry:
@@ -394,8 +398,8 @@ entry:
 
 declare fp128 @rintl(fp128) #1
 
-; CHECK-LABEL: libcall_powil:
-; CHECK: ld $25, %call16(__powitf2)
+; ALL-LABEL: libcall_powil:
+; ALL: ld $25, %call16(__powitf2)
 
 define fp128 @libcall_powil(fp128 %a, i32 %b) {
 entry:
@@ -405,18 +409,18 @@ entry:
 
 declare fp128 @llvm.powi.f128(fp128, i32) #3
 
-; CHECK-LABEL:     libcall2_copysignl:
-; CHECK-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
-; CHECK-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 63
-; CHECK-DAG: ld     $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK-DAG: ld     $[[R1:[0-9]+]], 8($[[R0]])
-; CHECK-DAG: and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
-; CHECK-DAG: ld     $[[R5:[0-9]+]], %got_disp(gld0)
-; CHECK-DAG: ld     $[[R6:[0-9]+]], 8($[[R5]])
-; CHECK-DAG: daddiu $[[R7:[0-9]+]], $[[R3]], -1
-; CHECK-DAG: and    $[[R8:[0-9]+]], $[[R6]], $[[R7]]
-; CHECK-DAG: or     $4, $[[R8]], $[[R4]]
-; CHECK-DAG: ld     $2, 0($[[R5]])
+; ALL-LABEL:     libcall2_copysignl:
+; ALL-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
+; ALL-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 63
+; ALL-DAG: ld     $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL-DAG: ld     $[[R1:[0-9]+]], 8($[[R0]])
+; ALL-DAG: and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
+; ALL-DAG: ld     $[[R5:[0-9]+]], %got_disp(gld0)
+; ALL-DAG: ld     $[[R6:[0-9]+]], 8($[[R5]])
+; ALL-DAG: daddiu $[[R7:[0-9]+]], $[[R3]], -1
+; ALL-DAG: and    $[[R8:[0-9]+]], $[[R6]], $[[R7]]
+; ALL-DAG: or     $4, $[[R8]], $[[R4]]
+; ALL-DAG: ld     $2, 0($[[R5]])
 
 define fp128 @libcall2_copysignl() {
 entry:
@@ -428,8 +432,8 @@ entry:
 
 declare fp128 @copysignl(fp128, fp128) #1
 
-; CHECK-LABEL: libcall2_powl:
-; CHECK: ld $25, %call16(powl)
+; ALL-LABEL: libcall2_powl:
+; ALL: ld $25, %call16(powl)
 
 define fp128 @libcall2_powl() {
 entry:
@@ -441,8 +445,8 @@ entry:
 
 declare fp128 @powl(fp128, fp128) #2
 
-; CHECK-LABEL: libcall2_fmodl:
-; CHECK: ld $25, %call16(fmodl)
+; ALL-LABEL: libcall2_fmodl:
+; ALL: ld $25, %call16(fmodl)
 
 define fp128 @libcall2_fmodl() {
 entry:
@@ -454,8 +458,8 @@ entry:
 
 declare fp128 @fmodl(fp128, fp128) #2
 
-; CHECK-LABEL: libcall3_fmal:
-; CHECK: ld $25, %call16(fmal)
+; ALL-LABEL: libcall3_fmal:
+; ALL: ld $25, %call16(fmal)
 
 define fp128 @libcall3_fmal() {
 entry:
@@ -468,8 +472,8 @@ entry:
 
 declare fp128 @llvm.fma.f128(fp128, fp128, fp128) #4
 
-; CHECK-LABEL: cmp_lt:
-; CHECK: ld $25, %call16(__lttf2)
+; ALL-LABEL: cmp_lt:
+; ALL: ld $25, %call16(__lttf2)
 
 define i32 @cmp_lt(fp128 %a, fp128 %b) {
 entry:
@@ -478,8 +482,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_le:
-; CHECK: ld $25, %call16(__letf2)
+; ALL-LABEL: cmp_le:
+; ALL: ld $25, %call16(__letf2)
 
 define i32 @cmp_le(fp128 %a, fp128 %b) {
 entry:
@@ -488,8 +492,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_gt:
-; CHECK: ld $25, %call16(__gttf2)
+; ALL-LABEL: cmp_gt:
+; ALL: ld $25, %call16(__gttf2)
 
 define i32 @cmp_gt(fp128 %a, fp128 %b) {
 entry:
@@ -498,8 +502,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_ge:
-; CHECK: ld $25, %call16(__getf2)
+; ALL-LABEL: cmp_ge:
+; ALL: ld $25, %call16(__getf2)
 
 define i32 @cmp_ge(fp128 %a, fp128 %b) {
 entry:
@@ -508,8 +512,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_eq:
-; CHECK: ld $25, %call16(__eqtf2)
+; ALL-LABEL: cmp_eq:
+; ALL: ld $25, %call16(__eqtf2)
 
 define i32 @cmp_eq(fp128 %a, fp128 %b) {
 entry:
@@ -518,8 +522,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_ne:
-; CHECK: ld $25, %call16(__netf2)
+; ALL-LABEL: cmp_ne:
+; ALL: ld $25, %call16(__netf2)
 
 define i32 @cmp_ne(fp128 %a, fp128 %b) {
 entry:
@@ -528,10 +532,10 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: load_LD_LD:
-; CHECK: ld $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld $2, 0($[[R0]])
-; CHECK: ld $4, 8($[[R0]])
+; ALL-LABEL: load_LD_LD:
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld $2, 0($[[R0]])
+; ALL: ld $4, 8($[[R0]])
 
 define fp128 @load_LD_LD() {
 entry:
@@ -539,11 +543,11 @@ entry:
   ret fp128 %0
 }
 
-; CHECK-LABEL: load_LD_float:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gf1)
-; CHECK: lw   $4, 0($[[R0]])
-; CHECK: ld   $25, %call16(__extendsftf2)
-; CHECK: jalr $25
+; ALL-LABEL: load_LD_float:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gf1)
+; ALL: lw   $4, 0($[[R0]])
+; ALL: ld   $25, %call16(__extendsftf2)
+; ALL: jalr $25
 
 define fp128 @load_LD_float() {
 entry:
@@ -552,11 +556,11 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: load_LD_double:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gd1)
-; CHECK: ld   $4, 0($[[R0]])
-; CHECK: ld   $25, %call16(__extenddftf2)
-; CHECK: jalr $25
+; ALL-LABEL: load_LD_double:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gd1)
+; ALL: ld   $4, 0($[[R0]])
+; ALL: ld   $25, %call16(__extenddftf2)
+; ALL: jalr $25
 
 define fp128 @load_LD_double() {
 entry:
@@ -565,13 +569,13 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: store_LD_LD:
-; CHECK: ld $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK: ld $[[R2:[0-9]+]], 8($[[R0]])
-; CHECK: ld $[[R3:[0-9]+]], %got_disp(gld0)
-; CHECK: sd $[[R2]], 8($[[R3]])
-; CHECK: sd $[[R1]], 0($[[R3]])
+; ALL-LABEL: store_LD_LD:
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld $[[R1:[0-9]+]], 0($[[R0]])
+; ALL: ld $[[R2:[0-9]+]], 8($[[R0]])
+; ALL: ld $[[R3:[0-9]+]], %got_disp(gld0)
+; ALL: sd $[[R2]], 8($[[R3]])
+; ALL: sd $[[R1]], 0($[[R3]])
 
 define void @store_LD_LD() {
 entry:
@@ -580,14 +584,14 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: store_LD_float:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld   $4, 0($[[R0]])
-; CHECK: ld   $5, 8($[[R0]])
-; CHECK: ld   $25, %call16(__trunctfsf2)
-; CHECK: jalr $25
-; CHECK: ld   $[[R1:[0-9]+]], %got_disp(gf1)
-; CHECK: sw   $2, 0($[[R1]])
+; ALL-LABEL: store_LD_float:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld   $4, 0($[[R0]])
+; ALL: ld   $5, 8($[[R0]])
+; ALL: ld   $25, %call16(__trunctfsf2)
+; ALL: jalr $25
+; ALL: ld   $[[R1:[0-9]+]], %got_disp(gf1)
+; ALL: sw   $2, 0($[[R1]])
 
 define void @store_LD_float() {
 entry:
@@ -597,14 +601,14 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: store_LD_double:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld   $4, 0($[[R0]])
-; CHECK: ld   $5, 8($[[R0]])
-; CHECK: ld   $25, %call16(__trunctfdf2)
-; CHECK: jalr $25
-; CHECK: ld   $[[R1:[0-9]+]], %got_disp(gd1)
-; CHECK: sd   $2, 0($[[R1]])
+; ALL-LABEL: store_LD_double:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld   $4, 0($[[R0]])
+; ALL: ld   $5, 8($[[R0]])
+; ALL: ld   $25, %call16(__trunctfdf2)
+; ALL: jalr $25
+; ALL: ld   $[[R1:[0-9]+]], %got_disp(gd1)
+; ALL: sd   $2, 0($[[R1]])
 
 define void @store_LD_double() {
 entry:
@@ -614,11 +618,22 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: select_LD:
-; CHECK: movn $8, $6, $4
-; CHECK: movn $9, $7, $4
-; CHECK: move $2, $8
-; CHECK: move $4, $9
+; ALL-LABEL: select_LD:
+; C_CC_FMT:      movn $8, $6, $4
+; C_CC_FMT:      movn $9, $7, $4
+; C_CC_FMT:      move $2, $8
+; C_CC_FMT:      move $4, $9
+
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; CMP_CC_FMT-DAG: sll $[[CC:[0-9]+]], $4, 0
+; CMP_CC_FMT-DAG: selnez $[[EQ1:[0-9]+]], $8, $[[CC]]
+; CMP_CC_FMT-DAG: seleqz $[[NE1:[0-9]+]], $6, $[[CC]]
+; CMP_CC_FMT-DAG: or $2, $[[EQ1]], $[[NE1]]
+; CMP_CC_FMT-DAG: selnez $[[EQ2:[0-9]+]], $9, $[[CC]]
+; CMP_CC_FMT-DAG: seleqz $[[NE2:[0-9]+]], $7, $[[CC]]
+; CMP_CC_FMT-DAG: or $4, $[[EQ2]], $[[NE2]]
 
 define fp128 @select_LD(i32 %a, i64, fp128 %b, fp128 %c) {
 entry:
@@ -627,18 +642,27 @@ entry:
   ret fp128 %cond
 }
 
-; CHECK-LABEL: selectCC_LD:
-; CHECK: move $[[R0:[0-9]+]], $11
-; CHECK: move $[[R1:[0-9]+]], $10
-; CHECK: move $[[R2:[0-9]+]], $9
-; CHECK: move $[[R3:[0-9]+]], $8
-; CHECK: ld   $25, %call16(__gttf2)($gp)
-; CHECK: jalr $25
-; CHECK: slti $1, $2, 1
-; CHECK: movz $[[R1]], $[[R3]], $1
-; CHECK: movz $[[R0]], $[[R2]], $1
-; CHECK: move $2, $[[R1]]
-; CHECK: move $4, $[[R0]]
+; ALL-LABEL: selectCC_LD:
+; ALL:           move $[[R0:[0-9]+]], $11
+; ALL:           move $[[R1:[0-9]+]], $10
+; ALL:           move $[[R2:[0-9]+]], $9
+; ALL:           move $[[R3:[0-9]+]], $8
+; ALL:           ld   $25, %call16(__gttf2)($gp)
+; ALL:           jalr $25
+
+; C_CC_FMT:      slti $[[CC:[0-9]+]], $2, 1
+; C_CC_FMT:      movz $[[R1]], $[[R3]], $[[CC]]
+; C_CC_FMT:      movz $[[R0]], $[[R2]], $[[CC]]
+; C_CC_FMT:      move $2, $[[R1]]
+; C_CC_FMT:      move $4, $[[R0]]
+
+; CMP_CC_FMT:    slt $[[CC:[0-9]+]], $zero, $2
+; CMP_CC_FMT:    seleqz $[[EQ1:[0-9]+]], $[[R1]], $[[CC]]
+; CMP_CC_FMT:    selnez $[[NE1:[0-9]+]], $[[R3]], $[[CC]]
+; CMP_CC_FMT:    or $2, $[[NE1]], $[[EQ1]]
+; CMP_CC_FMT:    seleqz $[[EQ2:[0-9]+]], $[[R0]], $[[CC]]
+; CMP_CC_FMT:    selnez $[[NE2:[0-9]+]], $[[R2]], $[[CC]]
+; CMP_CC_FMT:    or $4, $[[NE2]], $[[EQ2]]
 
 define fp128 @selectCC_LD(fp128 %a, fp128 %b, fp128 %c, fp128 %d) {
 entry:
diff --git a/test/CodeGen/Mips/mips64-fp-indexed-ls.ll b/test/CodeGen/Mips/mips64-fp-indexed-ls.ll
deleted file mode 100644
index bbdc05cd2d8f..000000000000
--- a/test/CodeGen/Mips/mips64-fp-indexed-ls.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
-
-%struct.S = type <{ [4 x float] }>
-%struct.S2 = type <{ [4 x double] }>
-%struct.S3 = type <{ i8, float }>
-
-@s = external global [4 x %struct.S]
-@gf = external global float
-@gd = external global double
-@s2 = external global [4 x %struct.S2]
-@s3 = external global %struct.S3
-
-define float @foo0(float* nocapture %b, i32 %o) nounwind readonly {
-entry:
-; CHECK: lwxc1
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds float* %b, i64 %idxprom
-  %0 = load float* %arrayidx, align 4
-  ret float %0
-}
-
-define double @foo1(double* nocapture %b, i32 %o) nounwind readonly {
-entry:
-; CHECK: ldxc1
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds double* %b, i64 %idxprom
-  %0 = load double* %arrayidx, align 8
-  ret double %0
-}
-
-define float @foo2(i32 %b, i32 %c) nounwind readonly {
-entry:
-; CHECK-NOT: luxc1
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S]* @s, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  %0 = load float* %arrayidx2, align 1
-  ret float %0
-}
-
-define void @foo3(float* nocapture %b, i32 %o) nounwind {
-entry:
-; CHECK: swxc1
-  %0 = load float* @gf, align 4
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds float* %b, i64 %idxprom
-  store float %0, float* %arrayidx, align 4
-  ret void
-}
-
-define void @foo4(double* nocapture %b, i32 %o) nounwind {
-entry:
-; CHECK: sdxc1
-  %0 = load double* @gd, align 8
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds double* %b, i64 %idxprom
-  store double %0, double* %arrayidx, align 8
-  ret void
-}
-
-define void @foo5(i32 %b, i32 %c) nounwind {
-entry:
-; CHECK-NOT: suxc1
-  %0 = load float* @gf, align 4
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S]* @s, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  store float %0, float* %arrayidx2, align 1
-  ret void
-}
-
-define double @foo6(i32 %b, i32 %c) nounwind readonly {
-entry:
-; CHECK: foo6
-; CHECK-NOT: luxc1
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S2]* @s2, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  %0 = load double* %arrayidx2, align 1
-  ret double %0
-}
-
-define void @foo7(i32 %b, i32 %c) nounwind {
-entry:
-; CHECK: foo7
-; CHECK-NOT: suxc1
-  %0 = load double* @gd, align 8
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S2]* @s2, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  store double %0, double* %arrayidx2, align 1
-  ret void
-}
-
-define float @foo8() nounwind readonly {
-entry:
-; CHECK: foo8
-; CHECK-NOT: luxc1
-  %0 = load float* getelementptr inbounds (%struct.S3* @s3, i64 0, i32 1), align 1
-  ret float %0
-}
-
-define void @foo9(float %f) nounwind {
-entry:
-; CHECK: foo9
-; CHECK-NOT: suxc1
-  store float %f, float* getelementptr inbounds (%struct.S3* @s3, i64 0, i32 1), align 1
-  ret void
-}
-
diff --git a/test/CodeGen/Mips/mips64countleading.ll b/test/CodeGen/Mips/mips64countleading.ll
deleted file mode 100644
index 252f323fe1a7..000000000000
--- a/test/CodeGen/Mips/mips64countleading.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
-
-define i64 @t1(i64 %X) nounwind readnone {
-entry:
-; CHECK-LABEL: t1:
-; MIPS4-NOT: dclz
-; MIPS64: dclz
-  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
-  ret i64 %tmp1
-}
-
-declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
-
-define i64 @t3(i64 %X) nounwind readnone {
-entry:
-; CHECK-LABEL: t3:
-; MIPS4-NOT: dclo
-; MIPS64: dclo
-  %neg = xor i64 %X, -1
-  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
-  ret i64 %tmp1
-}
-
diff --git a/test/CodeGen/Mips/mips64instrs.ll b/test/CodeGen/Mips/mips64instrs.ll
index 58f11f155dea..ed617be6532e 100644
--- a/test/CodeGen/Mips/mips64instrs.ll
+++ b/test/CodeGen/Mips/mips64instrs.ll
@@ -1,99 +1,128 @@
-; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
-; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
+; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS4 -check-prefix=ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=GPRMULDIV %s
 
 @gll0 = common global i64 0, align 8
 @gll1 = common global i64 0, align 8
 
 define i64 @f0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: daddu
+; ALL-LABEL: f0:
+; ALL:           daddu $2, ${{[45]}}, ${{[45]}}
   %add = add nsw i64 %a1, %a0
   ret i64 %add
 }
 
 define i64 @f1(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: dsubu
+; ALL-LABEL: f1:
+; ALL:           dsubu $2, $4, $5
   %sub = sub nsw i64 %a0, %a1
   ret i64 %sub
 }
 
 define i64 @f4(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: and
+; ALL-LABEL: f4:
+; ALL:           and $2, ${{[45]}}, ${{[45]}}
   %and = and i64 %a1, %a0
   ret i64 %and
 }
 
 define i64 @f5(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: or
+; ALL-LABEL: f5:
+; ALL:           or $2, ${{[45]}}, ${{[45]}}
   %or = or i64 %a1, %a0
   ret i64 %or
 }
 
 define i64 @f6(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: xor
+; ALL-LABEL: f6:
+; ALL:           xor $2, ${{[45]}}, ${{[45]}}
   %xor = xor i64 %a1, %a0
   ret i64 %xor
 }
 
 define i64 @f7(i64 %a0) nounwind readnone {
 entry:
-; CHECK: daddiu ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f7:
+; ALL:           daddiu $2, $4, 20
   %add = add nsw i64 %a0, 20
   ret i64 %add
 }
 
 define i64 @f8(i64 %a0) nounwind readnone {
 entry:
-; CHECK: daddiu ${{[0-9]+}}, ${{[0-9]+}}, -20
+; ALL-LABEL: f8:
+; ALL:           daddiu $2, $4, -20
   %sub = add nsw i64 %a0, -20
   ret i64 %sub
 }
 
 define i64 @f9(i64 %a0) nounwind readnone {
 entry:
-; CHECK: andi ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f9:
+; ALL:           andi $2, $4, 20
   %and = and i64 %a0, 20
   ret i64 %and
 }
 
 define i64 @f10(i64 %a0) nounwind readnone {
 entry:
-; CHECK: ori ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f10:
+; ALL:           ori $2, $4, 20
   %or = or i64 %a0, 20
   ret i64 %or
 }
 
 define i64 @f11(i64 %a0) nounwind readnone {
 entry:
-; CHECK: xori ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f11:
+; ALL:           xori $2, $4, 20
   %xor = xor i64 %a0, 20
   ret i64 %xor
 }
 
 define i64 @f12(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK: mult
+; ALL-LABEL: f12:
+
+; ACCMULDIV:     mult ${{[45]}}, ${{[45]}}
+; GPRMULDIV:     dmul $2, ${{[45]}}, ${{[45]}}
+
   %mul = mul nsw i64 %b, %a
   ret i64 %mul
 }
 
 define i64 @f13(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK: mult
+; ALL-LABEL: f13:
+
+; ACCMULDIV:     mult ${{[45]}}, ${{[45]}}
+; GPRMULDIV:     dmul $2, ${{[45]}}, ${{[45]}}
+
   %mul = mul i64 %b, %a
   ret i64 %mul
 }
 
 define i64 @f14(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f14:
-; CHECK: ddiv $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mflo
+; ALL-LABEL: f14:
+; ALL-DAG:       ld $[[P0:[0-9]+]], %got_disp(gll0)(
+; ALL-DAG:       ld $[[P1:[0-9]+]], %got_disp(gll1)(
+; ALL-DAG:       ld $[[T0:[0-9]+]], 0($[[P0]])
+; ALL-DAG:       ld $[[T1:[0-9]+]], 0($[[P1]])
+
+; ACCMULDIV:     ddiv $zero, $[[T0]], $[[T1]]
+; ACCMULDIV:     teq $[[T1]], $zero, 7
+; ACCMULDIV:     mflo $2
+
+; GPRMULDIV:     ddiv $2, $[[T0]], $[[T1]]
+; GPRMULDIV:     teq $[[T1]], $zero, 7
+
   %0 = load i64* @gll0, align 8
   %1 = load i64* @gll1, align 8
   %div = sdiv i64 %0, %1
@@ -102,10 +131,19 @@ entry:
 
 define i64 @f15() nounwind readnone {
 entry:
-; CHECK-LABEL: f15:
-; CHECK: ddivu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mflo
+; ALL-LABEL: f15:
+; ALL-DAG:       ld $[[P0:[0-9]+]], %got_disp(gll0)(
+; ALL-DAG:       ld $[[P1:[0-9]+]], %got_disp(gll1)(
+; ALL-DAG:       ld $[[T0:[0-9]+]], 0($[[P0]])
+; ALL-DAG:       ld $[[T1:[0-9]+]], 0($[[P1]])
+
+; ACCMULDIV:     ddivu $zero, $[[T0]], $[[T1]]
+; ACCMULDIV:     teq $[[T1]], $zero, 7
+; ACCMULDIV:     mflo $2
+
+; GPRMULDIV:     ddivu $2, $[[T0]], $[[T1]]
+; GPRMULDIV:     teq $[[T1]], $zero, 7
+
   %0 = load i64* @gll0, align 8
   %1 = load i64* @gll1, align 8
   %div = udiv i64 %0, %1
@@ -114,20 +152,30 @@ entry:
 
 define i64 @f16(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f16:
-; CHECK: ddiv $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mfhi
+; ALL-LABEL: f16:
+
+; ACCMULDIV:     ddiv $zero, $4, $5
+; ACCMULDIV:     teq $5, $zero, 7
+; ACCMULDIV:     mfhi $2
+
+; GPRMULDIV:     dmod $2, $4, $5
+; GPRMULDIV:     teq $5, $zero, 7
+
   %rem = srem i64 %a, %b
   ret i64 %rem
 }
 
 define i64 @f17(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f17:
-; CHECK: ddivu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mfhi
+; ALL-LABEL: f17:
+
+; ACCMULDIV:     ddivu $zero, $4, $5
+; ACCMULDIV:     teq $5, $zero, 7
+; ACCMULDIV:     mfhi $2
+
+; GPRMULDIV:     dmodu $2, $4, $5
+; GPRMULDIV:     teq $5, $zero, 7
+
   %rem = urem i64 %a, %b
   ret i64 %rem
 }
@@ -136,24 +184,26 @@ declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
 
 define i64 @f18(i64 %X) nounwind readnone {
 entry:
-; CHECK-LABEL: f18:
+; ALL-LABEL: f18:
 
 ; The MIPS4 version is too long to reasonably test. At least check we don't get dclz
-; MIPS4-NOT: dclz
+; MIPS4-NOT:     dclz
+
+; HAS-DCLO:      dclz $2, $4
 
-; MIPS64: dclz $2, $4
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
   ret i64 %tmp1
 }
 
 define i64 @f19(i64 %X) nounwind readnone {
 entry:
-; CHECK-LABEL: f19:
+; ALL-LABEL: f19:
 
 ; The MIPS4 version is too long to reasonably test. At least check we don't get dclo
-; MIPS4-NOT: dclo
+; MIPS4-NOT:     dclo
+
+; HAS-DCLO:      dclo $2, $4
 
-; MIPS64: dclo $2, $4
   %neg = xor i64 %X, -1
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
   ret i64 %tmp1
@@ -161,8 +211,8 @@ entry:
 
 define i64 @f20(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f20:
-; CHECK: nor
+; ALL-LABEL: f20:
+; ALL:           nor $2, ${{[45]}}, ${{[45]}}
   %or = or i64 %b, %a
   %neg = xor i64 %or, -1
   ret i64 %neg
diff --git a/test/CodeGen/Mips/mips64load-store-left-right.ll b/test/CodeGen/Mips/mips64load-store-left-right.ll
deleted file mode 100644
index c9ba467e6c1b..000000000000
--- a/test/CodeGen/Mips/mips64load-store-left-right.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck  -check-prefix=EL %s
-; RUN: llc -march=mips64 -mcpu=mips4 -mattr=n64 < %s | FileCheck  -check-prefix=EB %s
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EL %s
-; RUN: llc -march=mips64 -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EB %s
-
-%struct.SLL = type { i64 }
-%struct.SI = type { i32 }
-%struct.SUI = type { i32 }
-
-@sll = common global %struct.SLL zeroinitializer, align 1
-@si = common global %struct.SI zeroinitializer, align 1
-@sui = common global %struct.SUI zeroinitializer, align 1
-
-define i64 @foo_load_ll() nounwind readonly {
-entry:
-; EL: ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
-; EL: ldr $[[R0]], 0($[[R1]])
-; EB: ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: ldr $[[R0]], 7($[[R1]])
-
-  %0 = load i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
-  ret i64 %0
-}
-
-define i64 @foo_load_i() nounwind readonly {
-entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
-
-  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
-  %conv = sext i32 %0 to i64
-  ret i64 %conv
-}
-
-define i64 @foo_load_ui() nounwind readonly {
-entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EL: daddiu $[[R2:[0-9]+]], $zero, 1
-; EL: dsll   $[[R3:[0-9]+]], $[[R2]], 32
-; EL: daddiu $[[R4:[0-9]+]], $[[R3]], -1
-; EL: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
-
-
-  %0 = load i32* getelementptr inbounds (%struct.SUI* @sui, i64 0, i32 0), align 1
-  %conv = zext i32 %0 to i64
-  ret i64 %conv
-}
-
-define void @foo_store_ll(i64 %a) nounwind {
-entry:
-; EL: sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
-; EL: sdr $[[R0]], 0($[[R1]])
-; EB: sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: sdr $[[R0]], 7($[[R1]])
-
-  store i64 %a, i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
-  ret void
-}
-
-define void @foo_store_i(i32 %a) nounwind {
-entry:
-; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: swr $[[R0]], 0($[[R1]])
-; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: swr $[[R0]], 3($[[R1]])
-
-  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
-  ret void
-}
-
diff --git a/test/CodeGen/Mips/mips64muldiv.ll b/test/CodeGen/Mips/mips64muldiv.ll
index 39c73e953555..32d05a9da369 100644
--- a/test/CodeGen/Mips/mips64muldiv.ll
+++ b/test/CodeGen/Mips/mips64muldiv.ll
@@ -1,50 +1,79 @@
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
+; RUN: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR
+
+; FileCheck prefixes:
+;   ALL - All targets
+;   ACC - Targets with accumulator based mul/div (i.e. pre-MIPS32r6)
+;   GPR - Targets with register based mul/div (i.e. MIPS32r6)
 
 define i64 @m0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: dmult
-; CHECK: mflo
+; ALL-LABEL: m0:
+; ACC:           dmult ${{[45]}}, ${{[45]}}
+; ACC:           mflo $2
+; GPR:           dmul $2, ${{[45]}}, ${{[45]}}
   %mul = mul i64 %a1, %a0
   ret i64 %mul
 }
 
 define i64 @m1(i64 %a) nounwind readnone {
 entry:
-; CHECK: dmult
-; CHECK: mfhi
+; ALL-LABEL: m1:
+; ALL:           lui $[[T0:[0-9]+]], 21845
+; ALL:           addiu $[[T0]], $[[T0]], 21845
+; ALL:           dsll $[[T0]], $[[T0]], 16
+; ALL:           addiu $[[T0]], $[[T0]], 21845
+; ALL:           dsll $[[T0]], $[[T0]], 16
+; ALL:           addiu $[[T0]], $[[T0]], 21846
+
+; ACC:           dmult $4, $[[T0]]
+; ACC:           mfhi $[[T1:[0-9]+]]
+; GPR:           dmuh $[[T1:[0-9]+]], $4, $[[T0]]
+
+; ALL:           dsrl $2, $[[T1]], 63
+; ALL:           daddu $2, $[[T1]], $2
   %div = sdiv i64 %a, 3
   ret i64 %div
 }
 
 define i64 @d0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddivu
-; CHECK: mflo
+; ALL-LABEL: d0:
+; ACC:           ddivu $zero, $4, $5
+; ACC:           mflo $2
+; GPR:           ddivu $2, $4, $5
   %div = udiv i64 %a0, %a1
   ret i64 %div
 }
 
 define i64 @d1(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddiv
-; CHECK: mflo
+; ALL-LABEL: d1:
+; ACC:           ddiv $zero, $4, $5
+; ACC:           mflo $2
+; GPR:           ddiv $2, $4, $5
   %div = sdiv i64 %a0, %a1
   ret i64 %div
 }
 
 define i64 @d2(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddivu
-; CHECK: mfhi
+; ALL-LABEL: d2:
+; ACC:           ddivu $zero, $4, $5
+; ACC:           mfhi $2
+; GPR:           dmodu $2, $4, $5
   %rem = urem i64 %a0, %a1
   ret i64 %rem
 }
 
 define i64 @d3(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddiv
-; CHECK: mfhi
+; ALL-LABEL: d3:
+; ACC:           ddiv $zero, $4, $5
+; ACC:           mfhi $2
+; GPR:           dmod $2, $4, $5
   %rem = srem i64 %a0, %a1
   ret i64 %rem
 }
diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index f4854f880542..244b03d2b573 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
@@ -1,33 +1,113 @@
-; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r2 \
-; RUN: < %s | FileCheck %s -check-prefix=LE-PIC
-; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 < %s | \
-; RUN: FileCheck %s -check-prefix=LE-STATIC
-; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 < %s | \
-; RUN: FileCheck %s -check-prefix=BE-PIC
+; Check that [sl]dc1 are normally emitted. MIPS32r2 should have [sl]dxc1 too.
+; RUN: llc -march=mipsel -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1-LDC1
 ; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | \
-; RUN: FileCheck %s -check-prefix=CHECK-LDC1-SDC1
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2-LDXC1
+; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6-LDC1
+
+; Check that -mno-ldc1-sdc1 disables [sl]dc1
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
+; RUN:             -check-prefix=32R1-LE -check-prefix=32R1-LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r2 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
+; RUN:             -check-prefix=32R2-LE -check-prefix=32R2-LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
+; RUN:             -check-prefix=32R6-LE -check-prefix=32R6-LE-PIC
+
+; Check again for big-endian
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
+; RUN:             -check-prefix=32R1-BE -check-prefix=32R1-BE-PIC
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r2 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
+; RUN:             -check-prefix=32R2-BE -check-prefix=32R2-BE-PIC
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
+; RUN:             -check-prefix=32R6-BE -check-prefix=32R6-BE-PIC
+
+; Check again for the static relocation model
+; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
+; RUN:             -check-prefix=32R1-LE -check-prefix=32R1-LE-STATIC
+; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r2 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
+; RUN:             -check-prefix=32R2-LE -check-prefix=32R2-LE-STATIC
+; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
+; RUN:             -check-prefix=32R6-LE -check-prefix=32R6-LE-STATIC
 
 @g0 = common global double 0.000000e+00, align 8
 
-; LE-PIC-LABEL: test_ldc1:
-; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
-; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
-; LE-PIC-DAG: mtc1 $[[R0]], $f0
-; LE-PIC-DAG: mtc1 $[[R1]], $f1
-; LE-STATIC-LABEL: test_ldc1:
-; LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
-; LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
-; LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
-; LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
-; LE-STATIC-DAG: mtc1 $[[R1]], $f0
-; LE-STATIC-DAG: mtc1 $[[R3]], $f1
-; BE-PIC-LABEL: test_ldc1:
-; BE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
-; BE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
-; BE-PIC-DAG: mtc1 $[[R1]], $f0
-; BE-PIC-DAG: mtc1 $[[R0]], $f1
-; CHECK-LDC1-SDC1-LABEL: test_ldc1:
-; CHECK-LDC1-SDC1: ldc1 $f{{[0-9]+}}
+; ALL-LABEL: test_ldc1:
+
+; 32R1-LE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-LE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-LE-PIC-DAG:    mtc1 $[[R0]], $f0
+; 32R1-LE-PIC-DAG:    mtc1 $[[R1]], $f1
+
+; 32R2-LE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-LE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-LE-PIC-DAG:    mtc1 $[[R0]], $f0
+; 32R2-LE-PIC-DAG:    mthc1 $[[R1]], $f0
+
+; 32R6-LE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-LE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-LE-PIC-DAG:    mtc1 $[[R0]], $f0
+; 32R6-LE-PIC-DAG:    mthc1 $[[R1]], $f0
+
+; 32R1-LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; 32R1-LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; 32R1-LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; 32R1-LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; 32R1-LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; 32R1-LE-STATIC-DAG: mtc1 $[[R3]], $f1
+
+; 32R2-LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; 32R2-LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; 32R2-LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; 32R2-LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; 32R2-LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; 32R2-LE-STATIC-DAG: mthc1 $[[R3]], $f0
+
+; 32R6-LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; 32R6-LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; 32R6-LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; 32R6-LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; 32R6-LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; 32R6-LE-STATIC-DAG: mthc1 $[[R3]], $f0
+
+; 32R1-BE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-BE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-BE-PIC-DAG:    mtc1 $[[R1]], $f0
+; 32R1-BE-PIC-DAG:    mtc1 $[[R0]], $f1
+
+; 32R2-BE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-BE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-BE-PIC-DAG:    mtc1 $[[R1]], $f0
+; 32R2-BE-PIC-DAG:    mthc1 $[[R0]], $f0
+
+; 32R6-BE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-BE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-BE-PIC-DAG:    mtc1 $[[R1]], $f0
+; 32R6-BE-PIC-DAG:    mthc1 $[[R0]], $f0
+
+; 32R1-LDC1:          ldc1 $f0, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:         ldc1 $f0, 0(${{[0-9]+}})
+
+; 32R6-LDC1:          ldc1 $f0, 0(${{[0-9]+}})
 
 define double @test_ldc1() {
 entry:
@@ -35,25 +115,64 @@ entry:
   ret double %0
 }
 
-; LE-PIC-LABEL: test_sdc1:
-; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
-; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
-; LE-STATIC-LABEL: test_sdc1:
-; LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
-; LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
-; LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
-; LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
-; BE-PIC-LABEL: test_sdc1:
-; BE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; BE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; BE-PIC-DAG: sw $[[R1]], 0(${{[0-9]+}})
-; BE-PIC-DAG: sw $[[R0]], 4(${{[0-9]+}})
-; CHECK-LDC1-SDC1-LABEL: test_sdc1:
-; CHECK-LDC1-SDC1: sdc1 $f{{[0-9]+}}
+; ALL-LABEL: test_sdc1:
+
+; 32R1-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-LE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
+; 32R1-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R2-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-LE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
+; 32R2-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R6-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-LE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
+; 32R6-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R1-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; 32R1-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; 32R1-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; 32R1-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
+
+; 32R2-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; 32R2-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; 32R2-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; 32R2-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
+
+; 32R6-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-LE-STATIC-DAG: mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; 32R6-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; 32R6-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; 32R6-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
+
+; 32R1-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-BE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
+; 32R1-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
+
+; 32R2-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-BE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
+; 32R2-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
+
+; 32R6-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-BE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
+; 32R6-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
+
+; 32R1-LDC1:          sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:         sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+
+; 32R6-LDC1:          sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
 
 define void @test_sdc1(double %a) {
 entry:
@@ -61,14 +180,35 @@ entry:
   ret void
 }
 
+; ALL-LABEL: test_ldxc1:
+
+; 32R1-LE-DAG:   lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-LE-DAG:   lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-BE-DAG:   lw $[[R0:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-BE-DAG:   lw $[[R1:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-DAG:      mtc1 $[[R0]], $f0
+; 32R1-DAG:      mtc1 $[[R1]], $f1
+
+; 32R2-LE-DAG:   lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-LE-DAG:   lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-BE-DAG:   lw $[[R0:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-BE-DAG:   lw $[[R1:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-DAG:      mtc1 $[[R0]], $f0
+; 32R2-DAG:      mthc1 $[[R1]], $f0
 
-; LE-PIC-LABEL: test_ldxc1:
-; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
-; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
-; LE-PIC-DAG: mtc1 $[[R0]], $f0
-; LE-PIC-DAG: mtc1 $[[R1]], $f1
-; CHECK-LDC1-SDC1-LABEL: test_ldxc1:
-; CHECK-LDC1-SDC1: ldxc1 $f{{[0-9]+}}
+; 32R6-LE-DAG:   lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-LE-DAG:   lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-BE-DAG:   lw $[[R0:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-BE-DAG:   lw $[[R1:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-DAG:      mtc1 $[[R0]], $f0
+; 32R6-DAG:      mthc1 $[[R1]], $f0
+
+; 32R1-LDC1:     ldc1 $f0, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:    sll $[[OFFSET:[0-9]+]], $5, 3
+; 32R2-LDXC1:    ldxc1 $f0, $[[OFFSET]]($4)
+
+; 32R6-LDC1:     ldc1 $f0, 0(${{[0-9]+}})
 
 define double @test_ldxc1(double* nocapture readonly %a, i32 %i) {
 entry:
@@ -77,13 +217,29 @@ entry:
   ret double %0
 }
 
-; LE-PIC-LABEL: test_sdxc1:
-; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
-; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
-; CHECK-LDC1-SDC1-LABEL: test_sdxc1:
-; CHECK-LDC1-SDC1: sdxc1 $f{{[0-9]+}}
+; ALL-LABEL: test_sdxc1:
+
+; 32R1-DAG:      mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-DAG:      mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-DAG:      sw $[[R0]], 0(${{[0-9]+}})
+; 32R1-DAG:      sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R2-DAG:      mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-DAG:      mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-DAG:      sw $[[R0]], 0(${{[0-9]+}})
+; 32R2-DAG:      sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R6-DAG:      mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-DAG:      mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-DAG:      sw $[[R0]], 0(${{[0-9]+}})
+; 32R6-DAG:      sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R1-LDC1:     sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:    sll $[[OFFSET:[0-9]+]], $7, 3
+; 32R2-LDXC1:    sdxc1 $f{{[0-9]+}}, $[[OFFSET]]($6)
+
+; 32R6-LDC1:     sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
 
 define void @test_sdxc1(double %b, double* nocapture %a, i32 %i) {
 entry:
diff --git a/test/CodeGen/Mips/prevent-hoisting.ll b/test/CodeGen/Mips/prevent-hoisting.ll
new file mode 100644
index 000000000000..da665c210909
--- /dev/null
+++ b/test/CodeGen/Mips/prevent-hoisting.ll
@@ -0,0 +1,144 @@
+; RUN: llc -march=mipsel -O3 < %s | FileCheck %s
+
+
+; MIPS direct branches implicitly define register $at. This test makes sure that
+; code hoisting optimization (which moves identical instructions at the start of
+; two basic blocks to the common predecessor block) takes this into account and
+; doesn't move definition of $at to the predecessor block (which would make $at
+; live-in at the start of successor block).
+
+
+; CHECK-LABEL: readLumaCoeff8x8_CABAC
+
+; The check for "addiu" instruction is added so that we can match the correct "b" instruction.
+; CHECK:           addiu ${{[0-9]+}}, $zero, -1
+; CHECK:           b $[[BB0:BB[0-9_]+]]
+
+; Check that sll instruction that writes to $1 starts basic block.
+; CHECK:       {{BB[0-9_#]+}}: 
+; CHECK-NEXT:      sll $1, $[[R0:[0-9]+]], 4
+
+; Check that identical sll instruction starts another basic block.
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:      sll $1, $[[R0]], 4
+
+
+%struct.img_par = type { i32, i32, i32, i32, i32*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [16 x [16 x i16]], [6 x [32 x i32]], [16 x [16 x i32]], [4 x [12 x [4 x [4 x i32]]]], [16 x i32], i8**, i32*, i32***, i32**, i32, i32, i32, i32, %struct.Slice*, %struct.macroblock*, i32, i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32***, i32***, i32****, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x [2 x i32]], [3 x [2 x i32]], i32, i32, i32, i32, %struct.timeb, %struct.timeb, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+%struct.Slice = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.datapartition*, %struct.MotionInfoContexts*, %struct.TextureInfoContexts*, i32, i32*, i32*, i32*, i32, i32*, i32*, i32*, i32 (%struct.img_par*, %struct.inp_par*)*, i32, i32, i32, i32 }
+%struct.datapartition = type { %struct.Bitstream*, %struct.DecodingEnvironment, i32 (%struct.syntaxelement*, %struct.img_par*, %struct.datapartition*)* }
+%struct.Bitstream = type { i32, i32, i32, i32, i8*, i32 }
+%struct.DecodingEnvironment = type { i32, i32, i32, i32, i32, i8*, i32* }
+%struct.syntaxelement = type { i32, i32, i32, i32, i32, i32, i32, i32, void (i32, i32, i32*, i32*)*, void (%struct.syntaxelement*, %struct.img_par*, %struct.DecodingEnvironment*)* }
+%struct.MotionInfoContexts = type { [4 x [11 x %struct.BiContextType]], [2 x [9 x %struct.BiContextType]], [2 x [10 x %struct.BiContextType]], [2 x [6 x %struct.BiContextType]], [4 x %struct.BiContextType], [4 x %struct.BiContextType], [3 x %struct.BiContextType] }
+%struct.BiContextType = type { i16, i8 }
+%struct.TextureInfoContexts = type { [2 x %struct.BiContextType], [4 x %struct.BiContextType], [3 x [4 x %struct.BiContextType]], [10 x [4 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]], [10 x [5 x %struct.BiContextType]], [10 x [5 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]] }
+%struct.inp_par = type { [1000 x i8], [1000 x i8], [1000 x i8], i32, i32, i32, i32, i32, i32, i32, i32 }
+%struct.macroblock = type { i32, [2 x i32], i32, i32, %struct.macroblock*, %struct.macroblock*, i32, [2 x [4 x [4 x [2 x i32]]]], i32, i64, i64, i32, i32, [4 x i8], [4 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+%struct.DecRefPicMarking_s = type { i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s* }
+%struct.timeb = type { i32, i16, i16, i16 }
+
+@assignSE2partition = external global [0 x [20 x i32]]
+@FIELD_SCAN8x8 = external constant [64 x [2 x i8]]
+
+
+define void @readLumaCoeff8x8_CABAC(%struct.img_par* %img, i32 %b8) {
+
+  %1 = load i32* undef, align 4
+  br i1 false, label %2, label %3
+
+; <label>:2                                       ; preds = %0
+  br label %3
+
+; <label>:3                                       ; preds = %2, %0
+  br i1 undef, label %switch.lookup, label %4
+
+switch.lookup:                                    ; preds = %3
+  br label %4
+
+; <label>:4                                       ; preds = %switch.lookup, %3
+  br i1 undef, label %5, label %6
+
+; <label>:5                                       ; preds = %4
+  br label %6
+
+; <label>:6                                       ; preds = %5, %4
+  %7 = phi [2 x i8]* [ getelementptr inbounds ([64 x [2 x i8]]* @FIELD_SCAN8x8, i32 0, i32 0), %4 ], [ null, %5 ]
+  br i1 undef, label %switch.lookup6, label %8
+
+switch.lookup6:                                   ; preds = %6
+  br label %8
+
+; <label>:8                                       ; preds = %switch.lookup6, %6
+  br i1 undef, label %.loopexit, label %9
+
+; <label>:9                                       ; preds = %8
+  %10 = and i32 %b8, 1
+  %11 = shl nuw nsw i32 %10, 3
+  %12 = getelementptr inbounds %struct.Slice* null, i32 0, i32 9
+  br i1 undef, label %.preheader, label %.preheader11
+
+.preheader11:                                     ; preds = %21, %9
+  %k.014 = phi i32 [ %27, %21 ], [ 0, %9 ]
+  %coef_ctr.013 = phi i32 [ %23, %21 ], [ -1, %9 ]
+  br i1 false, label %13, label %14
+
+; <label>:13                                      ; preds = %.preheader11
+  br label %15
+
+; <label>:14                                      ; preds = %.preheader11
+  br label %15
+
+; <label>:15                                      ; preds = %14, %13
+  %16 = getelementptr inbounds [0 x [20 x i32]]* @assignSE2partition, i32 0, i32 %1, i32 undef
+  %17 = load i32* %16, align 4
+  %18 = getelementptr inbounds %struct.datapartition* null, i32 %17, i32 2
+  %19 = load i32 (%struct.syntaxelement*, %struct.img_par*, %struct.datapartition*)** %18, align 4
+  %20 = call i32 %19(%struct.syntaxelement* undef, %struct.img_par* %img, %struct.datapartition* undef)
+  br i1 false, label %.loopexit, label %21
+
+; <label>:21                                      ; preds = %15
+  %22 = add i32 %coef_ctr.013, 1
+  %23 = add i32 %22, 0
+  %24 = getelementptr inbounds [2 x i8]* %7, i32 %23, i32 0
+  %25 = add nsw i32 0, %11
+  %26 = getelementptr inbounds %struct.img_par* %img, i32 0, i32 27, i32 undef, i32 %25
+  store i32 0, i32* %26, align 4
+  %27 = add nsw i32 %k.014, 1
+  %28 = icmp slt i32 %27, 65
+  br i1 %28, label %.preheader11, label %.loopexit
+
+.preheader:                                       ; preds = %36, %9
+  %k.110 = phi i32 [ %45, %36 ], [ 0, %9 ]
+  %coef_ctr.29 = phi i32 [ %39, %36 ], [ -1, %9 ]
+  br i1 false, label %29, label %30
+
+; <label>:29                                      ; preds = %.preheader
+  br label %31
+
+; <label>:30                                      ; preds = %.preheader
+  br label %31
+
+; <label>:31                                      ; preds = %30, %29
+  %32 = getelementptr inbounds [0 x [20 x i32]]* @assignSE2partition, i32 0, i32 %1, i32 undef
+  %33 = load i32* %32, align 4
+  %34 = getelementptr inbounds %struct.datapartition* null, i32 %33
+  %35 = call i32 undef(%struct.syntaxelement* undef, %struct.img_par* %img, %struct.datapartition* %34)
+  br i1 false, label %.loopexit, label %36
+
+; <label>:36                                      ; preds = %31
+  %37 = load i32* undef, align 4
+  %38 = add i32 %coef_ctr.29, 1
+  %39 = add i32 %38, %37
+  %40 = getelementptr inbounds [2 x i8]* %7, i32 %39, i32 0
+  %41 = load i8* %40, align 1
+  %42 = zext i8 %41 to i32
+  %43 = add nsw i32 %42, %11
+  %44 = getelementptr inbounds %struct.img_par* %img, i32 0, i32 27, i32 undef, i32 %43
+  store i32 0, i32* %44, align 4
+  %45 = add nsw i32 %k.110, 1
+  %46 = icmp slt i32 %45, 65
+  br i1 %46, label %.preheader, label %.loopexit
+
+.loopexit:                                        ; preds = %36, %31, %21, %15, %8
+  ret void
+}
diff --git a/test/CodeGen/Mips/select.ll b/test/CodeGen/Mips/select.ll
index 06e2a86ad176..3319fd8b22aa 100644
--- a/test/CodeGen/Mips/select.ll
+++ b/test/CodeGen/Mips/select.ll
@@ -1,135 +1,713 @@
-; RUN: llc  < %s -march=mipsel | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -march=mipsel   -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=32
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32R2
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32R6
+; RUN: llc < %s -march=mips64el -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=64
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64R2
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64R6
 
 @d2 = external global double
 @d3 = external global double
 
-define i32 @sel1(i32 %s, i32 %f0, i32 %f1) nounwind readnone {
+define i32 @i32_icmp_ne_i32_val(i32 %s, i32 %f0, i32 %f1) nounwind readnone {
 entry:
-; CHECK: movn
+; ALL-LABEL: i32_icmp_ne_i32_val:
+
+; 32:            movn $5, $6, $4
+; 32:            move $2, $5
+
+; 32R2:          movn $5, $6, $4
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      selnez $[[T0:[0-9]+]], $5, $4
+; 32R6-DAG:      seleqz $[[T1:[0-9]+]], $6, $4
+; 32R6:          or $2, $[[T0]], $[[T1]]
+
+; 64:            movn $5, $6, $4
+; 64:            move $2, $5
+
+; 64R2:          movn $5, $6, $4
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      selnez $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      seleqz $[[T1:[0-9]+]], $6, $4
+; 64R6:          or $2, $[[T0]], $[[T1]]
+
   %tobool = icmp ne i32 %s, 0
   %cond = select i1 %tobool, i32 %f1, i32 %f0
   ret i32 %cond
 }
 
-define float @sel2(i32 %s, float %f0, float %f1) nounwind readnone {
+define i64 @i32_icmp_ne_i64_val(i32 %s, i64 %f0, i64 %f1) nounwind readnone {
+entry:
+; ALL-LABEL: i32_icmp_ne_i64_val:
+
+; 32-DAG:        lw $[[F1:[0-9]+]], 16($sp)
+; 32-DAG:        movn $6, $[[F1]], $4
+; 32-DAG:        lw $[[F1H:[0-9]+]], 20($sp)
+; 32:            movn $7, $[[F1H]], $4
+; 32:            move $2, $6
+; 32:            move $3, $7
+
+; 32R2-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R2-DAG:      movn $6, $[[F1]], $4
+; 32R2-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R2:          movn $7, $[[F1H]], $4
+; 32R2:          move $2, $6
+; 32R2:          move $3, $7
+
+; 32R6-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R6-DAG:      selnez $[[T0:[0-9]+]], $6, $4
+; 32R6-DAG:      seleqz $[[T1:[0-9]+]], $[[F1]], $4
+; 32R6:          or $2, $[[T0]], $[[T1]]
+; 32R6-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R6-DAG:      selnez $[[T0:[0-9]+]], $7, $4
+; 32R6-DAG:      seleqz $[[T1:[0-9]+]], $[[F1H]], $4
+; 32R6:          or $3, $[[T0]], $[[T1]]
+
+; 64:            movn $5, $6, $4
+; 64:            move $2, $5
+
+; 64R2:          movn $5, $6, $4
+; 64R2:          move $2, $5
+
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; 64R6-DAG:      sll $[[CC:[0-9]+]], $4, 0
+; 64R6-DAG:      selnez $[[T0:[0-9]+]], $5, $[[CC]]
+; 64R6-DAG:      seleqz $[[T1:[0-9]+]], $6, $[[CC]]
+; 64R6:          or $2, $[[T0]], $[[T1]]
+
+  %tobool = icmp ne i32 %s, 0
+  %cond = select i1 %tobool, i64 %f1, i64 %f0
+  ret i64 %cond
+}
+
+define i64 @i64_icmp_ne_i64_val(i64 %s, i64 %f0, i64 %f1) nounwind readnone {
 entry:
-; CHECK: movn.s
+; ALL-LABEL: i64_icmp_ne_i64_val:
+
+; 32-DAG:        or $[[CC:[0-9]+]], $4
+; 32-DAG:        lw $[[F1:[0-9]+]], 16($sp)
+; 32-DAG:        movn $6, $[[F1]], $[[CC]]
+; 32-DAG:        lw $[[F1H:[0-9]+]], 20($sp)
+; 32:            movn $7, $[[F1H]], $[[CC]]
+; 32:            move $2, $6
+; 32:            move $3, $7
+
+; 32R2-DAG:      or $[[CC:[0-9]+]], $4
+; 32R2-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R2-DAG:      movn $6, $[[F1]], $[[CC]]
+; 32R2-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R2:          movn $7, $[[F1H]], $[[CC]]
+; 32R2:          move $2, $6
+; 32R2:          move $3, $7
+
+; 32R6-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R6-DAG:      or $[[T2:[0-9]+]], $4, $5
+; 32R6-DAG:      selnez $[[T0:[0-9]+]], $6, $[[T2]]
+; 32R6-DAG:      seleqz $[[T1:[0-9]+]], $[[F1]], $[[T2]]
+; 32R6:          or $2, $[[T0]], $[[T1]]
+; 32R6-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R6-DAG:      selnez $[[T0:[0-9]+]], $7, $[[T2]]
+; 32R6-DAG:      seleqz $[[T1:[0-9]+]], $[[F1H]], $[[T2]]
+; 32R6:          or $3, $[[T0]], $[[T1]]
+
+; 64:            movn $5, $6, $4
+; 64:            move $2, $5
+
+; 64R2:          movn $5, $6, $4
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      selnez $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      seleqz $[[T1:[0-9]+]], $6, $4
+; 64R6:          or $2, $[[T0]], $[[T1]]
+
+  %tobool = icmp ne i64 %s, 0
+  %cond = select i1 %tobool, i64 %f1, i64 %f0
+  ret i64 %cond
+}
+
+define float @i32_icmp_ne_f32_val(i32 %s, float %f0, float %f1) nounwind readnone {
+entry:
+; ALL-LABEL: i32_icmp_ne_f32_val:
+
+; 32-DAG:        mtc1 $5, $[[F0:f[0-9]+]]
+; 32-DAG:        mtc1 $6, $[[F1:f0]]
+; 32:            movn.s $[[F1]], $[[F0]], $4
+
+; 32R2-DAG:      mtc1 $5, $[[F0:f[0-9]+]]
+; 32R2-DAG:      mtc1 $6, $[[F1:f0]]
+; 32R2:          movn.s $[[F1]], $[[F0]], $4
+
+; 32R6-DAG:      mtc1 $5, $[[F0:f[0-9]+]]
+; 32R6-DAG:      mtc1 $6, $[[F1:f[0-9]+]]
+; 32R6:          sltu $[[T0:[0-9]+]], $zero, $4
+; 32R6:          mtc1 $[[T0]], $[[CC:f0]]
+; 32R6:          sel.s $[[CC]], $[[F1]], $[[F0]]
+
+; 64:            movn.s $f14, $f13, $4
+; 64:            mov.s $f0, $f14
+
+; 64R2:          movn.s $f14, $f13, $4
+; 64R2:          mov.s $f0, $f14
+
+; 64R6:          sltu $[[T0:[0-9]+]], $zero, $4
+; 64R6:          mtc1 $[[T0]], $[[CC:f0]]
+; 64R6:          sel.s $[[CC]], $f14, $f13
+
   %tobool = icmp ne i32 %s, 0
   %cond = select i1 %tobool, float %f0, float %f1
   ret float %cond
 }
 
-define double @sel2_1(i32 %s, double %f0, double %f1) nounwind readnone {
+define double @i32_icmp_ne_f64_val(i32 %s, double %f0, double %f1) nounwind readnone {
 entry:
-; CHECK: movn.d
+; ALL-LABEL: i32_icmp_ne_f64_val:
+
+; 32-DAG:        mtc1 $6, $[[F0:f[1-3]*[02468]+]]
+; 32-DAG:        mtc1 $7, $[[F0H:f[1-3]*[13579]+]]
+; 32-DAG:        ldc1 $[[F1:f0]], 16($sp)
+; 32:            movn.d $[[F1]], $[[F0]], $4
+
+; 32R2-DAG:      mtc1 $6, $[[F0:f[0-9]+]]
+; 32R2-DAG:      mthc1 $7, $[[F0]]
+; 32R2-DAG:      ldc1 $[[F1:f0]], 16($sp)
+; 32R2:          movn.d $[[F1]], $[[F0]], $4
+
+; 32R6-DAG:      mtc1 $6, $[[F0:f[0-9]+]]
+; 32R6-DAG:      mthc1 $7, $[[F0]]
+; 32R6-DAG:      sltu $[[T0:[0-9]+]], $zero, $4
+; 32R6-DAG:      mtc1 $[[T0]], $[[CC:f0]]
+; 32R6-DAG:      ldc1 $[[F1:f[0-9]+]], 16($sp)
+; 32R6:          sel.d $[[CC]], $[[F1]], $[[F0]]
+
+; 64:            movn.d $f14, $f13, $4
+; 64:            mov.d $f0, $f14
+
+; 64R2:          movn.d $f14, $f13, $4
+; 64R2:          mov.d $f0, $f14
+
+; 64R6-DAG:      sltu $[[T0:[0-9]+]], $zero, $4
+; 64R6-DAG:      mtc1 $[[T0]], $[[CC:f0]]
+; 64R6:          sel.d $[[CC]], $f14, $f13
+
   %tobool = icmp ne i32 %s, 0
   %cond = select i1 %tobool, double %f0, double %f1
   ret double %cond
 }
 
-define float @sel3(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
+define float @f32_fcmp_oeq_f32_val(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.eq.s
-; CHECK: movt.s
+; ALL-LABEL: f32_fcmp_oeq_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.eq.s $[[F2]], $[[F3]]
+; 32:            movt.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.eq.s $[[F2]], $[[F3]]
+; 32R2:          movt.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.eq.s $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.eq.s $f14, $f15
+; 64:            movt.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.eq.s $f14, $f15
+; 64R2:          movt.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.eq.s $[[CC:f0]], $f14, $f15
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp oeq float %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define float @sel4(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
+define float @f32_fcmp_olt_f32_val(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.olt.s
-; CHECK: movt.s
+; ALL-LABEL: f32_fcmp_olt_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.olt.s $[[F2]], $[[F3]]
+; 32:            movt.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.olt.s $[[F2]], $[[F3]]
+; 32R2:          movt.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.olt.s $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.olt.s $f14, $f15
+; 64:            movt.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.olt.s $f14, $f15
+; 64R2:          movt.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.olt.s $[[CC:f0]], $f14, $f15
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp olt float %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define float @sel5(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
+define float @f32_fcmp_ogt_f32_val(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.s
-; CHECK: movf.s
+; ALL-LABEL: f32_fcmp_ogt_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.ule.s $[[F2]], $[[F3]]
+; 32:            movf.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.ule.s $[[F2]], $[[F3]]
+; 32R2:          movf.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.olt.s $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.ule.s $f14, $f15
+; 64:            movf.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.ule.s $f14, $f15
+; 64R2:          movf.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.olt.s $[[CC:f0]], $f15, $f14
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt float %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define double @sel5_1(double %f0, double %f1, float %f2, float %f3) nounwind readnone {
+define double @f32_fcmp_ogt_f64_val(double %f0, double %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.s
-; CHECK: movf.d
+; ALL-LABEL: f32_fcmp_ogt_f64_val:
+
+; 32-DAG:        lwc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        lwc1 $[[F3:f[0-9]+]], 20($sp)
+; 32:            c.ule.s $[[F2]], $[[F3]]
+; 32:            movf.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      lwc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      lwc1 $[[F3:f[0-9]+]], 20($sp)
+; 32R2:          c.ule.s $[[F2]], $[[F3]]
+; 32R2:          movf.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      lwc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      lwc1 $[[F3:f[0-9]+]], 20($sp)
+; 32R6:          cmp.olt.s $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.ule.s $f14, $f15
+; 64:            movf.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.ule.s $f14, $f15
+; 64R2:          movf.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.olt.s $[[CC:f0]], $f15, $f14
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt float %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define double @sel6(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
+define double @f64_fcmp_oeq_f64_val(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.eq.d
-; CHECK: movt.d
+; ALL-LABEL: f64_fcmp_oeq_f64_val:
+
+; 32-DAG:        ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32:            c.eq.d $[[F2]], $[[F3]]
+; 32:            movt.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R2:          c.eq.d $[[F2]], $[[F3]]
+; 32R2:          movt.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R6:          cmp.eq.d $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.eq.d $f14, $f15
+; 64:            movt.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.eq.d $f14, $f15
+; 64R2:          movt.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.eq.d $[[CC:f0]], $f14, $f15
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp oeq double %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define double @sel7(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
+define double @f64_fcmp_olt_f64_val(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.olt.d
-; CHECK: movt.d
+; ALL-LABEL: f64_fcmp_olt_f64_val:
+
+; 32-DAG:        ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32:            c.olt.d $[[F2]], $[[F3]]
+; 32:            movt.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R2:          c.olt.d $[[F2]], $[[F3]]
+; 32R2:          movt.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R6:          cmp.olt.d $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.olt.d $f14, $f15
+; 64:            movt.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.olt.d $f14, $f15
+; 64R2:          movt.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.olt.d $[[CC:f0]], $f14, $f15
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp olt double %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define double @sel8(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
+define double @f64_fcmp_ogt_f64_val(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.d
-; CHECK: movf.d
+; ALL-LABEL: f64_fcmp_ogt_f64_val:
+
+; 32-DAG:        ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32:            c.ule.d $[[F2]], $[[F3]]
+; 32:            movf.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R2:          c.ule.d $[[F2]], $[[F3]]
+; 32R2:          movf.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R6:          cmp.olt.d $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.ule.d $f14, $f15
+; 64:            movf.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.ule.d $f14, $f15
+; 64R2:          movf.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.olt.d $[[CC:f0]], $f15, $f14
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt double %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define float @sel8_1(float %f0, float %f1, double %f2, double %f3) nounwind readnone {
+define float @f64_fcmp_ogt_f32_val(float %f0, float %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.d
-; CHECK: movf.s
+; ALL-LABEL: f64_fcmp_ogt_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[1-3]*[02468]+]]
+; 32-DAG:        mtc1 $7, $[[F2H:f[1-3]*[13579]+]]
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 16($sp)
+; 32:            c.ule.d $[[F2]], $[[F3]]
+; 32:            movf.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mthc1 $7, $[[F2]]
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 16($sp)
+; 32R2:          c.ule.d $[[F2]], $[[F3]]
+; 32R2:          movf.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mthc1 $7, $[[F2]]
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 16($sp)
+; 32R6:          cmp.olt.d $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.ule.d $f14, $f15
+; 64:            movf.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.ule.d $f14, $f15
+; 64R2:          movf.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.olt.d $[[CC:f0]], $f15, $f14
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt double %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define i32 @sel9(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_oeq_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.eq.s
-; CHECK: movt
+; ALL-LABEL: f32_fcmp_oeq_i32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.eq.s $[[F2]], $[[F3]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.eq.s $[[F2]], $[[F3]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.eq.s $[[CC:f[0-9]+]], $[[F2]], $[[F3]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64:            c.eq.s $f14, $f15
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2:          c.eq.s $f14, $f15
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6:          cmp.eq.s $[[CC:f[0-9]+]], $f14, $f15
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %cmp = fcmp oeq float %f2, %f3
   %cond = select i1 %cmp, i32 %f0, i32 %f1
   ret i32 %cond
 }
 
-define i32 @sel10(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_olt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.olt.s
-; CHECK: movt
+; ALL-LABEL: f32_fcmp_olt_i32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.olt.s $[[F2]], $[[F3]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.olt.s $[[F2]], $[[F3]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.olt.s $[[CC:f[0-9]+]], $[[F2]], $[[F3]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64:            c.olt.s $f14, $f15
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2:          c.olt.s $f14, $f15
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6:          cmp.olt.s $[[CC:f[0-9]+]], $f14, $f15
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
   %cmp = fcmp olt float %f2, %f3
   %cond = select i1 %cmp, i32 %f0, i32 %f1
   ret i32 %cond
 }
 
-define i32 @sel11(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_ogt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.s
-; CHECK: movf
+; ALL-LABEL: f32_fcmp_ogt_i32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.ule.s $[[F2]], $[[F3]]
+; 32:            movf $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.ule.s $[[F2]], $[[F3]]
+; 32R2:          movf $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.olt.s $[[CC:f[0-9]+]], $[[F3]], $[[F2]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64:            c.ule.s $f14, $f15
+; 64:            movf $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2:          c.ule.s $f14, $f15
+; 64R2:          movf $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6:          cmp.olt.s $[[CC:f[0-9]+]], $f15, $f14
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %cmp = fcmp ogt float %f2, %f3
   %cond = select i1 %cmp, i32 %f0, i32 %f1
   ret i32 %cond
 }
 
-define i32 @sel12(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_oeq_i32_val(i32 %f0, i32 %f1) nounwind readonly {
 entry:
-; CHECK: c.eq.d
-; CHECK: movt
+; ALL-LABEL: f64_fcmp_oeq_i32_val:
+
+; 32-DAG:        addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32-DAG:        addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32-DAG:        lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32-DAG:        lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32:            c.eq.d $[[TMP]], $[[TMP1]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R2-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R2-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R2-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R2:          c.eq.d $[[TMP]], $[[TMP1]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R6-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R6-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R6-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R6:          cmp.eq.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64-DAG:        daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_oeq_i32_val)))
+; 64-DAG:        daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64-DAG:        ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64-DAG:        ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64:            c.eq.d $[[TMP]], $[[TMP1]]
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_oeq_i32_val)))
+; 64R2-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R2-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R2-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R2:          c.eq.d $[[TMP]], $[[TMP1]]
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_oeq_i32_val)))
+; 64R6-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R6-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R6-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R6:          cmp.eq.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %tmp = load double* @d2, align 8
   %tmp1 = load double* @d3, align 8
   %cmp = fcmp oeq double %tmp, %tmp1
@@ -137,10 +715,78 @@ entry:
   ret i32 %cond
 }
 
-define i32 @sel13(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_olt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
 entry:
-; CHECK: c.olt.d
-; CHECK: movt
+; ALL-LABEL: f64_fcmp_olt_i32_val:
+
+; 32-DAG:        addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32-DAG:        addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32-DAG:        lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32-DAG:        lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32:            c.olt.d $[[TMP]], $[[TMP1]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R2-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R2-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R2-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R2:          c.olt.d $[[TMP]], $[[TMP1]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R6-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R6-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R6-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R6:          cmp.olt.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64-DAG:        daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_olt_i32_val)))
+; 64-DAG:        daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64-DAG:        ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64-DAG:        ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64:            c.olt.d $[[TMP]], $[[TMP1]]
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_olt_i32_val)))
+; 64R2-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R2-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R2-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R2:          c.olt.d $[[TMP]], $[[TMP1]]
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_olt_i32_val)))
+; 64R6-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R6-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R6-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R6:          cmp.olt.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %tmp = load double* @d2, align 8
   %tmp1 = load double* @d3, align 8
   %cmp = fcmp olt double %tmp, %tmp1
@@ -148,10 +794,78 @@ entry:
   ret i32 %cond
 }
 
-define i32 @sel14(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_ogt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
 entry:
-; CHECK: c.ule.d
-; CHECK: movf
+; ALL-LABEL: f64_fcmp_ogt_i32_val:
+
+; 32-DAG:        addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32-DAG:        addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32-DAG:        lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32-DAG:        lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32:            c.ule.d $[[TMP]], $[[TMP1]]
+; 32:            movf $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R2-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R2-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R2-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R2:          c.ule.d $[[TMP]], $[[TMP1]]
+; 32R2:          movf $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R6-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R6-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R6-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R6:          cmp.olt.d $[[CC:f[0-9]+]], $[[TMP1]], $[[TMP]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64-DAG:        daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_ogt_i32_val)))
+; 64-DAG:        daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64-DAG:        ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64-DAG:        ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64:            c.ule.d $[[TMP]], $[[TMP1]]
+; 64:            movf $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_ogt_i32_val)))
+; 64R2-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R2-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R2-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R2:          c.ule.d $[[TMP]], $[[TMP1]]
+; 64R2:          movf $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_ogt_i32_val)))
+; 64R6-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R6-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R6-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R6:          cmp.olt.d $[[CC:f[0-9]+]], $[[TMP1]], $[[TMP]]
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; FIXME: This move is redundant
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %tmp = load double* @d2, align 8
   %tmp1 = load double* @d3, align 8
   %cmp = fcmp ogt double %tmp, %tmp1
diff --git a/test/CodeGen/Mips/selectcc.ll b/test/CodeGen/Mips/selectcc.ll
index aeef60ecb806..9790a0a3e411 100644
--- a/test/CodeGen/Mips/selectcc.ll
+++ b/test/CodeGen/Mips/selectcc.ll
@@ -1,5 +1,7 @@
-; RUN: llc -march=mipsel < %s
-; RUN: llc -march=mipsel -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -march=mipsel -mcpu=mips32 < %s
+; RUN: llc -march=mipsel -mcpu=mips32 -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
 
 @gf0 = external global float
 @gf1 = external global float
@@ -16,13 +18,11 @@ entry:
 ; SOURCE-SCHED: lw
 ; SOURCE-SCHED: lui
 ; SOURCE-SCHED: sw
-; SOURCE-SCHED: addiu
-; SOURCE-SCHED: addiu
-; SOURCE-SCHED: c.olt.s
-; SOURCE-SCHED: movt
+; SOURCE-SCHED: lw
+; SOURCE-SCHED: lwc1
 ; SOURCE-SCHED: mtc1
+; SOURCE-SCHED: c.olt.s
 ; SOURCE-SCHED: jr
-
   store float 0.000000e+00, float* @gf0, align 4
   store float 1.000000e+00, float* @gf1, align 4
   %cmp = fcmp olt float %a, %b
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index 3c810542cca3..b61f84e03761 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
-@bar = hidden alias i32* @foo
+@bar = hidden thread_local alias i32* @foo
 
 define i32* @zed() {
 ; CHECK-DAG: __tls_get_addr
-; CHECK-DAG: %tlsgd(bar)
+; CHECK-DAG: %tlsldm(bar)
        ret i32* @bar
 }
diff --git a/test/CodeGen/Mips/unalignedload.ll b/test/CodeGen/Mips/unalignedload.ll
index 19f3af7f344a..2002b1c60abe 100644
--- a/test/CodeGen/Mips/unalignedload.ll
+++ b/test/CodeGen/Mips/unalignedload.ll
@@ -1,5 +1,9 @@
-; RUN: llc  < %s -march=mipsel  | FileCheck %s -check-prefix=CHECK-EL
-; RUN: llc  < %s -march=mips    | FileCheck %s -check-prefix=CHECK-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32R6-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32R6-EB
 %struct.S2 = type { %struct.S1, %struct.S1 }
 %struct.S1 = type { i8, i8 }
 %struct.S4 = type { [7 x i8] }
@@ -7,21 +11,71 @@
 @s2 = common global %struct.S2 zeroinitializer, align 1
 @s4 = common global %struct.S4 zeroinitializer, align 1
 
-define void @foo1() nounwind {
+define void @bar1() nounwind {
 entry:
-; CHECK-EL-DAG: lbu ${{[0-9]+}}, 2($[[R0:[0-9]+]])
-; CHECK-EL-DAG: lbu ${{[0-9]+}}, 3($[[R0]])
-; CHECK-EL:     jalr
-; CHECK-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[R2:[0-9]+]])
-; CHECK-EL-DAG: lwr $[[R1]], 0($[[R2]])
-
-; CHECK-EB-DAG: lbu ${{[0-9]+}}, 3($[[R0:[0-9]+]])
-; CHECK-EB-DAG: lbu ${{[0-9]+}}, 2($[[R0]])
-; CHECK-EB:     jalr
-; CHECK-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[R2:[0-9]+]])
-; CHECK-EB-DAG: lwr $[[R1]], 3($[[R2]])
+; ALL-LABEL: bar1:
+
+; ALL-DAG:       lw $[[R0:[0-9]+]], %got(s2)(
+
+; MIPS32-EL-DAG: lbu $[[PART1:[0-9]+]], 2($[[R0]])
+; MIPS32-EL-DAG: lbu $[[PART2:[0-9]+]], 3($[[R0]])
+; MIPS32-EL-DAG: sll $[[T0:[0-9]+]], $[[PART2]], 8
+; MIPS32-EL-DAG: or  $4, $[[T0]], $[[PART1]]
+
+; MIPS32-EB-DAG: lbu $[[PART1:[0-9]+]], 2($[[R0]])
+; MIPS32-EB-DAG: lbu $[[PART2:[0-9]+]], 3($[[R0]])
+; MIPS32-EB-DAG: sll $[[T0:[0-9]+]], $[[PART1]], 8
+; MIPS32-EB-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[PART2]]
+; MIPS32-EB-DAG: sll $4, $[[T1]], 16
+
+; MIPS32R6-DAG:  lhu $[[PART1:[0-9]+]], 2($[[R0]])
 
   tail call void @foo2(%struct.S1* byval getelementptr inbounds (%struct.S2* @s2, i32 0, i32 1)) nounwind
+  ret void
+}
+
+define void @bar2() nounwind {
+entry:
+; ALL-LABEL: bar2:
+
+; ALL-DAG:       lw $[[R2:[0-9]+]], %got(s4)(
+
+; MIPS32-EL-DAG: lwl $[[R1:4]], 3($[[R2]])
+; MIPS32-EL-DAG: lwr $[[R1]], 0($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T1:[0-9]+]], 5($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T2:[0-9]+]], 6($[[R2]])
+; MIPS32-EL-DAG: sll $[[T3:[0-9]+]], $[[T1]], 8
+; MIPS32-EL-DAG: or  $[[T4:[0-9]+]], $[[T3]], $[[T0]]
+; MIPS32-EL-DAG: sll $[[T5:[0-9]+]], $[[T2]], 16
+; MIPS32-EL-DAG: or  $5, $[[T4]], $[[T5]]
+
+; MIPS32-EB-DAG: lwl $[[R1:4]], 0($[[R2]])
+; MIPS32-EB-DAG: lwr $[[R1]], 3($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T1:[0-9]+]], 5($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T2:[0-9]+]], 6($[[R2]])
+; MIPS32-EB-DAG: sll $[[T3:[0-9]+]], $[[T0]], 8
+; MIPS32-EB-DAG: or  $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+; MIPS32-EB-DAG: sll $[[T5:[0-9]+]], $[[T4]], 16
+; MIPS32-EB-DAG: sll $[[T6:[0-9]+]], $[[T2]], 8
+; MIPS32-EB-DAG: or  $5, $[[T5]], $[[T6]]
+
+; FIXME: We should be able to do better than this using lhu
+; MIPS32R6-EL-DAG: lw $4, 0($[[R2]])
+; MIPS32R6-EL-DAG: lhu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32R6-EL-DAG: lbu $[[T1:[0-9]+]], 6($[[R2]])
+; MIPS32R6-EL-DAG: sll $[[T2:[0-9]+]], $[[T1]], 16
+; MIPS32R6-EL-DAG: or  $5, $[[T0]], $[[T2]]
+
+; FIXME: We should be able to do better than this using lhu
+; MIPS32R6-EB-DAG: lw $4, 0($[[R2]])
+; MIPS32R6-EB-DAG: lhu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32R6-EB-DAG: lbu $[[T1:[0-9]+]], 6($[[R2]])
+; MIPS32R6-EB-DAG: sll $[[T2:[0-9]+]], $[[T0]], 16
+; MIPS32R6-EB-DAG: sll $[[T3:[0-9]+]], $[[T1]], 8
+; MIPS32R6-EB-DAG: or  $5, $[[T2]], $[[T3]]
+
   tail call void @foo4(%struct.S4* byval @s4) nounwind
   ret void
 }
diff --git a/test/CodeGen/Mips/zeroreg.ll b/test/CodeGen/Mips/zeroreg.ll
index e0e93e2e7682..a1b6cb0322b1 100644
--- a/test/CodeGen/Mips/zeroreg.ll
+++ b/test/CodeGen/Mips/zeroreg.ll
@@ -1,21 +1,109 @@
-; RUN: llc < %s -march=mipsel | FileCheck %s
+; RUN: llc < %s -march=mipsel -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32R6
+; RUN: llc < %s -march=mipsel -mcpu=mips4    | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64R6
 
 @g1 = external global i32
 
-define i32 @foo0(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z0(i32 %s) nounwind readonly {
 entry:
-; CHECK:     movn ${{[0-9]+}}, $zero
+; ALL-LABEL: sel_icmp_nez_i32_z0:
+
+; 32-CMOV:       lw $2, 0(${{[0-9]+}})
+; 32-CMOV:       movn $2, $zero, $4
+
+; 32R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6:          seleqz $2, $[[R0]], $4
+
+; 64-CMOV:       lw $2, 0(${{[0-9]+}})
+; 64-CMOV:       movn $2, $zero, $4
+
+; 64R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          seleqz $2, $[[R0]], $4
+
   %tobool = icmp ne i32 %s, 0
   %0 = load i32* @g1, align 4
   %cond = select i1 %tobool, i32 0, i32 %0
   ret i32 %cond
 }
 
-define i32 @foo1(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z1(i32 %s) nounwind readonly {
 entry:
-; CHECK:     movz ${{[0-9]+}}, $zero
+; ALL-LABEL: sel_icmp_nez_i32_z1:
+
+; 32-CMOV:       lw $2, 0(${{[0-9]+}})
+; 32-CMOV:       movz $2, $zero, $4
+
+; 32R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6:          selnez $2, $[[R0]], $4
+
+; 64-CMOV:       lw $2, 0(${{[0-9]+}})
+; 64-CMOV:       movz $2, $zero, $4
+
+; 64R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          selnez $2, $[[R0]], $4
+
   %tobool = icmp ne i32 %s, 0
   %0 = load i32* @g1, align 4
   %cond = select i1 %tobool, i32 %0, i32 0
   ret i32 %cond
 }
+
+@g2 = external global i64
+
+define i64 @sel_icmp_nez_i64_z0(i64 %s) nounwind readonly {
+entry:
+; ALL-LABEL: sel_icmp_nez_i64_z0:
+
+; 32-CMOV-DAG:   lw $[[R0:2]], 0(${{[0-9]+}})
+; 32-CMOV-DAG:   lw $[[R1:3]], 4(${{[0-9]+}})
+; 32-CMOV-DAG:   movn $[[R0]], $zero, $4
+; 32-CMOV-DAG:   movn $[[R1]], $zero, $4
+
+; 32R6-DAG:      lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-DAG:      lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-DAG:      or $[[CC:[0-9]+]], $4, $5
+; 32R6-DAG:      seleqz $2, $[[R0]], $[[CC]]
+; 32R6-DAG:      seleqz $3, $[[R1]], $[[CC]]
+
+; 64-CMOV:       ld $2, 0(${{[0-9]+}})
+; 64-CMOV:       movn $2, $zero, $4
+
+; 64R6:          ld $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          seleqz $2, $[[R0]], $4
+
+  %tobool = icmp ne i64 %s, 0
+  %0 = load i64* @g2, align 4
+  %cond = select i1 %tobool, i64 0, i64 %0
+  ret i64 %cond
+}
+
+define i64 @sel_icmp_nez_i64_z1(i64 %s) nounwind readonly {
+entry:
+; ALL-LABEL: sel_icmp_nez_i64_z1:
+
+; 32-CMOV-DAG:   lw $[[R0:2]], 0(${{[0-9]+}})
+; 32-CMOV-DAG:   lw $[[R1:3]], 4(${{[0-9]+}})
+; 32-CMOV-DAG:   movz $[[R0]], $zero, $4
+; 32-CMOV-DAG:   movz $[[R1]], $zero, $4
+
+; 32R6-DAG:      lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-DAG:      lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-DAG:      or $[[CC:[0-9]+]], $4, $5
+; 32R6-DAG:      selnez $2, $[[R0]], $[[CC]]
+; 32R6-DAG:      selnez $3, $[[R1]], $[[CC]]
+
+; 64-CMOV:       ld $2, 0(${{[0-9]+}})
+; 64-CMOV:       movz $2, $zero, $4
+
+; 64R6:          ld $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          selnez $2, $[[R0]], $4
+
+  %tobool = icmp ne i64 %s, 0
+  %0 = load i64* @g2, align 4
+  %cond = select i1 %tobool, i64 %0, i64 0
+  ret i64 %cond
+}
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll
index 0622aa3cb5f1..c225abf0fd85 100644
--- a/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@@ -74,13 +74,13 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
   ret float %sum5
 }
 
-; Verifies nvptx-favor-non-generic keeps addrspacecasts between pointers of
-; different element types.
+; When hoisting an addrspacecast between different pointer types, replace the
+; addrspacecast with a bitcast.
 define i32 @ld_int_from_float() {
 ; IR-LABEL: @ld_int_from_float
-; IR: addrspacecast
+; IR: load i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
 ; PTX-LABEL: ld_int_from_float(
-; PTX: cvta.shared.u{{(32|64)}}
+; PTX: ld.shared.u{{(32|64)}}
   %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
   ret i32 %1
 }
diff --git a/test/CodeGen/NVPTX/lit.local.cfg b/test/CodeGen/NVPTX/lit.local.cfg
index 85cf8c2c8c07..2cb98eb371b2 100644
--- a/test/CodeGen/NVPTX/lit.local.cfg
+++ b/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'NVPTX' in targets:
+if not 'NVPTX' in config.root.targets:
     config.unsupported = True
diff --git a/test/CodeGen/NVPTX/nvvm-reflect.ll b/test/CodeGen/NVPTX/nvvm-reflect.ll
index 0d02194651e3..45438847b8de 100644
--- a/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=0 -O2 | FileCheck %s --check-prefix=USE_MUL_0
 ; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=1 -O2 | FileCheck %s --check-prefix=USE_MUL_1
 
-@str = private addrspace(4) unnamed_addr constant [8 x i8] c"USE_MUL\00"
+@str = private unnamed_addr addrspace(4) constant [8 x i8] c"USE_MUL\00"
 
 declare i32 @__nvvm_reflect(i8*)
 declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)*)
diff --git a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
index 00a402e0e487..8802b97d2a6a 100644
--- a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
+++ b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vadduhm
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vsubuhm
-; XFAIL: *
 
 define <4 x i32> @test() nounwind {
 	ret <4 x i32> < i32 4293066722, i32 4293066722, i32 4293066722, i32 4293066722>
diff --git a/test/CodeGen/PowerPC/Atomics-32.ll b/test/CodeGen/PowerPC/Atomics-32.ll
index b5c03e2b202e..b7f23b1dd83e 100644
--- a/test/CodeGen/PowerPC/Atomics-32.ll
+++ b/test/CodeGen/PowerPC/Atomics-32.ll
@@ -529,63 +529,73 @@ define void @test_compare_and_swap() nounwind {
 entry:
   %0 = load i8* @uc, align 1
   %1 = load i8* @sc, align 1
-  %2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
+  %pair2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
+  %2 = extractvalue { i8, i1 } %pair2, 0
   store i8 %2, i8* @sc, align 1
   %3 = load i8* @uc, align 1
   %4 = load i8* @sc, align 1
-  %5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
+  %pair5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
+  %5 = extractvalue { i8, i1 } %pair5, 0
   store i8 %5, i8* @uc, align 1
   %6 = load i8* @uc, align 1
   %7 = zext i8 %6 to i16
   %8 = load i8* @sc, align 1
   %9 = sext i8 %8 to i16
   %10 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
+  %pair11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
+  %11 = extractvalue { i16, i1 } %pair11, 0
   store i16 %11, i16* @ss, align 2
   %12 = load i8* @uc, align 1
   %13 = zext i8 %12 to i16
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
   %16 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
+  %pair17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
+  %17 = extractvalue { i16, i1 } %pair17, 0
   store i16 %17, i16* @us, align 2
   %18 = load i8* @uc, align 1
   %19 = zext i8 %18 to i32
   %20 = load i8* @sc, align 1
   %21 = sext i8 %20 to i32
   %22 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
+  %pair23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
+  %23 = extractvalue { i32, i1 } %pair23, 0
   store i32 %23, i32* @si, align 4
   %24 = load i8* @uc, align 1
   %25 = zext i8 %24 to i32
   %26 = load i8* @sc, align 1
   %27 = sext i8 %26 to i32
   %28 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
+  %pair29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
+  %29 = extractvalue { i32, i1 } %pair29, 0
   store i32 %29, i32* @ui, align 4
   %30 = load i8* @uc, align 1
   %31 = zext i8 %30 to i32
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i32
   %34 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic monotonic
+  %pair35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic monotonic
+  %35 = extractvalue { i32, i1 } %pair35, 0
   store i32 %35, i32* @sl, align 4
   %36 = load i8* @uc, align 1
   %37 = zext i8 %36 to i32
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i32
   %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic monotonic
+  %pair41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic monotonic
+  %41 = extractvalue { i32, i1 } %pair41, 0
   store i32 %41, i32* @ul, align 4
   %42 = load i8* @uc, align 1
   %43 = load i8* @sc, align 1
-  %44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
+  %pair44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
+  %44 = extractvalue { i8, i1 } %pair44, 0
   %45 = icmp eq i8 %44, %42
   %46 = zext i1 %45 to i32
   store i32 %46, i32* @ui, align 4
   %47 = load i8* @uc, align 1
   %48 = load i8* @sc, align 1
-  %49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic monotonic
+  %pair49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic monotonic
+  %49 = extractvalue { i8, i1 } %pair49, 0
   %50 = icmp eq i8 %49, %47
   %51 = zext i1 %50 to i32
   store i32 %51, i32* @ui, align 4
@@ -594,7 +604,8 @@ entry:
   %54 = load i8* @sc, align 1
   %55 = sext i8 %54 to i16
   %56 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic monotonic
+  %pair57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic monotonic
+  %57 = extractvalue { i16, i1 } %pair57, 0
   %58 = icmp eq i16 %57, %53
   %59 = zext i1 %58 to i32
   store i32 %59, i32* @ui, align 4
@@ -603,7 +614,8 @@ entry:
   %62 = load i8* @sc, align 1
   %63 = sext i8 %62 to i16
   %64 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic monotonic
+  %pair65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic monotonic
+  %65 = extractvalue { i16, i1 } %pair65, 0
   %66 = icmp eq i16 %65, %61
   %67 = zext i1 %66 to i32
   store i32 %67, i32* @ui, align 4
@@ -612,7 +624,8 @@ entry:
   %70 = load i8* @sc, align 1
   %71 = sext i8 %70 to i32
   %72 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic monotonic
+  %pair73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic monotonic
+  %73 = extractvalue { i32, i1 } %pair73, 0
   %74 = icmp eq i32 %73, %69
   %75 = zext i1 %74 to i32
   store i32 %75, i32* @ui, align 4
@@ -621,7 +634,8 @@ entry:
   %78 = load i8* @sc, align 1
   %79 = sext i8 %78 to i32
   %80 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic monotonic
+  %pair81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic monotonic
+  %81 = extractvalue { i32, i1 } %pair81, 0
   %82 = icmp eq i32 %81, %77
   %83 = zext i1 %82 to i32
   store i32 %83, i32* @ui, align 4
@@ -630,7 +644,8 @@ entry:
   %86 = load i8* @sc, align 1
   %87 = sext i8 %86 to i32
   %88 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic monotonic
+  %pair89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic monotonic
+  %89 = extractvalue { i32, i1 } %pair89, 0
   %90 = icmp eq i32 %89, %85
   %91 = zext i1 %90 to i32
   store i32 %91, i32* @ui, align 4
@@ -639,7 +654,8 @@ entry:
   %94 = load i8* @sc, align 1
   %95 = sext i8 %94 to i32
   %96 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic monotonic
+  %pair97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic monotonic
+  %97 = extractvalue { i32, i1 } %pair97, 0
   %98 = icmp eq i32 %97, %93
   %99 = zext i1 %98 to i32
   store i32 %99, i32* @ui, align 4
diff --git a/test/CodeGen/PowerPC/alias.ll b/test/CodeGen/PowerPC/alias.ll
new file mode 100644
index 000000000000..86e41148a0d7
--- /dev/null
+++ b/test/CodeGen/PowerPC/alias.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -code-model=medium| FileCheck --check-prefix=CHECK --check-prefix=MEDIUM %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK --check-prefix=LARGE %s
+
+@foo = global i32 42
+@fooa = alias i32* @foo
+
+@foo2 = global i64 42
+@foo2a = alias i64* @foo2
+
+; CHECK-LABEL: bar:
+define i32 @bar() {
+; MEDIUM: addis 3, 2, fooa@toc@ha
+; LARGE: addis 3, 2, .LC1@toc@ha
+  %a = load i32* @fooa
+  ret i32 %a
+}
+
+; CHECK-LABEL: bar2:
+define i64 @bar2() {
+; MEDIUM: addis 3, 2, foo2a@toc@ha
+; MEDIUM: addi 3, 3, foo2a@toc@l
+; LARGE: addis 3, 2, .LC3@toc@ha
+  %a = load i64* @foo2a
+  ret i64 %a
+}
+
+; LARGE: .LC1:
+; LARGE-NEXT: .tc fooa[TC],fooa
+
+; LARGE: .LC3:
+; LARGE-NEXT: .tc foo2a[TC],foo2a
diff --git a/test/CodeGen/PowerPC/atomic-1.ll b/test/CodeGen/PowerPC/atomic-1.ll
index 083df47e562c..997a016a5dcd 100644
--- a/test/CodeGen/PowerPC/atomic-1.ll
+++ b/test/CodeGen/PowerPC/atomic-1.ll
@@ -11,7 +11,8 @@ define i32 @exchange_and_add(i32* %mem, i32 %val) nounwind {
 define i32 @exchange_and_cmp(i32* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp:
 ; CHECK: lwarx
-  %tmp = cmpxchg i32* %mem, i32 0, i32 1 monotonic monotonic
+  %tmppair = cmpxchg i32* %mem, i32 0, i32 1 monotonic monotonic
+  %tmp = extractvalue { i32, i1 } %tmppair, 0
 ; CHECK: stwcx.
 ; CHECK: stwcx.
   ret i32 %tmp
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 261335e81d8d..843250f10b4f 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -11,7 +11,8 @@ define i64 @exchange_and_add(i64* %mem, i64 %val) nounwind {
 define i64 @exchange_and_cmp(i64* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp:
 ; CHECK: ldarx
-  %tmp = cmpxchg i64* %mem, i64 0, i64 1 monotonic monotonic
+  %tmppair = cmpxchg i64* %mem, i64 0, i64 1 monotonic monotonic
+  %tmp = extractvalue { i64, i1 } %tmppair, 0
 ; CHECK: stdcx.
 ; CHECK: stdcx.
   ret i64 %tmp
diff --git a/test/CodeGen/PowerPC/func-addr.ll b/test/CodeGen/PowerPC/func-addr.ll
new file mode 100644
index 000000000000..4533c6258a52
--- /dev/null
+++ b/test/CodeGen/PowerPC/func-addr.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple powerpc64-linux < %s | FileCheck %s
+; RUN: llc -O0 -mtriple powerpc64-linux < %s | FileCheck %s
+
+define void @foo()  {
+  ret void
+}
+declare i32 @bar(i8*)
+
+; CHECK-LABEL: {{^}}zed:
+; CHECK:        addis 3, 2, foo@toc@ha
+; CHECK-NEXT:   addi 3, 3, foo@toc@l
+; CHECK-NEXT:   bl bar
+
+define  void @zed() {
+  call i32 @bar(i8* bitcast (void ()* @foo to i8*))
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/lit.local.cfg b/test/CodeGen/PowerPC/lit.local.cfg
index 2e463005586f..5d33887ff0a4 100644
--- a/test/CodeGen/PowerPC/lit.local.cfg
+++ b/test/CodeGen/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
index 1f3bb7111efd..31794be25beb 100644
--- a/test/CodeGen/PowerPC/ppc64-calls.ll
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -42,12 +42,18 @@ define void @test_indirect(void ()* nocapture %fp) nounwind {
   ret void
 }
 
-; Absolute vales should be have the TOC restore 'nop'
+; Absolute values must use the regular indirect call sequence
+; The main purpose of this test is to ensure that BLA is not
+; used on 64-bit SVR4 (as e.g. on Darwin).
 define void @test_abs() nounwind {
 ; CHECK-LABEL: test_abs:
   tail call void inttoptr (i64 1024 to void ()*)() nounwind
-; CHECK: bla 1024
-; CHECK-NEXT: nop
+; CHECK: ld [[FP:[0-9]+]], 1024(0)
+; CHECK: ld 11, 1040(0)
+; CHECK: ld 2, 1032(0)
+; CHECK-NEXT: mtctr [[FP]]
+; CHECK-NEXT: bctrl
+; CHECK-NEXT: ld 2, 40(1)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/splat-bug.ll b/test/CodeGen/PowerPC/splat-bug.ll
new file mode 100644
index 000000000000..4b5250b259fa
--- /dev/null
+++ b/test/CodeGen/PowerPC/splat-bug.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mcpu=ppc64 -O0 -fast-isel=false < %s | FileCheck %s
+
+; Checks for a previous bug where vspltisb/vaddubm were issued in place
+; of vsplitsh/vadduhm.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = external global <16 x i8>
+
+define void @foo() nounwind ssp {
+; CHECK: foo:
+  store <16 x i8> <i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16>, <16 x i8>* @a
+; CHECK: vspltish [[REG:[0-9]+]], 8
+; CHECK: vadduhm {{[0-9]+}}, [[REG]], [[REG]]
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll
index d7ed64a5b1cf..304a84d49a9d 100644
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5
+; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"
@@ -8,6 +10,8 @@ target triple = "powerpc-apple-darwin8"
 
 define void @foo(i32 %x, ...) {
 entry:
+; CHECK: foo:
+; CHECK-LE: foo:
 	%x_addr = alloca i32		; <i32*> [#uses=1]
 	%ap = alloca i8*		; <i8**> [#uses=3]
 	%ap.0 = alloca i8*		; <i8**> [#uses=3]
@@ -27,6 +31,10 @@ entry:
 	%tmp8 = getelementptr %struct.u16qi* %tmp6, i32 0, i32 0		; <<16 x i8>*> [#uses=1]
 	%tmp9 = getelementptr %struct.u16qi* %tmp7, i32 0, i32 0		; <<16 x i8>*> [#uses=1]
 	%tmp10 = load <16 x i8>* %tmp9, align 4		; <<16 x i8>> [#uses=1]
+; CHECK: lvsl
+; CHECK: vperm
+; CHECK-LE: lvsr
+; CHECK-LE: vperm
 	store <16 x i8> %tmp10, <16 x i8>* %tmp8, align 4
 	br label %return
 
diff --git a/test/CodeGen/PowerPC/vec_mul.ll b/test/CodeGen/PowerPC/vec_mul.ll
index c376751d8060..8a448156c98e 100644
--- a/test/CodeGen/PowerPC/vec_mul.ll
+++ b/test/CodeGen/PowerPC/vec_mul.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -march=ppc32 -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
 
 define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 	%tmp = load <4 x i32>* %X		; <<4 x i32>> [#uses=1]
@@ -9,6 +11,9 @@ define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 ; CHECK-LABEL: test_v4i32:
 ; CHECK: vmsumuhm
 ; CHECK-NOT: mullw
+; CHECK-LE-LABEL: test_v4i32:
+; CHECK-LE: vmsumuhm
+; CHECK-LE-NOT: mullw
 
 define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 	%tmp = load <8 x i16>* %X		; <<8 x i16>> [#uses=1]
@@ -19,6 +24,9 @@ define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 ; CHECK-LABEL: test_v8i16:
 ; CHECK: vmladduhm
 ; CHECK-NOT: mullw
+; CHECK-LE-LABEL: test_v8i16:
+; CHECK-LE: vmladduhm
+; CHECK-LE-NOT: mullw
 
 define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 	%tmp = load <16 x i8>* %X		; <<16 x i8>> [#uses=1]
@@ -30,6 +38,11 @@ define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 ; CHECK: vmuloub
 ; CHECK: vmuleub
 ; CHECK-NOT: mullw
+; CHECK-LE-LABEL: test_v16i8:
+; CHECK-LE: vmuloub [[REG1:[0-9]+]]
+; CHECK-LE: vmuleub [[REG2:[0-9]+]]
+; CHECK-LE: vperm {{[0-9]+}}, [[REG2]], [[REG1]]
+; CHECK-LE-NOT: mullw
 
 define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
 	%tmp = load <4 x float>* %X
@@ -44,3 +57,7 @@ define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
 ; CHECK: vspltisw [[ZNEG:[0-9]+]], -1
 ; CHECK: vslw     {{[0-9]+}}, [[ZNEG]], [[ZNEG]]
 ; CHECK: vmaddfp
+; CHECK-LE-LABEL: test_float:
+; CHECK-LE: vspltisw [[ZNEG:[0-9]+]], -1
+; CHECK-LE: vslw     {{[0-9]+}}, [[ZNEG]], [[ZNEG]]
+; CHECK-LE: vmaddfp
diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll
new file mode 100644
index 000000000000..635721c929d5
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+
+define void @VPKUHUM_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VPKUHUM_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK: vpkuhum
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VPKUHUM_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VPKUHUM_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK: vpkuhum
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VPKUWUM_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VPKUWUM_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29>
+; CHECK: vpkuwum
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VPKUWUM_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VPKUWUM_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+; CHECK: vpkuwum
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLB_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGLB_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+; CHECK: vmrglb
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLB_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGLB_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; CHECK: vmrglb
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHB_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGHB_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK: vmrghb
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHB_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGHB_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; CHECK: vmrghb
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLH_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGLH_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
+; CHECK: vmrglh
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLH_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGLH_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 4, i32 5, i32 4, i32 5, i32 6, i32 7, i32 6, i32 7>
+; CHECK: vmrglh
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHH_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGHH_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
+; CHECK: vmrghh
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHH_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGHH_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11, i32 12, i32 13, i32 12, i32 13, i32 14, i32 15, i32 14, i32 15>
+; CHECK: vmrghh
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLW_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGLW_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23>
+; CHECK: vmrglw
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLW_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGLW_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; CHECK: vmrglw
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHW_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGHW_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
+; CHECK: vmrghw
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHW_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGHW_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+; CHECK: vmrghw
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VSLDOI_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VSLDOI_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+; CHECK: vsldoi
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VSLDOI_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VSLDOI_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+; CHECK: vsldoi
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
diff --git a/test/CodeGen/PowerPC/vperm-instcombine.ll b/test/CodeGen/PowerPC/vperm-instcombine.ll
new file mode 100644
index 000000000000..d9084c8bb595
--- /dev/null
+++ b/test/CodeGen/PowerPC/vperm-instcombine.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define <16 x i8> @foo() nounwind ssp {
+; CHECK: @foo
+;; Arguments are {0,1,...,15},{16,17,...,31},{30,28,26,...,0}
+  %1 = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> <i32 50462976, i32 117835012, i32 185207048, i32 252579084>, <4 x i32> <i32 319951120, i32 387323156, i32 454695192, i32 522067228>, <16 x i8> <i8 30, i8 28, i8 26, i8 24, i8 22, i8 20, i8 18, i8 16, i8 14, i8 12, i8 10, i8 8, i8 6, i8 4, i8 2, i8 0>)
+  %2 = bitcast <4 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+;; Revised arguments are {16,17,...31},{0,1,...,15},{1,3,5,...,31}
+;; optimized into the following:
+; CHECK: ret <16 x i8> <i8 17, i8 19, i8 21, i8 23, i8 25, i8 27, i8 29, i8 31, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>
+}
+
+declare <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32>, <4 x i32>, <16 x i8>)
diff --git a/test/CodeGen/PowerPC/vperm-lowering.ll b/test/CodeGen/PowerPC/vperm-lowering.ll
new file mode 100644
index 000000000000..d55d26c959b6
--- /dev/null
+++ b/test/CodeGen/PowerPC/vperm-lowering.ll
@@ -0,0 +1,66 @@
+; RUN: llc -O0 -fast-isel=false -mcpu=ppc64 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define <16 x i8> @foo() nounwind ssp {
+  %1 = shufflevector <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 1, i32 6, i32 11>
+  ret <16 x i8> %1
+}
+
+; CHECK: .LCPI0_0:
+; CHECK: .byte 31
+; CHECK: .byte 26
+; CHECK: .byte 21
+; CHECK: .byte 16
+; CHECK: .byte 11
+; CHECK: .byte 6
+; CHECK: .byte 1
+; CHECK: .byte 28
+; CHECK: .byte 23
+; CHECK: .byte 18
+; CHECK: .byte 13
+; CHECK: .byte 8
+; CHECK: .byte 3
+; CHECK: .byte 30
+; CHECK: .byte 25
+; CHECK: .byte 20
+; CHECK: .LCPI0_1:
+; CHECK: .byte 0
+; CHECK: .byte 1
+; CHECK: .byte 2
+; CHECK: .byte 3
+; CHECK: .byte 4
+; CHECK: .byte 5
+; CHECK: .byte 6
+; CHECK: .byte 7
+; CHECK: .byte 8
+; CHECK: .byte 9
+; CHECK: .byte 10
+; CHECK: .byte 11
+; CHECK: .byte 12
+; CHECK: .byte 13
+; CHECK: .byte 14
+; CHECK: .byte 15
+; CHECK: .LCPI0_2:
+; CHECK: .byte 16
+; CHECK: .byte 17
+; CHECK: .byte 18
+; CHECK: .byte 19
+; CHECK: .byte 20
+; CHECK: .byte 21
+; CHECK: .byte 22
+; CHECK: .byte 23
+; CHECK: .byte 24
+; CHECK: .byte 25
+; CHECK: .byte 26
+; CHECK: .byte 27
+; CHECK: .byte 28
+; CHECK: .byte 29
+; CHECK: .byte 30
+; CHECK: .byte 31
+; CHECK: foo:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_2@toc@ha
+; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_2@toc@l
+; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]]
+; CHECK: vperm {{[0-9]+}}, [[REG3]], {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index ee9bc836eb86..cf1148123a2a 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -1,13 +1,12 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-;EG-CHECK: @test2
-;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test2
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +17,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test4
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -38,3 +36,75 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @s_and_i32
+; SI: S_AND_B32
+define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_and_constant_i32
+; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
+define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_and_i32
+; SI: V_AND_B32
+define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_and_constant_i32
+; SI: V_AND_B32
+define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_and_i64
+; SI: S_AND_B64
+define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @s_and_constant_i64
+; SI: S_AND_B64
+define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+  %and = and i64 %a, 281474976710655
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_and_i64
+; SI: V_AND_B32
+; SI: V_AND_B32
+define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_and_constant_i64
+; SI: V_AND_B32
+; SI: V_AND_B32
+define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 1234567
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index c2362da15cde..3230353c36c7 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -10,7 +10,12 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 
 ; SI-LABEL: @test_private_array_ptr_calc:
 ; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
-; SI: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
+;
+; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
+; alloca to a vector.  It currently fails because it does not know how
+; to interpret:
+; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
+; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %alloca = alloca [4 x i32], i32 4, align 16
   %tid = call i32 @llvm.SI.tid() readnone
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
new file mode 100644
index 000000000000..eb9539eec516
--- /dev/null
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i32_offset:
+; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: V_MOV_B32_e32 [[VCMP:v[0-9]+]], 7
+; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: V_MOV_B32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; SI: DS_CMPST_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]], 0x10, [M0]
+; SI: S_ENDPGM
+define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i64_offset:
+; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: S_MOV_B64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
+; SI-DAG: V_MOV_B32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: V_MOV_B32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; SI-DAG: V_MOV_B32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; SI: DS_CMPST_RTN_B64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}}, 0x20, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
index cb0242cd0c91..c26f9cd80eaf 100644
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -1,23 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @atomic_add_local
-; R600-CHECK: LDS_ADD *
-; SI-CHECK-LABEL: @atomic_add_local
-; SI-CHECK: DS_ADD_U32_RTN
+; FUNC-LABEL: @atomic_add_local
+; R600: LDS_ADD *
+; SI: DS_ADD_RTN_U32
 define void @atomic_add_local(i32 addrspace(3)* %local) {
-entry:
-   %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+   %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; R600-CHECK-LABEL: @atomic_add_ret_local
-; R600-CHECK: LDS_ADD_RET *
-; SI-CHECK-LABEL: @atomic_add_ret_local
-; SI-CHECK: DS_ADD_U32_RTN
+; FUNC-LABEL: @atomic_add_local_const_offset
+; R600: LDS_ADD *
+; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: @atomic_add_ret_local
+; R600: LDS_ADD_RET *
+; SI: DS_ADD_RTN_U32
 define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-entry:
-  %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %0, i32 addrspace(1)* %out
+  %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @atomic_add_ret_local_const_offset
+; R600: LDS_ADD_RET *
+; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
index 7c26e529a360..3569d91e08dc 100644
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -1,23 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @atomic_sub_local
-; R600-CHECK: LDS_SUB *
-; SI-CHECK-LABEL: @atomic_sub_local
-; SI-CHECK: DS_SUB_U32_RTN
+; FUNC-LABEL: @atomic_sub_local
+; R600: LDS_SUB *
+; SI: DS_SUB_RTN_U32
 define void @atomic_sub_local(i32 addrspace(3)* %local) {
-entry:
-   %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+   %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; R600-CHECK-LABEL: @atomic_sub_ret_local
-; R600-CHECK: LDS_SUB_RET *
-; SI-CHECK-LABEL: @atomic_sub_ret_local
-; SI-CHECK: DS_SUB_U32_RTN
+; FUNC-LABEL: @atomic_sub_local_const_offset
+; R600: LDS_SUB *
+; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: @atomic_sub_ret_local
+; R600: LDS_SUB_RET *
+; SI: DS_SUB_RTN_U32
 define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-entry:
-  %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %0, i32 addrspace(1)* %out
+  %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @atomic_sub_ret_local_const_offset
+; R600: LDS_SUB_RET *
+; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
index 5bfc00836678..0be79e658f5c 100644
--- a/test/CodeGen/R600/bitcast.ll
+++ b/test/CodeGen/R600/bitcast.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; This test just checks that the compiler doesn't crash.
-; CHECK-LABEL: @v32i8_to_v8i32
-; CHECK: S_ENDPGM
 
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+; FUNC-LABEL: @v32i8_to_v8i32
+; SI: S_ENDPGM
 define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8> addrspace(2)* %0
@@ -15,12 +17,8 @@ entry:
   ret void
 }
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-
-; CHECK-LABEL: @i8ptr_v16i8ptr
-; CHECK: S_ENDPGM
+; FUNC-LABEL: @i8ptr_v16i8ptr
+; SI: S_ENDPGM
 define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
@@ -28,3 +26,53 @@ entry:
   store <16 x i8> %1, <16 x i8> addrspace(1)* %out
   ret void
 }
+
+define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %load = load float addrspace(1)* %in, align 4
+  %bc = bitcast float %load to <2 x i16>
+  store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+  %load = load <2 x i16> addrspace(1)* %in, align 4
+  %bc = bitcast <2 x i16> %load to float
+  store float %bc, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %bc = bitcast <4 x i8> %load to i32
+  store i32 %bc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %load to <4 x i8>
+  store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bitcast_v2i32_to_f64
+; SI: S_ENDPGM
+define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %add = add <2 x i32> %val, <i32 4, i32 9>
+  %bc = bitcast <2 x i32> %add to double
+  store double %bc, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @bitcast_f64_to_v2i32
+; SI: S_ENDPGM
+define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+  %val = load double addrspace(1)* %in, align 8
+  %add = fadd double %val, 4.0
+  %bc = bitcast double %add to <2 x i32>
+  store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll
new file mode 100644
index 000000000000..6aebe851366c
--- /dev/null
+++ b/test/CodeGen/R600/bswap.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
+
+define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
+  store i32 %bswap, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+  %val = load <4 x i32> addrspace(1)* %in, align 16
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %val = load i64 addrspace(1)* %in, align 8
+  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
+  store i64 %bswap, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
+  store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+  %val = load <4 x i64> addrspace(1)* %in, align 32
+  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
+  store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
new file mode 100644
index 000000000000..15b5188efd60
--- /dev/null
+++ b/test/CodeGen/R600/ctlz_zero_undef.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: @s_ctlz_zero_undef_i32:
+; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
+; SI: S_FLBIT_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctlz_zero_undef_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_FFBH_U32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctlz_zero_undef_v2i32:
+; SI: BUFFER_LOAD_DWORDX2
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32> addrspace(1)* %valptr, align 8
+  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_ctlz_zero_undef_v4i32:
+; SI: BUFFER_LOAD_DWORDX4
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: BUFFER_STORE_DWORDX4
+; SI: S_ENDPGM
+define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32> addrspace(1)* %valptr, align 16
+  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
new file mode 100644
index 000000000000..e4d11e003696
--- /dev/null
+++ b/test/CodeGen/R600/ctpop.ll
@@ -0,0 +1,254 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
+
+; FUNC-LABEL: @s_ctpop_i32:
+; SI: S_LOAD_DWORD [[SVAL:s[0-9]+]],
+; SI: S_BCNT1_I32_B32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Why 0 in register?
+; FUNC-LABEL: @v_ctpop_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
+; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_add_chain_i32
+; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
+; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
+; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
+; SI-NOT: ADD
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+  %val0 = load i32 addrspace(1)* %in0, align 4
+  %val1 = load i32 addrspace(1)* %in1, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
+  %add = add i32 %ctpop0, %ctpop1
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v2i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v4i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i32> addrspace(1)* %in, align 16
+  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v8i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <8 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v16i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <16 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
+  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_inline_constant:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 4
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_inline_constant_inv:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 4, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_literal:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[LIT:v[0-9]+]], 0x1869f
+; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 99999
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_var:
+; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, %const
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_var_inv:
+; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
+; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], {{.*}} + 0x0
+; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} + 0x10
+; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %gep = getelementptr i32 addrspace(1)* %constptr, i32 4
+  %const = load i32 addrspace(1)* %gep, align 4
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
new file mode 100644
index 000000000000..798d8f55ecc1
--- /dev/null
+++ b/test/CodeGen/R600/ctpop64.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
+declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
+
+; FUNC-LABEL: @s_ctpop_i64:
+; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]],
+; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i64:
+; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
+; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
+; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %val = load i64 addrspace(1)* %in, align 8
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_ctpop_v2i64:
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_ENDPGM
+define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @s_ctpop_v4i64:
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_ENDPGM
+define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v2i64:
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: S_ENDPGM
+define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v4i64:
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: S_ENDPGM
+define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i64> addrspace(1)* %in, align 32
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
new file mode 100644
index 000000000000..cf44f8e60d01
--- /dev/null
+++ b/test/CodeGen/R600/cttz_zero_undef.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: @s_cttz_zero_undef_i32:
+; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
+; SI: S_FF1_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_cttz_zero_undef_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_FFBL_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32 addrspace(1)* %valptr, align 4
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_cttz_zero_undef_v2i32:
+; SI: BUFFER_LOAD_DWORDX2
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32> addrspace(1)* %valptr, align 8
+  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_cttz_zero_undef_v4i32:
+; SI: BUFFER_LOAD_DWORDX4
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: BUFFER_STORE_DWORDX4
+; SI: S_ENDPGM
+define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32> addrspace(1)* %valptr, align 16
+  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll
new file mode 100644
index 000000000000..fe97a4485626
--- /dev/null
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -0,0 +1,171 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @load_i8_to_f32:
+; SI: BUFFER_LOAD_UBYTE [[LOADREG:v[0-9]+]],
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
+; SI: BUFFER_STORE_DWORD [[CONV]],
+define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8 addrspace(1)* %in, align 1
+  %cvt = uitofp i8 %load to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @load_v2i8_to_v2f32:
+; SI: BUFFER_LOAD_USHORT [[LOADREG:v[0-9]+]],
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI-NOT: AND
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <2 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <2 x i8> %load to <2 x float>
+  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @load_v3i8_to_v3f32:
+; SI-NOT: BFE
+; SI-NOT: V_CVT_F32_UBYTE3_e32
+; SI-DAG: V_CVT_F32_UBYTE2_e32
+; SI-DAG: V_CVT_F32_UBYTE1_e32
+; SI-DAG: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <3 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <3 x i8> %load to <3 x float>
+  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @load_v4i8_to_v4f32:
+; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI-DAG: V_CVT_F32_UBYTE3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: BUFFER_STORE_DWORDX4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; XXX - This should really still be able to use the V_CVT_F32_UBYTE0
+; for each component, but computeKnownBits doesn't handle vectors very
+; well.
+
+; SI-LABEL: @load_v4i8_to_v4f32_2_uses:
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+
+; XXX - replace with this when v4i8 loads aren't scalarized anymore.
+; XSI: BUFFER_LOAD_DWORD
+; XSI: V_CVT_F32_U32_e32
+; XSI: V_CVT_F32_U32_e32
+; XSI: V_CVT_F32_U32_e32
+; XSI: V_CVT_F32_U32_e32
+; SI: S_ENDPGM
+define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; Make sure this doesn't crash.
+; SI-LABEL: @load_v7i8_to_v7f32:
+; SI: S_ENDPGM
+define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <7 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <7 x i8> %load to <7 x float>
+  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @load_v8i8_to_v8f32:
+; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <8 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <8 x i8> %load to <8 x float>
+  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @i8_zext_inreg_i32_to_f32:
+; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
+; SI: V_ADD_I32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
+; SI-NEXT: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[ADD]]
+; SI: BUFFER_STORE_DWORD [[CONV]],
+define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 2
+  %inreg = and i32 %add, 255
+  %cvt = uitofp i32 %inreg to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i8_zext_inreg_hi1_to_f32:
+define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %inreg = and i32 %load, 65280
+  %shr = lshr i32 %inreg, 8
+  %cvt = uitofp i32 %shr to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; We don't get these ones because of the zext, but instcombine removes
+; them so it shouldn't really matter.
+define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8 addrspace(1)* %in, align 1
+  %ext = zext i8 %load to i32
+  %cvt = uitofp i32 %ext to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 1
+  %ext = zext <4 x i8> %load to <4 x i32>
+  %cvt = uitofp <4 x i32> %ext to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll
index b8b945f46ffe..458363adc1e3 100644
--- a/test/CodeGen/R600/fceil.ll
+++ b/test/CodeGen/R600/fceil.ll
@@ -1,84 +1,131 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-declare double @llvm.ceil.f64(double) nounwind readnone
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
+declare float @llvm.ceil.f32(float) nounwind readnone
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone
+declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 
-; CI-LABEL: @fceil_f64:
-; CI: V_CEIL_F64_e32
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
+; FUNC-LABEL: @fceil_f32:
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_f32(float addrspace(1)* %out, float %x) {
+  %y = call float @llvm.ceil.f32(float %x) nounwind readnone
+  store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @fceil_v2f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v2f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+  %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
+  store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FIXME-CI-LABEL: @fceil_v3f64:
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
+; FUNC-LABEL: @fceil_v3f32:
+; FIXME-SI: V_CEIL_F32_e32
+; FIXME-SI: V_CEIL_F32_e32
+; FIXME-SI: V_CEIL_F32_e32
+; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+  %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
+  store <3 x float> %y, <3 x float> addrspace(1)* %out
+  ret void
+}
 
-; CI-LABEL: @fceil_v4f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v4f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+  %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
+  store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @fceil_v8f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v8f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+  %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
+  store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @fceil_v16f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v16f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+  %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
+  store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
new file mode 100644
index 000000000000..b42aefa17328
--- /dev/null
+++ b/test/CodeGen/R600/fceil64.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.ceil.f64(double) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: @fceil_f64:
+; CI: V_CEIL_F64_e32
+; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: S_LSHR_B64
+; SI: S_NOT_B64
+; SI: S_AND_B64
+; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: CMP_LT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_GT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_GT_F64
+; SI: CNDMASK_B32
+; SI: CMP_NE_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: V_ADD_F64
+define void @fceil_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fceil_v2f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: @fceil_v3f64:
+; FIXME-CI: V_CEIL_F64_e32
+; FIXME-CI: V_CEIL_F64_e32
+; FIXME-CI: V_CEIL_F64_e32
+; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: @fceil_v4f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fceil_v8f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fceil_v16f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll
new file mode 100644
index 000000000000..7b4425bed724
--- /dev/null
+++ b/test/CodeGen/R600/fcopysign.f32.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+; Try to identify arg based on higher address.
+; FUNC-LABEL: @test_copysign_f32:
+; SI: S_LOAD_DWORD [[SSIGN:s[0-9]+]], {{.*}} 0xc
+; SI: V_MOV_B32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; SI-DAG: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb
+; SI-DAG: V_MOV_B32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: V_BFI_B32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BFI_INT
+define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+  %result = call float @llvm.copysign.f32(float %mag, float %sign)
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v2f32:
+; SI: S_ENDPGM
+
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+  %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v4f32:
+; SI: S_ENDPGM
+
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+  %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll
new file mode 100644
index 000000000000..ea7a6db67f34
--- /dev/null
+++ b/test/CodeGen/R600/fcopysign.f64.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.copysign.f64(double, double) nounwind readnone
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
+
+; FUNC-LABEL: @test_copysign_f64:
+; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: V_MOV_B32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: V_BFI_B32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; SI: V_MOV_B32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; SI: S_ENDPGM
+define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+  %result = call double @llvm.copysign.f64(double %mag, double %sign)
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v2f64:
+; SI: S_ENDPGM
+define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+  %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
+  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v4f64:
+; SI: S_ENDPGM
+define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+  %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 51d2b8961504..31c6116988e6 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.floor.f64(double) nounwind readnone
 declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
@@ -7,15 +8,34 @@ declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
 declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
 declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 
-; CI-LABEL: @ffloor_f64:
+; FUNC-LABEL: @ffloor_f64:
 ; CI: V_FLOOR_F64_e32
+
+; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: S_LSHR_B64
+; SI: S_NOT_B64
+; SI: S_AND_B64
+; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: CMP_LT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_GT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_LT_F64
+; SI: CNDMASK_B32
+; SI: CMP_NE_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: V_ADD_F64
 define void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @ffloor_v2f64:
+; FUNC-LABEL: @ffloor_v2f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
@@ -24,7 +44,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   ret void
 }
 
-; FIXME-CI-LABEL: @ffloor_v3f64:
+; FIXME-FUNC-LABEL: @ffloor_v3f64:
 ; FIXME-CI: V_FLOOR_F64_e32
 ; FIXME-CI: V_FLOOR_F64_e32
 ; FIXME-CI: V_FLOOR_F64_e32
@@ -34,7 +54,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ;   ret void
 ; }
 
-; CI-LABEL: @ffloor_v4f64:
+; FUNC-LABEL: @ffloor_v4f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
@@ -45,7 +65,7 @@ define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   ret void
 }
 
-; CI-LABEL: @ffloor_v8f64:
+; FUNC-LABEL: @ffloor_v8f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
@@ -60,7 +80,7 @@ define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   ret void
 }
 
-; CI-LABEL: @ffloor_v16f64:
+; FUNC-LABEL: @ffloor_v16f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
diff --git a/test/CodeGen/R600/fp_to_sint_i64.ll b/test/CodeGen/R600/fp_to_sint_i64.ll
new file mode 100644
index 000000000000..ec3e19804c57
--- /dev/null
+++ b/test/CodeGen/R600/fp_to_sint_i64.ll
@@ -0,0 +1,12 @@
+; FIXME: Merge into fp_to_sint.ll when EG/NI supports 64-bit types
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; SI-LABEL: @fp_to_sint_i64
+; Check that the compiler doesn't crash with a "cannot select" error
+; SI: S_ENDPGM
+define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fptosi float %in to i64
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
new file mode 100644
index 000000000000..bf607cef0884
--- /dev/null
+++ b/test/CodeGen/R600/fp_to_uint.f64.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @fp_to_uint_i32_f64
+; SI: V_CVT_U32_F64_e32
+define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+  %cast = fptoui double %in to i32
+  store i32 %cast, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index 1445a20839ad..f5e5708f1b41 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-
-; CHECK: @fsub_f64
-; CHECK: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}, 0, 0, 0, 0, 2
+; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
+; SI-LABEL: @fsub_f64:
+; SI: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll
index 6b235ffbd980..3cd1deb921fc 100644
--- a/test/CodeGen/R600/ftrunc.ll
+++ b/test/CodeGen/R600/ftrunc.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.trunc.f64(double) nounwind readnone
 declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
@@ -7,15 +8,40 @@ declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
 declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
 declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
 
-; CI-LABEL: @ftrunc_f64:
+; FUNC-LABEL: @v_ftrunc_f64:
 ; CI: V_TRUNC_F64_e32
+; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
+; SI: S_ENDPGM
+define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %x = load double addrspace(1)* %in, align 8
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @ftrunc_f64:
+; CI: V_TRUNC_F64_e32
+
+; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: S_LSHR_B64
+; SI: S_NOT_B64
+; SI: S_AND_B64
+; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: CMP_LT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_GT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: S_ENDPGM
 define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @ftrunc_v2f64:
+; FUNC-LABEL: @ftrunc_v2f64:
 ; CI: V_TRUNC_F64_e32
 ; CI: V_TRUNC_F64_e32
 define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
@@ -24,7 +50,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   ret void
 }
 
-; FIXME-CI-LABEL: @ftrunc_v3f64:
+; FIXME-FUNC-LABEL: @ftrunc_v3f64:
 ; FIXME-CI: V_TRUNC_F64_e32
 ; FIXME-CI: V_TRUNC_F64_e32
 ; FIXME-CI: V_TRUNC_F64_e32
@@ -34,7 +60,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ;   ret void
 ; }
 
-; CI-LABEL: @ftrunc_v4f64:
+; FUNC-LABEL: @ftrunc_v4f64:
 ; CI: V_TRUNC_F64_e32
 ; CI: V_TRUNC_F64_e32
 ; CI: V_TRUNC_F64_e32
@@ -45,7 +71,7 @@ define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   ret void
 }
 
-; CI-LABEL: @ftrunc_v8f64:
+; FUNC-LABEL: @ftrunc_v8f64:
 ; CI: V_TRUNC_F64_e32
 ; CI: V_TRUNC_F64_e32
 ; CI: V_TRUNC_F64_e32
@@ -60,7 +86,7 @@ define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   ret void
 }
 
-; CI-LABEL: @ftrunc_v16f64:
+; FUNC-LABEL: @ftrunc_v16f64:
 ; CI: V_TRUNC_F64_e32
 ; CI: V_TRUNC_F64_e32
 ; CI: V_TRUNC_F64_e32
diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
new file mode 100644
index 000000000000..ebd781107627
--- /dev/null
+++ b/test/CodeGen/R600/gv-const-addrspace-fail.ll
@@ -0,0 +1,58 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
+
+; FUNC-LABEL: @test_i8
+; EG: CF_END
+; SI: BUFFER_STORE_BYTE
+; SI: S_ENDPGM
+define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i8] addrspace(2)* @a, i32 0, i32 %s
+  %1 = load i8 addrspace(2)* %arrayidx, align 1
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+
+; FUNC-LABEL: @test_i16
+; EG: CF_END
+; SI: BUFFER_STORE_SHORT
+; SI: S_ENDPGM
+define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i16] addrspace(2)* @b, i32 0, i32 %s
+  %1 = load i16 addrspace(2)* %arrayidx, align 2
+  store i16 %1, i16 addrspace(1)* %out
+  ret void
+}
+
+%struct.bar = type { float, [5 x i8] }
+
+; The illegal i8s aren't handled
+@struct_bar_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
+
+; FUNC-LABEL: @struct_bar_gv_load
+define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i8 addrspace(2)* %gep, align 1
+  store i8 %load, i8 addrspace(1)* %out, align 1
+  ret void
+}
+
+
+; The private load isn't scalarzied.
+@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
+                                                                    <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
+                                                                    <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
+                                                                    <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
+
+; FUNC-LABEL: @array_vector_gv_load
+define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
+  %load = load <4 x i32> addrspace(2)* %gep, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index cda7ab1fccd3..db64a6fe8c7f 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -1,17 +1,21 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
 
 ; XXX: Test on SI once 64-bit adds are supportes.
 
-@float_gv = internal addrspace(2) unnamed_addr constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
+@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
 
 ; FUNC-LABEL: @float
 
-; R600-DAG: MOV {{\** *}}T2.X
-; R600-DAG: MOV {{\** *}}T3.X
-; R600-DAG: MOV {{\** *}}T4.X
-; R600-DAG: MOV {{\** *}}T5.X
-; R600-DAG: MOV {{\** *}}T6.X
-; R600: MOVA_INT
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
 
 define void @float(float addrspace(1)* %out, i32 %index) {
 entry:
@@ -21,16 +25,16 @@ entry:
   ret void
 }
 
-@i32_gv = internal addrspace(2) unnamed_addr constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
+@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
 
 ; FUNC-LABEL: @i32
 
-; R600-DAG: MOV {{\** *}}T2.X
-; R600-DAG: MOV {{\** *}}T3.X
-; R600-DAG: MOV {{\** *}}T4.X
-; R600-DAG: MOV {{\** *}}T5.X
-; R600-DAG: MOV {{\** *}}T6.X
-; R600: MOVA_INT
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
 
 define void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -39,3 +43,30 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+
+%struct.foo = type { float, [5 x i32] }
+
+@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
+
+; FUNC-LABEL: @struct_foo_gv_load
+
+define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32 addrspace(2)* %gep, align 4
+  store i32 %load, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
+                                                                <1 x i32> <i32 2>,
+                                                                <1 x i32> <i32 3>,
+                                                                <1 x i32> <i32 4> ]
+
+; FUNC-LABEL: @array_v1_gv_load
+define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32> addrspace(2)* %gep, align 4
+  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
index 4d1f7347ecc9..b127b7ede2e8 100644
--- a/test/CodeGen/R600/indirect-private-64.ll
+++ b/test/CodeGen/R600/indirect-private-64.ll
@@ -3,10 +3,8 @@
 declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
 
 ; SI-LABEL: @private_access_f64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load double addrspace(1)* %in, align 8
   %array = alloca double, i32 16, align 8
@@ -19,14 +17,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 }
 
 ; SI-LABEL: @private_access_v2f64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
+; SI: DS_READ_B64
 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x double> addrspace(1)* %in, align 16
   %array = alloca <2 x double>, i32 16, align 16
@@ -39,10 +33,8 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 }
 
 ; SI-LABEL: @private_access_i64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %array = alloca i64, i32 16, align 8
@@ -55,14 +47,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 }
 
 ; SI-LABEL: @private_access_v2i64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
+; SI: DS_READ_B64
 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x i64> addrspace(1)* %in, align 16
   %array = alloca <2 x i64>, i32 16, align 16
diff --git a/test/CodeGen/R600/large-alloca.ll b/test/CodeGen/R600/large-alloca.ll
new file mode 100644
index 000000000000..d8be6d40f310
--- /dev/null
+++ b/test/CodeGen/R600/large-alloca.ll
@@ -0,0 +1,14 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32]* %large, i32 0, i32 8191
+  store i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32]* %large, i32 0, i32 %y
+  %0 = load i32* %gep1
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
new file mode 100644
index 000000000000..552cd05e1373
--- /dev/null
+++ b/test/CodeGen/R600/large-constant-initializer.ll
@@ -0,0 +1,19 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
+
+define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+  %val = load i32 addrspace(2)* getelementptr ([239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
+  %mul12 = mul nsw i32 %val, 7
+  br i1 undef, label %exit, label %bb
+
+bb:
+  %cmp = icmp slt i32 %x, 0
+  br label %exit
+
+exit:
+  ret void
+}
+
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
index af0db0d65666..d5dc061964e9 100644
--- a/test/CodeGen/R600/lds-output-queue.ll
+++ b/test/CodeGen/R600/lds-output-queue.ll
@@ -8,7 +8,7 @@
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
 
-@local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4
+@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] [i32 1, i32 2], align 4
 
 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
index 2185180fd83f..9182e2561500 100644
--- a/test/CodeGen/R600/lds-size.ll
+++ b/test/CodeGen/R600/lds-size.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: @test
 ; CHECK: .long   166120
 ; CHECK-NEXT: .long   1
-@lds = internal addrspace(3) unnamed_addr global i32 zeroinitializer, align 4
+@lds = internal unnamed_addr addrspace(3) global i32 zeroinitializer, align 4
 
 define void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
index 2d8930ad0e88..ad9ce2541ef7 100644
--- a/test/CodeGen/R600/lit.local.cfg
+++ b/test/CodeGen/R600/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'R600' in targets:
+if not 'R600' in config.root.targets:
     config.unsupported = True
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
index b3fec06f18d9..eb5094232825 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
@@ -48,3 +48,379 @@ define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) no
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @bfe_i32_arg_0_width_reg_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_arg_0_width_imm_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_6
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: S_ENDPGM
+define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_7
+; SI-NOT: SHL
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: The shifts should be 1 BFE
+; FUNC-LABEL: @bfe_i32_test_8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_9
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_10
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_11
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_12
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_13
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_14
+; SI-NOT: LSHR
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_0
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_1
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_2
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_3
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_4
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_5
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_6
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_7
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_8
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_9
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_10
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_11
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -6
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_12
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_13
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_14
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_15
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_16
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_17
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_18
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - This should really be a single BFE, but the sext_inreg of the
+; extended type i24 is never custom lowered.
+; FUNC-LABEL: @bfe_sext_in_reg_i24
+; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
+; SI: V_LSHLREV_B32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; XSI: V_BFE_I32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
+; XSI-NOT: SHL
+; XSI-NOT: SHR
+; XSI: BUFFER_STORE_DWORD [[BFE]],
+define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
+  %shl = shl i32 %bfe, 8
+  %ashr = ashr i32 %shl, 8
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
index 0d478638219f..1a62253eeb74 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
@@ -38,3 +38,517 @@ define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @bfe_u32_arg_0_width_reg_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_arg_0_width_imm_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zextload_i8
+; SI: BUFFER_LOAD_UBYTE
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8 addrspace(1)* %in
+  %ext = zext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i16
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_1
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_3
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0xf8
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_7
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0x80
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i16_offset_8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_1
+; SI: BUFFER_LOAD_DWORD
+; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: S_ENDPGM
+; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
+define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_4
+; SI-NOT: LSHL
+; SI-NOT: SHR
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = lshr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_5
+; SI: BUFFER_LOAD_DWORD
+; SI-NOT: LSHL
+; SI-NOT: SHR
+; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_6
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: S_ENDPGM
+define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_7
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_8
+; SI-NOT: BFE
+; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_9
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_10
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_11
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_12
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_13
+; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_14
+; SI-NOT: LSHR
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_0
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_1
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_2
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_3
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_4
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_5
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_6
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x80
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_7
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_8
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_9
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFEfppppppppppppp
+define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_10
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_11
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_12
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_13
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_14
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_15
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_16
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_17
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_18
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
new file mode 100644
index 000000000000..68a5ad0649c2
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
+
+; FUNC-LABEL: @s_brev_i32:
+; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
+; SI: S_BREV_B32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_brev_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_BFREV_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
new file mode 100644
index 000000000000..6facb4782e98
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
+
+; SI-LABEL: @test_unpack_byte0_to_float:
+; SI: V_CVT_F32_UBYTE0
+define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_unpack_byte1_to_float:
+; SI: V_CVT_F32_UBYTE1
+define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_unpack_byte2_to_float:
+; SI: V_CVT_F32_UBYTE2
+define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_unpack_byte3_to_float:
+; SI: V_CVT_F32_UBYTE3
+define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
new file mode 100644
index 000000000000..95795ea63b93
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FIXME: Store of i32 seems to be broken pre-EG somehow?
+
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_imad24
+; SI: V_MAD_I32_I24
+; CM: MULADD_INT24
+; R600: MULLO_INT
+; R600: ADD_INT
+define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
index 33a1b8204e23..8ee3520daeae 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
@@ -1,14 +1,15 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
 
 ; FUNC-LABEL: @test_imul24
 ; SI: V_MUL_I32_I24
 ; CM: MUL_INT24
+; R600: MULLO_INT
 define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
 }
-
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
new file mode 100644
index 000000000000..afdfb18a563b
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_umad24
+; SI: V_MAD_U32_U24
+; EG: MULADD_UINT24
+; R600: MULLO_UINT
+; R600: ADD_INT
+define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
index 21f824a65fc7..72a36029fb31 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
@@ -1,11 +1,17 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
 
-; SI-LABEL: @test_umul24
+; FUNC-LABEL: @test_umul24
+; SI: V_MUL_U32_U24
+; R600: MUL_UINT24
+; R600: MULLO_UINT
 define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
 }
-
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
index a7a909addafb..3e2884b7ce02 100644
--- a/test/CodeGen/R600/llvm.rint.f64.ll
+++ b/test/CodeGen/R600/llvm.rint.f64.ll
@@ -1,30 +1,38 @@
 ; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @f64
+; FUNC-LABEL: @rint_f64
 ; CI: V_RNDNE_F64_e32
-define void @f64(double addrspace(1)* %out, double %in) {
+
+; SI-DAG: V_ADD_F64
+; SI-DAG: V_ADD_F64
+; SI-DAG V_CMP_GT_F64_e64
+; SI: V_CNDMASK_B32
+; SI: V_CNDMASK_B32
+; SI: S_ENDPGM
+define void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
   store double %0, double addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v2f64
+; FUNC-LABEL: @rint_v2f64
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
-define void @v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v4f64
+; FUNC-LABEL: @rint_v4f64
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
-define void @v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
index db8352fca9d5..209bb4358fd5 100644
--- a/test/CodeGen/R600/llvm.rint.ll
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -1,31 +1,31 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @f32
+; FUNC-LABEL: @rint_f32
 ; R600: RNDNE
 
 ; SI: V_RNDNE_F32_e32
-define void @f32(float addrspace(1)* %out, float %in) {
+define void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
-  %0 = call float @llvm.rint.f32(float %in)
+  %0 = call float @llvm.rint.f32(float %in) #0
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v2f32
+; FUNC-LABEL: @rint_v2f32
 ; R600: RNDNE
 ; R600: RNDNE
 
 ; SI: V_RNDNE_F32_e32
 ; SI: V_RNDNE_F32_e32
-define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
-  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in)
+  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v4f32
+; FUNC-LABEL: @rint_v4f32
 ; R600: RNDNE
 ; R600: RNDNE
 ; R600: RNDNE
@@ -35,15 +35,27 @@ entry:
 ; SI: V_RNDNE_F32_e32
 ; SI: V_RNDNE_F32_e32
 ; SI: V_RNDNE_F32_e32
-define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
-  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in)
+  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
   store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: @legacy_amdil_round_nearest_f32
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
+define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDIL.round.nearest.f32(float) #0
 declare float @llvm.rint.f32(float) #0
 declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
 declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
 
-attributes #0 = { nounwind readonly }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
new file mode 100644
index 000000000000..5a44951055ea
--- /dev/null
+++ b/test/CodeGen/R600/local-atomics.ll
@@ -0,0 +1,254 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i32:
+; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
+; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
+; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: DS_WRXCHG_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i32_offset:
+; SI: DS_WRXCHG_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: @lds_atomic_add_ret_i32:
+; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
+; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
+; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: DS_ADD_RTN_U32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_add_ret_i32_offset:
+; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i32:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i32_offset:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i32:
+; SI: DS_SUB_RTN_U32
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i32_offset:
+; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i32:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_DEC_RTN_U32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i32_offset:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_DEC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i32:
+; SI: DS_AND_RTN_B32
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i32_offset:
+; SI: DS_AND_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i32:
+; SI: DS_OR_RTN_B32
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i32_offset:
+; SI: DS_OR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i32:
+; SI: DS_XOR_RTN_B32
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i32_offset:
+; SI: DS_XOR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: @lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i32 %result, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; FUNC-LABEL: @lds_atomic_min_ret_i32:
+; SI: DS_MIN_RTN_I32
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_min_ret_i32_offset:
+; SI: DS_MIN_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i32:
+; SI: DS_MAX_RTN_I32
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i32_offset:
+; SI: DS_MAX_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i32:
+; SI: DS_MIN_RTN_U32
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i32_offset:
+; SI: DS_MIN_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i32:
+; SI: DS_MAX_RTN_U32
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i32_offset:
+; SI: DS_MAX_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
new file mode 100644
index 000000000000..849b033d84a9
--- /dev/null
+++ b/test/CodeGen/R600/local-atomics64.ll
@@ -0,0 +1,251 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i64:
+; SI: DS_WRXCHG_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i64_offset:
+; SI: DS_WRXCHG_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_add_ret_i64:
+; SI: DS_ADD_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_add_ret_i64_offset:
+; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
+; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI: DS_ADD_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}, 0x20, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i64:
+; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: DS_INC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i64_offset:
+; SI: DS_INC_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i64:
+; SI: DS_SUB_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i64_offset:
+; SI: DS_SUB_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i64:
+; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: DS_DEC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i64_offset:
+; SI: DS_DEC_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i64:
+; SI: DS_AND_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i64_offset:
+; SI: DS_AND_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i64:
+; SI: DS_OR_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i64_offset:
+; SI: DS_OR_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i64:
+; SI: DS_XOR_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i64_offset:
+; SI: DS_XOR_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: @lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; FUNC-LABEL: @lds_atomic_min_ret_i64:
+; SI: DS_MIN_RTN_I64
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_min_ret_i64_offset:
+; SI: DS_MIN_RTN_I64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i64:
+; SI: DS_MAX_RTN_I64
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i64_offset:
+; SI: DS_MAX_RTN_I64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i64:
+; SI: DS_MIN_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i64_offset:
+; SI: DS_MIN_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i64:
+; SI: DS_MAX_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i64_offset:
+; SI: DS_MAX_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 1e422855270d..e29e4cc88fd9 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
-@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
 
 ; EG-CHECK: @local_memory_two_objects
 
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 6ebe41d0a8fd..51af4844cb5a 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=CI-CHECK %s
 
-@local_memory.local_mem = internal addrspace(3) unnamed_addr global [128 x i32] zeroinitializer, align 4
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] zeroinitializer, align 4
 
 ; EG-CHECK-LABEL: @local_memory
 ; SI-CHECK-LABEL: @local_memory
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index 6ed754c5aa70..d231e92e27fa 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; mul24 and mad24 are affected
 
-;FUNC-LABEL: @test2
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test2
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;FUNC-LABEL: @test4
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test4
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,11 +39,11 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; SI-CHECK-LABEL: @trunc_i64_mul_to_i32
-; SI-CHECK: S_LOAD_DWORD
-; SI-CHECK: S_LOAD_DWORD
-; SI-CHECK: V_MUL_LO_I32
-; SI-CHECK: BUFFER_STORE_DWORD
+; FUNC-LABEL: @trunc_i64_mul_to_i32
+; SI: S_LOAD_DWORD
+; SI: S_LOAD_DWORD
+; SI: V_MUL_LO_I32
+; SI: BUFFER_STORE_DWORD
 define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
new file mode 100644
index 000000000000..ab82e7ee7993
--- /dev/null
+++ b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=r600 -mcpu=SI -o /dev/null %s
+; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
+
+@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
+
+; FUNC-LABEL: @load_extern_const_init
+define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
+
+; FUNC-LABEL: @load_undef_const_init
+define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
index 4afaf684bfce..8a269e0cb43a 100644
--- a/test/CodeGen/R600/parallelandifcollapse.ll
+++ b/test/CodeGen/R600/parallelandifcollapse.ll
@@ -7,6 +7,12 @@
 ; CHECK: AND_INT
 ; CHECK-NEXT: AND_INT
 ; CHECK-NEXT: OR_INT
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+; XFAIL: *
+
 define void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll
index b0db7cdd0671..feca688c30aa 100644
--- a/test/CodeGen/R600/parallelorifcollapse.ll
+++ b/test/CodeGen/R600/parallelorifcollapse.ll
@@ -3,6 +3,11 @@
 ;
 ; CFG flattening should use parallel-or to generate branch conditions and
 ; then merge if-regions with the same bodies.
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+; XFAIL: *
 ;
 ; CHECK: OR_INT
 ; CHECK-NEXT: OR_INT
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index d3453f26aee6..c60c05975600 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,24 +1,17 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-; This test checks that uses and defs of the AR register happen in the same
-; instruction clause.
-
 ; FUNC-LABEL: @mova_same_clause
 
-; R600-CHECK: MOVA_INT
-; R600-CHECK-NOT: ALU clause
-; R600-CHECK: 0 + AR.x
-; R600-CHECK: MOVA_INT
-; R600-CHECK-NOT: ALU clause
-; R600-CHECK: 0 + AR.x
-
-; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
-; SI-CHECK: V_MOVRELD
-; SI-CHECK: S_CBRANCH
-; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
-; SI-CHECK: V_MOVRELD
-; SI-CHECK: S_CBRANCH
+; R600-CHECK: LDS_WRITE
+; R600-CHECK: LDS_WRITE
+; R600-CHECK: LDS_READ
+; R600-CHECK: LDS_READ
+
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_READ_B32
+; SI-CHECK: DS_READ_B32
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
@@ -114,12 +107,8 @@ for.end:
 
 ; FUNC-LABEL: @short_array
 
-; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
-; R600-CHECK: 65536
-; R600-CHECK: *
 ; R600-CHECK: MOVA_INT
 
-; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000
 ; SI-CHECK: V_MOVRELS_B32_e32
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -137,10 +126,7 @@ entry:
 
 ; FUNC-LABEL: @char_array
 
-; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
-; R600-CHECK: 256
-; R600-CHECK: *
-; R600-CHECK-NEXT: MOVA_INT
+; R600-CHECK: MOVA_INT
 
 ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
 ; SI-CHECK: V_MOVRELS_B32_e32
@@ -185,7 +171,9 @@ entry:
 ; Test that two stack objects are not stored in the same register
 ; The second stack object should be in T3.X
 ; FUNC-LABEL: @no_overlap
-; R600-CHECK: MOV {{\** *}}T3.X
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-CHECK-NOT: [[CHAN]]+
 ; SI-CHECK: V_MOV_B32_e32 v3
 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
 entry:
diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll
new file mode 100644
index 000000000000..bda0b6694a8d
--- /dev/null
+++ b/test/CodeGen/R600/rotl.i64.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @s_rotl_i64:
+; SI: S_LSHL_B64
+; SI: S_SUB_I32
+; SI: S_LSHR_B64
+; SI: S_OR_B64
+define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %0 = shl i64 %x, %y
+  %1 = sub i64 64, %y
+  %2 = lshr i64 %x, %1
+  %3 = or i64 %0, %2
+  store i64 %3, i64 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @v_rotl_i64:
+; SI: V_LSHL_B64
+; SI: V_SUB_I32
+; SI: V_LSHR_B64
+; SI: V_OR_B32
+; SI: V_OR_B32
+define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64 addrspace(1)* %xptr, align 8
+  %y = load i64 addrspace(1)* %yptr, align 8
+  %tmp0 = shl i64 %x, %y
+  %tmp1 = sub i64 64, %y
+  %tmp2 = lshr i64 %x, %tmp1
+  %tmp3 = or i64 %tmp0, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/rotl.ll b/test/CodeGen/R600/rotl.ll
new file mode 100644
index 000000000000..83f657fd4cce
--- /dev/null
+++ b/test/CodeGen/R600/rotl.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @rotl_i32:
+; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
+; R600-NEXT: 32
+; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+
+; SI: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
+define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %0 = shl i32 %x, %y
+  %1 = sub i32 32, %y
+  %2 = lshr i32 %x, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @rotl_v2i32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %0 = shl <2 x i32> %x, %y
+  %1 = sub <2 x i32> <i32 32, i32 32>, %y
+  %2 = lshr <2 x i32> %x, %1
+  %3 = or <2 x i32> %0, %2
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @rotl_v4i32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+entry:
+  %0 = shl <4 x i32> %x, %y
+  %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %2 = lshr <4 x i32> %x, %1
+  %3 = or <4 x i32> %0, %2
+  store <4 x i32> %3, <4 x i32> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll
new file mode 100644
index 000000000000..c264751baeb1
--- /dev/null
+++ b/test/CodeGen/R600/rotr.i64.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @s_rotr_i64
+; SI: S_LSHR_B64
+; SI: S_SUB_I32
+; SI: S_LSHL_B64
+; SI: S_OR_B64
+define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @v_rotr_i64
+; SI: V_LSHR_B64
+; SI: V_SUB_I32
+; SI: V_LSHL_B64
+; SI: V_OR_B32
+; SI: V_OR_B32
+define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64 addrspace(1)* %xptr, align 8
+  %y = load i64 addrspace(1)* %yptr, align 8
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @s_rotr_v2i64
+define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+entry:
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @v_rotr_v2i64
+define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+entry:
+  %x = load <2 x i64> addrspace(1)* %xptr, align 8
+  %y = load <2 x i64> addrspace(1)* %yptr, align 8
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index edf7aeebea0f..a5a4da480738 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -1,37 +1,52 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @rotr:
-; R600-CHECK: BIT_ALIGN_INT
+; FUNC-LABEL: @rotr_i32:
+; R600: BIT_ALIGN_INT
 
-; SI-CHECK-LABEL: @rotr:
-; SI-CHECK: V_ALIGNBIT_B32
-define void @rotr(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+; SI: V_ALIGNBIT_B32
+define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
-  %0 = sub i32 32, %y
-  %1 = shl i32 %x, %0
-  %2 = lshr i32 %x, %y
-  %3 = or i32 %1, %2
-  store i32 %3, i32 addrspace(1)* %in
+  %tmp0 = sub i32 32, %y
+  %tmp1 = shl i32 %x, %tmp0
+  %tmp2 = lshr i32 %x, %y
+  %tmp3 = or i32 %tmp1, %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %in
   ret void
 }
 
-; R600-CHECK-LABEL: @rotl:
-; R600-CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
-; R600-CHECK-NEXT: 32
-; R600-CHECK: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+; FUNC-LABEL: @rotr_v2i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
 
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
+  %tmp1 = shl <2 x i32> %x, %tmp0
+  %tmp2 = lshr <2 x i32> %x, %y
+  %tmp3 = or <2 x i32> %tmp1, %tmp2
+  store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @rotr_v4i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
 
-; SI-CHECK-LABEL: @rotl:
-; SI-CHECK: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
-; SI-CHECK: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
-; SI-CHECK: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
-define void @rotl(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
-  %0 = shl i32 %x, %y
-  %1 = sub i32 32, %y
-  %2 = lshr i32 %x, %1
-  %3 = or i32 %0, %2
-  store i32 %3, i32 addrspace(1)* %in
+  %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %tmp1 = shl <4 x i32> %x, %tmp0
+  %tmp2 = lshr <4 x i32> %x, %y
+  %tmp3 = or <4 x i32> %tmp1, %tmp2
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in
   ret void
 }
diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
new file mode 100644
index 000000000000..87c05701104f
--- /dev/null
+++ b/test/CodeGen/R600/rsq.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; SI-LABEL: @rsq_f32
+; SI: V_RSQ_F32_e32
+; SI: S_ENDPGM
+define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float addrspace(1)* %in, align 4
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @rsq_f64
+; SI: V_RSQ_F64_e32
+; SI: S_ENDPGM
+define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+  %val = load double addrspace(1)* %in, align 4
+  %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
+  %div = fdiv double 1.0, %sqrt
+  store double %div, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll
new file mode 100644
index 000000000000..bcccb065818e
--- /dev/null
+++ b/test/CodeGen/R600/scalar_to_vector.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: @scalar_to_vector_v2i32
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: S_ENDPGM
+define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tmp1 = load i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @scalar_to_vector_v2f32
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: S_ENDPGM
+define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tmp1 = load float addrspace(1)* %in, align 4
+  %bc = bitcast float %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
+; to produce one, but for some reason never made it to selection.
+
+
+; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+;   %tmp1 = load i32 addrspace(1)* %in, align 4
+;   %bc = bitcast i32 %tmp1 to <4 x i8>
+
+;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;   store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
+;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
+;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
+;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
+;   store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
+;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
+;   store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
index 3dd10c8a61c1..e922d5c18680 100644
--- a/test/CodeGen/R600/sdiv.ll
+++ b/test/CodeGen/R600/sdiv.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; The code generated by sdiv is long and complex and may frequently change.
 ; The goal of this test is to make sure the ISel doesn't fail.
@@ -9,9 +10,9 @@
 ; This was fixed by adding an additional pattern in R600Instructions.td to
 ; match this pattern with a CNDGE_INT.
 
-; CHECK: CF_END
-
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+; FUNC-LABEL: @sdiv_i32
+; EG: CF_END
+define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
   %num = load i32 addrspace(1) * %in
   %den = load i32 addrspace(1) * %den_ptr
@@ -19,3 +20,84 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @sdiv_i32_4
+define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply by a weird constant to make sure setIntDivIsCheap is
+; working.
+
+; FUNC-LABEL: @slow_sdiv_i32_3435
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI: V_MUL_HI_I32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
+; SI: V_ADD_I32
+; SI: V_LSHRREV_B32
+; SI: V_ASHRREV_I32
+; SI: V_ADD_I32
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 3435
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32> addrspace(1) * %in
+  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32> addrspace(1) * %in
+  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Tests for 64-bit divide bypass.
+; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = sdiv i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = srem i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %resultdiv = sdiv i64 %a, %b
+;   %resultrem = srem i64 %a, %b
+;   %result = add i64 %resultdiv, %resultrem
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
diff --git a/test/CodeGen/R600/setcc-equivalent.ll b/test/CodeGen/R600/setcc-equivalent.ll
index 4c50aa33afd6..f796748fcefe 100644
--- a/test/CodeGen/R600/setcc-equivalent.ll
+++ b/test/CodeGen/R600/setcc-equivalent.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; XFAIL: *
 
 ; EG-LABEL: @and_setcc_setcc_i32
 ; EG: AND_INT
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index 90c4ba6af8f2..1b02e4bf8015 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -379,3 +379,146 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad
   store i16 %tmp6, i16 addrspace(1)* %out, align 2
   ret void
 }
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @bfe_0_width
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_8_bfe_8
+; SI: V_BFE_I32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_8_bfe_16
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI: S_ENDPGM
+define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; This really should be folded into 1
+; FUNC-LABEL: @bfe_16_bfe_8
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure there isn't a redundant BFE
+; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe
+; SI: S_SEXT_I32_I8 s{{[0-9]+}}, s{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe_wrong
+define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sextload_i8_to_i32_bfe
+; SI: BUFFER_LOAD_SBYTE
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sextload_i8_to_i32_bfe_0:
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_0:
+; SI-NOT: SHR
+; SI-NOT: SHL
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_1
+; SI: BUFFER_LOAD_DWORD
+; SI-NOT: SHL
+; SI-NOT: SHR
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; SI: S_ENDPGM
+define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i2_bfe_offset_1:
+; SI: BUFFER_LOAD_DWORD
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; SI: S_ENDPGM
+define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index 4a6aab4a104a..43fab2a39dcc 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -39,5 +39,118 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-; XXX: Add SI test for i64 shl once i64 stores and i64 function arguments are
-; supported.
+;EG-CHECK: @shl_i64
+;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG-CHECK: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG-CHECK: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-CHECK-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-CHECK-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+
+;SI-CHECK: @shl_i64
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %a = load i64 addrspace(1) * %in
+  %b = load i64 addrspace(1) * %b_ptr
+  %result = shl i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @shl_v2i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @shl_v2i64
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %result = shl <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @shl_v4i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHC]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHD]]
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @shl_v4i64
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %result = shl <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index 1212cee9446e..e3bee507de67 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -1,12 +1,61 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; SI-LABEL: @s_sext_i1_to_i32:
+; SI: V_CNDMASK_B32_e64
+; SI: S_ENDPGM
+define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
 
-; CHECK: V_ASHR
-define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c)  {
+; SI-LABEL: @test:
+; SI: V_ASHR
+; SI: S_ENDPG
+define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
-  %0 = mul i32 %a, %b
-  %1 = add i32 %0, %c
-  %2 = sext i32 %1 to i64
-  store i64 %2, i64 addrspace(1)* %out
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %sext = sext i32 %add to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sext_i1_to_i64:
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: S_ENDPGM
+define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sext_i32_to_i64:
+; SI: S_ASHR_I32
+; SI: S_ENDPGM
+define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+  %sext = sext i32 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @v_sext_i32_to_i64:
+; SI: V_ASHR
+; SI: S_ENDPGM
+define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %sext = sext i32 %val to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sext_i16_to_i64:
+; SI: S_ENDPGM
+define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+  %sext = sext i16 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
index d9f60ea1a4dc..dee432664e89 100644
--- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
+; XFAIL: *
+
 ; 64-bit select was originally lowered with a build_pair, and this
 ; could be simplified to 1 cndmask instead of 2, but that broken when
 ; it started being implemented with a v2i32 build_vector and
@@ -12,9 +14,10 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
   ret void
 }
 
+; FIXME: Fix truncating store for local memory
 ; SI-LABEL: @trunc_load_alloca_i64:
-; SI: V_MOVRELS_B32
-; SI-NOT: V_MOVRELS_B32
+; SI: DS_READ_B32
+; SI-NOT: DS_READ_B64
 ; SI: S_ENDPGM
 define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
   %idx = add i32 %a, %b
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 9241799091c0..b27dfda8aea6 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -29,3 +29,25 @@ define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @sint_to_fp_i1_f32:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00, [[CMP]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sint_to_fp_i1_f32_load:
+; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/sint_to_fp64.ll b/test/CodeGen/R600/sint_to_fp64.ll
index 5abc9d15965d..12b8cf57cf5f 100644
--- a/test/CodeGen/R600/sint_to_fp64.ll
+++ b/test/CodeGen/R600/sint_to_fp64.ll
@@ -1,9 +1,35 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; CHECK: @sint_to_fp64
-; CHECK: V_CVT_F64_I32_e32
+; SI: @sint_to_fp64
+; SI: V_CVT_F64_I32_e32
 define void @sint_to_fp64(double addrspace(1)* %out, i32 %in) {
   %result = sitofp i32 %in to double
   store double %result, double addrspace(1)* %out
   ret void
 }
+
+; SI-LABEL: @sint_to_fp_i1_f64:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
+; we should be able to fold the SGPRs into the V_CNDMASK instructions.
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = sitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @sint_to_fp_i1_f64_load:
+; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, -1
+; SI-NEXT: V_CVT_F64_I32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index fe9df104ae11..9eb3dc544074 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -52,3 +52,133 @@ entry:
   ret void
 }
 
+;EG-CHECK-LABEL: @ashr_i64_2
+;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-CHECK-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-CHECK-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-CHECK-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+
+;SI-CHECK-LABEL: @ashr_i64_2
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+entry:
+  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %a = load i64 addrspace(1) * %in
+  %b = load i64 addrspace(1) * %b_ptr
+  %result = ashr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK-LABEL: @ashr_v2i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK-LABEL: @ashr_v2i64
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %result = ashr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK-LABEL: @ashr_v4i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHC]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHD]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHC]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK-LABEL: @ashr_v4i64
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %result = ashr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/srem.ll b/test/CodeGen/R600/srem.ll
new file mode 100644
index 000000000000..65e33952d29e
--- /dev/null
+++ b/test/CodeGen/R600/srem.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=r600 -mcpu=redwood < %s
+
+define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in
+  %den = load i32 addrspace(1) * %den_ptr
+  %result = srem i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = srem i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32> addrspace(1) * %den_ptr
+  %result = srem <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32> addrspace(1) * %in
+  %result = srem <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32> addrspace(1) * %den_ptr
+  %result = srem <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32> addrspace(1) * %in
+  %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index 76373552fb16..44ad73f073ed 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -39,3 +39,129 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;EG-CHECK: @lshr_i64
+;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-CHECK-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-CHECK-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+
+;SI-CHECK: @lshr_i64
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %a = load i64 addrspace(1) * %in
+  %b = load i64 addrspace(1) * %b_ptr
+  %result = lshr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @lshr_v2i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @lshr_v2i64
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %result = lshr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+
+;EG-CHECK: @lshr_v4i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHC]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHD]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHD]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @lshr_v4i64
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %result = lshr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index e321ed67a693..58523d068e5e 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -45,7 +45,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ;EG-DAG: SUB_INT
 ;EG-DAG: SUB_INT
 
-;SI: S_XOR_B64
+;SI: S_NOT_B64
 ;SI-DAG: S_ADD_I32
 ;SI-DAG: S_ADDC_U32
 ;SI-DAG: S_ADD_I32
diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
index b3caebf123d7..a71315a12d80 100644
--- a/test/CodeGen/R600/udivrem64.ll
+++ b/test/CodeGen/R600/udivrem64.ll
@@ -3,8 +3,7 @@
 
 ;FUNC-LABEL: @test_udiv
 ;EG: RECIP_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
+;EG: LSHL {{.*}}, 1,
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
@@ -74,8 +73,7 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
-;EG: BFE_UINT
-;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
 ;SI: S_ENDPGM
 define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
new file mode 100644
index 000000000000..9a41796a06bf
--- /dev/null
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @uint_to_fp_f64_i32
+; SI: V_CVT_F64_U32_e32
+; SI: S_ENDPGM
+define void @uint_to_fp_f64_i32(double addrspace(1)* %out, i32 %in) {
+  %cast = uitofp i32 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @uint_to_fp_i1_f64:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
+; we should be able to fold the SGPRs into the V_CNDMASK instructions.
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @uint_to_fp_i1_f64_load:
+; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, 1
+; SI-NEXT: V_CVT_F64_U32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index a5ac3555afde..8f5d42d42c6b 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,28 +1,30 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @uint_to_fp_v2i32
-; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-; SI-CHECK-LABEL: @uint_to_fp_v2i32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
+; FUNC-LABEL: @uint_to_fp_v2i32
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: S_ENDPGM
 define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @uint_to_fp_v4i32
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK-LABEL: @uint_to_fp_v4i32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
+; FUNC-LABEL: @uint_to_fp_v4i32
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: S_ENDPGM
 define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
@@ -30,17 +32,39 @@ define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   ret void
 }
 
-; R600-CHECK-LABEL: @uint_to_fp_i64_f32
-; R600-CHECK: UINT_TO_FLT
-; R600-CHECK: UINT_TO_FLT
-; R600-CHECK: MULADD_IEEE
-; SI-CHECK-LABEL: @uint_to_fp_i64_f32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_MAD_F32
+; FUNC-LABEL: @uint_to_fp_i64_f32
+; R600: UINT_TO_FLT
+; R600: UINT_TO_FLT
+; R600: MULADD_IEEE
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: V_MAD_F32
+; SI: S_ENDPGM
 define void @uint_to_fp_i64_f32(float addrspace(1)* %out, i64 %in) {
 entry:
   %0 = uitofp i64 %in to float
   store float %0, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @uint_to_fp_i1_f32:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00, [[CMP]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @uint_to_fp_i1_f32_load:
+; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
new file mode 100644
index 000000000000..6543f6d05933
--- /dev/null
+++ b/test/CodeGen/R600/vector-alloca.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: @vector_read
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %index
+  %2 = load i32* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @vector_write
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+; EG: MOVA_INT
+define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %w_index
+  store i32 1, i32* %1
+  %2 = getelementptr [4 x i32]* %0, i32 0, i32 %r_index
+  %3 = load i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; This test should be optimize to:
+; store i32 0, i32 addrspace(1)* %out
+; FUNC-LABEL: @bitcast_gep
+; CHECK: STORE_RAW
+define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %2 = bitcast i32* %1 to [4 x i32]*
+  %3 = getelementptr [4 x i32]* %2, i32 0, i32 0
+  %4 = load i32* %3
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index 5a5c86d7ef5a..004304173990 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -90,3 +90,43 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @vector_xor_i64
+; SI-CHECK: V_XOR_B32_e32
+; SI-CHECK: V_XOR_B32_e32
+; SI-CHECK: S_ENDPGM
+define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64 addrspace(1)* %in0
+  %b = load i64 addrspace(1)* %in1
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @scalar_xor_i64
+; SI-CHECK: S_XOR_B64
+; SI-CHECK: S_ENDPGM
+define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @scalar_not_i64
+; SI-CHECK: S_NOT_B64
+define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @vector_not_i64
+; SI-CHECK: V_NOT_B32
+; SI-CHECK: V_NOT_B32
+define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64 addrspace(1)* %in0
+  %b = load i64 addrspace(1)* %in1
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/SPARC/atomics.ll b/test/CodeGen/SPARC/atomics.ll
index 5e4130059646..ee6c1f8999b0 100644
--- a/test/CodeGen/SPARC/atomics.ll
+++ b/test/CodeGen/SPARC/atomics.ll
@@ -38,7 +38,8 @@ entry:
 
 define i32 @test_cmpxchg_i32(i32 %a, i32* %ptr) {
 entry:
-  %b = cmpxchg i32* %ptr, i32 %a, i32 123 monotonic monotonic
+  %pair = cmpxchg i32* %ptr, i32 %a, i32 123 monotonic monotonic
+  %b = extractvalue { i32, i1 } %pair, 0
   ret i32 %b
 }
 
@@ -48,7 +49,8 @@ entry:
 
 define i64 @test_cmpxchg_i64(i64 %a, i64* %ptr) {
 entry:
-  %b = cmpxchg i64* %ptr, i64 %a, i64 123 monotonic monotonic
+  %pair = cmpxchg i64* %ptr, i64 %a, i64 123 monotonic monotonic
+  %b = extractvalue { i64, i1 } %pair, 0
   ret i64 %b
 }
 
diff --git a/test/CodeGen/SPARC/lit.local.cfg b/test/CodeGen/SPARC/lit.local.cfg
index 4d344fa91a9e..fa6a54e50132 100644
--- a/test/CodeGen/SPARC/lit.local.cfg
+++ b/test/CodeGen/SPARC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/SystemZ/Large/lit.local.cfg b/test/CodeGen/SystemZ/Large/lit.local.cfg
index 9a02f849c347..4f22a970c3a6 100644
--- a/test/CodeGen/SystemZ/Large/lit.local.cfg
+++ b/test/CodeGen/SystemZ/Large/lit.local.cfg
@@ -5,6 +5,5 @@ config.suffixes = ['.py']
 if config.root.host_arch not in ['SystemZ']:
     config.unsupported = True
 
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
diff --git a/test/CodeGen/SystemZ/cmpxchg-01.ll b/test/CodeGen/SystemZ/cmpxchg-01.ll
index bb0b18ad57c3..5118aadcf2ad 100644
--- a/test/CodeGen/SystemZ/cmpxchg-01.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-01.ll
@@ -32,7 +32,8 @@ define i8 @f1(i8 %dummy, i8 *%src, i8 %cmp, i8 %swap) {
 ; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
 ; CHECK-SHIFT: rll
 ; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -8([[NEGSHIFT]])
-  %res = cmpxchg i8 *%src, i8 %cmp, i8 %swap seq_cst seq_cst
+  %pair = cmpxchg i8 *%src, i8 %cmp, i8 %swap seq_cst seq_cst
+  %res = extractvalue { i8, i1 } %pair, 0
   ret i8 %res
 }
 
@@ -50,6 +51,7 @@ define i8 @f2(i8 *%src) {
 ; CHECK-SHIFT: risbg
 ; CHECK-SHIFT: risbg [[SWAP]], {{%r[0-9]+}}, 32, 55, 0
 ; CHECK-SHIFT: br %r14
-  %res = cmpxchg i8 *%src, i8 42, i8 88 seq_cst seq_cst
+  %pair = cmpxchg i8 *%src, i8 42, i8 88 seq_cst seq_cst
+  %res = extractvalue { i8, i1 } %pair, 0
   ret i8 %res
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-02.ll b/test/CodeGen/SystemZ/cmpxchg-02.ll
index 8d46a8c0736f..9eb0628b5a30 100644
--- a/test/CodeGen/SystemZ/cmpxchg-02.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-02.ll
@@ -32,7 +32,8 @@ define i16 @f1(i16 %dummy, i16 *%src, i16 %cmp, i16 %swap) {
 ; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
 ; CHECK-SHIFT: rll
 ; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -16([[NEGSHIFT]])
-  %res = cmpxchg i16 *%src, i16 %cmp, i16 %swap seq_cst seq_cst
+  %pair = cmpxchg i16 *%src, i16 %cmp, i16 %swap seq_cst seq_cst
+  %res = extractvalue { i16, i1 } %pair, 0
   ret i16 %res
 }
 
@@ -50,6 +51,7 @@ define i16 @f2(i16 *%src) {
 ; CHECK-SHIFT: risbg
 ; CHECK-SHIFT: risbg [[SWAP]], {{%r[0-9]+}}, 32, 47, 0
 ; CHECK-SHIFT: br %r14
-  %res = cmpxchg i16 *%src, i16 42, i16 88 seq_cst seq_cst
+  %pair = cmpxchg i16 *%src, i16 42, i16 88 seq_cst seq_cst
+  %res = extractvalue { i16, i1 } %pair, 0
   ret i16 %res
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-03.ll b/test/CodeGen/SystemZ/cmpxchg-03.ll
index f6a2ad0b6916..c5fab4dc0439 100644
--- a/test/CodeGen/SystemZ/cmpxchg-03.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-03.ll
@@ -7,7 +7,8 @@ define i32 @f1(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%src, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%src, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -17,7 +18,8 @@ define i32 @f2(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 4092(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 1023
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -27,7 +29,8 @@ define i32 @f3(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, 4096(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 1024
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -37,7 +40,8 @@ define i32 @f4(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, 524284(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 131071
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -49,7 +53,8 @@ define i32 @f5(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 131072
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -59,7 +64,8 @@ define i32 @f6(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, -4(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -1
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -69,7 +75,8 @@ define i32 @f7(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, -524288(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -131072
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -81,7 +88,8 @@ define i32 @f8(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -131073
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -93,7 +101,8 @@ define i32 @f9(i32 %cmp, i32 %swap, i64 %src, i64 %index) {
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %ptr = inttoptr i64 %add1 to i32 *
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -106,7 +115,8 @@ define i32 @f10(i32 %cmp, i32 %swap, i64 %src, i64 %index) {
   %add1 = add i64 %src, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i32 *
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -116,7 +126,8 @@ define i32 @f11(i32 %dummy, i32 %swap, i32 *%ptr) {
 ; CHECK: lhi %r2, 1001
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%ptr, i32 1001, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 1001, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -126,6 +137,7 @@ define i32 @f12(i32 %cmp, i32 *%ptr) {
 ; CHECK: lhi [[SWAP:%r[0-9]+]], 1002
 ; CHECK: cs %r2, [[SWAP]], 0(%r3)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 1002 seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 1002 seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-04.ll b/test/CodeGen/SystemZ/cmpxchg-04.ll
index 069bad65144b..ba1493e1853e 100644
--- a/test/CodeGen/SystemZ/cmpxchg-04.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-04.ll
@@ -7,7 +7,8 @@ define i64 @f1(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%src, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%src, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -17,7 +18,8 @@ define i64 @f2(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 524280(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 65535
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -29,7 +31,8 @@ define i64 @f3(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 65536
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -39,7 +42,8 @@ define i64 @f4(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, -8(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -1
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -49,7 +53,8 @@ define i64 @f5(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, -524288(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -65536
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -61,7 +66,8 @@ define i64 @f6(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -65537
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -73,7 +79,8 @@ define i64 @f7(i64 %cmp, i64 %swap, i64 %src, i64 %index) {
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %ptr = inttoptr i64 %add1 to i64 *
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -83,7 +90,8 @@ define i64 @f8(i64 %dummy, i64 %swap, i64 *%ptr) {
 ; CHECK: lghi %r2, 1001
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%ptr, i64 1001, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 1001, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -93,6 +101,7 @@ define i64 @f9(i64 %cmp, i64 *%ptr) {
 ; CHECK: lghi [[SWAP:%r[0-9]+]], 1002
 ; CHECK: csg %r2, [[SWAP]], 0(%r3)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 1002 seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 1002 seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
diff --git a/test/CodeGen/SystemZ/lit.local.cfg b/test/CodeGen/SystemZ/lit.local.cfg
index b12af09434be..5c02dd3614a4 100644
--- a/test/CodeGen/SystemZ/lit.local.cfg
+++ b/test/CodeGen/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index b87bf24993a1..ffc9584199cf 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -151,5 +151,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !98 = metadata !{i32 52, i32 0, metadata !1, null}
 !101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !102 = metadata !{i32 0}
-!103 = metadata !{metadata !3}
+!103 = metadata !{metadata !3, metadata !77}
 !104 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
new file mode 100644
index 000000000000..ae663697ebeb
--- /dev/null
+++ b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
+; XFAIL: *
+
+define void @foo(i32* %A) #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: push {r7, lr}
+; CHECK: ldm [[REG0:r[0-9]]]!,
+; CHECK-NEXT: subs [[REG0]]
+; CHECK-NEXT: bl
+  %0 = load i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  tail call void @bar(i32* %A, i32 %0, i32 %1) #2
+  ret void
+}
+
+declare void @bar(i32*, i32, i32) #1
diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll
index 6bc39afe214a..6c6de55347a4 100644
--- a/test/CodeGen/Thumb/dyn-stackalloc.ll
+++ b/test/CodeGen/Thumb/dyn-stackalloc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
 
 	%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
 	%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
@@ -45,8 +45,7 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) {
 ; CHECK: sub sp, #
 ; CHECK: mov r[[R0:[0-9]+]], sp
 ; CHECK: str r{{[0-9+]}}, [r[[R0]]
-; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]]
-; RA_BASIC: stm r[[R0]]!
+; CHECK: str r{{[0-9+]}}, [r[[R0]]
 ; CHECK-NOT: ldr r0, [sp
 ; CHECK: mov r[[R1:[0-9]+]], sp
 ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}}
diff --git a/test/CodeGen/Thumb/fastcc.ll b/test/CodeGen/Thumb/fastcc.ll
new file mode 100644
index 000000000000..98ff684d2ec6
--- /dev/null
+++ b/test/CodeGen/Thumb/fastcc.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mcpu=arm926ej-s -mattr=+vfp2
+
+; This is a regression test, to ensure that fastcc functions are correctly
+; handled when compiling for a processor which has a floating-point unit which
+; is not accessible from the selected instruction set.
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv5e-none-linux-gnueabi"
+
+; Function Attrs: optsize
+define fastcc void @_foo(float %walpha) #0 {
+entry:
+  br label %for.body13
+
+for.body13:                                       ; preds = %for.body13, %entry
+  br i1 undef, label %for.end182.critedge, label %for.body13
+
+for.end182.critedge:                              ; preds = %for.body13
+  %conv183 = fpext float %walpha to double
+  %mul184 = fmul double %conv183, 8.200000e-01
+  %conv185 = fptrunc double %mul184 to float
+  %conv188 = fpext float %conv185 to double
+  %mul189 = fmul double %conv188, 6.000000e-01
+  %conv190 = fptrunc double %mul189 to float
+  br label %for.body193
+
+for.body193:                                      ; preds = %for.body193, %for.end182.critedge
+  %mul195 = fmul float %conv190, undef
+  br label %for.body193
+}
+
+attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/CodeGen/Thumb/lit.local.cfg b/test/CodeGen/Thumb/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/CodeGen/Thumb/lit.local.cfg
+++ b/test/CodeGen/Thumb/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Thumb/segmented-stacks.ll b/test/CodeGen/Thumb/segmented-stacks.ll
index 89043ec11dce..d6e25c7792e8 100644
--- a/test/CodeGen/Thumb/segmented-stacks.ll
+++ b/test/CodeGen/Thumb/segmented-stacks.ll
@@ -57,6 +57,8 @@ define void @test_basic() #0 {
 define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; Thumb-android:      test_nested:
@@ -68,7 +70,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; Thumb-android-NEXT: cmp     r4, r5
 ; Thumb-android-NEXT: blo     .LBB1_2
 
-; Thumb-android:      mov     r4, #0
+; Thumb-android:      mov     r4, #56
 ; Thumb-android-NEXT: mov     r5, #0
 ; Thumb-android-NEXT: push    {lr}
 ; Thumb-android-NEXT: bl      __morestack
@@ -88,7 +90,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; Thumb-linux-NEXT: cmp     r4, r5
 ; Thumb-linux-NEXT: blo     .LBB1_2
 
-; Thumb-linux:      mov     r4, #0
+; Thumb-linux:      mov     r4, #56
 ; Thumb-linux-NEXT: mov     r5, #0
 ; Thumb-linux-NEXT: push    {lr}
 ; Thumb-linux-NEXT: bl      __morestack
@@ -246,4 +248,14 @@ define fastcc void @test_fastcc_large() #0 {
 
 }
 
+define void @test_nostack() #0 {
+	ret void
+
+; Thumb-android-LABEL: test_nostack:
+; Thumb-android-NOT:   bl __morestack
+
+; Thumb-linux-LABEL: test_nostack:
+; Thumb-linux-NOT:   bl __morestack
+}
+
 attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/Thumb/thumb-ldm.ll b/test/CodeGen/Thumb/thumb-ldm.ll
index dd98e6f300ea..95f3edc9c4c3 100644
--- a/test/CodeGen/Thumb/thumb-ldm.ll
+++ b/test/CodeGen/Thumb/thumb-ldm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
+; XFAIL: *
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 
diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
index 06cfd9bbef2b..dedc82bf68ce 100644
--- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
+++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
+; XFAIL: *
 
 @d = external global [64 x i32]
 @s = external global [64 x i32]
diff --git a/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll b/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
index e0144531454a..09e0ed1ead63 100644
--- a/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
+++ b/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 @csize = external global [100 x [20 x [4 x i8]]]		; <[100 x [20 x [4 x i8]]]*> [#uses=1]
 @vsize = external global [100 x [20 x [4 x i8]]]		; <[100 x [20 x [4 x i8]]]*> [#uses=1]
diff --git a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
index 940cfd15e08e..c8eac8d4d094 100644
--- a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4659
 ; PR4682
 
diff --git a/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll b/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
index 52066d3f86ad..a9a2478e4034 100644
--- a/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
+++ b/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -disable-cgp-branch-opts | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -disable-cgp-branch-opts -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 %struct.pix_pos = type { i32, i32, i32, i32, i32, i32 }
 
diff --git a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
index 1b8bdb1c19bb..8beb5b1c8944 100644
--- a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
+++ b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O3 | FileCheck %s
 ; rdar://7493908
 
 ; Make sure the result of the first dynamic_alloc isn't copied back to sp more
diff --git a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
index 810bfb790209..f3046e1fcb82 100644
--- a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
+++ b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O3 -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O3 -relocation-model=pic -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://8115404
 ; Tail merging must not split an IT block.
 
diff --git a/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll b/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
index 75f5439b98c2..3d89390d04c1 100644
--- a/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
+++ b/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
@@ -1,5 +1,5 @@
 ; rdar://8465407
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 %struct.buf = type opaque
 
diff --git a/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
index b1ce3bb935e3..240df83252cc 100644
--- a/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
+++ b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-apple-darwin10 < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-darwin10 -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 %struct.op = type { %struct.op*, %struct.op*, %struct.op* ()*, i32, i16, i16, i8, i8 }
 
diff --git a/test/CodeGen/Thumb2/buildvector-crash.ll b/test/CodeGen/Thumb2/buildvector-crash.ll
index 8a3c895bbe57..16e2298522f5 100644
--- a/test/CodeGen/Thumb2/buildvector-crash.ll
+++ b/test/CodeGen/Thumb2/buildvector-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 | FileCheck %s
 ; Formerly crashed, 3573915.
 
 define void @RotateStarsFP_Vec() nounwind {
diff --git a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
index a9f948cf717a..88c7f0f17ab9 100644
--- a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
+++ b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 define void @fht(float* nocapture %fz, i16 signext %n) nounwind {
 ; CHECK-LABEL: fht:
diff --git a/test/CodeGen/Thumb2/ldr-str-imm12.ll b/test/CodeGen/Thumb2/ldr-str-imm12.ll
index 36544d16d6f4..d20eef0c8bb7 100644
--- a/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
 ; rdar://7352504
 ; Make sure we use "str r9, [sp, #+28]" instead of "sub.w r4, r7, #256" followed by "str r9, [r4, #-32]".
 
diff --git a/test/CodeGen/Thumb2/lit.local.cfg b/test/CodeGen/Thumb2/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/CodeGen/Thumb2/lit.local.cfg
+++ b/test/CodeGen/Thumb2/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Thumb2/thumb2-branch.ll b/test/CodeGen/Thumb2/thumb2-branch.ll
index a00b22d85022..332ed50ede6f 100644
--- a/test/CodeGen/Thumb2/thumb2-branch.ll
+++ b/test/CodeGen/Thumb2/thumb2-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; If-conversion defeats the purpose of this test, which is to check
 ; conditional branch generation, so a call to make sure it doesn't
 ; happen and we get actual branches.
diff --git a/test/CodeGen/Thumb2/thumb2-cbnz.ll b/test/CodeGen/Thumb2/thumb2-cbnz.ll
index 893bd0fdaef4..f0f79168c904 100644
--- a/test/CodeGen/Thumb2/thumb2-cbnz.ll
+++ b/test/CodeGen/Thumb2/thumb2-cbnz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://7354379
 
 declare double @foo(double) nounwind readnone
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
index 403cd48035b4..a861912fe113 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-default-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-no-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-atomic-cfg-tidy=0 -arm-no-restrict-it | FileCheck %s
 
 define void @foo(i32 %X, i32 %Y) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
index a71aa3fb613a..79667d43b95e 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-default-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-no-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-atomic-cfg-tidy=0 -arm-no-restrict-it | FileCheck %s
 
 ; There shouldn't be a unconditional branch at end of bb52.
 ; rdar://7184787
diff --git a/test/CodeGen/Thumb2/thumb2-spill-q.ll b/test/CodeGen/Thumb2/thumb2-spill-q.ll
index 52c106344910..94f472593b3c 100644
--- a/test/CodeGen/Thumb2/thumb2-spill-q.ll
+++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-elf -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-elf -mattr=+neon -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4789
 
 %bar = type { float, float, float }
diff --git a/test/CodeGen/Thumb2/v8_IT_3.ll b/test/CodeGen/Thumb2/v8_IT_3.ll
index 4dca24629b01..a028deebc8e8 100644
--- a/test/CodeGen/Thumb2/v8_IT_3.ll
+++ b/test/CodeGen/Thumb2/v8_IT_3.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
 
 %struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
 %struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
index 2f352d6d5f17..2da75ad21436 100644
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it | FileCheck %s
 ; CHECK: it	ne
 ; CHECK-NEXT: cmpne
 ; CHECK-NEXT: bne [[JUMPTARGET:.LBB[0-9]+_[0-9]+]]
diff --git a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
index e673d315a435..a0106d7798d5 100644
--- a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
+++ b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=pic | grep TLSGD | count 2
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
 ; PR2137
 
 ; ModuleID = '1.c'
@@ -8,9 +8,11 @@ target triple = "i386-pc-linux-gnu"
 @__resp = thread_local global %struct.__res_state* @_res		; <%struct.__res_state**> [#uses=1]
 @_res = global %struct.__res_state zeroinitializer, section ".bss"		; <%struct.__res_state*> [#uses=1]
 
-@__libc_resp = hidden alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
+@__libc_resp = hidden thread_local alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
 
 define i32 @foo() {
+; CHECK-LABEL: foo:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
@@ -24,6 +26,8 @@ return:		; preds = %entry
 }
 
 define i32 @bar() {
+; CHECK-LABEL: bar:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
index f69cedc4d373..ebf51a5d660a 100644
--- a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
+++ b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
@@ -18,7 +18,8 @@ entry:
 loop:
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg8b
-  %r = cmpxchg i64* %ptr, i64 0, i64 1 monotonic monotonic
+  %pair = cmpxchg i64* %ptr, i64 0, i64 1 monotonic monotonic
+  %r = extractvalue { i64, i1 } %pair, 0
   %stored1  = icmp eq i64 %r, 0
   br i1 %stored1, label %loop, label %continue
 continue:
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index f0165284b187..625a35161c11 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -8,11 +8,11 @@ target triple = "x86_64-apple-darwin10.0.0"
 ; CHECK: DW_TAG_subprogram
 ; CHECK: DW_TAG_variable
 ; CHECK: DW_TAG_variable
+; CHECK-NEXT:   DW_AT_location
 ; CHECK-NEXT:   DW_AT_name {{.*}} "z_s"
 ; CHECK-NEXT:   DW_AT_decl_file
 ; CHECK-NEXT:   DW_AT_decl_line
 ; CHECK-NEXT:   DW_AT_type{{.*}}{[[TYPE:.*]]}
-; CHECK-NEXT:   DW_AT_location
 ; CHECK: [[TYPE]]:
 ; CHECK-NEXT: DW_AT_name {{.*}} "int"
 
diff --git a/test/CodeGen/X86/2014-05-29-factorial.ll b/test/CodeGen/X86/2014-05-29-factorial.ll
new file mode 100644
index 000000000000..987a21d34eab
--- /dev/null
+++ b/test/CodeGen/X86/2014-05-29-factorial.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; CHECK: decq [[X:%rdi|%rcx]]
+; CHECK-NOT: testq [[X]], [[X]]
+
+define i64 @fact2(i64 %x) {
+entry:
+  br label %while.body
+
+while.body:
+  %result.06 = phi i64 [ %mul, %while.body ], [ 1, %entry ]
+  %x.addr.05 = phi i64 [ %dec, %while.body ], [ %x, %entry ]
+  %mul = mul nsw i64 %result.06, %x.addr.05
+  %dec = add nsw i64 %x.addr.05, -1
+  %cmp = icmp sgt i64 %dec, 0
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  %mul.lcssa = phi i64 [ %mul, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i64 [ %mul.lcssa, %while.end.loopexit ]
+  ret i64 %result.0.lcssa
+}
diff --git a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll b/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
new file mode 100644
index 000000000000..4580795880ab
--- /dev/null
+++ b/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; CHECK: addl
+
+; The two additions are the same , but have different flags.
+; In theory this code should never be generated by the frontend, but this 
+; tries to test that two identical instructions with two different flags
+; actually generate two different nodes.
+;
+; Normally the combiner would see this condition without the flags 
+; and optimize the result of the sub into a register clear
+; (the final result would be 0). With the different flags though the combiner 
+; needs to keep the add + sub nodes, because the two nodes result as different
+; nodes and so cannot assume that the subtraction of the two nodes
+; generates 0 as result
+define i32 @foo(i32 %a, i32 %b) {
+  %1 = add i32 %a, %b
+  %2 = add nsw i32 %a, %b
+  %3 = sub i32 %1, %2
+  ret i32 %3
+}
diff --git a/test/CodeGen/X86/Atomics-64.ll b/test/CodeGen/X86/Atomics-64.ll
index c2746885044d..f9c25fc82261 100644
--- a/test/CodeGen/X86/Atomics-64.ll
+++ b/test/CodeGen/X86/Atomics-64.ll
@@ -704,7 +704,8 @@ entry:
   %3 = zext i8 %2 to i32
   %4 = trunc i32 %3 to i8
   %5 = trunc i32 %1 to i8
-  %6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic monotonic
+  %pair6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic monotonic
+  %6 = extractvalue { i8, i1 } %pair6, 0
   store i8 %6, i8* @sc, align 1
   %7 = load i8* @sc, align 1
   %8 = zext i8 %7 to i32
@@ -712,7 +713,8 @@ entry:
   %10 = zext i8 %9 to i32
   %11 = trunc i32 %10 to i8
   %12 = trunc i32 %8 to i8
-  %13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic monotonic
+  %pair13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic monotonic
+  %13 = extractvalue { i8, i1 } %pair13, 0
   store i8 %13, i8* @uc, align 1
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
@@ -722,7 +724,8 @@ entry:
   %19 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
   %20 = trunc i32 %18 to i16
   %21 = trunc i32 %16 to i16
-  %22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic monotonic
+  %pair22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic monotonic
+  %22 = extractvalue { i16, i1 } %pair22, 0
   store i16 %22, i16* @ss, align 2
   %23 = load i8* @sc, align 1
   %24 = sext i8 %23 to i16
@@ -732,49 +735,56 @@ entry:
   %28 = bitcast i8* bitcast (i16* @us to i8*) to i16*
   %29 = trunc i32 %27 to i16
   %30 = trunc i32 %25 to i16
-  %31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic monotonic
+  %pair31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic monotonic
+  %31 = extractvalue { i16, i1 } %pair31, 0
   store i16 %31, i16* @us, align 2
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i32
   %34 = load i8* @uc, align 1
   %35 = zext i8 %34 to i32
   %36 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic monotonic
+  %pair37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic monotonic
+  %37 = extractvalue { i32, i1 } %pair37, 0
   store i32 %37, i32* @si, align 4
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i32
   %40 = load i8* @uc, align 1
   %41 = zext i8 %40 to i32
   %42 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic monotonic
+  %pair43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic monotonic
+  %43 = extractvalue { i32, i1 } %pair43, 0
   store i32 %43, i32* @ui, align 4
   %44 = load i8* @sc, align 1
   %45 = sext i8 %44 to i64
   %46 = load i8* @uc, align 1
   %47 = zext i8 %46 to i64
   %48 = bitcast i8* bitcast (i64* @sl to i8*) to i64*
-  %49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic monotonic
+  %pair49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic monotonic
+  %49 = extractvalue { i64, i1 } %pair49, 0
   store i64 %49, i64* @sl, align 8
   %50 = load i8* @sc, align 1
   %51 = sext i8 %50 to i64
   %52 = load i8* @uc, align 1
   %53 = zext i8 %52 to i64
   %54 = bitcast i8* bitcast (i64* @ul to i8*) to i64*
-  %55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic monotonic
+  %pair55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic monotonic
+  %55 = extractvalue { i64, i1 } %pair55, 0
   store i64 %55, i64* @ul, align 8
   %56 = load i8* @sc, align 1
   %57 = sext i8 %56 to i64
   %58 = load i8* @uc, align 1
   %59 = zext i8 %58 to i64
   %60 = bitcast i8* bitcast (i64* @sll to i8*) to i64*
-  %61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic monotonic
+  %pair61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic monotonic
+  %61 = extractvalue { i64, i1 } %pair61, 0
   store i64 %61, i64* @sll, align 8
   %62 = load i8* @sc, align 1
   %63 = sext i8 %62 to i64
   %64 = load i8* @uc, align 1
   %65 = zext i8 %64 to i64
   %66 = bitcast i8* bitcast (i64* @ull to i8*) to i64*
-  %67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic monotonic
+  %pair67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic monotonic
+  %67 = extractvalue { i64, i1 } %pair67, 0
   store i64 %67, i64* @ull, align 8
   %68 = load i8* @sc, align 1
   %69 = zext i8 %68 to i32
@@ -782,7 +792,8 @@ entry:
   %71 = zext i8 %70 to i32
   %72 = trunc i32 %71 to i8
   %73 = trunc i32 %69 to i8
-  %74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic monotonic
+  %pair74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic monotonic
+  %74 = extractvalue { i8, i1 } %pair74, 0
   %75 = icmp eq i8 %74, %72
   %76 = zext i1 %75 to i8
   %77 = zext i8 %76 to i32
@@ -793,7 +804,8 @@ entry:
   %81 = zext i8 %80 to i32
   %82 = trunc i32 %81 to i8
   %83 = trunc i32 %79 to i8
-  %84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic monotonic
+  %pair84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic monotonic
+  %84 = extractvalue { i8, i1 } %pair84, 0
   %85 = icmp eq i8 %84, %82
   %86 = zext i1 %85 to i8
   %87 = zext i8 %86 to i32
@@ -805,7 +817,8 @@ entry:
   %92 = zext i8 %91 to i32
   %93 = trunc i32 %92 to i8
   %94 = trunc i32 %90 to i8
-  %95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic monotonic
+  %pair95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic monotonic
+  %95 = extractvalue { i8, i1 } %pair95, 0
   %96 = icmp eq i8 %95, %93
   %97 = zext i1 %96 to i8
   %98 = zext i8 %97 to i32
@@ -817,7 +830,8 @@ entry:
   %103 = zext i8 %102 to i32
   %104 = trunc i32 %103 to i8
   %105 = trunc i32 %101 to i8
-  %106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic monotonic
+  %pair106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic monotonic
+  %106 = extractvalue { i8, i1 } %pair106, 0
   %107 = icmp eq i8 %106, %104
   %108 = zext i1 %107 to i8
   %109 = zext i8 %108 to i32
@@ -828,7 +842,8 @@ entry:
   %113 = zext i8 %112 to i32
   %114 = trunc i32 %113 to i8
   %115 = trunc i32 %111 to i8
-  %116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic monotonic
+  %pair116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic monotonic
+  %116 = extractvalue { i8, i1 } %pair116, 0
   %117 = icmp eq i8 %116, %114
   %118 = zext i1 %117 to i8
   %119 = zext i8 %118 to i32
@@ -839,7 +854,8 @@ entry:
   %123 = zext i8 %122 to i32
   %124 = trunc i32 %123 to i8
   %125 = trunc i32 %121 to i8
-  %126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic monotonic
+  %pair126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic monotonic
+  %126 = extractvalue { i8, i1 } %pair126, 0
   %127 = icmp eq i8 %126, %124
   %128 = zext i1 %127 to i8
   %129 = zext i8 %128 to i32
@@ -850,7 +866,8 @@ entry:
   %133 = zext i8 %132 to i64
   %134 = trunc i64 %133 to i8
   %135 = trunc i64 %131 to i8
-  %136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic monotonic
+  %pair136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic monotonic
+  %136 = extractvalue { i8, i1 } %pair136, 0
   %137 = icmp eq i8 %136, %134
   %138 = zext i1 %137 to i8
   %139 = zext i8 %138 to i32
@@ -861,7 +878,8 @@ entry:
   %143 = zext i8 %142 to i64
   %144 = trunc i64 %143 to i8
   %145 = trunc i64 %141 to i8
-  %146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic monotonic
+  %pair146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic monotonic
+  %146 = extractvalue { i8, i1 } %pair146, 0
   %147 = icmp eq i8 %146, %144
   %148 = zext i1 %147 to i8
   %149 = zext i8 %148 to i32
@@ -872,7 +890,8 @@ entry:
   %153 = zext i8 %152 to i64
   %154 = trunc i64 %153 to i8
   %155 = trunc i64 %151 to i8
-  %156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic monotonic
+  %pair156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic monotonic
+  %156 = extractvalue { i8, i1 } %pair156, 0
   %157 = icmp eq i8 %156, %154
   %158 = zext i1 %157 to i8
   %159 = zext i8 %158 to i32
@@ -883,7 +902,8 @@ entry:
   %163 = zext i8 %162 to i64
   %164 = trunc i64 %163 to i8
   %165 = trunc i64 %161 to i8
-  %166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic monotonic
+  %pair166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic monotonic
+  %166 = extractvalue { i8, i1 } %pair166, 0
   %167 = icmp eq i8 %166, %164
   %168 = zext i1 %167 to i8
   %169 = zext i8 %168 to i32
diff --git a/test/CodeGen/X86/GC/lit.local.cfg b/test/CodeGen/X86/GC/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/CodeGen/X86/GC/lit.local.cfg
+++ b/test/CodeGen/X86/GC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index 8487c6082b72..bf55644de41e 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -1,4 +1,20 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false \
+; RUN: -relocation-model=pic | FileCheck %s
+
+@thread_var = thread_local global i32 42, align 4
+@thread_alias = thread_local(localdynamic) alias i32* @thread_var
+
+; CHECK-LABEL: get_thread_var
+define i32* @get_thread_var() {
+; CHECK: leal    thread_var@TLSGD
+  ret i32* @thread_var
+}
+
+; CHECK-LABEL: get_thread_alias
+define i32* @get_thread_alias() {
+; CHECK: leal    thread_alias@TLSLD
+  ret i32* @thread_alias
+}
 
 @bar = global i32 42
 
@@ -22,7 +38,7 @@ define i32 @foo_f() {
 @bar_i = alias internal i32* @bar
 
 ; CHECK-DAG: .globl	A
-@A = alias i64, i32* @bar
+@A = alias bitcast (i32* @bar to i64*)
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
@@ -32,6 +48,19 @@ define i32 @foo_f() {
 ; CHECK-DAG: .protected	bar_p
 @bar_p = protected alias i32* @bar
 
+; CHECK-DAG: test2 = bar+4
+@test2 = alias getelementptr(i32 *@bar, i32 1)
+
+; CHECK-DAG: test3 = 42
+@test3 = alias inttoptr(i32 42 to i32*)
+
+; CHECK-DAG: test4 = bar
+@test4 = alias inttoptr(i64 ptrtoint (i32* @bar to i64) to i32*)
+
+; CHECK-DAG: test5 = test2-bar
+@test5 = alias inttoptr(i32 sub (i32 ptrtoint (i32* @test2 to i32),
+                                 i32 ptrtoint (i32* @bar to i32)) to i32*)
+
 ; CHECK-DAG: .globl	test
 define i32 @test() {
 entry:
diff --git a/test/CodeGen/X86/atom-fixup-lea4.ll b/test/CodeGen/X86/atom-fixup-lea4.ll
new file mode 100644
index 000000000000..668574b968c8
--- /dev/null
+++ b/test/CodeGen/X86/atom-fixup-lea4.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mcpu=atom -mtriple=x86_64-linux
+
+%struct.ValueWrapper = type { double }
+%struct.ValueWrapper.6 = type { %struct.ValueWrapper.7 }
+%struct.ValueWrapper.7 = type { %struct.ValueWrapper.8 }
+%struct.ValueWrapper.8 = type { %struct.ValueWrapper }
+
+; Function Attrs: uwtable
+define linkonce_odr void @_ZN12ValueWrapperIS_IS_IS_IdEEEEC2Ev(%struct.ValueWrapper.6* %this) unnamed_addr #0 align 2 {
+entry:
+  %this.addr = alloca %struct.ValueWrapper.6*, align 8
+  store %struct.ValueWrapper.6* %this, %struct.ValueWrapper.6** %this.addr, align 8
+  %this1 = load %struct.ValueWrapper.6** %this.addr
+  %value = getelementptr inbounds %struct.ValueWrapper.6* %this1, i32 0, i32 0
+  call void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(%struct.ValueWrapper.7* %value)
+  ret void
+}
+
+; Function Attrs: uwtable
+declare void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(%struct.ValueWrapper.7*) unnamed_addr #0 align 2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index b3045ed645be..cb639abadd6e 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -101,11 +101,13 @@ entry:
 	%neg1 = sub i32 0, 10		; <i32> [#uses=1]
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic monotonic
+  %pair16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic monotonic
+  %16 = extractvalue { i32, i1 } %pair16, 0
 	store i32 %16, i32* %old
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic monotonic
+  %pair17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic monotonic
+  %17 = extractvalue { i32, i1 } %pair17, 0
 	store i32 %17, i32* %old
         ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
@@ -133,6 +135,7 @@ entry:
 ; CHECK: lock
 ; CHECK:	cmpxchgl	%{{.*}}, %gs:(%{{.*}})
 
-  %0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic monotonic
+  %pair0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic monotonic
+  %0 = extractvalue { i32, i1 } %pair0, 0
   ret void
 }
diff --git a/test/CodeGen/X86/avoid_complex_am.ll b/test/CodeGen/X86/avoid_complex_am.ll
index 0b7a13d3c091..7f095190ab8f 100644
--- a/test/CodeGen/X86/avoid_complex_am.ll
+++ b/test/CodeGen/X86/avoid_complex_am.ll
@@ -1,6 +1,9 @@
 ; RUN: opt -S -loop-reduce < %s | FileCheck %s
 ; Complex addressing mode are costly.
 ; Make loop-reduce prefer unscaled accesses.
+; On X86, reg1 + 1*reg2 has the same cost as reg1 + 8*reg2.
+; Therefore, LSR currently prefers to fold as much computation as possible
+; in the addressing mode.
 ; <rdar://problem/16730541>
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
@@ -18,8 +21,8 @@ for.body:                                         ; preds = %for.body, %entry
   %tmp = add nsw i64 %indvars.iv, -1
   %arrayidx = getelementptr inbounds double* %b, i64 %tmp
   %tmp1 = load double* %arrayidx, align 8
-; The induction variable should carry the scaling factor: 1 * 8 = 8.
-; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
+; The induction variable should carry the scaling factor: 1.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
   %tmp2 = load double* %arrayidx2, align 8
@@ -27,8 +30,8 @@ for.body:                                         ; preds = %for.body, %entry
   %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
   store double %mul, double* %arrayidx4, align 8
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 8 = 152.
-; CHECK: icmp eq i32 {{%[^,]+}}, 152
+; Comparison should be 19 * 1 = 19.
+; CHECK: icmp eq i32 {{%[^,]+}}, 19
   %exitcond = icmp eq i32 %lftr.wideiv, 20
   br i1 %exitcond, label %for.end, label %for.body
 
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index 8577a616c3cd..43cdf7edf70a 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -3,7 +3,16 @@
 ; AVX128 tests:
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: vblendps    $5
+; select mask is <i1 true, i1 false, i1 true, i1 false>.
+; Big endian representation is 0101 = 5.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 1010 = 10.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+; result is in xmm0 => destination argument.
+;CHECK: vblendps    $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -12,7 +21,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: vblendps   $5
+;CHECK: vblendps   $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -52,7 +61,13 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 
 ;CHECK-LABEL: vsel_float8:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendps    $17
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@@ -61,7 +76,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
 
 ;CHECK-LABEL: vsel_i328:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendps    $17
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
@@ -69,8 +84,15 @@ define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
 }
 
 ;CHECK-LABEL: vsel_double8:
-;CHECK: vblendpd    $1
-;CHECK: vblendpd    $1
+; select mask is 2x: 0001 => intel mask: ~0001 = 14
+; ABI:
+; v1 is in ymm0 and ymm1.
+; v2 is in ymm2 and ymm3.
+; result is in ymm0 and ymm1.
+; Compute the low part: res.low = blend v1.low, v2.low, blendmask
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+; Compute the high part.
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
@@ -78,8 +100,8 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
 }
 
 ;CHECK-LABEL: vsel_i648:
-;CHECK: vblendpd    $1
-;CHECK: vblendpd    $1
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
@@ -113,3 +135,68 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
   %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
   ret <2 x double> %min
 }
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: constant_blendvpd_avx:
+; CHECK-NOT: mov
+; CHECK: vblendpd
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps_avx:
+; CHECK-NOT: mov
+; CHECK: vblendps
+; CHECK: ret
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
+  ret <8 x float> %1
+}
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+;; 4 tests for shufflevectors that optimize to blend + immediate
+; CHECK-LABEL: @blend_shufflevector_4xfloat
+define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
+; Equivalent select mask is <i1 true, i1 false, i1 true, i1 false>.
+; Big endian representation is 0101 = 5.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; Inverted mask: 1010 = 10.
+; According to the ABI:
+; a is in xmm0 => first argument is xmm0.
+; b is in xmm1 => second argument is xmm1.
+; Result is in xmm0 => destination argument.
+; CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
+; CHECK: ret
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_8xfloat
+define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
+; CHECK: vblendps $190, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+  %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
+  ret <8 x float> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_4xdouble
+define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
+; CHECK: vblendpd $2, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+  %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x double> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_4xi64
+define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK: vblendpd $13, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+  %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x i64> %1
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 0be83f648d1a..ce31161dbbcd 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2219,14 +2219,6 @@ define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
 declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
 
 
-define <4 x double> @test_x86_avx_vbroadcast_sd_256(i8* %a0) {
-  ; CHECK: vbroadcastsd
-  %res = call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %a0) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*) nounwind readonly
-
-
 define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
   ; CHECK: vbroadcastf128
   %res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
@@ -2243,22 +2235,6 @@ define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
 declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
 
 
-define <4 x float> @test_x86_avx_vbroadcast_ss(i8* %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*) nounwind readonly
-
-
-define <8 x float> @test_x86_avx_vbroadcast_ss_256(i8* %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %a0) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*) nounwind readonly
-
-
 define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) {
   ; CHECK: vextractf128
   %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index f407ba4cc169..f3f7e554a33b 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -5,8 +5,10 @@ define <4 x float> @test1(<4 x float> %a) nounwind {
   %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
   ret <4 x float> %b
 ; CHECK-LABEL: test1:
-; CHECK: vshufps
-; CHECK: vpshufd
+;; TODO: This test could be improved by removing the xor instruction and
+;; having vinsertps zero out the needed elements.
+; CHECK: vxorps
+; CHECK: vinsertps
 }
 
 ; rdar://10538417
diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll
new file mode 100644
index 000000000000..b02442b6fadd
--- /dev/null
+++ b/test/CodeGen/X86/avx2-blend.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb_avx2:
+; CHECK: vmovdqa
+; CHECK: vpblendvb
+  %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
+  ret <32 x i8> %1
+}
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index 0e6dd297f8df..185b989458ae 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -60,6 +60,24 @@ define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline
   ret <4 x i64> %t
 }
 
+;; 2 tests for shufflevectors that optimize to blend + immediate
+; CHECK-LABEL: @blend_test5
+; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0
+; CHECK: ret
+define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) {
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %1
+}
+
+; CHECK-LABEL: @blend_test6
+; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) {
+  %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32  3, i32  4, i32  5, i32  6, i32 23,
+                                                               i32 8, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %1
+}
+
 ; CHECK: vpshufhw $27, %ymm
 define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 2476ea1253e6..f5cda96b99fa 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -192,6 +192,14 @@ define <16 x double> @uitof64(<16 x i32> %a) nounwind {
   ret <16 x double> %b
 }
 
+; CHECK-LABEL: uitof64_256
+; CHECK: vcvtudq2pd
+; CHECK: ret
+define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
+  %b = uitofp <4 x i32> %a to <4 x double>
+  ret <4 x double> %b
+}
+
 ; CHECK-LABEL: uitof32
 ; CHECK: vcvtudq2ps
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index e19841a6f9a8..51175390a258 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -311,7 +311,6 @@ define <8 x i64> @test_conflict_q(<8 x i64> %a) {
 
 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
 
-
 define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
   ; CHECK: vpconflictd 
   %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
@@ -324,6 +323,57 @@ define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
   ret <8 x i64> %res
 }
 
+define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
+  ; CHECK: movw $-1, %ax
+  ; CHECK: vpxor
+  ; CHECK: vplzcntd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
+  ; CHECK: movb $-1, %al
+  ; CHECK: vpxor
+  ; CHECK: vplzcntq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+
+define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+  ; CHECK: vplzcntd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ; CHECK: vplzcntq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @test_ctlz_d(<16 x i32> %a) {
+  ; CHECK-LABEL: test_ctlz_d
+  ; CHECK: vplzcntd
+  %res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) nounwind readonly
+
+define <8 x i64> @test_ctlz_q(<8 x i64> %a) {
+  ; CHECK-LABEL: test_ctlz_q
+  ; CHECK: vplzcntq
+  %res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) nounwind readonly
+
 define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK: vblendmps
   %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
@@ -545,3 +595,12 @@ define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%p
 }
 
 declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+define <8 x i64> @test_vmovntdqa(i8 *%x) {
+; CHECK-LABEL: test_vmovntdqa:
+; CHECK: vmovntdqa (%rdi), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x2a,0x07]
+  %res = call <8 x i64> @llvm.x86.avx512.movntdqa(i8* %x)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
diff --git a/test/CodeGen/X86/avx512-nontemporal.ll b/test/CodeGen/X86/avx512-nontemporal.ll
new file mode 100644
index 000000000000..ef50cdb82831
--- /dev/null
+++ b/test/CodeGen/X86/avx512-nontemporal.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86-64 -mattr=+avx512f | FileCheck %s
+
+define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, i32 %D, <8 x i64> %E, <8 x i64> %EE) {
+; CHECK: vmovntps %z
+  %cast = bitcast i8* %B to <16 x float>*
+  %A2 = fadd <16 x float> %A, %AA
+  store <16 x float> %A2, <16 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+  %cast1 = bitcast i8* %B to <8 x i64>*
+  %E2 = add <8 x i64> %E, %EE
+  store <8 x i64> %E2, <8 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %z
+  %cast2 = bitcast i8* %B to <8 x double>*
+  %C2 = fadd <8 x double> %C, %CC
+  store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0
+  ret void
+}
+
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 4e17a714bf57..34aaf2c31ace 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -22,7 +22,17 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 }
 
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: pblendw $17
+; The select mask is
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+;CHECK: pblendw $238, %xmm1, %xmm0
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index cdcdc963ed10..149d53759fe2 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -198,3 +198,16 @@ define i32 @test14(i32 %mask, i32 %base, i32 %intra) #0 {
 ; CHECK: 	shrl	$7, %edi
 ; CHECK-NEXT: 	cmovnsl	%edx, %esi
 }
+
+; PR19964
+define zeroext i1 @test15(i32 %bf.load, i32 %n) {
+  %bf.lshr = lshr i32 %bf.load, 16
+  %cmp2 = icmp eq i32 %bf.lshr, 0
+  %cmp5 = icmp uge i32 %bf.lshr, %n
+  %.cmp5 = or i1 %cmp2, %cmp5
+  ret i1 %.cmp5
+
+; CHECK-LABEL: test15:
+; CHECK:  shrl	$16, %edi
+; CHECK:  cmpl	%esi, %edi
+}
diff --git a/test/CodeGen/X86/cmpxchg-i1.ll b/test/CodeGen/X86/cmpxchg-i1.ll
new file mode 100644
index 000000000000..a21ab593b078
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-i1.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=x86_64 -o - %s | FileCheck %s
+
+define i1 @try_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: try_cmpxchg:
+; CHECK: cmpxchgl
+; CHECK-NOT: cmp
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  ret i1 %success
+}
+
+define void @cmpxchg_flow(i64* %addr, i64 %desired, i64 %new) {
+; CHECK-LABEL: cmpxchg_flow:
+; CHECK: cmpxchgq
+; CHECK-NOT: cmp
+; CHECK-NOT: set
+; CHECK: {{jne|jeq}}
+  %pair = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst seq_cst
+  %success = extractvalue { i64, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  call void @foo()
+  ret void
+
+false:
+  call void @bar()
+  ret void
+}
+
+define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: cmpxchg_sext:
+; CHECK-DAG: cmpxchgl
+; CHECK-NOT: cmpl
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %mask = sext i1 %success to i64
+  ret i64 %mask
+}
+
+define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: cmpxchg_zext:
+; CHECK: cmpxchgl
+; CHECK-NOT: cmp
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %mask = zext i1 %success to i32
+  ret i32 %mask
+}
+
+
+define i32 @cmpxchg_use_eflags_and_val(i32* %addr, i32 %offset) {
+; CHECK-LABEL: cmpxchg_use_eflags_and_val:
+; CHECK: movl (%rdi), %e[[OLDVAL:[a-z0-9]+]]
+
+; CHECK: [[LOOPBB:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: leal (%r[[OLDVAL]],%rsi), [[NEW:%[a-z0-9]+]]
+; CHECK: cmpxchgl [[NEW]], (%rdi)
+; CHECK-NOT: cmpl
+; CHECK: jne [[LOOPBB]]
+
+  ; Result already in %eax
+; CHECK: retq
+entry:
+  %init = load atomic i32* %addr seq_cst, align 4
+  br label %loop
+
+loop:
+  %old = phi i32 [%init, %entry], [%oldval, %loop]
+  %new = add i32 %old, %offset
+  %pair = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst seq_cst
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i32 %oldval
+}
+
+declare void @foo()
+declare void @bar()
diff --git a/test/CodeGen/X86/cmpxchg-i128-i1.ll b/test/CodeGen/X86/cmpxchg-i128-i1.ll
new file mode 100644
index 000000000000..4dd30013ecab
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mcpu=core-avx2 -mtriple=x86_64 -o - %s | FileCheck %s
+
+define i1 @try_cmpxchg(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: try_cmpxchg:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmp
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  ret i1 %success
+}
+
+define void @cmpxchg_flow(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_flow:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmp
+; CHECK-NOT: set
+; CHECK: {{jne|jeq}}
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  call void @foo()
+  ret void
+
+false:
+  call void @bar()
+  ret void
+}
+
+; Can't use the flags here because cmpxchg16b only sets ZF.
+define i1 @cmpxchg_arithcmp(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_arithcmp:
+; CHECK: cmpxchg16b
+; CHECK: cmpq
+; CHECK: retq
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %oldval = extractvalue { i128, i1 } %pair, 0
+  %success = icmp sge i128 %oldval, %desired
+  ret i1 %success
+}
+
+define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_zext:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmpq
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  %mask = zext i1 %success to i128
+  ret i128 %mask
+}
+
+
+define i128 @cmpxchg_use_eflags_and_val(i128* %addr, i128 %offset) {
+; CHECK-LABEL: cmpxchg_use_eflags_and_val:
+
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmpq
+; CHECK: jne
+entry:
+  %init = load atomic i128* %addr seq_cst, align 16
+  br label %loop
+
+loop:
+  %old = phi i128 [%init, %entry], [%oldval, %loop]
+  %new = add i128 %old, %offset
+
+  %pair = cmpxchg i128* %addr, i128 %old, i128 %new seq_cst seq_cst
+  %oldval = extractvalue { i128, i1 } %pair, 0
+  %success = extractvalue { i128, i1 } %pair, 1
+
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i128 %old
+}
+
+declare void @foo()
+declare void @bar()
diff --git a/test/CodeGen/X86/coalescer-remat.ll b/test/CodeGen/X86/coalescer-remat.ll
index 468b70bdc862..bb08a0ec52cd 100644
--- a/test/CodeGen/X86/coalescer-remat.ll
+++ b/test/CodeGen/X86/coalescer-remat.ll
@@ -5,7 +5,8 @@
 
 define i32 @main() nounwind {
 entry:
-  %0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
+  %t0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
+  %0 = extractvalue { i64, i1 } %t0, 0
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/X86/combine-64bit-vec-binop.ll b/test/CodeGen/X86/combine-64bit-vec-binop.ll
new file mode 100644
index 000000000000..8440fdab0eea
--- /dev/null
+++ b/test/CodeGen/X86/combine-64bit-vec-binop.ll
@@ -0,0 +1,273 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+define double @test1_add(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %add = add <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test1_add
+; SSE41: paddd
+; AVX: vpaddd
+; CHECK-NEXT: ret
+
+
+define double @test2_add(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %add = add <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test2_add
+; SSE41: paddw
+; AVX: vpaddw
+; CHECK-NEXT: ret
+
+define double @test3_add(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %add = add <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test3_add
+; SSE41: paddb
+; AVX: vpaddb
+; CHECK-NEXT: ret
+
+
+define double @test1_sub(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %sub = sub <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test1_sub
+; SSE41: psubd
+; AVX: vpsubd
+; CHECK-NEXT: ret
+
+
+define double @test2_sub(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %sub = sub <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test2_sub
+; SSE41: psubw
+; AVX: vpsubw
+; CHECK-NEXT: ret
+
+
+define double @test3_sub(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %sub = sub <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test3_sub
+; SSE41: psubb
+; AVX: vpsubb
+; CHECK-NEXT: ret
+
+
+define double @test1_mul(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %mul = mul <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test1_mul
+; SSE41: pmulld
+; AVX: vpmulld
+; CHECK-NEXT: ret
+
+
+define double @test2_mul(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %mul = mul <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test2_mul
+; SSE41: pmullw
+; AVX: vpmullw
+; CHECK-NEXT: ret
+
+; There is no legal ISD::MUL with type MVT::v8i16.
+define double @test3_mul(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %mul = mul <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test3_mul
+; CHECK: pmullw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test1_and(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %and = and <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test1_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test2_and(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %and = and <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test2_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test3_and(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %and = and <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test3_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test1_or(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %or = or <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test1_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test2_or(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %or = or <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test2_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test3_or(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %or = or <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test3_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test1_xor(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %xor = xor <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test1_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test2_xor(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %xor = xor <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test2_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test3_xor(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %xor = xor <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test3_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test_fadd(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %add = fadd <2 x float> %1, %2
+  %3 = bitcast <2 x float> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test_fadd
+; SSE41: addps
+; AVX: vaddps
+; CHECK-NEXT: ret
+
+define double @test_fsub(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %sub = fsub <2 x float> %1, %2
+  %3 = bitcast <2 x float> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test_fsub
+; SSE41: subps
+; AVX: vsubps
+; CHECK-NEXT: ret
+
+define double @test_fmul(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %mul = fmul <2 x float> %1, %2
+  %3 = bitcast <2 x float> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test_fmul
+; SSE41: mulps
+; AVX: vmulps
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index c1ce53334ec6..572aded5e9a3 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -25,7 +25,7 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
 }
 ; CHECK-LABEL: test2
 ; CHECK-NOT: xorps
-; CHECK: shufps
+; CHECK: movsd
 ; CHECK: ret
 
 
@@ -111,7 +111,7 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
 }
 ; CHECK-LABEL: test9
 ; CHECK-NOT: xorps
-; CHECK: shufps
+; CHECK: movsd
 ; CHECK: ret
 
 
diff --git a/test/CodeGen/X86/computeKnownBits_urem.ll b/test/CodeGen/X86/computeKnownBits_urem.ll
new file mode 100644
index 000000000000..9902e6f2597b
--- /dev/null
+++ b/test/CodeGen/X86/computeKnownBits_urem.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+define i32 @main() #0 {
+entry:
+  %a = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  %0 = load i32* %a, align 4
+  %or = or i32 1, %0
+  %and = and i32 1, %or
+  %rem = urem i32 %and, 1
+  %add = add i32 %rem, 1
+  ret i32 %add
+}
+; CHECK: $1, %eax
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/dagcombine-and-setcc.ll b/test/CodeGen/X86/dagcombine-and-setcc.ll
new file mode 100644
index 000000000000..e7336a90dbdd
--- /dev/null
+++ b/test/CodeGen/X86/dagcombine-and-setcc.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...)
+
+; On X86 1 is true and 0 is false, so we can't perform the combine:
+; (and (setgt X,  true), (setgt Y,  true)) -> (setgt (or X, Y), true)
+; This combine only works if the true value is -1.
+
+
+;CHECK: cmpl
+;CHECK: setg
+;CHECK: cmpl
+;CHECK: setg
+;CHECK: andb
+
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+; Function Attrs: optsize ssp uwtable
+define i32 @foo(i32 %a, i32 %b, i32 * %c) {
+if.else429:
+  %cmp.i1144 = icmp eq i32* %c, null
+  %cmp430 = icmp slt i32 %a, 2
+  %cmp432 = icmp slt i32 %b, 2
+  %or.cond710 = or i1 %cmp430, %cmp432
+  %or.cond710.not = xor i1 %or.cond710, true
+  %brmerge1448 = or i1 %cmp.i1144, %or.cond710.not
+  br i1 %brmerge1448, label %ret1, label %ret2
+
+ret1:
+  ret i32 0
+
+ret2:
+  ret i32 1
+}
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+  %res = alloca i32, align 4
+  %t = call i32 @foo(i32 1, i32 2, i32* %res) #3
+  %v = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %t)
+  ret i32 0
+}
+
+
+
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index 8bd882efeeb4..0d5afa1b1384 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -72,33 +72,37 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: weak_alias = f1
 @weak_alias = dllexport alias weak_odr void()* @f1
 
+@blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
+@blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*)
 
 ; CHECK: .section .drectve
-; WIN32: /EXPORT:Var1,DATA
-; WIN32: /EXPORT:Var2,DATA
-; WIN32: /EXPORT:Var3,DATA
-; WIN32: /EXPORT:WeakVar1,DATA
-; WIN32: /EXPORT:WeakVar2,DATA
-; WIN32: /EXPORT:f1
-; WIN32: /EXPORT:f2
-; WIN32: /EXPORT:lnk1
-; WIN32: /EXPORT:lnk2
-; WIN32: /EXPORT:weak1
-; WIN32: /EXPORT:alias
-; WIN32: /EXPORT:alias2
-; WIN32: /EXPORT:alias3
-; WIN32: /EXPORT:weak_alias
-; MINGW: -export:Var1,data
-; MINGW: -export:Var2,data
-; MINGW: -export:Var3,data
-; MINGW: -export:WeakVar1,data
-; MINGW: -export:WeakVar2,data
-; MINGW: -export:f1
-; MINGW: -export:f2
-; MINGW: -export:lnk1
-; MINGW: -export:lnk2
-; MINGW: -export:weak1
-; MINGW: -export:alias
-; MINGW: -export:alias2
-; MINGW: -export:alias3
-; MINGW: -export:weak_alias
+; WIN32: " /EXPORT:Var1,DATA"
+; WIN32: " /EXPORT:Var2,DATA"
+; WIN32: " /EXPORT:Var3,DATA"
+; WIN32: " /EXPORT:WeakVar1,DATA"
+; WIN32: " /EXPORT:WeakVar2,DATA"
+; WIN32: " /EXPORT:f1"
+; WIN32: " /EXPORT:f2"
+; WIN32: " /EXPORT:lnk1"
+; WIN32: " /EXPORT:lnk2"
+; WIN32: " /EXPORT:weak1"
+; WIN32: " /EXPORT:alias"
+; WIN32: " /EXPORT:alias2"
+; WIN32: " /EXPORT:alias3"
+; WIN32: " /EXPORT:weak_alias"
+; WIN32: " /EXPORT:blob_alias"
+; MINGW: " -export:Var1,data"
+; MINGW: " -export:Var2,data"
+; MINGW: " -export:Var3,data"
+; MINGW: " -export:WeakVar1,data"
+; MINGW: " -export:WeakVar2,data"
+; MINGW: " -export:f1"
+; MINGW: " -export:f2"
+; MINGW: " -export:lnk1"
+; MINGW: " -export:lnk2"
+; MINGW: " -export:weak1"
+; MINGW: " -export:alias"
+; MINGW: " -export:alias2"
+; MINGW: " -export:alias3"
+; MINGW: " -export:weak_alias"
+; MINGW: " -export:blob_alias"
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index b85df83e3e3a..e2c3f131ee06 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -93,38 +93,38 @@ define weak_odr dllexport void @weak1() {
 
 
 ; CHECK: .section .drectve
-; CHECK-CL: /EXPORT:_Var1,DATA
-; CHECK-CL: /EXPORT:_Var2,DATA
-; CHECK-CL: /EXPORT:_Var3,DATA
-; CHECK-CL: /EXPORT:_WeakVar1,DATA
-; CHECK-CL: /EXPORT:_WeakVar2,DATA
-; CHECK-CL: /EXPORT:_f1
-; CHECK-CL: /EXPORT:_f2
-; CHECK-CL: /EXPORT:_stdfun@0
-; CHECK-CL: /EXPORT:@fastfun@0
-; CHECK-CL: /EXPORT:_thisfun
-; CHECK-CL: /EXPORT:_lnk1
-; CHECK-CL: /EXPORT:_lnk2
-; CHECK-CL: /EXPORT:_weak1
-; CHECK-CL: /EXPORT:_alias
-; CHECK-CL: /EXPORT:_alias2
-; CHECK-CL: /EXPORT:_alias3
-; CHECK-CL: /EXPORT:_weak_alias
-; CHECK-GCC: -export:Var1,data
-; CHECK-GCC: -export:Var2,data
-; CHECK-GCC: -export:Var3,data
-; CHECK-GCC: -export:WeakVar1,data
-; CHECK-GCC: -export:WeakVar2,data
-; CHECK-GCC: -export:f1
-; CHECK-GCC: -export:f2
-; CHECK-GCC: -export:stdfun@0
-; CHECK-GCC: -export:@fastfun@0
-; CHECK-GCC: -export:thisfun
-; CHECK-GCC: -export:lnk1
-; CHECK-GCC: -export:lnk2
-; CHECK-GCC: -export:weak1
-; CHECK-GCC: -export:alias
-; CHECK-GCC: -export:alias2
-; CHECK-GCC: -export:alias3
-; CHECK-GCC: -export:weak_alias
+; CHECK-CL: " /EXPORT:_Var1,DATA"
+; CHECK-CL: " /EXPORT:_Var2,DATA"
+; CHECK-CL: " /EXPORT:_Var3,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar1,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar2,DATA"
+; CHECK-CL: " /EXPORT:_f1"
+; CHECK-CL: " /EXPORT:_f2"
+; CHECK-CL: " /EXPORT:_stdfun@0"
+; CHECK-CL: " /EXPORT:@fastfun@0"
+; CHECK-CL: " /EXPORT:_thisfun"
+; CHECK-CL: " /EXPORT:_lnk1"
+; CHECK-CL: " /EXPORT:_lnk2"
+; CHECK-CL: " /EXPORT:_weak1"
+; CHECK-CL: " /EXPORT:_alias"
+; CHECK-CL: " /EXPORT:_alias2"
+; CHECK-CL: " /EXPORT:_alias3"
+; CHECK-CL: " /EXPORT:_weak_alias"
+; CHECK-GCC: " -export:Var1,data"
+; CHECK-GCC: " -export:Var2,data"
+; CHECK-GCC: " -export:Var3,data"
+; CHECK-GCC: " -export:WeakVar1,data"
+; CHECK-GCC: " -export:WeakVar2,data"
+; CHECK-GCC: " -export:f1"
+; CHECK-GCC: " -export:f2"
+; CHECK-GCC: " -export:stdfun@0"
+; CHECK-GCC: " -export:@fastfun@0"
+; CHECK-GCC: " -export:thisfun"
+; CHECK-GCC: " -export:lnk1"
+; CHECK-GCC: " -export:lnk2"
+; CHECK-GCC: " -export:weak1"
+; CHECK-GCC: " -export:alias"
+; CHECK-GCC: " -export:alias2"
+; CHECK-GCC: " -export:alias3"
+; CHECK-GCC: " -export:weak_alias"
 
diff --git a/test/CodeGen/X86/fast-isel-args-fail2.ll b/test/CodeGen/X86/fast-isel-args-fail2.ll
new file mode 100644
index 000000000000..08de472c2a54
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-args-fail2.ll
@@ -0,0 +1,10 @@
+; RUN: not --crash llc < %s -fast-isel -fast-isel-abort-args -mtriple=x86_64-apple-darwin10
+; REQUIRES: asserts
+
+%struct.s0 = type { x86_fp80, x86_fp80 }
+
+; FastISel cannot handle this case yet. Make sure that we abort.
+define i8* @args_fail(%struct.s0* byval nocapture readonly align 16 %y) {
+  %1 = bitcast %struct.s0* %y to i8*
+  ret i8* %1
+}
diff --git a/test/CodeGen/X86/fast-isel-args.ll b/test/CodeGen/X86/fast-isel-args.ll
index 0f3626565e7d..8c86a9cc01d6 100644
--- a/test/CodeGen/X86/fast-isel-args.ll
+++ b/test/CodeGen/X86/fast-isel-args.ll
@@ -23,3 +23,27 @@ entry:
   %add2 = add nsw i64 %add, %conv1
   ret i64 %add2
 }
+
+define float @t4(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h) {
+entry:
+  %add1 = fadd float %a, %b
+  %add2 = fadd float %c, %d
+  %add3 = fadd float %e, %f
+  %add4 = fadd float %g, %h
+  %add5 = fadd float %add1, %add2
+  %add6 = fadd float %add3, %add4
+  %add7 = fadd float %add5, %add6
+  ret float %add7
+}
+
+define double @t5(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h) {
+entry:
+  %add1 = fadd double %a, %b
+  %add2 = fadd double %c, %d
+  %add3 = fadd double %e, %f
+  %add4 = fadd double %g, %h
+  %add5 = fadd double %add1, %add2
+  %add6 = fadd double %add3, %add4
+  %add7 = fadd double %add5, %add6
+  ret double %add7
+}
diff --git a/test/CodeGen/X86/fast-isel-branch_weights.ll b/test/CodeGen/X86/fast-isel-branch_weights.ll
new file mode 100644
index 000000000000..bc41395e1e83
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-branch_weights.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+; Test if the BBs are reordred according to their branch weights.
+define i64 @branch_weights_test(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_weights_test
+; CHECK-LABEL: success
+; CHECK-LABEL: fail
+  %1 = icmp ult i64 %a, %b
+  br i1 %1, label %fail, label %success, !prof !0
+
+fail:
+  ret i64 -1
+
+success:
+  ret i64 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
new file mode 100644
index 000000000000..7e45c49f48f7
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
@@ -0,0 +1,294 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+define i32 @fcmp_oeq(float %x, float %y) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_2}}
+  %1 = fcmp oeq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp ogt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge(float %x, float %y) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp oge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt(float %x, float %y) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp olt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole(float %x, float %y) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp ole float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one(float %x, float %y) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_1}}
+  %1 = fcmp one float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord(float %x, float %y) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ord float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno(float %x, float %y) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_2}}
+  %1 = fcmp uno float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq(float %x, float %y) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_2}}
+  %1 = fcmp ueq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ugt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge(float %x, float %y) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp uge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult(float %x, float %y) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ult float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule(float %x, float %y) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp ule float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une(float %x, float %y) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_2}}
+; CHECK-NEXT:  jp  {{LBB.+_2}}
+; CHECK-NEXT:  jmp {{LBB.+_1}}
+  %1 = fcmp une float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_eq
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jne {{LBB.+_1}}
+  %1 = icmp eq i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  je {{LBB.+_1}}
+  %1 = icmp ne i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = icmp ugt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = icmp uge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = icmp ult i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = icmp ule i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jle {{LBB.+_1}}
+  %1 = icmp sgt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jl {{LBB.+_1}}
+  %1 = icmp sge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jge {{LBB.+_1}}
+  %1 = icmp slt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jg {{LBB.+_1}}
+  %1 = icmp sle i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
new file mode 100644
index 000000000000..a3f6851ca240
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -0,0 +1,470 @@
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+define i32 @fcmp_oeq1(float %x) {
+; CHECK-LABEL: fcmp_oeq1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp oeq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oeq2(float %x) {
+; CHECK-LABEL: fcmp_oeq2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_2}}
+  %1 = fcmp oeq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt1(float %x) {
+; CHECK-LABEL: fcmp_ogt1
+; CHECK-NOT:   ucomiss
+; CHECK:       movl $1, %eax
+  %1 = fcmp ogt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt2(float %x) {
+; CHECK-LABEL: fcmp_ogt2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp ogt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge1(float %x) {
+; CHECK-LABEL: fcmp_oge1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp oge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge2(float %x) {
+; CHECK-LABEL: fcmp_oge2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp oge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt1(float %x) {
+; CHECK-LABEL: fcmp_olt1
+; CHECK-NOT:   ucomiss
+; CHECK:       movl $1, %eax
+  %1 = fcmp olt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt2(float %x) {
+; CHECK-LABEL: fcmp_olt2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp olt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole1(float %x) {
+; CHECK-LABEL: fcmp_ole1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ole float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole2(float %x) {
+; CHECK-LABEL: fcmp_ole2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp ole float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one1(float %x) {
+; CHECK-LABEL: fcmp_one1
+; CHECK-NOT:   ucomiss
+; CHECK:       movl $1, %eax
+  %1 = fcmp one float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one2(float %x) {
+; CHECK-LABEL: fcmp_one2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_1}}
+  %1 = fcmp one float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord1(float %x) {
+; CHECK-LABEL: fcmp_ord1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ord float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord2(float %x) {
+; CHECK-LABEL: fcmp_ord2
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ord float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno1(float %x) {
+; CHECK-LABEL: fcmp_uno1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_2}}
+  %1 = fcmp uno float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno2(float %x) {
+; CHECK-LABEL: fcmp_uno2
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_2}}
+  %1 = fcmp uno float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq1(float %x) {
+; CHECK-LABEL: fcmp_ueq1
+; CHECK-NOT:   ucomiss
+  %1 = fcmp ueq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq2(float %x) {
+; CHECK-LABEL: fcmp_ueq2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_2}}
+  %1 = fcmp ueq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt1(float %x) {
+; CHECK-LABEL: fcmp_ugt1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jnp {{LBB.+_1}}
+  %1 = fcmp ugt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt2(float %x) {
+; CHECK-LABEL: fcmp_ugt2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ugt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge1(float %x) {
+; CHECK-LABEL: fcmp_uge1
+; CHECK-NOT:   ucomiss
+  %1 = fcmp uge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge2(float %x) {
+; CHECK-LABEL: fcmp_uge2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp uge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult1(float %x) {
+; CHECK-LABEL: fcmp_ult1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jnp {{LBB.+_1}}
+  %1 = fcmp ult float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult2(float %x) {
+; CHECK-LABEL: fcmp_ult2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ult float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule1(float %x) {
+; CHECK-LABEL: fcmp_ule1
+; CHECK-NOT:   ucomiss
+  %1 = fcmp ule float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule2(float %x) {
+; CHECK-LABEL: fcmp_ule2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp ule float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une1(float %x) {
+; CHECK-LABEL: fcmp_une1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jnp {{LBB.+_1}}
+  %1 = fcmp une float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une2(float %x) {
+; CHECK-LABEL: fcmp_une2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_2}}
+; CHECK-NEXT:  jp {{LBB.+_2}}
+; CHECK-NEXT:  jmp {{LBB.+_1}}
+  %1 = fcmp une float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x) {
+; CHECK-LABEL: icmp_eq
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp eq i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x) {
+; CHECK-LABEL: icmp_ne
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp ne i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x) {
+; CHECK-LABEL: icmp_ugt
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp ugt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x) {
+; CHECK-LABEL: icmp_uge
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp uge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x) {
+; CHECK-LABEL: icmp_ult
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp ult i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x) {
+; CHECK-LABEL: icmp_ule
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp ule i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x) {
+; CHECK-LABEL: icmp_sgt
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp sgt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x) {
+; CHECK-LABEL: icmp_sge
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp sge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x) {
+; CHECK-LABEL: icmp_slt
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp slt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x) {
+; CHECK-LABEL: icmp_sle
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp sle i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/X86/fast-isel-cmp.ll b/test/CodeGen/X86/fast-isel-cmp.ll
new file mode 100644
index 000000000000..1b72cfcde657
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-cmp.ll
@@ -0,0 +1,689 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=SDAG
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=FAST
+
+define zeroext i1 @fcmp_oeq(float %x, float %y) {
+; SDAG-LABEL: fcmp_oeq
+; SDAG:       cmpeqss  %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_oeq
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+; FAST-NEXT:  setnp    %cl
+; FAST-NEXT:  andb     %al, %cl
+  %1 = fcmp oeq float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt(float %x, float %y) {
+; SDAG-LABEL: fcmp_ogt
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  seta     %al
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  seta     %al
+  %1 = fcmp ogt float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge(float %x, float %y) {
+; SDAG-LABEL: fcmp_oge
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_oge
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setae    %al
+  %1 = fcmp oge float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt(float %x, float %y) {
+; SDAG-LABEL: fcmp_olt
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  seta     %al
+; FAST-LABEL: fcmp_olt
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  seta     %al
+  %1 = fcmp olt float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole(float %x, float %y) {
+; SDAG-LABEL: fcmp_ole
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_ole
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setae    %al
+  %1 = fcmp ole float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one(float %x, float %y) {
+; SDAG-LABEL: fcmp_one
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setne    %al
+; FAST-LABEL: fcmp_one
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+  %1 = fcmp one float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord(float %x, float %y) {
+; SDAG-LABEL: fcmp_ord
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ord
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ord float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno(float %x, float %y) {
+; SDAG-LABEL: fcmp_uno
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_uno
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp uno float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq(float %x, float %y) {
+; SDAG-LABEL: fcmp_ueq
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  sete     %al
+; FAST-LABEL: fcmp_ueq
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+  %1 = fcmp ueq float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt(float %x, float %y) {
+; SDAG-LABEL: fcmp_ugt
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ugt
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ugt float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge(float %x, float %y) {
+; SDAG-LABEL: fcmp_uge
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_uge
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp uge float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult(float %x, float %y) {
+; SDAG-LABEL: fcmp_ult
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ult
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ult float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule(float %x, float %y) {
+; SDAG-LABEL: fcmp_ule
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_ule
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp ule float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une(float %x, float %y) {
+; SDAG-LABEL: fcmp_une
+; SDAG:       cmpneqss %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_une
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+; FAST-NEXT:  setp     %cl
+; FAST-NEXT:  orb      %al, %cl
+  %1 = fcmp une float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_eq(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_eq
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  sete     %al
+; FAST-LABEL: icmp_eq
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  sete     %al
+  %1 = icmp eq i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ne(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ne
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setne    %al
+; FAST-LABEL: icmp_ne
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setne    %al
+  %1 = icmp ne i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ugt(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ugt
+; SDAG:       cmpl     %edi, %esi
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: icmp_ugt
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  seta     %al
+  %1 = icmp ugt i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_uge(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_uge
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: icmp_uge
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setae    %al
+  %1 = icmp uge i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ult(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ult
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: icmp_ult
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setb     %al
+  %1 = icmp ult i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ule(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ule
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: icmp_ule
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setbe    %al
+  %1 = icmp ule i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sgt(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_sgt
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setg     %al
+; FAST-LABEL: icmp_sgt
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setg     %al
+  %1 = icmp sgt i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sge(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_sge
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setge    %al
+; FAST-LABEL: icmp_sge
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setge    %al
+  %1 = icmp sge i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_slt(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_slt
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setl     %al
+; FAST-LABEL: icmp_slt
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setl     %al
+  %1 = icmp slt i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sle(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_sle
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setle    %al
+; FAST-LABEL: icmp_sle
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setle    %al
+  %1 = icmp sle i32 %x, %y
+  ret i1 %1
+}
+
+; Test cmp folding and condition optimization.
+define zeroext i1 @fcmp_oeq2(float %x) {
+; SDAG-LABEL: fcmp_oeq2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_oeq2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp oeq float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oeq3(float %x) {
+; SDAG-LABEL: fcmp_oeq3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  cmpeqss  %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_oeq3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+; FAST-NEXT:  setnp    %cl
+; FAST-NEXT:  andb     %al, %cl
+  %1 = fcmp oeq float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt2(float %x) {
+; SDAG-LABEL: fcmp_ogt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: fcmp_ogt2
+; FAST:       xorl     %eax, %eax
+  %1 = fcmp ogt float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt3(float %x) {
+; SDAG-LABEL: fcmp_ogt3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  seta     %al
+; FAST-LABEL: fcmp_ogt3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  seta     %al
+  %1 = fcmp ogt float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge2(float %x) {
+; SDAG-LABEL: fcmp_oge2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_oge2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp oge float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge3(float %x) {
+; SDAG-LABEL: fcmp_oge3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_oge3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setae    %al
+  %1 = fcmp oge float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt2(float %x) {
+; SDAG-LABEL: fcmp_olt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: fcmp_olt2
+; FAST:       xorl     %eax, %eax
+  %1 = fcmp olt float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt3(float %x) {
+; SDAG-LABEL: fcmp_olt3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  seta     %al
+; FAST-LABEL: fcmp_olt3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  seta     %al
+  %1 = fcmp olt float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole2(float %x) {
+; SDAG-LABEL: fcmp_ole2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ole2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ole float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole3(float %x) {
+; SDAG-LABEL: fcmp_ole3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_ole3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setae    %al
+  %1 = fcmp ole float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one2(float %x) {
+; SDAG-LABEL: fcmp_one2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: fcmp_one2
+; FAST:       xorl     %eax, %eax
+  %1 = fcmp one float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one3(float %x) {
+; SDAG-LABEL: fcmp_one3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setne    %al
+; FAST-LABEL: fcmp_one3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+  %1 = fcmp one float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord2(float %x) {
+; SDAG-LABEL: fcmp_ord2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ord2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ord float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord3(float %x) {
+; SDAG-LABEL: fcmp_ord3
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ord3
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ord float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno2(float %x) {
+; SDAG-LABEL: fcmp_uno2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_uno2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp uno float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno3(float %x) {
+; SDAG-LABEL: fcmp_uno3
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_uno3
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp uno float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq2(float %x) {
+; SDAG-LABEL: fcmp_ueq2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: fcmp_ueq2
+; FAST:       movb     $1, %al
+  %1 = fcmp ueq float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq3(float %x) {
+; SDAG-LABEL: fcmp_ueq3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  sete     %al
+; FAST-LABEL: fcmp_ueq3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+  %1 = fcmp ueq float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt2(float %x) {
+; SDAG-LABEL: fcmp_ugt2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_ugt2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp ugt float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt3(float %x) {
+; SDAG-LABEL: fcmp_ugt3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ugt3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ugt float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge2(float %x) {
+; SDAG-LABEL: fcmp_uge2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: fcmp_uge2
+; FAST:       movb     $1, %al
+  %1 = fcmp uge float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge3(float %x) {
+; SDAG-LABEL: fcmp_uge3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_uge3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp uge float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult2(float %x) {
+; SDAG-LABEL: fcmp_ult2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_ult2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp ult float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult3(float %x) {
+; SDAG-LABEL: fcmp_ult3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ult3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ult float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule2(float %x) {
+; SDAG-LABEL: fcmp_ule2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: fcmp_ule2
+; FAST:       movb     $1, %al
+  %1 = fcmp ule float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule3(float %x) {
+; SDAG-LABEL: fcmp_ule3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_ule3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp ule float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une2(float %x) {
+; SDAG-LABEL: fcmp_une2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_une2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp une float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une3(float %x) {
+; SDAG-LABEL: fcmp_une3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  cmpneqss %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_une3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+; FAST-NEXT:  setp     %cl
+; FAST-NEXT:  orb      %al, %cl
+  %1 = fcmp une float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_eq2(i32 %x) {
+; SDAG-LABEL: icmp_eq2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_eq2
+; FAST:       movb     $1, %al
+  %1 = icmp eq i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ne2(i32 %x) {
+; SDAG-LABEL: icmp_ne2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_ne2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp ne i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ugt2(i32 %x) {
+; SDAG-LABEL: icmp_ugt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_ugt2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp ugt i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_uge2(i32 %x) {
+; SDAG-LABEL: icmp_uge2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_uge2
+; FAST:       movb     $1, %al
+  %1 = icmp uge i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ult2(i32 %x) {
+; SDAG-LABEL: icmp_ult2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_ult2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp ult i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ule2(i32 %x) {
+; SDAG-LABEL: icmp_ule2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_ule2
+; FAST:       movb     $1, %al
+  %1 = icmp ule i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sgt2(i32 %x) {
+; SDAG-LABEL: icmp_sgt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_sgt2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp sgt i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sge2(i32 %x) {
+; SDAG-LABEL: icmp_sge2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_sge2
+; FAST:       movb     $1, %al
+  %1 = icmp sge i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_slt2(i32 %x) {
+; SDAG-LABEL: icmp_slt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_slt2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp slt i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sle2(i32 %x) {
+; SDAG-LABEL: icmp_sle2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_sle2
+; FAST:       movb     $1, %al
+  %1 = icmp sle i32 %x, %x
+  ret i1 %1
+}
+
diff --git a/test/CodeGen/X86/fast-isel-fold-mem.ll b/test/CodeGen/X86/fast-isel-fold-mem.ll
new file mode 100644
index 000000000000..a94577962e91
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-fold-mem.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define i64 @fold_load(i64* %a, i64 %b) {
+; CHECK-LABEL: fold_load
+; CHECK:       addq  (%rdi), %rsi
+; CHECK-NEXT:  movq  %rsi, %rax
+  %1 = load i64* %a, align 8
+  %2 = add i64 %1, %b
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/X86/fast-isel-sse12-fptoint.ll b/test/CodeGen/X86/fast-isel-sse12-fptoint.ll
new file mode 100644
index 000000000000..769c987e604a
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-sse12-fptoint.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse2 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx2,+avx -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=AVX
+
+define i32 @cvt_test1(float %a) {
+; SSE-LABEL: cvt_test1
+; SSE:       cvttss2si %xmm0, %eax
+; AVX-LABEL: cvt_test1
+; AVX:       vcvttss2si %xmm0, %eax
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 0.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 0.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 0.000000e+00, i32 3
+  %5 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %4)
+  ret i32 %5
+}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+
+define i64 @cvt_test2(float %a) {
+; SSE-LABEL: cvt_test2
+; SSE:       cvttss2si %xmm0, %rax
+; AVX-LABEL: cvt_test2
+; AVX:       vcvttss2si %xmm0, %rax
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 0.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 0.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 0.000000e+00, i32 3
+  %5 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %4)
+  ret i64 %5
+}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+
+define i32 @cvt_test3(double %a) {
+; SSE-LABEL: cvt_test3
+; SSE:       cvttsd2si %xmm0, %eax
+; AVX-LABEL: cvt_test3
+; AVX:       vcvttsd2si %xmm0, %eax
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 0.000000e+00, i32 1
+  %3 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %2)
+  ret i32 %3
+}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+
+define i64 @cvt_test4(double %a) {
+; SSE-LABEL: cvt_test4
+; SSE:       cvttsd2si %xmm0, %rax
+; AVX-LABEL: cvt_test4
+; AVX:       vcvttsd2si %xmm0, %rax
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 0.000000e+00, i32 1
+  %3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %2)
+  ret i64 %3
+}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll
new file mode 100644
index 000000000000..6c1ca252bb97
--- /dev/null
+++ b/test/CodeGen/X86/frameaddr.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=x86                                | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -march=x86    -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -march=x86-64                             | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64
+
+define i8* @test1() nounwind {
+entry:
+; CHECK-32-LABEL: test1
+; CHECK-32:       push
+; CHECK-32-NEXT:  movl %esp, %ebp
+; CHECK-32-NEXT:  movl %ebp, %eax
+; CHECK-32-NEXT:  pop
+; CHECK-32-NEXT:  ret
+; CHECK-64-LABEL: test1
+; CHECK-64:       push
+; CHECK-64-NEXT:  movq %rsp, %rbp
+; CHECK-64-NEXT:  movq %rbp, %rax
+; CHECK-64-NEXT:  pop
+; CHECK-64-NEXT:  ret
+  %0 = tail call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @test2() nounwind {
+entry:
+; CHECK-32-LABEL: test2
+; CHECK-32:       push
+; CHECK-32-NEXT:  movl %esp, %ebp
+; CHECK-32-NEXT:  movl (%ebp), %eax
+; CHECK-32-NEXT:  movl (%eax), %eax
+; CHECK-32-NEXT:  pop
+; CHECK-32-NEXT:  ret
+; CHECK-64-LABEL: test2
+; CHECK-64:       push
+; CHECK-64-NEXT:  movq %rsp, %rbp
+; CHECK-64-NEXT:  movq (%rbp), %rax
+; CHECK-64-NEXT:  movq (%rax), %rax
+; CHECK-64-NEXT:  pop
+; CHECK-64-NEXT:  ret
+  %0 = tail call i8* @llvm.frameaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index 7a29b07423c6..8c328ec58f93 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -50,7 +50,3 @@ eh.resume:
 declare void @_Z1fv() optsize
 
 declare i32 @__gxx_personality_v0(...)
-
-; CHECK: Leh_func_end0:
-; CHECK: GCC_except_table0
-; CHECK: = Leh_func_end0-
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 7f123c106727..c763f3947e59 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin9.7 | FileCheck %s -check-prefix=DARWIN
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -relocation-model=static | FileCheck %s -check-prefix=DARWIN-STATIC
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=DARWIN64
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -fdata-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
-; RUN: llc < %s -mtriple=i686-pc-win32 -fdata-sections -ffunction-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -data-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i686-pc-win32 -data-sections -function-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
 
 define void @F1() {
   ret void
@@ -18,13 +18,13 @@ define void @F1() {
 ; LINUX: .type   G1,@object
 ; LINUX: .comm  G1,4,4
 
-; DARWIN: .comm	_G1,4,2
+; DARWIN: .comm _G1,4,2
 
 
 
 
 ; const int G2 __attribute__((weak)) = 42;
-@G2 = weak_odr unnamed_addr constant i32 42	
+@G2 = weak_odr unnamed_addr constant i32 42     
 
 
 ; TODO: linux drops this into .rodata, we drop it into ".gnu.linkonce.r.G2"
@@ -85,25 +85,25 @@ define void @F1() {
 ; PR4584
 @"foo bar" = linkonce global i32 42
 
-; LINUX: .type	"foo bar",@object
+; LINUX: .type  "foo bar",@object
 ; LINUX: .section ".data.foo bar","aGw",@progbits,"foo bar",comdat
-; LINUX: .weak	"foo bar"
+; LINUX: .weak  "foo bar"
 ; LINUX: "foo bar":
 
-; DARWIN: .section		__DATA,__datacoal_nt,coalesced
-; DARWIN: .globl	"_foo bar"
-; DARWIN:	.weak_definition "_foo bar"
+; DARWIN: .section              __DATA,__datacoal_nt,coalesced
+; DARWIN: .globl        "_foo bar"
+; DARWIN:       .weak_definition "_foo bar"
 ; DARWIN: "_foo bar":
 
 ; PR4650
 @G6 = weak_odr unnamed_addr constant [1 x i8] c"\01"
 
-; LINUX:   .type	G6,@object
-; LINUX:   .section	.rodata.G6,"aG",@progbits,G6,comdat
-; LINUX:   .weak	G6
+; LINUX:   .type        G6,@object
+; LINUX:   .section     .rodata.G6,"aG",@progbits,G6,comdat
+; LINUX:   .weak        G6
 ; LINUX: G6:
-; LINUX:   .byte	1
-; LINUX:   .size	G6, 1
+; LINUX:   .byte        1
+; LINUX:   .size        G6, 1
 
 ; DARWIN:  .section __TEXT,__const_coal,coalesced
 ; DARWIN:  .globl _G6
@@ -114,58 +114,58 @@ define void @F1() {
 
 @G7 = unnamed_addr constant [10 x i8] c"abcdefghi\00"
 
-; DARWIN:	__TEXT,__cstring,cstring_literals
-; DARWIN:	.globl _G7
+; DARWIN:       __TEXT,__cstring,cstring_literals
+; DARWIN:       .globl _G7
 ; DARWIN: _G7:
-; DARWIN:	.asciz	"abcdefghi"
+; DARWIN:       .asciz  "abcdefghi"
 
-; LINUX:	.section	.rodata.str1.1,"aMS",@progbits,1
-; LINUX:	.globl G7
+; LINUX:        .section        .rodata.str1.1,"aMS",@progbits,1
+; LINUX:        .globl G7
 ; LINUX: G7:
-; LINUX:	.asciz	"abcdefghi"
+; LINUX:        .asciz  "abcdefghi"
 
 ; LINUX-SECTIONS: .section        .rodata.G7,"aMS",@progbits,1
-; LINUX-SECTIONS:	.globl G7
+; LINUX-SECTIONS:       .globl G7
 
 ; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G7
-; WIN32-SECTIONS:	.globl _G7
+; WIN32-SECTIONS:       .globl _G7
 
 
 @G8 = unnamed_addr constant [4 x i16] [ i16 1, i16 2, i16 3, i16 0 ]
 
-; DARWIN:	.section	__TEXT,__const
-; DARWIN:	.globl _G8
+; DARWIN:       .section        __TEXT,__const
+; DARWIN:       .globl _G8
 ; DARWIN: _G8:
 
-; LINUX:	.section	.rodata.str2.2,"aMS",@progbits,2
-; LINUX:	.globl G8
+; LINUX:        .section        .rodata.str2.2,"aMS",@progbits,2
+; LINUX:        .globl G8
 ; LINUX:G8:
 
 @G9 = unnamed_addr constant [4 x i32] [ i32 1, i32 2, i32 3, i32 0 ]
 
-; DARWIN:	.globl _G9
+; DARWIN:       .globl _G9
 ; DARWIN: _G9:
 
-; LINUX:	.section	.rodata.str4.4,"aMS",@progbits,4
-; LINUX:	.globl G9
+; LINUX:        .section        .rodata.str4.4,"aMS",@progbits,4
+; LINUX:        .globl G9
 ; LINUX:G9
 
 
 @G10 = weak global [100 x i32] zeroinitializer, align 32 ; <[100 x i32]*> [#uses=0]
 
 
-; DARWIN: 	.section	__DATA,__datacoal_nt,coalesced
+; DARWIN:       .section        __DATA,__datacoal_nt,coalesced
 ; DARWIN: .globl _G10
-; DARWIN:	.weak_definition _G10
-; DARWIN:	.align	5
+; DARWIN:       .weak_definition _G10
+; DARWIN:       .align  5
 ; DARWIN: _G10:
-; DARWIN:	.space	400
+; DARWIN:       .space  400
 
-; LINUX:	.bss
-; LINUX:	.weak	G10
-; LINUX:	.align	32
+; LINUX:        .bss
+; LINUX:        .weak   G10
+; LINUX:        .align  32
 ; LINUX: G10:
-; LINUX:	.zero	400
+; LINUX:        .zero   400
 
 
 
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
new file mode 100644
index 000000000000..ff939a99427e
--- /dev/null
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -0,0 +1,802 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE3
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3,+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSSE3
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+
+
+define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecext1 = extractelement <4 x float> %A, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %A, i32 2
+  %vecext3 = extractelement <4 x float> %A, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+  %vecext6 = extractelement <4 x float> %B, i32 0
+  %vecext7 = extractelement <4 x float> %B, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
+  %vecext10 = extractelement <4 x float> %B, i32 2
+  %vecext11 = extractelement <4 x float> %B, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hadd_ps_test1
+; CHECK: haddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 2
+  %vecext1 = extractelement <4 x float> %A, i32 3
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 1
+  %vecext2 = extractelement <4 x float> %A, i32 0
+  %vecext3 = extractelement <4 x float> %A, i32 1
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
+  %vecext6 = extractelement <4 x float> %B, i32 2
+  %vecext7 = extractelement <4 x float> %B, i32 3
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
+  %vecext10 = extractelement <4 x float> %B, i32 0
+  %vecext11 = extractelement <4 x float> %B, i32 1
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hadd_ps_test2
+; CHECK: haddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecext1 = extractelement <4 x float> %A, i32 1
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %sub, i32 0
+  %vecext2 = extractelement <4 x float> %A, i32 2
+  %vecext3 = extractelement <4 x float> %A, i32 3
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
+  %vecext6 = extractelement <4 x float> %B, i32 0
+  %vecext7 = extractelement <4 x float> %B, i32 1
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
+  %vecext10 = extractelement <4 x float> %B, i32 2
+  %vecext11 = extractelement <4 x float> %B, i32 3
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hsub_ps_test1
+; CHECK: hsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 2
+  %vecext1 = extractelement <4 x float> %A, i32 3
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
+  %vecext2 = extractelement <4 x float> %A, i32 0
+  %vecext3 = extractelement <4 x float> %A, i32 1
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
+  %vecext6 = extractelement <4 x float> %B, i32 2
+  %vecext7 = extractelement <4 x float> %B, i32 3
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
+  %vecext10 = extractelement <4 x float> %B, i32 0
+  %vecext11 = extractelement <4 x float> %B, i32 1
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hsub_ps_test2
+; CHECK: hsubps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 0
+  %vecext1 = extractelement <4 x i32> %A, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <4 x i32> %A, i32 2
+  %vecext3 = extractelement <4 x i32> %A, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
+  %vecext6 = extractelement <4 x i32> %B, i32 0
+  %vecext7 = extractelement <4 x i32> %B, i32 1
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
+  %vecext10 = extractelement <4 x i32> %B, i32 2
+  %vecext11 = extractelement <4 x i32> %B, i32 3
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phadd_d_test1
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; AVX: vphaddd
+; AVX2 vphaddd
+; CHECK: ret
+
+
+define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 2
+  %vecext1 = extractelement <4 x i32> %A, i32 3
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
+  %vecext2 = extractelement <4 x i32> %A, i32 0
+  %vecext3 = extractelement <4 x i32> %A, i32 1
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
+  %vecext6 = extractelement <4 x i32> %B, i32 3
+  %vecext7 = extractelement <4 x i32> %B, i32 2
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
+  %vecext10 = extractelement <4 x i32> %B, i32 1
+  %vecext11 = extractelement <4 x i32> %B, i32 0
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phadd_d_test2
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; AVX: vphaddd
+; AVX2 vphaddd
+; CHECK: ret
+
+
+define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 0
+  %vecext1 = extractelement <4 x i32> %A, i32 1
+  %sub = sub i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
+  %vecext2 = extractelement <4 x i32> %A, i32 2
+  %vecext3 = extractelement <4 x i32> %A, i32 3
+  %sub4 = sub i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
+  %vecext6 = extractelement <4 x i32> %B, i32 0
+  %vecext7 = extractelement <4 x i32> %B, i32 1
+  %sub8 = sub i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
+  %vecext10 = extractelement <4 x i32> %B, i32 2
+  %vecext11 = extractelement <4 x i32> %B, i32 3
+  %sub12 = sub i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phsub_d_test1
+; SSE3-NOT: phsubd
+; SSSE3: phsubd
+; AVX: vphsubd
+; AVX2 vphsubd
+; CHECK: ret
+
+
+define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 2
+  %vecext1 = extractelement <4 x i32> %A, i32 3
+  %sub = sub i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
+  %vecext2 = extractelement <4 x i32> %A, i32 0
+  %vecext3 = extractelement <4 x i32> %A, i32 1
+  %sub4 = sub i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
+  %vecext6 = extractelement <4 x i32> %B, i32 2
+  %vecext7 = extractelement <4 x i32> %B, i32 3
+  %sub8 = sub i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
+  %vecext10 = extractelement <4 x i32> %B, i32 0
+  %vecext11 = extractelement <4 x i32> %B, i32 1
+  %sub12 = sub i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phsub_d_test2
+; SSE3-NOT: phsubd
+; SSSE3: phsubd
+; AVX: vphsubd
+; AVX2 vphsubd
+; CHECK: ret
+
+
+define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %A, i32 0
+  %vecext1 = extractelement <2 x double> %A, i32 1
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <2 x double> %B, i32 0
+  %vecext3 = extractelement <2 x double> %B, i32 1
+  %add2 = fadd double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hadd_pd_test1
+; CHECK: haddpd
+; CHECK-NEXT: ret
+
+
+define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %A, i32 1
+  %vecext1 = extractelement <2 x double> %A, i32 0
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <2 x double> %B, i32 1
+  %vecext3 = extractelement <2 x double> %B, i32 0
+  %add2 = fadd double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hadd_pd_test2
+; CHECK: haddpd
+; CHECK-NEXT: ret
+
+
+define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %A, i32 0
+  %vecext1 = extractelement <2 x double> %A, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %sub, i32 0
+  %vecext2 = extractelement <2 x double> %B, i32 0
+  %vecext3 = extractelement <2 x double> %B, i32 1
+  %sub2 = fsub double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hsub_pd_test1
+; CHECK: hsubpd
+; CHECK-NEXT: ret
+
+
+define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %B, i32 0
+  %vecext1 = extractelement <2 x double> %B, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
+  %vecext2 = extractelement <2 x double> %A, i32 0
+  %vecext3 = extractelement <2 x double> %A, i32 1
+  %sub2 = fsub double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hsub_pd_test2
+; CHECK: hsubpd
+; CHECK-NEXT: ret
+
+
+define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
+  %vecext = extractelement <4 x double> %A, i32 0
+  %vecext1 = extractelement <4 x double> %A, i32 1
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <4 x double> %A, i32 2
+  %vecext3 = extractelement <4 x double> %A, i32 3
+  %add4 = fadd double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
+  %vecext6 = extractelement <4 x double> %B, i32 0
+  %vecext7 = extractelement <4 x double> %B, i32 1
+  %add8 = fadd double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
+  %vecext10 = extractelement <4 x double> %B, i32 2
+  %vecext11 = extractelement <4 x double> %B, i32 3
+  %add12 = fadd double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_vhadd_pd_test
+; SSE3: haddpd
+; SSE3-NEXT: haddpd
+; SSSE3: haddpd
+; SSSE3: haddpd
+; AVX: vhaddpd
+; AVX: vhaddpd
+; AVX2: vhaddpd
+; AVX2: vhaddpd
+; CHECK: ret
+
+
+define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
+  %vecext = extractelement <4 x double> %A, i32 0
+  %vecext1 = extractelement <4 x double> %A, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
+  %vecext2 = extractelement <4 x double> %A, i32 2
+  %vecext3 = extractelement <4 x double> %A, i32 3
+  %sub4 = fsub double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
+  %vecext6 = extractelement <4 x double> %B, i32 0
+  %vecext7 = extractelement <4 x double> %B, i32 1
+  %sub8 = fsub double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
+  %vecext10 = extractelement <4 x double> %B, i32 2
+  %vecext11 = extractelement <4 x double> %B, i32 3
+  %sub12 = fsub double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_vhsub_pd_test
+; SSE3: hsubpd
+; SSE3-NEXT: hsubpd
+; SSSE3: hsubpd
+; SSSE3-NEXT: hsubpd
+; AVX: vhsubpd
+; AVX: vhsubpd
+; AVX2: vhsubpd
+; AVX2: vhsubpd
+; CHECK: ret
+
+
+define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
+  %vecext = extractelement <8 x i32> %A, i32 0
+  %vecext1 = extractelement <8 x i32> %A, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <8 x i32> %A, i32 2
+  %vecext3 = extractelement <8 x i32> %A, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
+  %vecext6 = extractelement <8 x i32> %A, i32 4
+  %vecext7 = extractelement <8 x i32> %A, i32 5
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
+  %vecext10 = extractelement <8 x i32> %A, i32 6
+  %vecext11 = extractelement <8 x i32> %A, i32 7
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
+  %vecext14 = extractelement <8 x i32> %B, i32 0
+  %vecext15 = extractelement <8 x i32> %B, i32 1
+  %add16 = add i32 %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
+  %vecext18 = extractelement <8 x i32> %B, i32 2
+  %vecext19 = extractelement <8 x i32> %B, i32 3
+  %add20 = add i32 %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
+  %vecext22 = extractelement <8 x i32> %B, i32 4
+  %vecext23 = extractelement <8 x i32> %B, i32 5
+  %add24 = add i32 %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
+  %vecext26 = extractelement <8 x i32> %B, i32 6
+  %vecext27 = extractelement <8 x i32> %B, i32 7
+  %add28 = add i32 %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
+  ret <8 x i32> %vecinit29
+}
+; CHECK-LABEL: avx2_vphadd_d_test
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; SSSE3-NEXT: phaddd
+; AVX: vphaddd
+; AVX: vphaddd
+; AVX2: vphaddd
+; AVX2: vphaddd
+; CHECK: ret
+
+define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
+  %vecext = extractelement <16 x i16> %a, i32 0
+  %vecext1 = extractelement <16 x i16> %a, i32 1
+  %add = add i16 %vecext, %vecext1
+  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
+  %vecext4 = extractelement <16 x i16> %a, i32 2
+  %vecext6 = extractelement <16 x i16> %a, i32 3
+  %add8 = add i16 %vecext4, %vecext6
+  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
+  %vecext11 = extractelement <16 x i16> %a, i32 4
+  %vecext13 = extractelement <16 x i16> %a, i32 5
+  %add15 = add i16 %vecext11, %vecext13
+  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
+  %vecext18 = extractelement <16 x i16> %a, i32 6
+  %vecext20 = extractelement <16 x i16> %a, i32 7
+  %add22 = add i16 %vecext18, %vecext20
+  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
+  %vecext25 = extractelement <16 x i16> %a, i32 8
+  %vecext27 = extractelement <16 x i16> %a, i32 9
+  %add29 = add i16 %vecext25, %vecext27
+  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
+  %vecext32 = extractelement <16 x i16> %a, i32 10
+  %vecext34 = extractelement <16 x i16> %a, i32 11
+  %add36 = add i16 %vecext32, %vecext34
+  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
+  %vecext39 = extractelement <16 x i16> %a, i32 12
+  %vecext41 = extractelement <16 x i16> %a, i32 13
+  %add43 = add i16 %vecext39, %vecext41
+  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
+  %vecext46 = extractelement <16 x i16> %a, i32 14
+  %vecext48 = extractelement <16 x i16> %a, i32 15
+  %add50 = add i16 %vecext46, %vecext48
+  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
+  %vecext53 = extractelement <16 x i16> %b, i32 0
+  %vecext55 = extractelement <16 x i16> %b, i32 1
+  %add57 = add i16 %vecext53, %vecext55
+  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
+  %vecext60 = extractelement <16 x i16> %b, i32 2
+  %vecext62 = extractelement <16 x i16> %b, i32 3
+  %add64 = add i16 %vecext60, %vecext62
+  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
+  %vecext67 = extractelement <16 x i16> %b, i32 4
+  %vecext69 = extractelement <16 x i16> %b, i32 5
+  %add71 = add i16 %vecext67, %vecext69
+  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
+  %vecext74 = extractelement <16 x i16> %b, i32 6
+  %vecext76 = extractelement <16 x i16> %b, i32 7
+  %add78 = add i16 %vecext74, %vecext76
+  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
+  %vecext81 = extractelement <16 x i16> %b, i32 8
+  %vecext83 = extractelement <16 x i16> %b, i32 9
+  %add85 = add i16 %vecext81, %vecext83
+  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
+  %vecext88 = extractelement <16 x i16> %b, i32 10
+  %vecext90 = extractelement <16 x i16> %b, i32 11
+  %add92 = add i16 %vecext88, %vecext90
+  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
+  %vecext95 = extractelement <16 x i16> %b, i32 12
+  %vecext97 = extractelement <16 x i16> %b, i32 13
+  %add99 = add i16 %vecext95, %vecext97
+  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
+  %vecext102 = extractelement <16 x i16> %b, i32 14
+  %vecext104 = extractelement <16 x i16> %b, i32 15
+  %add106 = add i16 %vecext102, %vecext104
+  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
+  ret <16 x i16> %vecinit108
+}
+; CHECK-LABEL: avx2_vphadd_w_test
+; SSE3-NOT: phaddw
+; SSSE3: phaddw
+; SSSE3-NEXT: phaddw
+; AVX: vphaddw
+; AVX: vphaddw
+; AVX2: vphaddw
+; AVX2: vphaddw
+; CHECK: ret
+
+
+; Verify that we don't select horizontal subs in the following functions.
+
+define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 0
+  %vecext1 = extractelement <4 x i32> %A, i32 1
+  %sub = sub i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
+  %vecext2 = extractelement <4 x i32> %A, i32 2
+  %vecext3 = extractelement <4 x i32> %A, i32 3
+  %sub4 = sub i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
+  %vecext6 = extractelement <4 x i32> %B, i32 1
+  %vecext7 = extractelement <4 x i32> %B, i32 0
+  %sub8 = sub i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
+  %vecext10 = extractelement <4 x i32> %B, i32 3
+  %vecext11 = extractelement <4 x i32> %B, i32 2
+  %sub12 = sub i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: not_a_hsub_1
+; CHECK-NOT: phsubd
+; CHECK: ret
+
+
+define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 2
+  %vecext1 = extractelement <4 x float> %A, i32 3
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
+  %vecext2 = extractelement <4 x float> %A, i32 0
+  %vecext3 = extractelement <4 x float> %A, i32 1
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
+  %vecext6 = extractelement <4 x float> %B, i32 3
+  %vecext7 = extractelement <4 x float> %B, i32 2
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
+  %vecext10 = extractelement <4 x float> %B, i32 0
+  %vecext11 = extractelement <4 x float> %B, i32 1
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: not_a_hsub_2
+; CHECK-NOT: hsubps
+; CHECK: ret
+
+
+define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %B, i32 0
+  %vecext1 = extractelement <2 x double> %B, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
+  %vecext2 = extractelement <2 x double> %A, i32 1
+  %vecext3 = extractelement <2 x double> %A, i32 0
+  %sub2 = fsub double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: not_a_hsub_3
+; CHECK-NOT: hsubpd
+; CHECK: ret
+
+
+; Test AVX horizontal add/sub of packed single/double precision
+; floating point values from 256-bit vectors.
+
+define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <8 x float> %a, i32 2
+  %vecext3 = extractelement <8 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
+  %vecext6 = extractelement <8 x float> %b, i32 0
+  %vecext7 = extractelement <8 x float> %b, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
+  %vecext10 = extractelement <8 x float> %b, i32 2
+  %vecext11 = extractelement <8 x float> %b, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
+  %vecext14 = extractelement <8 x float> %a, i32 4
+  %vecext15 = extractelement <8 x float> %a, i32 5
+  %add16 = fadd float %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
+  %vecext18 = extractelement <8 x float> %a, i32 6
+  %vecext19 = extractelement <8 x float> %a, i32 7
+  %add20 = fadd float %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
+  %vecext22 = extractelement <8 x float> %b, i32 4
+  %vecext23 = extractelement <8 x float> %b, i32 5
+  %add24 = fadd float %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
+  %vecext26 = extractelement <8 x float> %b, i32 6
+  %vecext27 = extractelement <8 x float> %b, i32 7
+  %add28 = fadd float %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
+  ret <8 x float> %vecinit29
+}
+; CHECK-LABEL: avx_vhadd_ps
+; SSE3: haddps
+; SSE3-NEXT: haddps
+; SSSE3: haddps
+; SSSE3-NEXT: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK: ret
+
+
+define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> undef, float %sub, i32 0
+  %vecext2 = extractelement <8 x float> %a, i32 2
+  %vecext3 = extractelement <8 x float> %a, i32 3
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
+  %vecext6 = extractelement <8 x float> %b, i32 0
+  %vecext7 = extractelement <8 x float> %b, i32 1
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
+  %vecext10 = extractelement <8 x float> %b, i32 2
+  %vecext11 = extractelement <8 x float> %b, i32 3
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
+  %vecext14 = extractelement <8 x float> %a, i32 4
+  %vecext15 = extractelement <8 x float> %a, i32 5
+  %sub16 = fsub float %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
+  %vecext18 = extractelement <8 x float> %a, i32 6
+  %vecext19 = extractelement <8 x float> %a, i32 7
+  %sub20 = fsub float %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
+  %vecext22 = extractelement <8 x float> %b, i32 4
+  %vecext23 = extractelement <8 x float> %b, i32 5
+  %sub24 = fsub float %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
+  %vecext26 = extractelement <8 x float> %b, i32 6
+  %vecext27 = extractelement <8 x float> %b, i32 7
+  %sub28 = fsub float %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
+  ret <8 x float> %vecinit29
+}
+; CHECK-LABEL: avx_vhsub_ps
+; SSE3: hsubps
+; SSE3-NEXT: hsubps
+; SSSE3: hsubps
+; SSSE3-NEXT: hsubps
+; AVX: vhsubps
+; AVX2: vhsubps
+; CHECK: ret
+
+
+define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
+  %vecext = extractelement <4 x double> %a, i32 0
+  %vecext1 = extractelement <4 x double> %a, i32 1
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <4 x double> %b, i32 0
+  %vecext3 = extractelement <4 x double> %b, i32 1
+  %add4 = fadd double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
+  %vecext6 = extractelement <4 x double> %a, i32 2
+  %vecext7 = extractelement <4 x double> %a, i32 3
+  %add8 = fadd double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
+  %vecext10 = extractelement <4 x double> %b, i32 2
+  %vecext11 = extractelement <4 x double> %b, i32 3
+  %add12 = fadd double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_hadd_pd
+; SSE3: haddpd
+; SSE3-NEXT: haddpd
+; SSSE3: haddpd
+; SSSE3-NEXT: haddpd
+; AVX: vhaddpd
+; AVX2: vhaddpd
+; CHECK: ret
+
+
+define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
+  %vecext = extractelement <4 x double> %a, i32 0
+  %vecext1 = extractelement <4 x double> %a, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
+  %vecext2 = extractelement <4 x double> %b, i32 0
+  %vecext3 = extractelement <4 x double> %b, i32 1
+  %sub4 = fsub double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
+  %vecext6 = extractelement <4 x double> %a, i32 2
+  %vecext7 = extractelement <4 x double> %a, i32 3
+  %sub8 = fsub double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
+  %vecext10 = extractelement <4 x double> %b, i32 2
+  %vecext11 = extractelement <4 x double> %b, i32 3
+  %sub12 = fsub double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_hsub_pd
+; SSE3: hsubpd
+; SSE3-NEXT: hsubpd
+; SSSE3: hsubpd
+; SSSE3-NEXT: hsubpd
+; AVX: vhsubpd
+; AVX2: vhsubpd
+; CHECK: ret
+
+
+; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
+
+define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
+  %vecext = extractelement <8 x i32> %a, i32 0
+  %vecext1 = extractelement <8 x i32> %a, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <8 x i32> %a, i32 2
+  %vecext3 = extractelement <8 x i32> %a, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
+  %vecext6 = extractelement <8 x i32> %b, i32 0
+  %vecext7 = extractelement <8 x i32> %b, i32 1
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
+  %vecext10 = extractelement <8 x i32> %b, i32 2
+  %vecext11 = extractelement <8 x i32> %b, i32 3
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
+  %vecext14 = extractelement <8 x i32> %a, i32 4
+  %vecext15 = extractelement <8 x i32> %a, i32 5
+  %add16 = add i32 %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
+  %vecext18 = extractelement <8 x i32> %a, i32 6
+  %vecext19 = extractelement <8 x i32> %a, i32 7
+  %add20 = add i32 %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
+  %vecext22 = extractelement <8 x i32> %b, i32 4
+  %vecext23 = extractelement <8 x i32> %b, i32 5
+  %add24 = add i32 %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
+  %vecext26 = extractelement <8 x i32> %b, i32 6
+  %vecext27 = extractelement <8 x i32> %b, i32 7
+  %add28 = add i32 %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
+  ret <8 x i32> %vecinit29
+}
+; CHECK-LABEL: avx2_hadd_d
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; SSSE3-NEXT: phaddd
+; AVX: vphaddd
+; AVX: vphaddd
+; AVX2: vphaddd
+; AVX2-NOT: vphaddd
+; CHECK: ret
+
+
+define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
+  %vecext = extractelement <16 x i16> %a, i32 0
+  %vecext1 = extractelement <16 x i16> %a, i32 1
+  %add = add i16 %vecext, %vecext1
+  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
+  %vecext4 = extractelement <16 x i16> %a, i32 2
+  %vecext6 = extractelement <16 x i16> %a, i32 3
+  %add8 = add i16 %vecext4, %vecext6
+  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
+  %vecext11 = extractelement <16 x i16> %a, i32 4
+  %vecext13 = extractelement <16 x i16> %a, i32 5
+  %add15 = add i16 %vecext11, %vecext13
+  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
+  %vecext18 = extractelement <16 x i16> %a, i32 6
+  %vecext20 = extractelement <16 x i16> %a, i32 7
+  %add22 = add i16 %vecext18, %vecext20
+  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
+  %vecext25 = extractelement <16 x i16> %a, i32 8
+  %vecext27 = extractelement <16 x i16> %a, i32 9
+  %add29 = add i16 %vecext25, %vecext27
+  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
+  %vecext32 = extractelement <16 x i16> %a, i32 10
+  %vecext34 = extractelement <16 x i16> %a, i32 11
+  %add36 = add i16 %vecext32, %vecext34
+  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
+  %vecext39 = extractelement <16 x i16> %a, i32 12
+  %vecext41 = extractelement <16 x i16> %a, i32 13
+  %add43 = add i16 %vecext39, %vecext41
+  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
+  %vecext46 = extractelement <16 x i16> %a, i32 14
+  %vecext48 = extractelement <16 x i16> %a, i32 15
+  %add50 = add i16 %vecext46, %vecext48
+  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
+  %vecext53 = extractelement <16 x i16> %b, i32 0
+  %vecext55 = extractelement <16 x i16> %b, i32 1
+  %add57 = add i16 %vecext53, %vecext55
+  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
+  %vecext60 = extractelement <16 x i16> %b, i32 2
+  %vecext62 = extractelement <16 x i16> %b, i32 3
+  %add64 = add i16 %vecext60, %vecext62
+  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
+  %vecext67 = extractelement <16 x i16> %b, i32 4
+  %vecext69 = extractelement <16 x i16> %b, i32 5
+  %add71 = add i16 %vecext67, %vecext69
+  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
+  %vecext74 = extractelement <16 x i16> %b, i32 6
+  %vecext76 = extractelement <16 x i16> %b, i32 7
+  %add78 = add i16 %vecext74, %vecext76
+  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
+  %vecext81 = extractelement <16 x i16> %b, i32 8
+  %vecext83 = extractelement <16 x i16> %b, i32 9
+  %add85 = add i16 %vecext81, %vecext83
+  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
+  %vecext88 = extractelement <16 x i16> %b, i32 10
+  %vecext90 = extractelement <16 x i16> %b, i32 11
+  %add92 = add i16 %vecext88, %vecext90
+  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
+  %vecext95 = extractelement <16 x i16> %b, i32 12
+  %vecext97 = extractelement <16 x i16> %b, i32 13
+  %add99 = add i16 %vecext95, %vecext97
+  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
+  %vecext102 = extractelement <16 x i16> %b, i32 14
+  %vecext104 = extractelement <16 x i16> %b, i32 15
+  %add106 = add i16 %vecext102, %vecext104
+  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
+  ret <16 x i16> %vecinit108
+}
+; CHECK-LABEL: avx2_hadd_w
+; SSE3-NOT: phaddw
+; SSSE3: phaddw
+; SSSE3-NEXT: phaddw
+; AVX: vphaddw
+; AVX: vphaddw
+; AVX2: vphaddw
+; AVX2-NOT: vphaddw
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll
new file mode 100644
index 000000000000..ba846f3e9be3
--- /dev/null
+++ b/test/CodeGen/X86/i8-umulo.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=generic -march=x86 < %s | FileCheck %s
+; PR19858
+
+declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+define i8 @testumulo(i32 %argc) {
+; CHECK: imulw
+; CHECK: testb %{{.+}}, %{{.+}}
+; CHECK: je [[NOOVERFLOWLABEL:.+]]
+; CHECK: {{.*}}[[NOOVERFLOWLABEL]]:
+; CHECK-NEXT: movb
+; CHECK-NEXT: retl
+top:
+  %RHS = trunc i32 %argc to i8
+  %umul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 25, i8 %RHS)
+  %ex = extractvalue { i8, i1 } %umul, 1
+  br i1 %ex, label %overflow, label %nooverlow
+
+overflow:
+  ret i8 %RHS
+
+nooverlow:
+  %umul.value = extractvalue { i8, i1 } %umul, 0
+  ret i8 %umul.value
+}
diff --git a/test/CodeGen/X86/jump_table_alias.ll b/test/CodeGen/X86/jump_table_alias.ll
new file mode 100644
index 000000000000..f3691fda221e
--- /dev/null
+++ b/test/CodeGen/X86/jump_table_alias.ll
@@ -0,0 +1,33 @@
+; RUN: llc <%s -jump-table-type=single | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @f() unnamed_addr jumptable {
+entry:
+  ret i32 0
+}
+
+@i = alias internal i32 ()* @f
+@j = alias i32 ()* @f
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %temp = alloca i32 ()*, align 8
+  store i32 ()* @i, i32()** %temp, align 8
+; CHECK: movq    $__llvm_jump_instr_table_0_1
+  %1 = load i32 ()** %temp, align 8
+; CHECK: movl    $__llvm_jump_instr_table_0_1
+  %2 = call i32 ()* %1()
+  %3 = call i32 ()* @i()
+; CHECK: callq   i
+  %4 = call i32 ()* @j()
+; CHECK: callq   j
+  ret i32 %3
+}
+
+; There should only be one table, even though there are two GlobalAliases,
+; because they both alias the same value.
+
+; CHECK:         .globl  __llvm_jump_instr_table_0_1
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK:         jmp     f@PLT
+
diff --git a/test/CodeGen/X86/jump_table_bitcast.ll b/test/CodeGen/X86/jump_table_bitcast.ll
new file mode 100644
index 000000000000..33a798f7a6b7
--- /dev/null
+++ b/test/CodeGen/X86/jump_table_bitcast.ll
@@ -0,0 +1,46 @@
+; RUN: llc <%s -jump-table-type=single | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @f() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 @g(i8* %a) unnamed_addr jumptable {
+  ret i32 0
+}
+
+define void @h(void ()* %func) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @main() {
+  %g = alloca i32 (...)*, align 8
+  store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
+; CHECK: movq    $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], (%rsp)
+; CHECK: movl    $__llvm_jump_instr_table_0_[[ENTRY]], %ecx
+  %1 = load i32 (...)** %g, align 8
+  %call = call i32 (...)* %1()
+  call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
+; CHECK: movl    $__llvm_jump_instr_table_0_{{1|2|3}}, %edi
+; CHECK: callq   h
+
+  %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
+; CHECK: callq g
+  ret i32 %a
+}
+
+; CHECK:         .globl  __llvm_jump_instr_table_0_1
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK:         jmp     {{f|g|h}}@PLT
+; CHECK:         .globl  __llvm_jump_instr_table_0_2
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_2,@function
+; CHECK: __llvm_jump_instr_table_0_2:
+; CHECK:         jmp     {{f|g|h}}@PLT
+; CHECK:         .globl  __llvm_jump_instr_table_0_3
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_3,@function
+; CHECK: __llvm_jump_instr_table_0_3:
+; CHECK:         jmp     {{f|g|h}}@PLT
+
diff --git a/test/CodeGen/X86/jump_tables.ll b/test/CodeGen/X86/jump_tables.ll
new file mode 100644
index 000000000000..5a0aed0c1761
--- /dev/null
+++ b/test/CodeGen/X86/jump_tables.ll
@@ -0,0 +1,272 @@
+; RUN: llc <%s -jump-table-type=single | FileCheck --check-prefix=SINGLE %s
+; RUN: llc <%s -jump-table-type=arity | FileCheck --check-prefix=ARITY %s
+; RUN: llc <%s -jump-table-type=simplified | FileCheck --check-prefix=SIMPL %s
+; RUN: llc <%s -jump-table-type=full | FileCheck --check-prefix=FULL %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.fun_struct = type { i32 (...)* }
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_match() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @indirect_fun_i32() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 @indirect_fun_i32_1(i32 %a) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define i32 @indirect_fun_i32_2(i32 %a, i32 %b) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define i32* @indirect_fun_i32S_2(i32* %a, i32 %b) unnamed_addr jumptable {
+  ret i32* %a
+}
+
+define void @indirect_fun_struct(%struct.fun_struct %fs) unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_fun(i32 (...)* %fun, i32 %a) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @indirect_fun_fun_ret(i32 (...)* %fun, i32 %a) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define void @indirect_fun_array([19 x i8] %a) unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_vec(<3 x i32> %a) unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_vec_2(<4 x float> %a) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+  call void ()* %fun()
+  ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+; SINGLE: movl    $__llvm_jump_instr_table_0_
+; ARITY: movl    $__llvm_jump_instr_table_
+; SIMPL: movl    $__llvm_jump_instr_table_
+; FULL: movl    $__llvm_jump_instr_table_
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_1
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_1,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_1:
+; SINGLE-DAG:         jmp     indirect_fun_array@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_2,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_2:
+; SINGLE-DAG:         jmp     indirect_fun_i32_2@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_3
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_3,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_3:
+; SINGLE-DAG:         jmp     indirect_fun_vec_2@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_4
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_4,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_4:
+; SINGLE-DAG:         jmp     indirect_fun_i32S_2@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_5
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_5,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_5:
+; SINGLE-DAG:         jmp     indirect_fun_struct@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_6
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_6,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_6:
+; SINGLE-DAG:         jmp     indirect_fun_i32_1@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_7
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_7,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_7:
+; SINGLE-DAG:         jmp     indirect_fun_i32@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_8
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_8,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_8:
+; SINGLE-DAG:         jmp     indirect_fun_fun@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_9
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_9,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_9:
+; SINGLE-DAG:         jmp     indirect_fun_fun_ret@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_10
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_10,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_10:
+; SINGLE-DAG:         jmp     indirect_fun@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_11
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_11,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_11:
+; SINGLE-DAG:         jmp     indirect_fun_match@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_12
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_12,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_12:
+; SINGLE-DAG:         jmp     indirect_fun_vec@PLT
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+
+
+; ARITY-DAG:         .globl  __llvm_jump_instr_table_2_1
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         .type   __llvm_jump_instr_table_2_1,@function
+; ARITY-DAG: __llvm_jump_instr_table_2_1:
+; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         ud2
+; ARITY-DAG:         .globl  __llvm_jump_instr_table_0_1
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         .type   __llvm_jump_instr_table_0_1,@function
+; ARITY-DAG: __llvm_jump_instr_table_0_1:
+; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
+; ARITY-DAG:         .globl  __llvm_jump_instr_table_1_1
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         .type   __llvm_jump_instr_table_1_1,@function
+; ARITY-DAG: __llvm_jump_instr_table_1_1:
+; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
+
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_2_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_2_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_2_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         ud2
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_0_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_0_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_0_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_1_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_1_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_1_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_3_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_3_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_3_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_4_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_4_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_4_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+
+
+; FULL-DAG:        .globl  __llvm_jump_instr_table_10_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_10_1,@function
+; FULL-DAG:__llvm_jump_instr_table_10_1:
+; FULL-DAG:        jmp     indirect_fun_i32_1@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_9_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_9_1,@function
+; FULL-DAG:__llvm_jump_instr_table_9_1:
+; FULL-DAG:        jmp     indirect_fun_i32_2@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_7_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_7_1,@function
+; FULL-DAG:__llvm_jump_instr_table_7_1:
+; FULL-DAG:        jmp     indirect_fun_i32S_2@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_3_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_3_1,@function
+; FULL-DAG:__llvm_jump_instr_table_3_1:
+; FULL-DAG:        jmp     indirect_fun_vec_2@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_2_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_2_1,@function
+; FULL-DAG:__llvm_jump_instr_table_2_1:
+; FULL-DAG:        jmp     indirect_fun@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_8_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_8_1,@function
+; FULL-DAG:__llvm_jump_instr_table_8_1:
+; FULL-DAG:        jmp     indirect_fun_i32@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_1_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_1_1,@function
+; FULL-DAG:__llvm_jump_instr_table_1_1:
+; FULL-DAG:        jmp     indirect_fun_array@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_0_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_0_1,@function
+; FULL-DAG:__llvm_jump_instr_table_0_1:
+; FULL-DAG:        jmp     indirect_fun_vec@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_6_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_6_1,@function
+; FULL-DAG:__llvm_jump_instr_table_6_1:
+; FULL-DAG:        jmp     indirect_fun_struct@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_5_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_5_1,@function
+; FULL-DAG:__llvm_jump_instr_table_5_1:
+; FULL-DAG:        jmp     indirect_fun_fun@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_4_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_4_1,@function
+; FULL-DAG:__llvm_jump_instr_table_4_1:
+; FULL-DAG:        jmp     indirect_fun_fun_ret@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
diff --git a/test/CodeGen/X86/libcall-sret.ll b/test/CodeGen/X86/libcall-sret.ll
new file mode 100644
index 000000000000..67b99ac239cd
--- /dev/null
+++ b/test/CodeGen/X86/libcall-sret.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=i686-linux-gnu -o - %s | FileCheck %s
+
+@var = global i128 0
+
+; We were trying to convert the i128 operation into a libcall, but failing to
+; perform sret demotion when we couldn't return the result in registers. Make
+; sure we marshal the return properly:
+
+define void @test_sret_libcall(i128 %l, i128 %r) {
+; CHECK-LABEL: test_sret_libcall:
+
+  ; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
+  ; (aligned) place for the actual sret data is %esp + 40.
+; CHECK: leal 40(%esp), [[SRET_ADDR:%[a-z]+]]
+; CHECK: movl [[SRET_ADDR]], (%esp)
+; CHECK: calll __multi3
+; CHECK-DAG: movl 40(%esp), [[RES0:%[a-z]+]]
+; CHECK-DAG: movl 44(%esp), [[RES1:%[a-z]+]]
+; CHECK-DAG: movl 48(%esp), [[RES2:%[a-z]+]]
+; CHECK-DAG: movl 52(%esp), [[RES3:%[a-z]+]]
+; CHECK-DAG: movl [[RES0]], var
+; CHECK-DAG: movl [[RES1]], var+4
+; CHECK-DAG: movl [[RES2]], var+8
+; CHECK-DAG: movl [[RES3]], var+12
+  %prod = mul i128 %l, %r
+  store i128 %prod, i128* @var
+  ret void
+}
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 3d91b03d609d..8ed58f119c4f 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -6,7 +6,6 @@
 # cleanly.
 config.suffixes = ['.ll', '.test', '.txt']
 
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/X86/lower-bitcast-v2i32.ll b/test/CodeGen/X86/lower-bitcast.ll
similarity index 53%
rename from test/CodeGen/X86/lower-bitcast-v2i32.ll
rename to test/CodeGen/X86/lower-bitcast.ll
index 1c0de630ef8c..769831ee8185 100644
--- a/test/CodeGen/X86/lower-bitcast-v2i32.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -14,7 +14,7 @@ define double @test1(double %A) {
 ; CHECK-LABEL: test1
 ; CHECK-NOT: movsd
 ; CHECK: pshufd
-; CHECK-NEXT: paddq
+; CHECK-NEXT: paddd
 ; CHECK-NEXT: pshufd
 ; CHECK-NEXT: ret
 
@@ -26,16 +26,9 @@ define double @test2(double %A, double %B) {
   %3 = bitcast <2 x i32> %add to double
   ret double %3
 }
-; FIXME: Ideally we should be able to fold the entire body of @test2 into a
-; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the
-; sequence pshufd+pshufd+paddq+pshufd.
-
 ; CHECK-LABEL: test2
 ; CHECK-NOT: movsd
-; CHECK: pshufd
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: paddq
-; CHECK-NEXT: pshufd
+; CHECK: paddd
 ; CHECK-NEXT: ret
 
 
@@ -78,3 +71,66 @@ define double @test5(double %A) {
 ; CHECK: addps
 ; CHECK-NEXT: ret
 
+
+define double @test6(double %A) {
+  %1 = bitcast double %A to <4 x i16>
+  %add = add <4 x i16> %1, <i16 3, i16 4, i16 5, i16 6>
+  %2 = bitcast <4 x i16> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test6 into a
+; single paddw instruction.
+
+; CHECK-LABEL: test6
+; CHECK-NOT: movsd
+; CHECK: punpcklwd
+; CHECK-NEXT: paddw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test7(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %add = add <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: movsd
+; CHECK-NOT: punpcklwd
+; CHECK: paddw
+; CHECK-NEXT: ret
+
+
+define double @test8(double %A) {
+  %1 = bitcast double %A to <8 x i8>
+  %add = add <8 x i8> %1, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10>
+  %2 = bitcast <8 x i8> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test8 into a
+; single paddb instruction. At the moment we produce the sequence 
+; pshufd+paddw+pshufd.
+
+; CHECK-LABEL: test8
+; CHECK-NOT: movsd
+; CHECK: punpcklbw
+; CHECK-NEXT: paddb
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test9(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %add = add <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: movsd
+; CHECK-NOT: punpcklbw
+; CHECK: paddb
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index 7f61e10f5f68..9ddc84708d5b 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq $8,
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
@@ -71,7 +71,7 @@ return:
 
 ; CHECK-LABEL: count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq $8,
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
@@ -242,7 +242,7 @@ return:
 
 ; CHECK-LABEL: another_count_down_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq $-8,
+; CHECK: decq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down_signed(double* %d, i64 %n) nounwind {
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 7adf307ce0fb..fc9c78d1bbeb 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -33,8 +33,8 @@ define <2 x i32> @t3() nounwind {
 define double @t4() nounwind {
 	ret double bitcast (<2 x i32> <i32 1, i32 0> to double)
 ; CHECK-LABEL: t4:
-; CHECK-NOT: movl $1
+; CHECK: movl $1
 ; CHECK-NOT: pshufd
-; CHECK: movsd {{.*}}, %xmm0
+; CHECK: movd {{.*}}, %xmm0
 }
 
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 8089f2056847..9dab3cd8d6d5 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -107,13 +107,15 @@ define void @test_basic() #0 {
 define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; X32-Linux:       cmpl %gs:48, %esp
 ; X32-Linux-NEXT:  ja      .LBB1_2
 
 ; X32-Linux:       pushl $4
-; X32-Linux-NEXT:  pushl $0
+; X32-Linux-NEXT:  pushl $60
 ; X32-Linux-NEXT:  calll __morestack
 ; X32-Linux-NEXT:  ret
 
@@ -121,7 +123,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-Linux-NEXT:  ja      .LBB1_2
 
 ; X64-Linux:       movq %r10, %rax
-; X64-Linux-NEXT:  movabsq $0, %r10
+; X64-Linux-NEXT:  movabsq $56, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
@@ -132,7 +134,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-Darwin-NEXT: ja      LBB1_2
 
 ; X32-Darwin:      pushl $4
-; X32-Darwin-NEXT: pushl $0
+; X32-Darwin-NEXT: pushl $60
 ; X32-Darwin-NEXT: calll ___morestack
 ; X32-Darwin-NEXT: ret
 
@@ -140,7 +142,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-Darwin-NEXT: ja      LBB1_2
 
 ; X64-Darwin:      movq %r10, %rax
-; X64-Darwin-NEXT: movabsq $0, %r10
+; X64-Darwin-NEXT: movabsq $56, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
 ; X64-Darwin-NEXT: callq ___morestack
 ; X64-Darwin-NEXT: ret
@@ -150,7 +152,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-MinGW-NEXT:  ja      LBB1_2
 
 ; X32-MinGW:       pushl $4
-; X32-MinGW-NEXT:  pushl $0
+; X32-MinGW-NEXT:  pushl $52
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
@@ -159,7 +161,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-MinGW-NEXT:  ja      .LBB1_2
 
 ; X64-MinGW:       movq %r10, %rax
-; X64-MinGW-NEXT:  movabsq $0, %r10
+; X64-MinGW-NEXT:  movabsq $88, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
 ; X64-MinGW-NEXT:  callq __morestack
 ; X64-MinGW-NEXT:  retq
@@ -169,7 +171,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-FreeBSD-NEXT:  ja      .LBB1_2
 
 ; X64-FreeBSD:       movq %r10, %rax
-; X64-FreeBSD-NEXT:  movabsq $0, %r10
+; X64-FreeBSD-NEXT:  movabsq $56, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
@@ -435,4 +437,29 @@ define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 {
 
 }
 
+define void @test_nostack() #0 {
+	ret void
+
+; X32-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   calll __morestack
+
+; X64-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   callq __morestack
+
+; X32-Darwin-LABEL: test_nostack:
+; X32-Darwin-NOT:   calll __morestack
+
+; X64-Darwin-LABEL: test_nostack:
+; X64-Darwin-NOT:   callq __morestack
+
+; X32-MinGW-LABEL: test_nostack:
+; X32-MinGW-NOT:   calll __morestack
+
+; X64-MinGW-LABEL: test_nostack:
+; X64-MinGW-NOT:   callq __morestack
+
+; X64-FreeBSD-LABEL: test_nostack:
+; X64-FreeBSD-NOT:   callq __morestack
+}
+
 attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/sqrt.ll b/test/CodeGen/X86/sqrt.ll
new file mode 100644
index 000000000000..be7c6e867399
--- /dev/null
+++ b/test/CodeGen/X86/sqrt.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse2                             | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse2 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx2,+avx                             | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx2,+avx -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=AVX
+
+define float @test_sqrt_f32(float %a) {
+; SSE2-LABEL: test_sqrt_f32
+; SSE2:       sqrtss %xmm0, %xmm0
+; AVX-LABEL:  test_sqrt_f32
+; AVX:        vsqrtss %xmm0, %xmm0
+  %res = call float @llvm.sqrt.f32(float %a)
+  ret float %res
+}
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+define double @test_sqrt_f64(double %a) {
+; SSE2-LABEL: test_sqrt_f64
+; SSE2:       sqrtsd %xmm0, %xmm0
+; AVX-LABEL:  test_sqrt_f64
+; AVX:        vsqrtsd %xmm0, %xmm0
+  %res = call double @llvm.sqrt.f64(double %a)
+  ret double %res
+}
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 951bb7dc854a..3a4812119f8a 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -88,3 +88,53 @@ entry:
   store double %extract214vector_func.i, double addrspace(1)* undef, align 8
   ret void
 }
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; In this case, we emit a simple movss
+; CHECK-LABEL: constant_blendvpd
+; CHECK: movsd
+; CHECK: ret
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps
+; CHECK-NOT: mov
+; CHECK: blendps $7
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb:
+; CHECK: movaps
+; CHECK: pblendvb
+; CHECK: ret
+  %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
+  ret <16 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+
+;; 2 tests for shufflevectors that optimize to blend + immediate
+; CHECK-LABEL: @blend_shufflevector_4xfloat
+; CHECK: blendps $6, %xmm1, %xmm0
+; CHECK: ret
+define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  ret <4 x float> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_8xi16
+; CHECK: pblendw $134, %xmm1, %xmm0
+; CHECK: ret
+define <8 x i16> @blend_shufflevector_8xi16(<8 x i16> %a, <8 x i16> %b) {
+  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
+  ret <8 x i16> %1
+}
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index a3c62016c436..a77ede228df6 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -692,3 +692,14 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
   %13 = fadd <4 x float> %11, %12
   ret <4 x float> %13
 }
+
+define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_with_undefs:
+; CHECK-NOT: shufps
+; CHECK: insertps    $32, %xmm0
+; CHECK: ret
+  %1 = load float* %b, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
+  ret <4 x float> %result
+}
diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
new file mode 100644
index 000000000000..0b7e6dbdc7a2
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll
@@ -0,0 +1,165 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim                             | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -fast-isel -fast-isel-abort | FileCheck %s
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 4
+; Num LargeConstants
+; CHECK-NEXT:   .long 3
+; Num Callsites
+; CHECK-NEXT:   .long 7
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _constantargs
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _liveConstant
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _directFrameIdx
+; CHECK-NEXT:   .quad 40
+; CHECK-NEXT:   .quad _longid
+; CHECK-NEXT:   .quad 8
+
+; Large Constants
+; CHECK-NEXT:   .quad   2147483648
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+
+; Callsites
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  12
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2000000000
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2147483647
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 1
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+; LargeConstant at index 2
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+
+define void @constantargs() {
+entry:
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 15, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
+  ret void
+}
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  ret void
+}
+
+; Directly map an alloca's address.
+;
+; Callsite 16
+; CHECK-LABEL:  .long L{{.*}}-_directFrameIdx
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short	1
+; Loc 0: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+
+define void @directFrameIdx() {
+entry:
+  %metadata1 = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata1
+  store i64 12, i64* %metadata1
+  store i64 13, i64* %metadata1
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  ret void
+}
+
+; Test a 64-bit ID.
+;
+; CHECK:        .quad 4294967295
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 4294967296
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 9223372036854775807
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad -1
+; CHECK-LABEL:  .long L{{.*}}-_longid
+define void @longid() {
+entry:
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967295, i32 0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967296, i32 0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 9223372036854775807, i32 0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 -1, i32 0)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index a02e3836078c..28f2a9074cb8 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
 ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
 
 define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
@@ -35,6 +36,23 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
 
 ; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
 define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
+  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
+  %2 = load <4 x float>* %1, align 16
+  %3 = trunc i64 %j to i32
+  %4 = extractelement <4 x float> %2, i32 %3
+  %5 = insertelement <4 x float> undef, float %4, i32 0
+  %6 = insertelement <4 x float> %5, float %4, i32 1
+  %7 = insertelement <4 x float> %6, float %4, i32 2
+  %8 = insertelement <4 x float> %7, float %4, i32 3
+  ret <4 x float> %8
+  
+; AVX-LABEL: load_extract_splat
+; AVX-NOT: rsp
+; AVX: vbroadcastss
+}
+
+; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
+define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
   %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
   %2 = load <4 x float>* %1, align 16
   %3 = extractelement <4 x float> %2, i64 %j
@@ -44,7 +62,7 @@ define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64
   %7 = insertelement <4 x float> %6, float %3, i32 3
   ret <4 x float> %7
   
-; AVX-LABEL: load_extract_splat
+; AVX-LABEL: load_extract_splat1
 ; AVX-NOT: movs
 ; AVX: vbroadcastss
 }
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 0cf03fc5d62c..42cf06a4a049 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -262,3 +262,17 @@ define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK: movsd
 ; CHECK: ret
 
+define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
+; CHECK-LABEL: select_of_shuffles_0
+; CHECK-DAG: movlhps %xmm2, [[REGA:%xmm[0-9]+]]
+; CHECK-DAG: movlhps %xmm3, [[REGB:%xmm[0-9]+]]
+; CHECK: subps [[REGB]], [[REGA]]
+  %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1
+  %4 = shufflevector <2 x float> %b0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %5 = shufflevector <2 x float> %b1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %6 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %5, <4 x float> %4
+  %7 = fsub <4 x float> %3, %6
+  ret <4 x float> %7
+}
diff --git a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index 5d7a10b5901e..08d0257a0e5c 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -3,7 +3,7 @@
 ; clang -Oz -c test1.cpp -emit-llvm -S -o
 ; Verify that we generate shld insruction when we are optimizing for size,
 ; even for X86_64 processors that are known to have poor latency double 
-; precision shift instuctions.
+; precision shift instructions.
 ; uint64_t lshift10(uint64_t a, uint64_t b)
 ; {
 ;     return (a << 10) | (b >> 54);
@@ -25,7 +25,7 @@ attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"
 ; clang -Os -c test2.cpp -emit-llvm -S
 ; Verify that we generate shld insruction when we are optimizing for size,
 ; even for X86_64 processors that are known to have poor latency double
-; precision shift instuctions.
+; precision shift instructions.
 ; uint64_t lshift11(uint64_t a, uint64_t b)
 ; {
 ;     return (a << 11) | (b >> 53);
@@ -46,7 +46,7 @@ attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false"
 ; clang -O2 -c test2.cpp -emit-llvm -S
 ; Verify that we do not generate shld insruction when we are not optimizing
 ; for size for X86_64 processors that are known to have poor latency double
-; precision shift instuctions.
+; precision shift instructions.
 ; uint64_t lshift12(uint64_t a, uint64_t b)
 ; {
 ;     return (a << 12) | (b >> 52);
diff --git a/test/CodeGen/X86/x86-64-frameaddr.ll b/test/CodeGen/X86/x86-64-frameaddr.ll
deleted file mode 100644
index 7d36a7af6aaa..000000000000
--- a/test/CodeGen/X86/x86-64-frameaddr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-
-; CHECK: stack_end_address
-; CHECK: {{movq.+rbp.*$}}
-; CHECK: {{movq.+rbp.*$}}
-; CHECK: ret
-
-define i64* @stack_end_address() nounwind  {
-entry:
-	tail call i8* @llvm.frameaddress( i32 0 )
-	bitcast i8* %0 to i64*
-	ret i64* %1
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone 
diff --git a/test/CodeGen/X86/x86-64-static-relo-movl.ll b/test/CodeGen/X86/x86-64-static-relo-movl.ll
new file mode 100644
index 000000000000..71e52bb99191
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-static-relo-movl.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=x86_64-pc-win32-macho -relocation-model=static -O0 < %s | FileCheck %s
+
+; Ensure that we don't generate a movl and not a lea for a static relocation
+; when compiling for 64 bit.
+
+%struct.MatchInfo = type [64 x i64]
+
+@NO_MATCH = internal constant %struct.MatchInfo zeroinitializer, align 8
+
+define void @setup() {
+  %pending = alloca %struct.MatchInfo, align 8
+  %t = bitcast %struct.MatchInfo* %pending to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t, i8* bitcast (%struct.MatchInfo* @NO_MATCH to i8*), i64 512, i32 8, i1 false)
+  %u = getelementptr inbounds %struct.MatchInfo* %pending, i32 0, i32 2
+  %v = load i64* %u, align 8
+  br label %done
+done:
+  ret void
+
+  ; CHECK: movabsq $_NO_MATCH, {{.*}}
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
diff --git a/test/CodeGen/X86/x86-frameaddr.ll b/test/CodeGen/X86/x86-frameaddr.ll
deleted file mode 100644
index d5958745dfff..000000000000
--- a/test/CodeGen/X86/x86-frameaddr.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 | grep mov | grep ebp
-
-define i8* @t() nounwind {
-entry:
-	%0 = tail call i8* @llvm.frameaddress(i32 0)
-	ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/X86/x86-frameaddr2.ll b/test/CodeGen/X86/x86-frameaddr2.ll
deleted file mode 100644
index c5091154152b..000000000000
--- a/test/CodeGen/X86/x86-frameaddr2.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 | grep mov | count 3
-
-define i8* @t() nounwind {
-entry:
-	%0 = tail call i8* @llvm.frameaddress(i32 2)
-	ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
new file mode 100644
index 000000000000..d885f1cd364f
--- /dev/null
+++ b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mattr=+avx < %s | FileCheck %s
+
+; Check that we properly upgrade the AVX vbroadcast intrinsics to IR.  The
+; expectation is that we should still get the original instruction back that
+; maps to the intrinsic.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; CHECK-LABEL: test_mm_broadcast_ss:
+define <4 x float> @test_mm_broadcast_ss(float* readonly %__a){
+entry:
+  %0 = bitcast float* %__a to i8*
+; CHECK: vbroadcastss (%{{.*}}), %xmm
+  %1 = tail call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %0)
+  ret <4 x float> %1
+}
+
+; CHECK-LABEL: test_mm256_broadcast_sd:
+define <4 x double> @test_mm256_broadcast_sd(double* readonly %__a) {
+entry:
+  %0 = bitcast double* %__a to i8*
+; CHECK: vbroadcastsd (%{{.*}}), %ymm
+  %1 = tail call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %0)
+  ret <4 x double> %1
+}
+
+; CHECK-LABEL: test_mm256_broadcast_ss:
+define <8 x float> @test_mm256_broadcast_ss(float* readonly %__a) {
+entry:
+  %0 = bitcast float* %__a to i8*
+; CHECK: vbroadcastss (%{{.*}}), %ymm
+  %1 = tail call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %0)
+  ret <8 x float> %1
+}
+
+declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*)
+
+declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*)
+
+declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*)
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
new file mode 100644
index 000000000000..a9aa8fa96aec
--- /dev/null
+++ b/test/CodeGen/X86/xaluo.ll
@@ -0,0 +1,337 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=DAG
+; RUN: llc -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
+
+;
+; Get the actual value of the overflow bit.
+;
+; SADDO reg, reg
+define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
+entry:
+; DAG-LABEL:    saddo.i8
+; DAG:          addb %sil, %dil
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i8
+; FAST:         addb %sil, %dil
+; FAST-NEXT:    seto %al
+  %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
+entry:
+; DAG-LABEL:    saddo.i16
+; DAG:          addw %si, %di
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i16
+; FAST:         addw %si, %di
+; FAST-NEXT:    seto %al
+  %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    saddo.i32
+; DAG:          addl %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i32
+; FAST:         addl %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64
+; DAG:          addq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64
+; FAST:         addq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SADDO reg, imm | imm, reg
+; FIXME: INC isn't supported in FastISel yet
+define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm1
+; DAG:          incq %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64imm1
+; FAST:         addq $1, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; FIXME: DAG doesn't optimize immediates on the LHS.
+define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm2
+; DAG:          mov
+; DAG-NEXT:     addq
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm2
+; FAST:         addq $1, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; Check boundary conditions for large immediates.
+define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm3
+; DAG:          addq $-2147483648, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64imm3
+; FAST:         addq $-2147483648, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm4
+; DAG:          movabsq $-21474836489, %[[REG:[a-z]+]]
+; DAG-NEXT:     addq %rdi, %[[REG]]
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm4
+; FAST:         movabsq $-21474836489, %[[REG:[a-z]+]]
+; FAST-NEXT:    addq %rdi, %[[REG]]
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm5
+; DAG:          addq $2147483647, %rdi
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm5
+; FAST:         addq $2147483647, %rdi
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; TODO: FastISel shouldn't use movabsq.
+define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm6
+; DAG:          movl $2147483648, %ecx
+; DAG:          addq %rdi, %rcx
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm6
+; FAST:         movabsq $2147483648, %[[REG:[a-z]+]]
+; FAST:         addq %rdi, %[[REG]]
+; FAST-NEXT:     seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; UADDO
+define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    uaddo.i32
+; DAG:          addl %esi, %edi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   uaddo.i32
+; FAST:         addl %esi, %edi
+; FAST-NEXT:    setb %al
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    uaddo.i64
+; DAG:          addq %rsi, %rdi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   uaddo.i64
+; FAST:         addq %rsi, %rdi
+; FAST-NEXT:    setb %al
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SSUBO
+define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    ssubo.i32
+; DAG:          subl %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   ssubo.i32
+; FAST:         subl %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    ssubo.i64
+; DAG:          subq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   ssubo.i64
+; FAST:         subq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; USUBO
+define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    usubo.i32
+; DAG:          subl %esi, %edi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   usubo.i32
+; FAST:         subl %esi, %edi
+; FAST-NEXT:    setb %al
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    usubo.i64
+; DAG:          subq %rsi, %rdi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   usubo.i64
+; FAST:         subq %rsi, %rdi
+; FAST-NEXT:    setb %al
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SMULO
+define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    smulo.i32
+; DAG:          imull %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   smulo.i32
+; FAST:         imull %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    smulo.i64
+; DAG:          imulq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   smulo.i64
+; FAST:         imulq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; UMULO
+define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    umulo.i32
+; DAG:          mull %esi
+; DAG-NEXT:     seto
+; FAST-LABEL:   umulo.i32
+; FAST:         mull %esi
+; FAST-NEXT:    seto
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    umulo.i64
+; DAG:          mulq %rsi
+; DAG-NEXT:     seto
+; FAST-LABEL:   umulo.i64
+; FAST:         mulq %rsi
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
diff --git a/test/CodeGen/XCore/lit.local.cfg b/test/CodeGen/XCore/lit.local.cfg
index 3e84c1befeab..0b947bbbb850 100644
--- a/test/CodeGen/XCore/lit.local.cfg
+++ b/test/CodeGen/XCore/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/2010-03-19-DbgDeclare.ll b/test/DebugInfo/2010-03-19-DbgDeclare.ll
index 0c0a4dcb63e8..94aa259d31b6 100644
--- a/test/DebugInfo/2010-03-19-DbgDeclare.ll
+++ b/test/DebugInfo/2010-03-19-DbgDeclare.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -verify -S -asm-verbose | FileCheck %s
+; RUN: opt < %s -verify -S | FileCheck %s
 
 ; CHECK: lang 0x8001
 
diff --git a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
index e81667f6793c..5f7cb696d738 100644
--- a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
+++ b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
@@ -10,10 +10,10 @@
 ; Check that the subprogram inside the class definition has low_pc, only
 ; attached to the definition.
 ; CHECK: [[FOO_INL:0x........]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name {{.*}} "_ZZN1B2fnEvEN1A3fooEv"
-; CHECK-NOT: NULL
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_low_pc
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_ZZN1B2fnEvEN1A3fooEv"
 ; And just double check that there's no out of line definition that references
 ; this subprogram.
 ; CHECK-NOT: DW_AT_specification {{.*}} {[[FOO_INL]]}
diff --git a/test/DebugInfo/AArch64/cfi-frame.ll b/test/DebugInfo/AArch64/cfi-frame.ll
deleted file mode 100644
index 7290ddf357c1..000000000000
--- a/test/DebugInfo/AArch64/cfi-frame.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-WITH-FP
-
-@bigspace = global [8 x i64] zeroinitializer
-
-declare void @use_addr(i8*)
-
-define void @test_frame([8 x i64] %val) {
-; CHECK: test_frame:
-; CHECK: .cfi_startproc
-
-  %var = alloca i8, i32 1000000
-; CHECK: sub sp, sp, #[[SP_INIT_ADJ:[0-9]+]]
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_def_cfa sp, [[SP_INIT_ADJ]]
-
-; Make sure the prologue is reasonably efficient
-; CHECK-NEXT: stp x29, x30, [sp,
-; CHECK-NEXT: stp x25, x26, [sp,
-; CHECK-NEXT: stp x23, x24, [sp,
-; CHECK-NEXT: stp x21, x22, [sp,
-; CHECK-NEXT: stp x19, x20, [sp,
-; CHECK-NEXT: sub sp, sp, #160
-; CHECK-NEXT: sub sp, sp, #244, lsl #12
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_def_cfa sp, 1000080
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_offset x30, -8
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_offset x29, -16
-; [...]
-; CHECK: .cfi_offset x19, -80
-
-; CHECK: bl use_addr
-  call void @use_addr(i8* %var)
-
-  store [8 x i64] %val, [8 x i64]* @bigspace
-  ret void
-; CHECK: ret
-; CHECK: .cfi_endproc
-}
-
-; CHECK-WITH-FP: test_frame:
-
-; CHECK-WITH-FP: sub sp, sp, #[[SP_INIT_ADJ:[0-9]+]]
-; CHECK-WITH-FP-NEXT: .Ltmp
-; CHECK-WITH-FP-NEXT: .cfi_def_cfa sp, [[SP_INIT_ADJ]]
-
-; CHECK-WITH-FP: stp x29, x30, [sp, [[OFFSET:#[0-9]+]]]
-; CHECK-WITH-FP-NEXT: add x29, sp, [[OFFSET]]
-; CHECK-WITH-FP-NEXT: .Ltmp
-; CHECK-WITH-FP-NEXT: .cfi_def_cfa x29, 16
-
-  ; We shouldn't emit any kind of update for the second stack adjustment if the
-  ; FP is in use.
-; CHECK-WITH-FP-NOT: .cfi_def_cfa_offset
-
-; CHECK-WITH-FP: bl use_addr
diff --git a/test/DebugInfo/AArch64/lit.local.cfg b/test/DebugInfo/AArch64/lit.local.cfg
index 9a66a00189ea..cec29af5bbe4 100644
--- a/test/DebugInfo/AArch64/lit.local.cfg
+++ b/test/DebugInfo/AArch64/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/ARM64/struct_by_value.ll b/test/DebugInfo/AArch64/struct_by_value.ll
similarity index 98%
rename from test/DebugInfo/ARM64/struct_by_value.ll
rename to test/DebugInfo/AArch64/struct_by_value.ll
index 0023c3d6eabf..0e336f799c54 100644
--- a/test/DebugInfo/ARM64/struct_by_value.ll
+++ b/test/DebugInfo/AArch64/struct_by_value.ll
@@ -1,7 +1,9 @@
 ; A by-value struct is a register-indirect value (breg).
 ; RUN: llc %s -filetype=asm -o - | FileCheck %s
 
-; CHECK: DW_OP_breg0
+; CHECK: DW_AT_location
+; CHECK-NEXT: .byte 112
+; 112 = 0x70 = DW_OP_breg0
 
 ; rdar://problem/13658587
 ;
diff --git a/test/DebugInfo/AArch64/variable-loc.ll b/test/DebugInfo/AArch64/variable-loc.ll
deleted file mode 100644
index f28ee76ebfa3..000000000000
--- a/test/DebugInfo/AArch64/variable-loc.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
-
-; This is a regression test making sure the location of variables is correct in
-; debugging information, even if they're addressed via the frame pointer.
-
-; In case it needs, regenerating, the following suffices:
-; int printf(const char *, ...);
-; void populate_array(int *, int);
-; int sum_array(int *, int);
-
-; int main() {
-;     int main_arr[100], val;
-;     populate_array(main_arr, 100);
-;     val = sum_array(main_arr, 100);
-;     printf("Total is %d\n", val);
-;     return 0;
-; }
-
-  ; First make sure main_arr is where we expect it: sp + 4 == x29 - 412:
-; CHECK: main:
-; CHECK: sub sp, sp, #432
-; CHECK: stp x29, x30, [sp, #416]
-; CHECK: add x29, sp, #416
-; CHECK: add {{x[0-9]+}}, sp, #4
-
-; CHECK: .Linfo_string7:
-; CHECK-NEXT: main_arr
-
-; Now check the debugging information reflects this:
-; CHECK: DW_TAG_variable
-; CHECK-NEXT: .word .Linfo_string7
-
-  ; Rather hard-coded, but 145 => DW_OP_fbreg and the .ascii is LEB128 encoded -412.
-; CHECK: DW_AT_location
-; CHECK-NEXT: .byte 145
-; CHECK-NEXT: .ascii "\344|"
-
-
-
-target datalayout = "e-p:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-n32:64-S128"
-target triple = "aarch64-none-linux-gnu"
-
-@.str = private unnamed_addr constant [13 x i8] c"Total is %d\0A\00", align 1
-
-declare void @populate_array(i32*, i32) nounwind
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare i32 @sum_array(i32*, i32) nounwind
-
-define i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  %main_arr = alloca [100 x i32], align 4
-  %val = alloca i32, align 4
-  store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[100 x i32]* %main_arr}, metadata !17), !dbg !22
-  call void @llvm.dbg.declare(metadata !{i32* %val}, metadata !23), !dbg !24
-  %arraydecay = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !25
-  call void @populate_array(i32* %arraydecay, i32 100), !dbg !25
-  %arraydecay1 = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !26
-  %call = call i32 @sum_array(i32* %arraydecay1, i32 100), !dbg !26
-  store i32 %call, i32* %val, align 4, !dbg !26
-  %0 = load i32* %val, align 4, !dbg !27
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), i32 %0), !dbg !27
-  ret i32 0, !dbg !28
-}
-
-declare i32 @printf(i8*, ...)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!30}
-
-!0 = metadata !{i32 786449, metadata !29, i32 12, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !11, metadata !14}
-!5 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"populate_array", metadata !"populate_array", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*, i32)* @populate_array, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
-!6 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"sum_array", metadata !"sum_array", metadata !"", i32 9, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*, i32)* @sum_array, null, null, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
-!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{metadata !10, metadata !9, metadata !10}
-!14 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !10}
-!17 = metadata !{i32 786688, metadata !18, metadata !"main_arr", metadata !6, i32 19, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
-!18 = metadata !{i32 786443, metadata !29, metadata !14, i32 18, i32 16, i32 4} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
-!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
-!20 = metadata !{i32 786465, i64 0, i64 99}       ; [ DW_TAG_subrange_type ] [0, 99]
-!22 = metadata !{i32 19, i32 7, metadata !18, null}
-!23 = metadata !{i32 786688, metadata !18, metadata !"val", metadata !6, i32 20, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [val] [line 20]
-!24 = metadata !{i32 20, i32 7, metadata !18, null}
-!25 = metadata !{i32 22, i32 3, metadata !18, null}
-!26 = metadata !{i32 23, i32 9, metadata !18, null}
-!27 = metadata !{i32 24, i32 3, metadata !18, null}
-!28 = metadata !{i32 26, i32 3, metadata !18, null}
-!29 = metadata !{metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build"}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/ARM/lit.local.cfg b/test/DebugInfo/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/DebugInfo/ARM/lit.local.cfg
+++ b/test/DebugInfo/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/ARM64/lit.local.cfg b/test/DebugInfo/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b6f74c..000000000000
--- a/test/DebugInfo/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/DebugInfo/COFF/lit.local.cfg b/test/DebugInfo/COFF/lit.local.cfg
index 19840aa7574c..c8625f4d9d24 100644
--- a/test/DebugInfo/COFF/lit.local.cfg
+++ b/test/DebugInfo/COFF/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/Inputs/arange-overlap.cc b/test/DebugInfo/Inputs/arange-overlap.cc
new file mode 100644
index 000000000000..82e3f120efda
--- /dev/null
+++ b/test/DebugInfo/Inputs/arange-overlap.cc
@@ -0,0 +1,26 @@
+void call();
+
+struct S {
+  static void foo() { call(); call(); }
+  static void bar() { call(); call(); }
+  static void baz() {}
+};
+
+#ifdef FILE1
+# define FUNC_NAME func1
+# define FUNC_BODY \
+    S::foo(); S::bar(); S::baz();
+#else
+# define FUNC_NAME func2
+# define FUNC_BODY \
+    S::bar();
+#endif
+
+void FUNC_NAME() {
+  FUNC_BODY
+}
+
+// Build instructions:
+// $ clang -g -fPIC -c -DFILE1 arange-overlap.cc -o obj1.o
+// $ clang -g -fPIC -c arange-overlap.cc -o obj2.o
+// $ clang -shared obj1.o obj2.o -o <output>
diff --git a/test/DebugInfo/Inputs/arange-overlap.elf-x86_64 b/test/DebugInfo/Inputs/arange-overlap.elf-x86_64
new file mode 100755
index 000000000000..075e9c271231
Binary files /dev/null and b/test/DebugInfo/Inputs/arange-overlap.elf-x86_64 differ
diff --git a/test/DebugInfo/Inputs/fission-ranges.cc b/test/DebugInfo/Inputs/fission-ranges.cc
new file mode 100644
index 000000000000..a585bf9c0086
--- /dev/null
+++ b/test/DebugInfo/Inputs/fission-ranges.cc
@@ -0,0 +1,17 @@
+static inline int inlined_f() {
+  volatile int x = 2;
+  return x;
+}
+
+int main() {
+  return inlined_f();
+}
+
+// Build instructions:
+// $ mkdir /tmp/dbginfo
+// $ cp fission-ranges.cc /tmp/dbginfo/
+// $ cd /tmp/dbginfo
+// $ gcc -gsplit-dwarf -O2 -fPIC fission-ranges.cc -c -o obj2.o
+// $ clang -gsplit-dwarf -O2 -fsanitize=address -fPIC -Dmain=foo fission-ranges.cc -c -o obj1.o
+// $ gcc obj1.o obj2.o -shared -o <output>
+// $ objcopy --remove-section=.debug_aranges <output>
diff --git a/test/DebugInfo/Inputs/fission-ranges.elf-x86_64 b/test/DebugInfo/Inputs/fission-ranges.elf-x86_64
new file mode 100755
index 000000000000..3d2fd79dd747
Binary files /dev/null and b/test/DebugInfo/Inputs/fission-ranges.elf-x86_64 differ
diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll
new file mode 100644
index 000000000000..9bce4ba6c9d8
--- /dev/null
+++ b/test/DebugInfo/Mips/delay-slot.ll
@@ -0,0 +1,75 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple mips-unknown-linux-gnu | llvm-dwarfdump - | FileCheck %s
+; PR19815
+
+; Generated using clang -target mips-linux-gnu -g test.c -S -o - -flto|opt -sroa -S
+; test.c:
+;
+; int foo(int x) {
+;  if (x)
+;    return 0;
+;  return 1;
+; }
+
+; CHECK: Address            Line   Column File   ISA Discriminator Flags
+; CHECK: ------------------ ------ ------ ------ --- ------------- -------------
+; CHECK: 0x0000000000000000      1      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000000      1      0      1   0             0  is_stmt prologue_end
+; CHECK: 0x0000000000000008      2      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000020      3      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000030      4      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000040      5      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000050      5      0      1   0             0  is_stmt end_sequence
+
+target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %x) #0 {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12), !dbg !13
+  %tobool = icmp ne i32 %x, 0, !dbg !14
+  br i1 %tobool, label %if.then, label %if.end, !dbg !14
+
+if.then:                                          ; preds = %entry
+  br label %return, !dbg !16
+
+if.end:                                           ; preds = %entry
+  br label %return, !dbg !17
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %if.end ]
+  ret i32 %retval.0, !dbg !18
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5.0"}
+!12 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!13 = metadata !{i32 1, i32 0, metadata !4, null}
+!14 = metadata !{i32 2, i32 0, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/test.c]
+!16 = metadata !{i32 3, i32 0, metadata !15, null}
+!17 = metadata !{i32 4, i32 0, metadata !4, null}
+!18 = metadata !{i32 5, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/Mips/lit.local.cfg b/test/DebugInfo/Mips/lit.local.cfg
new file mode 100644
index 000000000000..7d12f7a9c564
--- /dev/null
+++ b/test/DebugInfo/Mips/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'Mips' in config.root.targets:
+    config.unsupported = True
diff --git a/test/DebugInfo/PR20038.ll b/test/DebugInfo/PR20038.ll
new file mode 100644
index 000000000000..a9fb3ff3b604
--- /dev/null
+++ b/test/DebugInfo/PR20038.ll
@@ -0,0 +1,146 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; IR generated from clang -O0 with:
+; struct C {
+;   ~C();
+; };
+; extern bool b;
+; void fun4() { b && (C(), 1); }
+; __attribute__((always_inline)) C::~C() { }
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "C"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[C_DTOR_DECL:.*]]:  DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "~C"
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "fun4"
+
+; FIXME: The dtor is inlined into fun4 and should have an inlined_subroutine
+; entry. (it may be necessary to put some non-trivial instruction, such as an
+; assignment to a global, in the dtor just to ensure its emission/inlining)
+
+; CHECK-NOT: DW_TAG_inlined_subroutine
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_ZN1CD1Ev"
+
+; FIXME: But I think where the real issue is for PR20038 is that the D1 ctor, ;
+; calling and inlining D2, doesn't end up with an inlined_subroutine. Though this
+; might be more the result of a lack of any actual work in D2 (again, could use
+; an assignment to global, etc)
+
+; CHECK-NOT: DW_TAG_inlined_subroutine
+
+%struct.C = type { i8 }
+
+@b = external global i8
+
+; Function Attrs: nounwind
+define void @_Z4fun4v() #0 {
+entry:
+  %this.addr.i.i = alloca %struct.C*, align 8, !dbg !21
+  %this.addr.i = alloca %struct.C*, align 8
+  %agg.tmp.ensured = alloca %struct.C, align 1
+  %cleanup.cond = alloca i1
+  %0 = load i8* @b, align 1, !dbg !22
+  %tobool = trunc i8 %0 to i1, !dbg !22
+  store i1 false, i1* %cleanup.cond
+  br i1 %tobool, label %land.rhs, label %land.end, !dbg !22
+
+land.rhs:                                         ; preds = %entry
+  store i1 true, i1* %cleanup.cond, !dbg !23
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %1 = phi i1 [ false, %entry ], [ true, %land.rhs ]
+  %cleanup.is_active = load i1* %cleanup.cond
+  br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
+
+cleanup.action:                                   ; preds = %land.end
+  store %struct.C* %agg.tmp.ensured, %struct.C** %this.addr.i, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !25), !dbg !27
+  %this1.i = load %struct.C** %this.addr.i
+  store %struct.C* %this1.i, %struct.C** %this.addr.i.i, align 8, !dbg !21
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i.i}, metadata !28), !dbg !29
+  %this1.i.i = load %struct.C** %this.addr.i.i, !dbg !21
+  br label %cleanup.done
+
+cleanup.done:                                     ; preds = %cleanup.action, %land.end
+  ret void, !dbg !22
+}
+
+; Function Attrs: alwaysinline nounwind
+define void @_ZN1CD1Ev(%struct.C* %this) unnamed_addr #1 align 2 {
+entry:
+  %this.addr.i = alloca %struct.C*, align 8, !dbg !21
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !25), !dbg !27
+  %this1 = load %struct.C** %this.addr
+  store %struct.C* %this1, %struct.C** %this.addr.i, align 8, !dbg !21
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !28), !dbg !29
+  %this1.i = load %struct.C** %this.addr.i, !dbg !21
+  ret void, !dbg !21
+}
+
+; Function Attrs: alwaysinline nounwind
+define void @_ZN1CD2Ev(%struct.C* %this) unnamed_addr #1 align 2 {
+entry:
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28), !dbg !30
+  %this1 = load %struct.C** %this.addr
+  ret void, !dbg !31
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !19}
+!llvm.ident = !{!20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !"PR20038.cpp", metadata !"/tmp/dbginfo"}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"", i32 2, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [~C]
+!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{null, metadata !10}
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!11 = metadata !{metadata !12, metadata !16, metadata !17}
+!12 = metadata !{i32 786478, metadata !5, metadata !13, metadata !"fun4", metadata !"fun4", metadata !"_Z4fun4v", i32 5, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4fun4v, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [fun4]
+!13 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/PR20038.cpp]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{null}
+!16 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"_ZN1CD2Ev", i32 6, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1CD2Ev, null, metadata !7, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!17 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"_ZN1CD1Ev", i32 6, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1CD1Ev, null, metadata !7, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{metadata !"clang version 3.5.0 "}
+!21 = metadata !{i32 6, i32 0, metadata !17, null}
+!22 = metadata !{i32 5, i32 0, metadata !12, null}
+!23 = metadata !{i32 5, i32 0, metadata !24, null}
+!24 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!25 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !26, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!26 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!27 = metadata !{i32 0, i32 0, metadata !17, null}
+!28 = metadata !{i32 786689, metadata !16, metadata !"this", null, i32 16777216, metadata !26, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!29 = metadata !{i32 0, i32 0, metadata !16, metadata !21}
+!30 = metadata !{i32 0, i32 0, metadata !16, null}
+!31 = metadata !{i32 6, i32 0, metadata !16, null}
diff --git a/test/DebugInfo/PowerPC/lit.local.cfg b/test/DebugInfo/PowerPC/lit.local.cfg
index 193ebebcd50e..091332439b18 100644
--- a/test/DebugInfo/PowerPC/lit.local.cfg
+++ b/test/DebugInfo/PowerPC/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/Sparc/lit.local.cfg b/test/DebugInfo/Sparc/lit.local.cfg
index e4cee974dec3..d86c9e6d943a 100644
--- a/test/DebugInfo/Sparc/lit.local.cfg
+++ b/test/DebugInfo/Sparc/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/SystemZ/lit.local.cfg b/test/DebugInfo/SystemZ/lit.local.cfg
index b12af09434be..5c02dd3614a4 100644
--- a/test/DebugInfo/SystemZ/lit.local.cfg
+++ b/test/DebugInfo/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/SystemZ/variable-loc.ll b/test/DebugInfo/SystemZ/variable-loc.ll
index 2d92fd9edcc5..23df1cb555d3 100644
--- a/test/DebugInfo/SystemZ/variable-loc.ll
+++ b/test/DebugInfo/SystemZ/variable-loc.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -mtriple=s390x-linux-gnu -disable-fp-elim < %s | FileCheck %s
+; RUN: llc -mtriple=s390x-linux-gnu -disable-fp-elim -filetype=obj < %s \
+; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=DEBUG %s
 ;
 ; This is a regression test making sure the location of variables is correct in
 ; debugging information, even if they're addressed via the frame pointer.
@@ -10,20 +12,13 @@
 ; CHECK: aghi    %r15, -568
 ; CHECK: la      %r2, 164(%r11)
 ; CHECK: brasl   %r14, populate_array@PLT
-;
-; CHECK: .Linfo_string7:
-; CHECK-NEXT: main_arr
-;
-; Now check that the debugging information reflects this:
-; CHECK: DW_TAG_variable
-; CHECK-NEXT: .long .Linfo_string7
-;
-; Rather hard-coded, but 145 => DW_OP_fbreg and the .ascii is the sleb128
-; encoding of 164:
-; CHECK: DW_AT_location
-; CHECK-NEXT: .byte 145
-; CHECK-NEXT: .ascii "\244\001"
-;
+
+; DEBUG: DW_TAG_variable
+; Rather hard-coded, but 0x91 => DW_OP_fbreg and 0xa401 is SLEB128 encoded 164.
+; DEBUG-NOT: DW_TAG
+; DEBUG: DW_AT_location {{.*}}(<0x3> 91 a4 01 )
+; DEBUG-NOT: DW_TAG
+; DEBUG: DW_AT_name {{.*}} "main_arr"
 
 
 @.str = private unnamed_addr constant [13 x i8] c"Total is %d\0A\00", align 2
diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index 1bbfbf4e6c1d..4dc747f566d2 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
@@ -37,13 +37,19 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !20 = metadata !{metadata !"test.c", metadata !"/work/llvm/vanilla/test/DebugInfo"}
 
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "GLB")
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "GLB")
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
 
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x{{[0-9a-f]*}}] = "LOC")
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x{{[0-9a-f]*}}] = "LOC")
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_file [DW_FORM_data1]     (0x01)
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_line [DW_FORM_data1]     (0x04)
 
 !21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_AT_linkage_name.ll b/test/DebugInfo/X86/DW_AT_linkage_name.ll
index 4aa69f2aa20b..dce234aa9002 100644
--- a/test/DebugInfo/X86/DW_AT_linkage_name.ll
+++ b/test/DebugInfo/X86/DW_AT_linkage_name.ll
@@ -18,12 +18,14 @@
 ; Test that we do emit a linkage name for a specific instance of it.
 
 ; CHECK: DW_TAG_subprogram
-; CHECK: [[A:.*]]:     DW_TAG_subprogram
+; CHECK: [[A_DTOR:.*]]:     DW_TAG_subprogram
 ; CHECK: DW_AT_name {{.*}} "~A"
 ; CHECK-NOT: DW_AT_MIPS_linkage_name
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT:  DW_AT_specification {{.*}}[[A]]
-; CHECK-NEXT: DW_AT_MIPS_linkage_name {{.*}} "_ZN1AD2Ev"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_ZN1AD2Ev"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}}[[A_DTOR]]
 
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/DebugInfo/X86/DW_AT_location-reference.ll b/test/DebugInfo/X86/DW_AT_location-reference.ll
index 6c5e32c093d4..f31b0ad3259a 100644
--- a/test/DebugInfo/X86/DW_AT_location-reference.ll
+++ b/test/DebugInfo/X86/DW_AT_location-reference.ll
@@ -31,11 +31,11 @@
 ; // The 'x' variable and its symbol reference location
 ; CHECK: .debug_info contents:
 ; CHECK:      DW_TAG_variable
+; CHECK-NEXT:   DW_AT_location [DW_FORM_sec_offset] (0x00000000)
 ; CHECK-NEXT:   DW_AT_name {{.*}} "x"
 ; CHECK-NEXT:   DW_AT_decl_file
 ; CHECK-NEXT:   DW_AT_decl_line
 ; CHECK-NEXT:   DW_AT_type
-; CHECK-NEXT:   DW_AT_location [DW_FORM_sec_offset] (0x00000000)
 
 ; Check that the location contains only 4 ranges - this verifies that the 4th
 ; and 5th ranges were successfully merged into a single range.
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
index 5fa96994a8be..4b9fae8e5af8 100644
--- a/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -7,7 +7,8 @@
 ; CHECK: DW_TAG_class_type
 ; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
 ; CHECK: [[PARAM]]:     DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
 
 %class.A = type { i32 }
 
diff --git a/test/DebugInfo/X86/DW_AT_specification.ll b/test/DebugInfo/X86/DW_AT_specification.ll
index 64b00800d63b..b93cdf081df7 100644
--- a/test/DebugInfo/X86/DW_AT_specification.ll
+++ b/test/DebugInfo/X86/DW_AT_specification.ll
@@ -3,10 +3,10 @@
 
 ; test that the DW_AT_specification is a back edge in the file.
 
-; CHECK: DW_TAG_subprogram [{{[0-9]+}}] *
-; CHECK: DW_AT_specification [DW_FORM_ref4]      (cu + 0x[[OFFSET:[0-9a-f]*]] => {0x0000[[OFFSET]]})
-; CHECK: 0x0000[[OFFSET]]: DW_TAG_subprogram [{{[0-9]+}}] *
-; CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x{{[0-9a-f]*}}] = "bar")
+; CHECK: [[BAR_DECL:0x[0-9a-f]*]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_MIPS_linkage_name {{.*}} "_ZN3foo3barEv"
+; CHECK: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[BAR_DECL]]}
 
 
 @_ZZN3foo3barEvE1x = constant i32 0, align 4
diff --git a/test/DebugInfo/X86/arguments.ll b/test/DebugInfo/X86/arguments.ll
index 673528455714..989e4fff484a 100644
--- a/test/DebugInfo/X86/arguments.ll
+++ b/test/DebugInfo/X86/arguments.ll
@@ -15,13 +15,16 @@
 
 ; CHECK: debug_info contents
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name{{.*}}"_Z4func3fooS_"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name{{.*}}"_Z4func3fooS_"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"f"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"f"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"g"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"g"
 
 %struct.foo = type { i32 }
 
diff --git a/test/DebugInfo/X86/block-capture.ll b/test/DebugInfo/X86/block-capture.ll
index 0ff91de4cce1..b3877df30178 100644
--- a/test/DebugInfo/X86/block-capture.ll
+++ b/test/DebugInfo/X86/block-capture.ll
@@ -4,15 +4,15 @@
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DWARF3
 
 ; Checks that we emit debug info for the block variable declare.
-; CHECK: DW_TAG_subprogram [3]
-; CHECK: DW_TAG_variable [5]
-; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[{{.*}}] = "block")
-; CHECK: DW_AT_location [DW_FORM_sec_offset]        ({{.*}})
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_variable
+; CHECK: DW_AT_location [DW_FORM_sec_offset]
+; CHECK: DW_AT_name {{.*}} "block"
 
-; DWARF3: DW_TAG_subprogram [3]
-; DWARF3: DW_TAG_variable [5]
-; DWARF3: DW_AT_name [DW_FORM_strp]     ( .debug_str[{{.*}}] = "block")
-; DWARF3: DW_AT_location [DW_FORM_data4]        ({{.*}})
+; DWARF3: DW_TAG_subprogram
+; DWARF3: DW_TAG_variable
+; DWARF3: DW_AT_location [DW_FORM_data4]
+; DWARF3: DW_AT_name {{.*}} "block"
 
 %struct.__block_descriptor = type { i64, i64 }
 %struct.__block_literal_generic = type { i8*, i32, i32, i8*, %struct.__block_descriptor* }
diff --git a/test/DebugInfo/X86/byvalstruct.ll b/test/DebugInfo/X86/byvalstruct.ll
index 731f8dbc672e..d787ef39c36c 100644
--- a/test/DebugInfo/X86/byvalstruct.ll
+++ b/test/DebugInfo/X86/byvalstruct.ll
@@ -6,7 +6,8 @@
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name {{.*}} "info"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "info"
 ;
 ; generated from
 ;
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index ae025ad0de33..40300de793d5 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=x86_64-linux %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 
 ; test that we add DW_AT_inline even when we only have concrete out of line
 ; instances.
@@ -11,8 +10,25 @@
 ; CHECK:   DW_TAG_subprogram
 ; CHECK: [[ASSIGN_DECL:0x........]]:  DW_TAG_subprogram
 
+; CHECK: DW_TAG_class_type
+; CHECK: [[RELEASE_DECL:0x........]]:  DW_TAG_subprogram
+; CHECK: [[DTOR_DECL:0x........]]:  DW_TAG_subprogram
+
+; CHECK: [[D2_ABS:.*]]: DW_TAG_subprogram
+; CHECK-NEXT:     DW_AT_{{.*}}linkage_name {{.*}}D2
+; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
+; CHECK-NEXT:     DW_AT_inline
+; CHECK-NOT:      DW_AT
+; CHECK: DW_TAG
+; CHECK: [[D1_ABS:.*]]: DW_TAG_subprogram
+; CHECK-NEXT:     DW_AT_{{.*}}linkage_name {{.*}}D1
+; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
+; CHECK-NEXT:     DW_AT_inline
+; CHECK-NOT:     DW_AT
+; CHECK: [[D1_THIS_ABS:.*]]: DW_TAG_formal_parameter
+
 ; CHECK: [[RELEASE:0x........]]: DW_TAG_subprogram
-; CHECK-NEXT:     DW_AT_specification {{.*}} {[[RELEASE_DECL:0x........]]}
+; CHECK:     DW_AT_specification {{.*}} {[[RELEASE_DECL]]}
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_TAG
@@ -24,35 +40,20 @@
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D1_ABS:0x........]]}
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D2_ABS:0x........]]}
-
-; CHECK: DW_TAG_class_type
-; CHECK: [[RELEASE_DECL]]:  DW_TAG_subprogram
-; CHECK: [[DTOR_DECL:0x........]]:  DW_TAG_subprogram
-
-
-; CHECK: [[D1_ABS]]: DW_TAG_subprogram
-; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
-; CHECK-NEXT:     DW_AT_{{.*}}linkage_name
-; CHECK-NEXT:     DW_AT_inline
-; CHECK-NOT:     DW_AT_inline
-; CHECK: [[D2_ABS]]: DW_TAG_subprogram
-; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
-; CHECK-NEXT:     DW_AT_{{.*}}linkage_name
-; CHECK-NEXT:     DW_AT_inline
-; CHECK-NOT:     DW_AT_inline
-; CHECK: DW_TAG
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D2_ABS]]}
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
 ; CHECK: DW_TAG_subprogram
-; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
 ; CHECK: DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_abstract_origin {{.*}} {[[D1_THIS_ABS]]}
 ; CHECK: DW_TAG_inlined_subroutine
 ; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D2_ABS]]}
 
diff --git a/test/DebugInfo/X86/cu-ranges.ll b/test/DebugInfo/X86/cu-ranges.ll
index e6dc17e2d50e..405a498155f5 100644
--- a/test/DebugInfo/X86/cu-ranges.ll
+++ b/test/DebugInfo/X86/cu-ranges.ll
@@ -1,4 +1,4 @@
-; RUN: llc -split-dwarf=Enable -O0 %s -ffunction-sections -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
+; RUN: llc -split-dwarf=Enable -O0 %s -function-sections -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
 ; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck --check-prefix=FUNCTION-SECTIONS %s
 ; RUN: llvm-readobj --relocations %t | FileCheck --check-prefix=FUNCTION-SECTIONS-RELOCS %s
 
diff --git a/test/DebugInfo/X86/dbg-const-int.ll b/test/DebugInfo/X86/dbg-const-int.ll
index f2f51c9b0f3d..bf7ee08c665f 100644
--- a/test/DebugInfo/X86/dbg-const-int.ll
+++ b/test/DebugInfo/X86/dbg-const-int.ll
@@ -1,12 +1,14 @@
-; RUN: llc -mtriple=x86_64-apple-darwin12 -filetype=obj %s -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin12 -filetype=obj < %s \
+; RUN:    | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.6.7"
 ; Radar 9511391
 
 ; CHECK: DW_TAG_variable
-; CHECK: "i"
-; CHECK: DW_AT_const_value [DW_FORM_sdata]   (42)
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_const_value [DW_FORM_sdata]   (42)
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "i"
 
 define i32 @foo() nounwind uwtable readnone optsize ssp {
 entry:
diff --git a/test/DebugInfo/X86/dbg-value-const-byref.ll b/test/DebugInfo/X86/dbg-value-const-byref.ll
index baba0cdb315a..23fa3520a7d1 100644
--- a/test/DebugInfo/X86/dbg-value-const-byref.ll
+++ b/test/DebugInfo/X86/dbg-value-const-byref.ll
@@ -20,9 +20,10 @@
 ;
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name{{.*}}"i"
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_location [DW_FORM_data4]	([[LOC:.*]])
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"i"
 ; CHECK: .debug_loc contents:
 ; CHECK: [[LOC]]:
 ;        consts 0x00000003
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index 3db67ffdc7a4..4d18f7dc3063 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -5,8 +5,27 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin < %s -filetype=obj -regalloc=basic \
 ; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK --check-prefix=DARWIN %s
 
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_abstract_origin {{.*}}{[[ABS:.*]]}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}}{[[ABS_SP:.*]]}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}}{[[ABS_NUMS:.*]]}
+
+; CHECK: [[ABS]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "foo"
+; CHECK: [[ABS_SP]]:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "sp"
+; CHECK: [[ABS_NUMS]]:  DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "nums"
+
 ;CHECK: DW_TAG_inlined_subroutine
-;CHECK-NEXT: DW_AT_abstract_origin
+;CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABS]]}
 ;CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]
 ;CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]
 ;CHECK-NEXT: DW_AT_call_file
@@ -14,9 +33,11 @@
 
 ;CHECK: DW_TAG_formal_parameter
 ;FIXME: Linux shouldn't drop this parameter either...
-;LINUX-NOT: DW_TAG_formal_parameter
+;CHECK-NOT: DW_TAG
+;DARWIN:   DW_AT_abstract_origin {{.*}}{[[ABS_SP]]}
 ;DARWIN: DW_TAG_formal_parameter
-;DARWIN-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000055] = "sp")
+;CHECK: DW_AT_abstract_origin {{.*}}{[[ABS_NUMS]]}
+;CHECK-NOT: DW_TAG_formal_parameter
 
 %struct.S1 = type { float*, i32 }
 
@@ -62,7 +83,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !6 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"", i32 15, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @foobar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 15] [def] [scope 0] [foobar]
 !7 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 786689, metadata !0, metadata !"sp", metadata !1, i32 7, metadata !10, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 786689, metadata !0, metadata !"sp", metadata !1, i32 16777223, metadata !10, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
 !10 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 786454, metadata !42, metadata !2, metadata !"S1", i32 4, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ]
 !12 = metadata !{i32 786451, metadata !42, metadata !2, metadata !"S1", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S1] [line 1, size 128, align 64, offset 0] [def] [from ]
@@ -71,7 +92,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !15 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ]
 !16 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !17 = metadata !{i32 786445, metadata !42, metadata !1, metadata !"nums", i32 3, i64 32, i64 32, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ]
-!18 = metadata !{i32 786689, metadata !0, metadata !"nums", metadata !1, i32 7, metadata !5, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{i32 786689, metadata !0, metadata !"nums", metadata !1, i32 33554439, metadata !5, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786484, i32 0, metadata !2, metadata !"p", metadata !"p", metadata !"", metadata !1, i32 14, metadata !11, i32 0, i32 1, %struct.S1* @p, null} ; [ DW_TAG_variable ]
 !20 = metadata !{i32 7, i32 13, metadata !0, null}
 !21 = metadata !{i32 7, i32 21, metadata !0, null}
diff --git a/test/DebugInfo/X86/dbg-value-isel.ll b/test/DebugInfo/X86/dbg-value-isel.ll
index f899f48b1fdf..155f76f7ab5f 100644
--- a/test/DebugInfo/X86/dbg-value-isel.ll
+++ b/test/DebugInfo/X86/dbg-value-isel.ll
@@ -92,7 +92,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !8 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 1, i32 32, metadata !0, null}
 !10 = metadata !{i32 786688, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !0, i32 2, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 786443, metadata !1, metadata !0, i32 2, i32 1, i32 1} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 5, i32 24, metadata !11, null}
 !13 = metadata !{i32 786688, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
 !14 = metadata !{i32 6, i32 25, metadata !11, null}
diff --git a/test/DebugInfo/X86/dbg-value-location.ll b/test/DebugInfo/X86/dbg-value-location.ll
index 9184217b1874..55d1ae6a9f69 100644
--- a/test/DebugInfo/X86/dbg-value-location.ll
+++ b/test/DebugInfo/X86/dbg-value-location.ll
@@ -6,11 +6,11 @@ target triple = "x86_64-apple-darwin10.0.0"
 ; rdar://8950491
 
 ;CHECK: DW_TAG_formal_parameter
+;CHECK-NEXT: DW_AT_location
 ;CHECK-NEXT: DW_AT_name {{.*}} "var"
 ;CHECK-NEXT: DW_AT_decl_file
 ;CHECK-NEXT: DW_AT_decl_line
 ;CHECK-NEXT: DW_AT_type
-;CHECK-NEXT: DW_AT_location
 
 @dfm = external global i32, align 4
 
diff --git a/test/DebugInfo/X86/debug-dead-local-var.ll b/test/DebugInfo/X86/debug-dead-local-var.ll
new file mode 100644
index 000000000000..64f0b2a9e40f
--- /dev/null
+++ b/test/DebugInfo/X86/debug-dead-local-var.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; Reconstruct this via clang and -O2.
+; static void foo() {
+;   struct X { int a; int b; } xyz;
+; }
+
+; int bar() {
+;   foo();
+;   return 1;
+; }
+
+; Check that we still have the structure type for X even though we're not
+; going to emit a low/high_pc for foo.
+; CHECK: DW_TAG_structure_type
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @bar() #0 {
+entry:
+  ret i32 1, !dbg !21
+}
+
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !19}
+!llvm.ident = !{!20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/debug-dead-local-var.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"debug-dead-local-var.c", metadata !"/usr/local/google/home/echristo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !9}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 11, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @bar, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/debug-dead-local-var.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !10, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, metadata !12, i32 6} ; [ DW_TAG_subprogram ] [line 6] [local] [def] [foo]
+!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{null}
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786688, metadata !9, metadata !"xyz", metadata !5, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xyz] [line 8]
+!14 = metadata !{i32 786451, metadata !1, metadata !9, metadata !"X", i32 8, i64 64, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 8, size 64, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"a", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 8, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"b", i32 8, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 8, size 32, align 32, offset 32] [from int]
+!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{metadata !"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)"}
+!21 = metadata !{i32 13, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/X86/debug-info-block-captured-self.ll b/test/DebugInfo/X86/debug-info-block-captured-self.ll
index 87e8f03d267c..d753cc0f41e6 100644
--- a/test/DebugInfo/X86/debug-info-block-captured-self.ll
+++ b/test/DebugInfo/X86/debug-info-block-captured-self.ll
@@ -7,17 +7,19 @@
 ; This test is split into two parts, the frontend part can be found at
 ; llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m
 ;
-; CHECK:      {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
-; CHECK:      DW_TAG_variable
-; CHECK:      {{.*}}DW_AT_name{{.*}}"self"{{.*}}
+; CHECK: {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
+; CHECK: DW_TAG_variable
 ; CHECK-NOT:  DW_TAG
-; CHECK:      DW_AT_location
+; CHECK:   DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name{{.*}}"self"{{.*}}
 ;
-; CHECK:      {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
-; CHECK:      DW_TAG_variable
-; CHECK:      {{.*}}DW_AT_name{{.*}}"self"{{.*}}
+; CHECK: {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
+; CHECK: DW_TAG_variable
 ; CHECK-NOT:  DW_TAG
-; CHECK:      DW_AT_location
+; CHECK:   DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name{{.*}}"self"{{.*}}
 ;
 ; Generated (and then reduced) from
 ; ----------------------------------------------------------------------
diff --git a/test/DebugInfo/X86/debug-info-blocks.ll b/test/DebugInfo/X86/debug-info-blocks.ll
index b2531f647083..db1143cfa3f5 100644
--- a/test/DebugInfo/X86/debug-info-blocks.ll
+++ b/test/DebugInfo/X86/debug-info-blocks.ll
@@ -5,33 +5,40 @@
 ; rdar://problem/9279956
 ; test that the DW_AT_location of self is at ( fbreg +{{[0-9]+}}, deref, +{{[0-9]+}} )
 
+; CHECK: [[A:.*]]:   DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_APPLE_objc_complete_type
+; CHECK-NEXT: DW_AT_name{{.*}}"A"
+
 ; CHECK: DW_TAG_subprogram
-; CHECK: DW_AT_name{{.*}}_block_invoke
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_object_pointer
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}_block_invoke
 
-; CHECK-NOT: DW_TAG_subprogram
+; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}.block_descriptor
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}.block_descriptor
 
-; CHECK-NOT: DW_TAG_subprogram
+; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name{{.*}}"self"
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_type{{.*}}{[[APTR:.*]]}
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_artificial
 ; CHECK-NOT: DW_TAG
 ; 0x06 = DW_OP_deref
 ; 0x23 = DW_OP_uconst
 ; 0x91 = DW_OP_fbreg
 ; CHECK: DW_AT_location{{.*}}91 {{[0-9]+}} 06 23 {{[0-9]+}} )
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"self"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_type{{.*}}{[[APTR:.*]]}
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_artificial
 
-; CHECK: [[A:.*]]:   DW_TAG_structure_type
-; CHECK-NEXT: DW_AT_APPLE_objc_complete_type
-; CHECK-NEXT: DW_AT_name{{.*}}"A"
-; CHECK: [[APTR]]:   DW_TAG_pointer_type [5]
+; CHECK: [[APTR]]:   DW_TAG_pointer_type
 ; CHECK-NEXT: {[[A]]}
 
 
diff --git a/test/DebugInfo/X86/debug-loc-asan.ll b/test/DebugInfo/X86/debug-loc-asan.ll
new file mode 100644
index 000000000000..746b1dcaa3eb
--- /dev/null
+++ b/test/DebugInfo/X86/debug-loc-asan.ll
@@ -0,0 +1,186 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; Verify that we have correct debug info for local variables in code
+; instrumented with AddressSanitizer.
+
+; Generated from the source file test.cc:
+; int bar(int y) {
+;   return y + 2;
+; }
+; with "clang++ -S -emit-llvm -fsanitize=address -O0 -g test.cc"
+
+; First, argument variable "y" resides in %rdi:
+; CHECK: DEBUG_VALUE: bar:y <- RDI
+
+; Then its address is stored in a location on a stack:
+; CHECK: movq %rdi, [[OFFSET:[0-9]+]](%rsp)
+; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
+; CHECK-NEXT: DEBUG_VALUE: bar:y <- [RSP+[[OFFSET]]]
+; This location should be valid until the end of the function.
+
+; CHECK: .Ldebug_loc{{[0-9]+}}:
+; We expect two location ranges for the variable.
+
+; First, it is stored in %rdx:
+; CHECK:      .Lset{{[0-9]+}} = .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK-NEXT: .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK: DW_OP_reg5
+
+; Then it's addressed via %rsp:
+; CHECK:      .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK-NEXT: .Lset{{[0-9]+}} = .Lfunc_end0-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK: DW_OP_breg7
+; CHECK-NEXT: [[OFFSET]]
+; CHECK: DW_OP_deref
+
+; ModuleID = 'test.cc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 1, void ()* @asan.module_ctor }]
+@__asan_option_detect_stack_use_after_return = external global i32
+@__asan_gen_ = private unnamed_addr constant [16 x i8] c"1 32 4 6 y.addr\00", align 1
+
+; Function Attrs: nounwind sanitize_address uwtable
+define i32 @_Z3bari(i32 %y) #0 {
+entry:
+  %MyAlloca = alloca [64 x i8], align 32
+  %0 = ptrtoint [64 x i8]* %MyAlloca to i64
+  %1 = load i32* @__asan_option_detect_stack_use_after_return
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %5
+
+; <label>:3                                       ; preds = %entry
+  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0)
+  br label %5
+
+; <label>:5                                       ; preds = %entry, %3
+  %6 = phi i64 [ %0, %entry ], [ %4, %3 ]
+  %7 = add i64 %6, 32
+  %8 = inttoptr i64 %7 to i32*
+  %9 = inttoptr i64 %6 to i64*
+  store i64 1102416563, i64* %9
+  %10 = add i64 %6, 8
+  %11 = inttoptr i64 %10 to i64*
+  store i64 ptrtoint ([16 x i8]* @__asan_gen_ to i64), i64* %11
+  %12 = add i64 %6, 16
+  %13 = inttoptr i64 %12 to i64*
+  store i64 ptrtoint (i32 (i32)* @_Z3bari to i64), i64* %13
+  %14 = lshr i64 %6, 3
+  %15 = add i64 %14, 2147450880
+  %16 = add i64 %15, 0
+  %17 = inttoptr i64 %16 to i64*
+  store i64 -868083100587789839, i64* %17
+  %18 = ptrtoint i32* %8 to i64
+  %19 = lshr i64 %18, 3
+  %20 = add i64 %19, 2147450880
+  %21 = inttoptr i64 %20 to i8*
+  %22 = load i8* %21
+  %23 = icmp ne i8 %22, 0
+  call void @llvm.dbg.declare(metadata !{i32* %8}, metadata !12)
+  br i1 %23, label %24, label %30
+
+; <label>:24                                      ; preds = %5
+  %25 = and i64 %18, 7
+  %26 = add i64 %25, 3
+  %27 = trunc i64 %26 to i8
+  %28 = icmp sge i8 %27, %22
+  br i1 %28, label %29, label %30
+
+; <label>:29                                      ; preds = %24
+  call void @__asan_report_store4(i64 %18)
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:30                                      ; preds = %24, %5
+  store i32 %y, i32* %8, align 4
+  %31 = ptrtoint i32* %8 to i64, !dbg !13
+  %32 = lshr i64 %31, 3, !dbg !13
+  %33 = add i64 %32, 2147450880, !dbg !13
+  %34 = inttoptr i64 %33 to i8*, !dbg !13
+  %35 = load i8* %34, !dbg !13
+  %36 = icmp ne i8 %35, 0, !dbg !13
+  br i1 %36, label %37, label %43, !dbg !13
+
+; <label>:37                                      ; preds = %30
+  %38 = and i64 %31, 7, !dbg !13
+  %39 = add i64 %38, 3, !dbg !13
+  %40 = trunc i64 %39 to i8, !dbg !13
+  %41 = icmp sge i8 %40, %35, !dbg !13
+  br i1 %41, label %42, label %43
+
+; <label>:42                                      ; preds = %37
+  call void @__asan_report_load4(i64 %31), !dbg !13
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:43                                      ; preds = %37, %30
+  %44 = load i32* %8, align 4, !dbg !13
+  %add = add nsw i32 %44, 2, !dbg !13
+  store i64 1172321806, i64* %9, !dbg !13
+  %45 = icmp ne i64 %6, %0, !dbg !13
+  br i1 %45, label %46, label %53, !dbg !13
+
+; <label>:46                                      ; preds = %43
+  %47 = add i64 %15, 0, !dbg !13
+  %48 = inttoptr i64 %47 to i64*, !dbg !13
+  store i64 -723401728380766731, i64* %48, !dbg !13
+  %49 = add i64 %6, 56, !dbg !13
+  %50 = inttoptr i64 %49 to i64*, !dbg !13
+  %51 = load i64* %50, !dbg !13
+  %52 = inttoptr i64 %51 to i8*, !dbg !13
+  store i8 0, i8* %52, !dbg !13
+  br label %56, !dbg !13
+
+; <label>:53                                      ; preds = %43
+  %54 = add i64 %15, 0, !dbg !13
+  %55 = inttoptr i64 %54 to i64*, !dbg !13
+  store i64 0, i64* %55, !dbg !13
+  br label %56, !dbg !13
+
+; <label>:56                                      ; preds = %53, %46
+  ret i32 %add, !dbg !13
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+define internal void @asan.module_ctor() {
+  call void @__asan_init_v3()
+  ret void
+}
+
+declare void @__asan_init_v3()
+
+declare void @__asan_report_load4(i64)
+
+declare void @__asan_report_store4(i64)
+
+declare i64 @__asan_stack_malloc_0(i64, i64)
+
+attributes #0 = { nounwind sanitize_address uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (209308)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/test.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"test.cc", metadata !"/llvm_cmake_gcc"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3bari", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3bari, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/test.cc]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5.0 (209308)"}
+!12 = metadata !{i32 786689, metadata !4, metadata !"y", metadata !5, i32 16777217, metadata !8, i32 0, i32 0, i64 2} ; [ DW_TAG_arg_variable ] [y] [line 1]
+!13 = metadata !{i32 2, i32 0, metadata !4, null}
+
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
index b10309c85f8c..7866d0eac5c9 100644
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ b/test/DebugInfo/X86/debug-loc-offset.ll
@@ -3,20 +3,23 @@
 
 ; From the code:
 
-; bar.cpp
+; debug-loc-offset1.cc
 ; int bar (int b) {
 ;   return b+4;
 ; }
 
-; foo.cpp
+; debug-loc-offset2.cc
 ; struct A {
-;   int a;
-;   int b;
-;   int c;
+;   int var;
+;   virtual char foo();
 ; };
 
-; int a (struct A var) {
-;   return var.a;
+; void baz(struct A a) {
+;   int z = 2;
+;   if (a.var > 2)
+;     z++;
+;   if (a.foo() == 'a')
+;     z++;
 ; }
 
 ; Compiled separately for i386-pc-linux-gnu and linked together.
@@ -37,77 +40,114 @@
 ; CHECK: DW_AT_high_pc
 
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name [DW_FORM_strp]{{.*}}"_Z1a1A"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
+; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]{{.*}}"var"
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_location [DW_FORM_sec_offset]   (0x00000000)
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]{{.*}}"a"
+
+; CHECK: DW_TAG_variable
+; CHECK: DW_AT_location [DW_FORM_exprloc]
 ; CHECK-NOT: DW_AT_location
 
 ; CHECK: .debug_loc contents:
 ; CHECK: 0x00000000: Beginning address offset: 0x0000000000000000
-; CHECK:                Ending address offset: 0x0000000000000009
-
+; CHECK:                Ending address offset: 0x000000000000001a
 
-%struct.A = type { i32, i32, i32 }
+%struct.A = type { i32 (...)**, i32 }
 
 ; Function Attrs: nounwind
 define i32 @_Z3bari(i32 %b) #0 {
 entry:
   %b.addr = alloca i32, align 4
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !25), !dbg !26
-  %0 = load i32* %b.addr, align 4, !dbg !27
-  %add = add nsw i32 %0, 4, !dbg !27
-  ret i32 %add, !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !21), !dbg !22
+  %0 = load i32* %b.addr, align 4, !dbg !23
+  %add = add nsw i32 %0, 4, !dbg !23
+  ret i32 %add, !dbg !23
 }
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata) #1
 
-; Function Attrs: nounwind
-define i32 @_Z1a1A(%struct.A* byval align 4 %var) #0 {
+define void @_Z3baz1A(%struct.A* %a) #2 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.A* %var}, metadata !28), !dbg !29
-  %a = getelementptr inbounds %struct.A* %var, i32 0, i32 0, !dbg !30
-  %0 = load i32* %a, align 4, !dbg !30
-  ret i32 %0, !dbg !30
+  %z = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata !{i32* %z}, metadata !26), !dbg !27
+  store i32 2, i32* %z, align 4, !dbg !27
+  %var = getelementptr inbounds %struct.A* %a, i32 0, i32 1, !dbg !28
+  %0 = load i32* %var, align 4, !dbg !28
+  %cmp = icmp sgt i32 %0, 2, !dbg !28
+  br i1 %cmp, label %if.then, label %if.end, !dbg !28
+
+if.then:                                          ; preds = %entry
+  %1 = load i32* %z, align 4, !dbg !30
+  %inc = add nsw i32 %1, 1, !dbg !30
+  store i32 %inc, i32* %z, align 4, !dbg !30
+  br label %if.end, !dbg !30
+
+if.end:                                           ; preds = %if.then, %entry
+  %call = call signext i8 @_ZN1A3fooEv(%struct.A* %a), !dbg !31
+  %conv = sext i8 %call to i32, !dbg !31
+  %cmp1 = icmp eq i32 %conv, 97, !dbg !31
+  br i1 %cmp1, label %if.then2, label %if.end4, !dbg !31
+
+if.then2:                                         ; preds = %if.end
+  %2 = load i32* %z, align 4, !dbg !33
+  %inc3 = add nsw i32 %2, 1, !dbg !33
+  store i32 %inc3, i32* %z, align 4, !dbg !33
+  br label %if.end4, !dbg !33
+
+if.end4:                                          ; preds = %if.then2, %if.end
+  ret void, !dbg !34
 }
 
+declare signext i8 @_ZN1A3fooEv(%struct.A*) #2
+
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0, !9}
-!llvm.module.flags = !{!22, !23}
-!llvm.ident = !{!24, !24}
+!llvm.module.flags = !{!18, !19}
+!llvm.ident = !{!20, !20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 204264) (llvm/trunk 204286)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1}
-!1 = metadata !{metadata !"bar.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (210479)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset1.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"debug-loc-offset1.cc", metadata !"/llvm_cmake_gcc"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3bari", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3bari, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/bar.cpp]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset1.cc]
 !6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 (trunk 204264) (llvm/trunk 204286)", i1 false, metadata !"", i32 0, metadata !2, metadata !11, metadata !17, metadata !2, metadata !2, metadata !"", i32 1}
-!10 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 (210479)", i1 false, metadata !"", i32 0, metadata !2, metadata !11, metadata !13, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset2.cc] [DW_LANG_C_plus_plus]
+!10 = metadata !{metadata !"debug-loc-offset2.cc", metadata !"/llvm_cmake_gcc"}
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786451, metadata !10, null, metadata !"A", i32 1, i64 96, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 96, align 32, offset 0] [def] [from ]
-!13 = metadata !{metadata !14, metadata !15, metadata !16}
-!14 = metadata !{i32 786445, metadata !10, metadata !"_ZTS1A", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!15 = metadata !{i32 786445, metadata !10, metadata !"_ZTS1A", metadata !"b", i32 3, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from int]
-!16 = metadata !{i32 786445, metadata !10, metadata !"_ZTS1A", metadata !"c", i32 4, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 4, size 32, align 32, offset 64] [from int]
-!17 = metadata !{metadata !18}
-!18 = metadata !{i32 786478, metadata !10, metadata !19, metadata !"a", metadata !"a", metadata !"_Z1a1A", i32 7, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.A*)* @_Z1a1A, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [a]
-!19 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cpp]
-!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{metadata !8, metadata !12}
-!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!24 = metadata !{metadata !"clang version 3.5.0 (trunk 204264) (llvm/trunk 204286)"}
-!25 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
-!26 = metadata !{i32 1, i32 0, metadata !4, null}
-!27 = metadata !{i32 2, i32 0, metadata !4, null}
-!28 = metadata !{i32 786689, metadata !18, metadata !"var", metadata !19, i32 16777223, metadata !"_ZTS1A", i32 0, i32 0}
-!29 = metadata !{i32 7, i32 0, metadata !18, null}
-!30 = metadata !{i32 8, i32 0, metadata !18, null} ; [ DW_TAG_imported_declaration ]
+!12 = metadata !{i32 786451, metadata !10, null, metadata !"A", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!13 = metadata !{metadata !14}
+!14 = metadata !{i32 786478, metadata !10, metadata !15, metadata !"baz", metadata !"baz", metadata !"_Z3baz1A", i32 6, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*)* @_Z3baz1A, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [baz]
+!15 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{null, metadata !12}
+!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{metadata !"clang version 3.5.0 (210479)"}
+!21 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!22 = metadata !{i32 1, i32 0, metadata !4, null}
+!23 = metadata !{i32 2, i32 0, metadata !4, null}
+!24 = metadata !{i32 786689, metadata !14, metadata !"a", metadata !15, i32 16777222, metadata !"_ZTS1A", i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 6]
+!25 = metadata !{i32 6, i32 0, metadata !14, null}
+!26 = metadata !{i32 786688, metadata !14, metadata !"z", metadata !15, i32 7, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [z] [line 7]
+!27 = metadata !{i32 7, i32 0, metadata !14, null}
+!28 = metadata !{i32 8, i32 0, metadata !29, null} ; [ DW_TAG_imported_declaration ]
+!29 = metadata !{i32 786443, metadata !10, metadata !14, i32 8, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!30 = metadata !{i32 9, i32 0, metadata !29, null}
+!31 = metadata !{i32 10, i32 0, metadata !32, null}
+!32 = metadata !{i32 786443, metadata !10, metadata !14, i32 10, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!33 = metadata !{i32 11, i32 0, metadata !32, null}
+!34 = metadata !{i32 12, i32 0, metadata !14, null}
diff --git a/test/DebugInfo/X86/dwarf-public-names.ll b/test/DebugInfo/X86/dwarf-public-names.ll
index d870ccb5db0b..793971a5f89f 100644
--- a/test/DebugInfo/X86/dwarf-public-names.ll
+++ b/test/DebugInfo/X86/dwarf-public-names.ll
@@ -43,12 +43,14 @@
 ; LINUX: debug_pubnames
 
 ; Check for each name in the output.
-; LINUX: global_namespace_variable
-; LINUX: global_namespace_function
-; LINUX: static_member_function
-; LINUX: global_variable
-; LINUX: global_function
-; LINUX: member_function
+; LINUX-DAG: "ns"
+; LINUX-DAG: "C::static_member_function"
+; LINUX-DAG: "global_variable"
+; LINUX-DAG: "ns::global_namespace_variable"
+; LINUX-DAG: "ns::global_namespace_function"
+; LINUX-DAG: "global_function"
+; LINUX-DAG: "C::static_member_variable"
+; LINUX-DAG: "C::member_function"
 
 %struct.C = type { i8 }
 
@@ -112,7 +114,7 @@ attributes #1 = { nounwind readnone }
 !18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
 !19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
 !20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{i32 786489, null, metadata !"ns", metadata !4, i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!21 = metadata !{i32 786489, metadata !4, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
 !22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25, metadata !26, metadata !27}
diff --git a/test/DebugInfo/X86/formal_parameter.ll b/test/DebugInfo/X86/formal_parameter.ll
index 3445f46fb4d8..2fdab7a07f54 100644
--- a/test/DebugInfo/X86/formal_parameter.ll
+++ b/test/DebugInfo/X86/formal_parameter.ll
@@ -19,7 +19,8 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; rdar://problem/14874886
 ;
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name {{.*}}map
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}}map
 ; CHECK-NOT: DW_AT_name {{.*}}map
 
 ; Function Attrs: nounwind ssp uwtable
diff --git a/test/DebugInfo/X86/generate-odr-hash.ll b/test/DebugInfo/X86/generate-odr-hash.ll
index 7f4c99de2f9c..2256b3e212a7 100644
--- a/test/DebugInfo/X86/generate-odr-hash.ll
+++ b/test/DebugInfo/X86/generate-odr-hash.ll
@@ -1,12 +1,10 @@
 ; REQUIRES: object-emission
 
-; RUN: llc < %s -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
+; RUN: llc %s -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
 ; RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=SINGLE %s
-; RUN: llvm-readobj -s -t %t | FileCheck --check-prefix=OBJ_COMMON %s
 
-; RUN: llc < %s -split-dwarf=Enable -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
+; RUN: llc %s -split-dwarf=Enable -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
 ; RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=FISSION %s
-; RUN: llvm-readobj -s -t %t | FileCheck --check-prefix=OBJ_COMMON --check-prefix=OBJ_FISSION %s
 
 ; Generated from bar.cpp:
 
@@ -158,24 +156,10 @@
 ; Don't emit pubtype entries for type DIEs in the compile unit that just indirect to a type unit.
 ; CHECK-NEXT: unit_size = [[CU_SIZE]]
 ; CHECK-NEXT: Offset Name
-; CHECK-NEXT: [[BAR]] "bar"
-; CHECK-NEXT: [[WOMBAT]] "wombat"
-; CHECK-NEXT: [[FLUFFY]] "echidna::capybara::mongoose::fluffy"
-; CHECK-NEXT: [[WALRUS]] "walrus"
-
-; Make sure debug_types are in comdat groups. This could be more rigid to check
-; that they're the right comdat groups (each type in a separate comdat group,
-; etc)
-; OBJ_COMMON: Name: .debug_types (
-; OBJ_COMMON-NOT: }
-; OBJ_COMMON: SHF_GROUP
-
-; Fission type units don't go in comdat groups, since their linker is debug
-; aware it's handled using the debug info semantics rather than raw ELF object
-; semantics.
-; OBJ_FISSION: Name: .debug_types.dwo (
-; OBJ_FISSION-NOT: SHF_GROUP
-; OBJ_FISSION: }
+; CHECK-DAG: [[BAR]] "bar"
+; CHECK-DAG: [[WALRUS]] "(anonymous namespace)::walrus"
+; CHECK-DAG: [[WOMBAT]] "wombat"
+; CHECK-DAG: [[FLUFFY]] "echidna::capybara::mongoose::fluffy"
 
 %struct.bar = type { i8 }
 %"class.echidna::capybara::mongoose::fluffy" = type { i32, i32 }
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index f4001e3af927..96fa52b92caf 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections < %s | FileCheck -check-prefix=ASM %s
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
-; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections -filetype=obj -dwarf-version=3 < %s | llvm-dwarfdump - | FileCheck %s -check-prefix=DWARF3
 ; ModuleID = 'dwarf-public-names.cpp'
 ;
 ; Generated from:
@@ -46,67 +45,135 @@
 ; ASM-NEXT: .asciz  "C"                     # External Name
 
 ; CHECK: .debug_info contents:
-; CHECK: Compile Unit: length = [[UNIT_SIZE:[0-9a-f]+]]
+; CHECK: Compile Unit:
 ; CHECK: DW_AT_GNU_pubnames [DW_FORM_flag_present]   (true)
 ; CHECK-NOT: DW_AT_GNU_pubtypes [
 
-; CHECK: [[C:[0-9a-f]+]]: DW_TAG_structure_type
+; CHECK: [[C:0x[0-9a-f]+]]: DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "C"
 
-; CHECK: [[STATIC_MEM_DECL:[0-9a-f]+]]: DW_TAG_member
+; CHECK: [[STATIC_MEM_DECL:0x[0-9a-f]+]]: DW_TAG_member
 ; CHECK-NEXT: DW_AT_name {{.*}} "static_member_variable"
 
-; CHECK: [[MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: [[MEM_FUNC_DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name
 ; CHECK-NEXT: DW_AT_name {{.*}} "member_function"
 
-; CHECK: [[STATIC_MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: [[STATIC_MEM_FUNC_DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name
 ; CHECK-NEXT: DW_AT_name {{.*}} "static_member_function"
 
-; CHECK: [[INT:[0-9a-f]+]]: DW_TAG_base_type
+; CHECK: [[INT:0x[0-9a-f]+]]: DW_TAG_base_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "int"
 
-; CHECK: [[STATIC_MEM_VAR:[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_DECL]]
+; CHECK: [[STATIC_MEM_VAR:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[STATIC_MEM_DECL]]}
 
-; CHECK: [[GLOB_VAR:[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[GLOB_VAR:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "global_variable"
 
-; CHECK: [[NS:[0-9a-f]+]]: DW_TAG_namespace
+; CHECK: [[NS:0x[0-9a-f]+]]: DW_TAG_namespace
 ; CHECK-NEXT: DW_AT_name {{.*}} "ns"
 
-; CHECK: [[GLOB_NS_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[GLOB_NS_VAR_DECL:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "global_namespace_variable"
 
-; CHECK: [[D_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[D_VAR_DECL:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "d"
 
-; CHECK: [[D:[0-9a-f]+]]: DW_TAG_structure_type
+; CHECK: [[D:0x[0-9a-f]+]]: DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "D"
 
-; CHECK: [[GLOB_NS_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name {{.*}} "global_namespace_function"
-
-; CHECK: [[GLOB_NS_VAR:[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}}[[GLOB_NS_VAR_DECL]]
-
-; CHECK: [[D_VAR:[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}}[[D_VAR_DECL]]
-
-; CHECK: [[MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
-
-; CHECK: [[STATIC_MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
-
-; CHECK: [[GLOBAL_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name {{.*}} "global_function"
+; CHECK: [[GLOB_NS_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "global_namespace_function"
+
+; CHECK: [[GLOB_NS_VAR:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[GLOB_NS_VAR_DECL]]}
+
+; CHECK: [[D_VAR:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[D_VAR_DECL]]}
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "f3"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[F3_Z:.*]]:   DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "z"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_AT_location
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+
+; CHECK: [[OUTER:.*]]: DW_TAG_namespace
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "outer"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[OUTER_ANON:.*]]:  DW_TAG_namespace
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK-NOT:     DW_AT_name
+; CHECK: [[OUTER_ANON_C_DECL:.*]]:     DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "c"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; FIXME: We probably shouldn't bother describing the implicit
+; import of the preceding anonymous namespace. This should be fixed
+; in clang.
+; CHECK:     DW_TAG_imported_module
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[OUTER_ANON_C:.*]]: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK-NEXT:   DW_AT_specification {{.*}} {[[OUTER_ANON_C_DECL]]}
+
+; CHECK: [[ANON:.*]]: DW_TAG_namespace
+; CHECK-NOT:   DW_AT_name
+; CHECK: [[ANON_INNER:.*]]:  DW_TAG_namespace
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "inner"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ANON_INNER_B_DECL:.*]]:     DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "b"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ANON_I_DECL:.*]]:   DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "i"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ANON_INNER_B:.*]]: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK-NEXT:   DW_AT_specification {{.*}} {[[ANON_INNER_B_DECL]]}
+; CHECK: [[ANON_I:.*]]: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK-NEXT:   DW_AT_specification {{.*}} {[[ANON_I_DECL]]}
+
+; CHECK: [[MEM_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}} {[[MEM_FUNC_DECL]]}
+
+; CHECK: [[STATIC_MEM_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}} {[[STATIC_MEM_FUNC_DECL]]}
+
+; CHECK: [[GLOBAL_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "global_function"
 
 ; CHECK-LABEL: .debug_gnu_pubnames contents:
-; CHECK-NEXT: length = 0x000000e7 version = 0x0002 unit_offset = 0x00000000 unit_size = [[UNIT_SIZE]]
+; CHECK-NEXT: length = {{.*}} version = 0x0002 unit_offset = 0x00000000 unit_size = {{.*}}
 ; CHECK-NEXT: Offset     Linkage  Kind     Name
 ; CHECK-DAG:  [[GLOBAL_FUNC]] EXTERNAL FUNCTION "global_function"
 ; CHECK-DAG:  [[NS]] EXTERNAL TYPE     "ns"
@@ -117,6 +184,20 @@
 ; CHECK-DAG:  [[D_VAR]] EXTERNAL VARIABLE "ns::d"
 ; CHECK-DAG:  [[STATIC_MEM_VAR]] EXTERNAL VARIABLE "C::static_member_variable"
 ; CHECK-DAG:  [[STATIC_MEM_FUNC]] EXTERNAL FUNCTION "C::static_member_function"
+; CHECK-DAG:  [[ANON]] EXTERNAL TYPE "(anonymous namespace)"
+; CHECK-DAG:  [[ANON_INNER]] EXTERNAL TYPE "(anonymous namespace)::inner"
+; CHECK-DAG:  [[OUTER]] EXTERNAL TYPE "outer"
+; CHECK-DAG:  [[OUTER_ANON]] EXTERNAL TYPE "outer::(anonymous namespace)"
+; CHECK-DAG:  [[ANON_I]] STATIC VARIABLE "(anonymous namespace)::i"
+; CHECK-DAG:  [[ANON_INNER_B]] STATIC VARIABLE "(anonymous namespace)::inner::b"
+; CHECK-DAG:  [[OUTER_ANON_C]] STATIC VARIABLE "outer::(anonymous namespace)::c"
+
+; GCC Doesn't put local statics in pubnames, but it seems not unreasonable and
+; comes out naturally from LLVM's implementation, so I'm OK with it for now. If
+; it's demonstrated that this is a major size concern or degrades debug info
+; consumer behavior, feel free to change it.
+
+; CHECK-DAG:  [[F3_Z]] STATIC VARIABLE "f3::z"
 
 
 ; CHECK-LABEL: debug_gnu_pubtypes contents:
@@ -125,86 +206,6 @@
 ; CHECK-DAG:  [[D]] EXTERNAL TYPE     "ns::D"
 ; CHECK-DAG:  [[INT]] STATIC   TYPE     "int"
 
-; DWARF3: .debug_info contents:
-; DWARF3: Compile Unit: length = [[UNIT_SIZE:[0-9a-f]+]]
-; DWARF3: DW_AT_GNU_pubnames [DW_FORM_flag]   (0x01)
-; DWARF3-NOT: DW_AT_GNU_pubtypes [
-
-; DWARF3: [[C:[0-9a-f]+]]: DW_TAG_structure_type
-; DWARF3-NEXT: DW_AT_name {{.*}} "C"
-
-; DWARF3: [[STATIC_MEM_DECL:[0-9a-f]+]]: DW_TAG_member
-; DWARF3-NEXT: DW_AT_name {{.*}} "static_member_variable"
-
-; DWARF3: [[MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "member_function"
-
-; DWARF3: [[STATIC_MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "static_member_function"
-
-; DWARF3: [[INT:[0-9a-f]+]]: DW_TAG_base_type
-; DWARF3-NEXT: DW_AT_name {{.*}} "int"
-
-; DWARF3: [[STATIC_MEM_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_DECL]]
-
-; DWARF3: [[GLOB_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_variable"
-
-; DWARF3: [[NS:[0-9a-f]+]]: DW_TAG_namespace
-; DWARF3-NEXT: DW_AT_name {{.*}} "ns"
-
-; DWARF3: [[GLOB_NS_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_namespace_variable"
-
-; DWARF3: [[D_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_name {{.*}} "d"
-
-; DWARF3: [[D:[0-9a-f]+]]: DW_TAG_structure_type
-; DWARF3-NEXT: DW_AT_name {{.*}} "D"
-
-; DWARF3: [[GLOB_NS_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_namespace_function"
-
-; DWARF3: [[GLOB_NS_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[GLOB_NS_VAR_DECL]]
-
-; DWARF3: [[D_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[D_VAR_DECL]]
-
-; DWARF3: [[MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
-
-; DWARF3: [[STATIC_MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
-
-; DWARF3: [[GLOBAL_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_function"
-
-; DWARF3-LABEL: .debug_gnu_pubnames contents:
-; DWARF3-NEXT: length = 0x000000e7 version = 0x0002 unit_offset = 0x00000000 unit_size = [[UNIT_SIZE]]
-; DWARF3-NEXT: Offset     Linkage  Kind     Name
-; DWARF3-DAG:  [[GLOBAL_FUNC]] EXTERNAL FUNCTION "global_function"
-; DWARF3-DAG:  [[NS]] EXTERNAL TYPE     "ns"
-; DWARF3-DAG:  [[MEM_FUNC]] EXTERNAL FUNCTION "C::member_function"
-; DWARF3-DAG:  [[GLOB_VAR]] EXTERNAL VARIABLE "global_variable"
-; DWARF3-DAG:  [[GLOB_NS_VAR]] EXTERNAL VARIABLE "ns::global_namespace_variable"
-; DWARF3-DAG:  [[GLOB_NS_FUNC]] EXTERNAL FUNCTION "ns::global_namespace_function"
-; DWARF3-DAG:  [[D_VAR]] EXTERNAL VARIABLE "ns::d"
-; DWARF3-DAG:  [[STATIC_MEM_VAR]] EXTERNAL VARIABLE "C::static_member_variable"
-; DWARF3-DAG:  [[STATIC_MEM_FUNC]] EXTERNAL FUNCTION "C::static_member_function"
-
-
-; DWARF3-LABEL: debug_gnu_pubtypes contents:
-; DWARF3: Offset     Linkage  Kind     Name
-; DWARF3-DAG:  [[C]] EXTERNAL TYPE     "C"
-; DWARF3-DAG:  [[D]] EXTERNAL TYPE     "ns::D"
-; DWARF3-DAG:  [[INT]] STATIC   TYPE     "int"
-
 %struct.C = type { i8 }
 %"struct.ns::D" = type { i32 }
 
@@ -212,16 +213,20 @@
 @global_variable = global %struct.C zeroinitializer, align 1
 @_ZN2ns25global_namespace_variableE = global i32 1, align 4
 @_ZN2ns1dE = global %"struct.ns::D" zeroinitializer, align 4
+@_ZZ2f3vE1z = internal global i32 0, align 4
+@_ZN12_GLOBAL__N_11iE = internal global i32 0, align 4
+@_ZN12_GLOBAL__N_15inner1bE = internal global i32 0, align 4
+@_ZN5outer12_GLOBAL__N_11cE = internal global i32 0, align 4
 
 ; Function Attrs: nounwind uwtable
 define void @_ZN1C15member_functionEv(%struct.C* %this) #0 align 2 {
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !36), !dbg !38
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !50), !dbg !52
   %this1 = load %struct.C** %this.addr
-  store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !39
-  ret void, !dbg !39
+  store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !53
+  ret void, !dbg !54
 }
 
 ; Function Attrs: nounwind readnone
@@ -230,72 +235,108 @@ declare void @llvm.dbg.declare(metadata, metadata) #1
 ; Function Attrs: nounwind uwtable
 define i32 @_ZN1C22static_member_functionEv() #0 align 2 {
 entry:
-  %0 = load i32* @_ZN1C22static_member_variableE, align 4, !dbg !40
-  ret i32 %0, !dbg !40
+  %0 = load i32* @_ZN1C22static_member_variableE, align 4, !dbg !55
+  ret i32 %0, !dbg !55
 }
 
 ; Function Attrs: nounwind uwtable
 define i32 @_Z15global_functionv() #0 {
 entry:
-  ret i32 -1, !dbg !41
+  ret i32 -1, !dbg !56
 }
 
 ; Function Attrs: nounwind uwtable
 define void @_ZN2ns25global_namespace_functionEv() #0 {
 entry:
-  call void @_ZN1C15member_functionEv(%struct.C* @global_variable), !dbg !42
-  ret void, !dbg !42
+  call void @_ZN1C15member_functionEv(%struct.C* @global_variable), !dbg !57
+  ret void, !dbg !58
+}
+
+; Function Attrs: nounwind uwtable
+define i32* @_Z2f3v() #0 {
+entry:
+  ret i32* @_ZZ2f3vE1z, !dbg !59
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z2f7v() #0 {
+entry:
+  %0 = load i32* @_ZN12_GLOBAL__N_11iE, align 4, !dbg !60
+  %call = call i32* @_Z2f3v(), !dbg !60
+  %1 = load i32* %call, align 4, !dbg !60
+  %add = add nsw i32 %0, %1, !dbg !60
+  %2 = load i32* @_ZN12_GLOBAL__N_15inner1bE, align 4, !dbg !60
+  %add1 = add nsw i32 %add, %2, !dbg !60
+  %3 = load i32* @_ZN5outer12_GLOBAL__N_11cE, align 4, !dbg !60
+  %add2 = add nsw i32 %add1, %3, !dbg !60
+  ret i32 %add2, !dbg !60
 }
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!34, !43}
-!llvm.ident = !{!35}
+!llvm.module.flags = !{!47, !48}
+!llvm.ident = !{!49}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 192862) (llvm/trunk 192861)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !21, metadata !29, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/pubnames.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"pubnames.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !19, metadata !32, metadata !45, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/pubnames.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"pubnames.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !17}
+!3 = metadata !{metadata !4, metadata !15}
 !4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8, metadata !13}
+!5 = metadata !{metadata !6, metadata !8, metadata !12}
 !6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1C", metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !7, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
 !7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !12, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
 !9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11}
 !11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!12 = metadata !{i32 786468}
-!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !16, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !7}
-!16 = metadata !{i32 786468}
-!17 = metadata !{i32 786451, metadata !1, metadata !18, metadata !"D", i32 21, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 21, size 32, align 32, offset 0] [def] [from ]
-!18 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 17} ; [ DW_TAG_namespace ] [ns] [line 17]
-!19 = metadata !{metadata !20}
-!20 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN2ns1DE", metadata !"A", i32 22, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [A] [line 22, size 32, align 32, offset 0] [from int]
-!21 = metadata !{metadata !22, metadata !23, metadata !24, metadata !26}
-!22 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!23 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !13, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [static_member_function]
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 15, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !2, i32 15} ; [ DW_TAG_subprogram ] [line 15] [def] [global_function]
-!25 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/pubnames.cpp]
-!26 = metadata !{i32 786478, metadata !1, metadata !18, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 18, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [global_namespace_function]
-!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!28 = metadata !{null}
-!29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33}
-!30 = metadata !{i32 786484, i32 0, metadata !4, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !25, i32 7, metadata !7, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!31 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !25, i32 13, metadata !4, i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 13] [def]
-!32 = metadata !{i32 786484, i32 0, metadata !18, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !25, i32 19, metadata !7, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 19] [def]
-!33 = metadata !{i32 786484, i32 0, metadata !18, metadata !"d", metadata !"d", metadata !"_ZN2ns1dE", metadata !25, i32 23, metadata !17, i32 0, i32 1, %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 23] [def]
-!34 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!35 = metadata !{metadata !"clang version 3.4 (trunk 192862) (llvm/trunk 192861)"}
-!36 = metadata !{i32 786689, metadata !22, metadata !"this", null, i32 16777216, metadata !37, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!38 = metadata !{i32 0, i32 0, metadata !22, null}
-!39 = metadata !{i32 9, i32 0, metadata !22, null}
-!40 = metadata !{i32 11, i32 0, metadata !23, null}
-!41 = metadata !{i32 15, i32 0, metadata !24, null}
-!42 = metadata !{i32 18, i32 0, metadata !26, null}
-
-!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !7}
+!15 = metadata !{i32 786451, metadata !1, metadata !16, metadata !"D", i32 28, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 28, size 32, align 32, offset 0] [def] [from ]
+!16 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [ns] [line 23]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN2ns1DE", metadata !"A", i32 29, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [A] [line 29, size 32, align 32, offset 0] [from int]
+!19 = metadata !{metadata !20, metadata !21, metadata !22, metadata !24, metadata !27, metadata !31}
+!20 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!21 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !12, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!22 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !2, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!23 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/pubnames.cpp]
+!24 = metadata !{i32 786478, metadata !1, metadata !16, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !25, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!25 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = metadata !{null}
+!27 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"f3", metadata !"f3", metadata !"_Z2f3v", i32 37, metadata !28, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32* ()* @_Z2f3v, null, null, metadata !2, i32 37} ; [ DW_TAG_subprogram ] [line 37] [def] [f3]
+!28 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{metadata !30}
+!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!31 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"f7", metadata !"f7", metadata !"_Z2f7v", i32 54, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z2f7v, null, null, metadata !2, i32 54} ; [ DW_TAG_subprogram ] [line 54] [def] [f7]
+!32 = metadata !{metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !41, metadata !44}
+!33 = metadata !{i32 786484, i32 0, metadata !4, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !23, i32 7, metadata !7, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!34 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !23, i32 17, metadata !"_ZTS1C", i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!35 = metadata !{i32 786484, i32 0, metadata !16, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !23, i32 27, metadata !7, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!36 = metadata !{i32 786484, i32 0, metadata !16, metadata !"d", metadata !"d", metadata !"_ZN2ns1dE", metadata !23, i32 30, metadata !"_ZTSN2ns1DE", i32 0, i32 1, %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 30] [def]
+!37 = metadata !{i32 786484, i32 0, metadata !27, metadata !"z", metadata !"z", metadata !"", metadata !23, i32 38, metadata !7, i32 1, i32 1, i32* @_ZZ2f3vE1z, null} ; [ DW_TAG_variable ] [z] [line 38] [local] [def]
+!38 = metadata !{i32 786484, i32 0, metadata !39, metadata !"c", metadata !"c", metadata !"_ZN5outer12_GLOBAL__N_11cE", metadata !23, i32 50, metadata !7, i32 1, i32 1, i32* @_ZN5outer12_GLOBAL__N_11cE, null} ; [ DW_TAG_variable ] [c] [line 50] [local] [def]
+!39 = metadata !{i32 786489, metadata !1, metadata !40, metadata !"", i32 49} ; [ DW_TAG_namespace ] [line 49]
+!40 = metadata !{i32 786489, metadata !1, null, metadata !"outer", i32 48} ; [ DW_TAG_namespace ] [outer] [line 48]
+!41 = metadata !{i32 786484, i32 0, metadata !42, metadata !"b", metadata !"b", metadata !"_ZN12_GLOBAL__N_15inner1bE", metadata !23, i32 44, metadata !7, i32 1, i32 1, i32* @_ZN12_GLOBAL__N_15inner1bE, null} ; [ DW_TAG_variable ] [b] [line 44] [local] [def]
+!42 = metadata !{i32 786489, metadata !1, metadata !43, metadata !"inner", i32 43} ; [ DW_TAG_namespace ] [inner] [line 43]
+!43 = metadata !{i32 786489, metadata !1, null, metadata !"", i32 33} ; [ DW_TAG_namespace ] [line 33]
+!44 = metadata !{i32 786484, i32 0, metadata !43, metadata !"i", metadata !"i", metadata !"_ZN12_GLOBAL__N_11iE", metadata !23, i32 34, metadata !7, i32 1, i32 1, i32* @_ZN12_GLOBAL__N_11iE, null} ; [ DW_TAG_variable ] [i] [line 34] [local] [def]
+!45 = metadata !{metadata !46}
+!46 = metadata !{i32 786490, metadata !40, metadata !39, i32 40} ; [ DW_TAG_imported_module ]
+!47 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!48 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{metadata !"clang version 3.5.0 "}
+!50 = metadata !{i32 786689, metadata !20, metadata !"this", null, i32 16777216, metadata !51, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!51 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!52 = metadata !{i32 0, i32 0, metadata !20, null}
+!53 = metadata !{i32 10, i32 0, metadata !20, null}
+!54 = metadata !{i32 11, i32 0, metadata !20, null}
+!55 = metadata !{i32 14, i32 0, metadata !21, null}
+!56 = metadata !{i32 20, i32 0, metadata !22, null}
+!57 = metadata !{i32 25, i32 0, metadata !24, null}
+!58 = metadata !{i32 26, i32 0, metadata !24, null}
+!59 = metadata !{i32 39, i32 0, metadata !27, null}
+!60 = metadata !{i32 55, i32 0, metadata !31, null}
diff --git a/test/DebugInfo/X86/inline-member-function.ll b/test/DebugInfo/X86/inline-member-function.ll
index 4a4a19c19130..3dc6043bf36c 100644
--- a/test/DebugInfo/X86/inline-member-function.ll
+++ b/test/DebugInfo/X86/inline-member-function.ll
@@ -13,21 +13,24 @@
 ;   return foo().func(i);
 ; }
 
+; CHECK: DW_TAG_structure_type
+; CHECK:   DW_TAG_subprogram
+
+; But make sure we emit DW_AT_object_pointer on the abstract definition.
+; CHECK: [[ABSTRACT_ORIGIN:.*]]: DW_TAG_subprogram
+; CHECK-NOT: NULL
+; CHECK-NOT: TAG
+; CHECK: DW_AT_object_pointer
+
 ; Ensure we omit DW_AT_object_pointer on inlined subroutines.
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABSTRACT_ORIGIN:0x[0-9a-e]*]]}
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABSTRACT_ORIGIN]]}
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_AT_object_pointer
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_AT_artificial
 ; CHECK: DW_TAG
 
-; But make sure we emit DW_AT_object_pointer on the abstract definition.
-; CHECK: [[ABSTRACT_ORIGIN]]: DW_TAG_subprogram
-; CHECK-NOT: NULL
-; CHECK-NOT: TAG
-; CHECK: DW_AT_object_pointer
-
 %struct.foo = type { i8 }
 
 @i = global i32 0, align 4
diff --git a/test/DebugInfo/X86/inline-seldag-test.ll b/test/DebugInfo/X86/inline-seldag-test.ll
index 83c61c4fc62c..615f03a2ad28 100644
--- a/test/DebugInfo/X86/inline-seldag-test.ll
+++ b/test/DebugInfo/X86/inline-seldag-test.ll
@@ -11,10 +11,12 @@
 ;   x = f(x);
 ; }
 
+; CHECK: [[F:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "f"
+
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[F:0x.*]]}
-; CHECK: [[F]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_name {{.*}} "f"
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[F]]}
 
 
 ; Make sure the condition test is attributed to the inline function, not the
diff --git a/test/DebugInfo/X86/lit.local.cfg b/test/DebugInfo/X86/lit.local.cfg
index 19840aa7574c..c8625f4d9d24 100644
--- a/test/DebugInfo/X86/lit.local.cfg
+++ b/test/DebugInfo/X86/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/X86/misched-dbg-value.ll b/test/DebugInfo/X86/misched-dbg-value.ll
index 478f221d3ffe..c713e65110e7 100644
--- a/test/DebugInfo/X86/misched-dbg-value.ll
+++ b/test/DebugInfo/X86/misched-dbg-value.ll
@@ -6,20 +6,33 @@
 ; function parameters.
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
-; CHECK: DW_TAG_subprogram
-; CHECK: Proc8
-; CHECK: DW_TAG_formal_parameter
-; CHECK: Array1Par
-; CHECK: DW_AT_location
-; CHECK: DW_TAG_formal_parameter
-; CHECK: Array2Par
-; CHECK: DW_AT_location
-; CHECK: DW_TAG_formal_parameter
-; CHECK: IntParI1
-; CHECK: DW_AT_location
-; CHECK: DW_TAG_formal_parameter
-; CHECK: IntParI2
-; CHECK: DW_AT_location
+; CHECK:   DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "Proc8"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "Array1Par"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "Array2Par"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "IntParI1"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "IntParI2"
 
 %struct.Record = type { %struct.Record*, i32, i32, i32, [31 x i8] }
 
diff --git a/test/DebugInfo/X86/op_deref.ll b/test/DebugInfo/X86/op_deref.ll
index 810ebbc09aba..59382d19f1fe 100644
--- a/test/DebugInfo/X86/op_deref.ll
+++ b/test/DebugInfo/X86/op_deref.ll
@@ -1,18 +1,21 @@
-; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DW-CHECK
-; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj -dwarf-version=3
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DWARF3
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin < %s -filetype=obj \
+; RUN:     | llvm-dwarfdump -debug-dump=info - \
+; RUN:     | FileCheck %s -check-prefix=CHECK -check-prefix=DWARF4
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin < %s -filetype=obj -dwarf-version=3 \
+; RUN:     | llvm-dwarfdump -debug-dump=info - \
+; RUN:     | FileCheck %s -check-prefix=CHECK -check-prefix=DWARF3
 
-; DW-CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
 ; FIXME: The location here needs to be fixed, but llvm-dwarfdump doesn't handle
 ; DW_AT_location lists yet.
-; DW-CHECK: DW_AT_location [DW_FORM_sec_offset]                      (0x00000000)
+; DWARF4: DW_AT_location [DW_FORM_sec_offset]                      (0x00000000)
 
-; DWARF3: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
 ; FIXME: The location here needs to be fixed, but llvm-dwarfdump doesn't handle
 ; DW_AT_location lists yet.
 ; DWARF3: DW_AT_location [DW_FORM_data4]                      (0x00000000)
 
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
+
 ; Unfortunately llvm-dwarfdump can't unparse a list of DW_AT_locations
 ; right now, so we check the asm output:
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o - -filetype=asm | FileCheck %s -check-prefix=ASM-CHECK
diff --git a/test/DebugInfo/X86/parameters.ll b/test/DebugInfo/X86/parameters.ll
index 8248cf63c4ef..4215c21721ef 100644
--- a/test/DebugInfo/X86/parameters.ll
+++ b/test/DebugInfo/X86/parameters.ll
@@ -23,13 +23,15 @@
 ; }
 
 ; CHECK: debug_info contents
-; CHECK: DW_AT_name{{.*}} = "f"
 ; 0x74 is DW_OP_breg4, showing that the parameter is accessed indirectly
 ; (with a zero offset) from the register parameter
 ; CHECK: DW_AT_location{{.*}}(<0x0{{.}}> 74 00
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}} = "f"
 
-; CHECK: DW_AT_name{{.*}} = "g"
 ; CHECK: DW_AT_location{{.*}}([[G_LOC:0x[0-9]*]])
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}} = "g"
 ; CHECK: debug_loc contents
 ; CHECK-NEXT: [[G_LOC]]: Beginning
 ; CHECK-NEXT:               Ending
diff --git a/test/DebugInfo/X86/pr11300.ll b/test/DebugInfo/X86/pr11300.ll
index 772861a69bbb..11c409c16042 100644
--- a/test/DebugInfo/X86/pr11300.ll
+++ b/test/DebugInfo/X86/pr11300.ll
@@ -3,11 +3,14 @@
 
 ; test that the DW_AT_specification is a back edge in the file.
 
+; Skip the definition of zed(foo*)
 ; CHECK: DW_TAG_subprogram
-; CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x{{[0-9a-f]*}}] = "zed")
+; CHECK: DW_TAG_class_type
+; CHECK: [[BAR_DECL:0x[0-9a-f]*]]:     DW_TAG_subprogram
+; CHECK:     DW_AT_MIPS_linkage_name {{.*}} "_ZN3foo3barEv"
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_specification [DW_FORM_ref4]      (cu + {{.*}} => {[[BACK:0x[0-9a-f]*]]})
-; CHECK: [[BACK]]:     DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}} {[[BAR_DECL]]}
 
 %struct.foo = type { i8 }
 
diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
new file mode 100644
index 000000000000..07e3a4255b08
--- /dev/null
+++ b/test/DebugInfo/X86/pr19307.ll
@@ -0,0 +1,147 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; Generated from the source file pr19307.cc:
+; #include <string>
+; void parse_range(unsigned long long &offset, unsigned long long &limit,
+;                  std::string range) {
+;   if (range.compare(0, 6, "items=") != 0 || range[6] == '-')
+;     offset = 1;
+;   range.erase(0, 6);
+;   limit = 2;
+; }
+; with "clang++ -S -emit-llvm -O0 -g pr19307.cc"
+
+; Location of "range" string is spilled from %rdx to stack and is
+; addressed via %rbp.
+; CHECK: movq %rdx, {{[-0-9]+}}(%rbp)
+; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
+; This location should be valid until the end of the function.
+
+; Verify that we have proper range in debug_loc section:
+; CHECK: .Ldebug_loc{{[0-9]+}}:
+; CHECK: DW_OP_breg1
+; CHECK:      .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK-NEXT: .Lset{{[0-9]+}} = .Lfunc_end0-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK: DW_OP_breg6
+; CHECK: DW_OP_deref
+
+; ModuleID = 'pr19307.cc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+
+@.str = private unnamed_addr constant [7 x i8] c"items=\00", align 1
+
+; Function Attrs: uwtable
+define void @_Z11parse_rangeRyS_Ss(i64* %offset, i64* %limit, %"class.std::basic_string"* %range) #0 {
+entry:
+  %offset.addr = alloca i64*, align 8
+  %limit.addr = alloca i64*, align 8
+  store i64* %offset, i64** %offset.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i64** %offset.addr}, metadata !45), !dbg !46
+  store i64* %limit, i64** %limit.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i64** %limit.addr}, metadata !47), !dbg !46
+  call void @llvm.dbg.declare(metadata !{%"class.std::basic_string"* %range}, metadata !48), !dbg !49
+  %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0)), !dbg !50
+  %cmp = icmp ne i32 %call, 0, !dbg !50
+  br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !50
+
+lor.lhs.false:                                    ; preds = %entry
+  %call1 = call i8* @_ZNSsixEm(%"class.std::basic_string"* %range, i64 6), !dbg !52
+  %0 = load i8* %call1, !dbg !52
+  %conv = sext i8 %0 to i32, !dbg !52
+  %cmp2 = icmp eq i32 %conv, 45, !dbg !52
+  br i1 %cmp2, label %if.then, label %if.end, !dbg !52
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %1 = load i64** %offset.addr, align 8, !dbg !54
+  store i64 1, i64* %1, align 8, !dbg !54
+  br label %if.end, !dbg !54
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  %call3 = call %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"* %range, i64 0, i64 6), !dbg !55
+  %2 = load i64** %limit.addr, align 8, !dbg !56
+  store i64 2, i64* %2, align 8, !dbg !56
+  ret void, !dbg !57
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"*, i64, i64, i8*) #2
+
+declare i8* @_ZNSsixEm(%"class.std::basic_string"*, i64) #2
+
+declare %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"*, i64, i64) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!42, !43}
+!llvm.ident = !{!44}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (209308)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !2, metadata !21, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/pr19307.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"pr19307.cc", metadata !"/llvm_cmake_gcc"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !6, metadata !8}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"", i32 83, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_structure_type ] [line 83, size 0, align 0, offset 0] [decl] [from ]
+!5 = metadata !{metadata !"/usr/include/wchar.h", metadata !"/llvm_cmake_gcc"}
+!6 = metadata !{i32 786451, metadata !7, null, metadata !"lconv", i32 54, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS5lconv"} ; [ DW_TAG_structure_type ] [lconv] [line 54, size 0, align 0, offset 0] [decl] [from ]
+!7 = metadata !{metadata !"/usr/include/locale.h", metadata !"/llvm_cmake_gcc"}
+!8 = metadata !{i32 786434, metadata !9, metadata !10, metadata !"basic_string<char, std::char_traits<char>, std::allocator<char> >", i32 1134, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTSSs"} ; [ DW_TAG_class_type ] [basic_string<char, std::char_traits<char>, std::allocator<char> >] [line 1134, size 0, align 0, offset 0] [decl] [from ]
+!9 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", metadata !"/llvm_cmake_gcc"}
+!10 = metadata !{i32 786489, metadata !11, null, metadata !"std", i32 153} ; [ DW_TAG_namespace ] [std] [line 153]
+!11 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", metadata !"/llvm_cmake_gcc"}
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"parse_range", metadata !"parse_range", metadata !"_Z11parse_rangeRyS_Ss", i32 3, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i64*, i64*, %"class.std::basic_string"*)* @_Z11parse_rangeRyS_Ss, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [parse_range]
+!14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/pr19307.cc]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null, metadata !17, metadata !17, metadata !19}
+!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!18 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!19 = metadata !{i32 786454, metadata !20, metadata !10, metadata !"string", i32 65, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTSSs"} ; [ DW_TAG_typedef ] [string] [line 65, size 0, align 0, offset 0] [from _ZTSSs]
+!20 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", metadata !"/llvm_cmake_gcc"}
+!21 = metadata !{metadata !22, metadata !26, metadata !29, metadata !33, metadata !38, metadata !41}
+!22 = metadata !{i32 786490, metadata !23, metadata !25, i32 57} ; [ DW_TAG_imported_module ]
+!23 = metadata !{i32 786489, metadata !24, null, metadata !"__gnu_debug", i32 55} ; [ DW_TAG_namespace ] [__gnu_debug] [line 55]
+!24 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", metadata !"/llvm_cmake_gcc"}
+!25 = metadata !{i32 786489, metadata !24, metadata !10, metadata !"__debug", i32 49} ; [ DW_TAG_namespace ] [__debug] [line 49]
+!26 = metadata !{i32 786440, metadata !10, metadata !27, i32 66} ; [ DW_TAG_imported_declaration ]
+!27 = metadata !{i32 786454, metadata !5, null, metadata !"mbstate_t", i32 106, i64 0, i64 0, i64 0, i32 0, metadata !28} ; [ DW_TAG_typedef ] [mbstate_t] [line 106, size 0, align 0, offset 0] [from __mbstate_t]
+!28 = metadata !{i32 786454, metadata !5, null, metadata !"__mbstate_t", i32 95, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_typedef ] [__mbstate_t] [line 95, size 0, align 0, offset 0] [from _ZTS11__mbstate_t]
+!29 = metadata !{i32 786440, metadata !10, metadata !30, i32 141} ; [ DW_TAG_imported_declaration ]
+!30 = metadata !{i32 786454, metadata !31, null, metadata !"wint_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_typedef ] [wint_t] [line 141, size 0, align 0, offset 0] [from unsigned int]
+!31 = metadata !{metadata !"/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", metadata !"/llvm_cmake_gcc"}
+!32 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!33 = metadata !{i32 786440, metadata !34, metadata !36, i32 42} ; [ DW_TAG_imported_declaration ]
+!34 = metadata !{i32 786489, metadata !35, null, metadata !"__gnu_cxx", i32 69} ; [ DW_TAG_namespace ] [__gnu_cxx] [line 69]
+!35 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", metadata !"/llvm_cmake_gcc"}
+!36 = metadata !{i32 786454, metadata !11, metadata !10, metadata !"size_t", i32 155, i64 0, i64 0, i64 0, i32 0, metadata !37} ; [ DW_TAG_typedef ] [size_t] [line 155, size 0, align 0, offset 0] [from long unsigned int]
+!37 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!38 = metadata !{i32 786440, metadata !34, metadata !39, i32 43} ; [ DW_TAG_imported_declaration ]
+!39 = metadata !{i32 786454, metadata !11, metadata !10, metadata !"ptrdiff_t", i32 156, i64 0, i64 0, i64 0, i32 0, metadata !40} ; [ DW_TAG_typedef ] [ptrdiff_t] [line 156, size 0, align 0, offset 0] [from long int]
+!40 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!41 = metadata !{i32 786440, metadata !10, metadata !"_ZTS5lconv", i32 55} ; [ DW_TAG_imported_declaration ]
+!42 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!43 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!44 = metadata !{metadata !"clang version 3.5.0 (209308)"}
+!45 = metadata !{i32 786689, metadata !13, metadata !"offset", metadata !14, i32 16777219, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [offset] [line 3]
+!46 = metadata !{i32 3, i32 0, metadata !13, null}
+!47 = metadata !{i32 786689, metadata !13, metadata !"limit", metadata !14, i32 33554435, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [limit] [line 3]
+!48 = metadata !{i32 786689, metadata !13, metadata !"range", metadata !14, i32 50331652, metadata !19, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [range] [line 4]
+!49 = metadata !{i32 4, i32 0, metadata !13, null}
+!50 = metadata !{i32 5, i32 0, metadata !51, null}
+!51 = metadata !{i32 786443, metadata !1, metadata !13, i32 5, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!52 = metadata !{i32 5, i32 0, metadata !53, null}
+!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 5, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!54 = metadata !{i32 6, i32 0, metadata !51, null}
+!55 = metadata !{i32 7, i32 0, metadata !13, null}
+!56 = metadata !{i32 8, i32 0, metadata !13, null} ; [ DW_TAG_imported_declaration ]
+!57 = metadata !{i32 9, i32 0, metadata !13, null}
+
diff --git a/test/DebugInfo/X86/sret.ll b/test/DebugInfo/X86/sret.ll
index 004632814c2c..faf51583848f 100644
--- a/test/DebugInfo/X86/sret.ll
+++ b/test/DebugInfo/X86/sret.ll
@@ -3,8 +3,8 @@
 
 ; Based on the debuginfo-tests/sret.cpp code.
 
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0xc68148e4333befda)
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0xc68148e4333befda)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x5b59949640ec1580)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x5b59949640ec1580)
 
 %class.A = type { i32 (...)**, i32 }
 %class.B = type { i8 }
diff --git a/test/DebugInfo/X86/subregisters.ll b/test/DebugInfo/X86/subregisters.ll
index 738ab02c0caa..d46a95f2c994 100644
--- a/test/DebugInfo/X86/subregisters.ll
+++ b/test/DebugInfo/X86/subregisters.ll
@@ -6,8 +6,8 @@
 ;
 ; rdar://problem/16015314
 ;
+; CHECK:  DW_AT_location [DW_FORM_block1]       (<0x03> 54 93 04 )
 ; CHECK:  DW_AT_name [DW_FORM_strp]{{.*}} "a"
-; CHECK:    DW_AT_location [DW_FORM_block1]       (<0x03> 54 93 04 )
 ;
 ; struct bar {
 ;   int a;
diff --git a/test/DebugInfo/constant-pointers.ll b/test/DebugInfo/constant-pointers.ll
new file mode 100644
index 000000000000..fdde06d4a2b2
--- /dev/null
+++ b/test/DebugInfo/constant-pointers.ll
@@ -0,0 +1,51 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj %s -o - | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Ensure that pointer constants are emitted as unsigned data. Alternatively,
+; these could be signless data (dataN).
+
+; Built with Clang from:
+; template <void *V, void (*F)(), int i>
+; void func() {}
+; template void func<nullptr, nullptr, 42>();
+
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_TAG_template_value_parameter
+; CHECK:     DW_AT_name {{.*}} "V"
+; CHECK:     DW_AT_const_value [DW_FORM_udata] (0)
+; CHECK:   DW_TAG_template_value_parameter
+; CHECK:     DW_AT_name {{.*}} "F"
+; CHECK:     DW_AT_const_value [DW_FORM_udata] (0)
+
+; Function Attrs: nounwind uwtable
+define weak_odr void @_Z4funcILPv0ELPFvvE0ELi42EEvv() #0 {
+entry:
+  ret void, !dbg !18
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!15, !16}
+!llvm.ident = !{!17}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/constant-pointers.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"constant-pointers.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func<nullptr, nullptr, 42>", metadata !"func<nullptr, nullptr, 42>", metadata !"_Z4funcILPv0ELPFvvE0ELi42EEvv", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4funcILPv0ELPFvvE0ELi42EEvv, metadata !8, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func<nullptr, nullptr, 42>]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/constant-pointers.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{metadata !9, metadata !11, metadata !13}
+!9 = metadata !{i32 786480, null, metadata !"V", metadata !10, i8 0, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!11 = metadata !{i32 786480, null, metadata !"F", metadata !12, i8 0, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{i32 786480, null, metadata !"i", metadata !14, i32 42, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!14 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!16 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{metadata !"clang version 3.5.0 "}
+!18 = metadata !{i32 3, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/cross-cu-inlining.ll b/test/DebugInfo/cross-cu-inlining.ll
new file mode 100644
index 000000000000..899558a0b85a
--- /dev/null
+++ b/test/DebugInfo/cross-cu-inlining.ll
@@ -0,0 +1,138 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Build from source:
+; $ clang++ a.cpp b.cpp -g -c -emit-llvm
+; $ llvm-link a.bc b.bc -o ab.bc
+; $ opt -inline ab.bc -o ab-opt.bc
+; $ cat a.cpp
+; extern int i;
+; int func(int);
+; int main() {
+;   return func(i);
+; }
+; $ cat b.cpp
+; int __attribute__((always_inline)) func(int x) {
+;   return x * 2;
+; }
+
+; Ensure that func inlined into main is described and references the abstract
+; definition in b.cpp's CU.
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_AT_name {{.*}} "a.cpp"
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_type [DW_FORM_ref_addr] (0x00000000[[INT:.*]])
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_abstract_origin {{.*}}[[ABS_FUNC:........]])
+; CHECK:       DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:         DW_AT_abstract_origin {{.*}}[[ABS_VAR:........]])
+
+; Check the abstract definition is in the 'b.cpp' CU and doesn't contain any
+; concrete information (address range or variable location)
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_AT_name {{.*}} "b.cpp"
+; CHECK: 0x[[ABS_FUNC]]: DW_TAG_subprogram
+; CHECK-NOT: DW_AT_low_pc
+; CHECK: 0x[[ABS_VAR]]: DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK-NOT: DW_AT_location
+; CHECK: DW_AT_type [DW_FORM_ref4] {{.*}} {0x[[INT]]}
+; CHECK-NOT: DW_AT_location
+
+; CHECK: 0x[[INT]]: DW_TAG_base_type
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "int"
+
+; Check the concrete out of line definition references the abstract and
+; provides the address range and variable location
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_low_pc
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_abstract_origin {{.*}} {0x[[ABS_FUNC]]}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {0x[[ABS_VAR]]}
+
+
+@i = external global i32
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %x.addr.i = alloca i32, align 4
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @i, align 4, !dbg !19
+  %1 = bitcast i32* %x.addr.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %1)
+  store i32 %0, i32* %x.addr.i, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !20), !dbg !21
+  %2 = load i32* %x.addr.i, align 4, !dbg !22
+  %mul.i = mul nsw i32 %2, 2, !dbg !22
+  %3 = bitcast i32* %x.addr.i to i8*, !dbg !22
+  call void @llvm.lifetime.end(i64 4, i8* %3), !dbg !22
+  ret i32 %mul.i, !dbg !19
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define i32 @_Z4funci(i32 %x) #1 {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !20), !dbg !23
+  %0 = load i32* %x.addr, align 4, !dbg !24
+  %mul = mul nsw i32 %0, 2, !dbg !24
+  ret i32 %mul, !dbg !24
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0, !9}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!18, !18}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !11, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!10 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 786478, metadata !10, metadata !13, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!13 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !8, metadata !8}
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{metadata !"clang version 3.5.0 "}
+!19 = metadata !{i32 4, i32 0, metadata !4, null}
+!20 = metadata !{i32 786689, metadata !12, metadata !"x", metadata !13, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!21 = metadata !{i32 1, i32 0, metadata !12, metadata !19}
+!22 = metadata !{i32 2, i32 0, metadata !12, metadata !19}
+!23 = metadata !{i32 1, i32 0, metadata !12, null}
+!24 = metadata !{i32 2, i32 0, metadata !12, null}
+
diff --git a/test/DebugInfo/cross-cu-linkonce.ll b/test/DebugInfo/cross-cu-linkonce.ll
new file mode 100644
index 000000000000..16a50122dcf8
--- /dev/null
+++ b/test/DebugInfo/cross-cu-linkonce.ll
@@ -0,0 +1,74 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Built from source:
+; $ clang++ a.cpp b.cpp -g -c -emit-llvm
+; $ llvm-link a.bc b.bc -o ab.bc
+; $ opt -inline ab.bc -o ab-opt.bc
+; $ cat a.cpp
+; # 1 "func.h"
+; inline int func(int i) {
+;   return i * 2;
+; }
+; int (*x)(int) = &func;
+; $ cat b.cpp
+; # 1 "func.h"
+; inline int func(int i) {
+;   return i * 2;
+; }
+; int (*y)(int) = &func;
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "func"
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG_subprogram
+
+@x = global i32 (i32)* @_Z4funci, align 8
+@y = global i32 (i32)* @_Z4funci, align 8
+
+; Function Attrs: inlinehint nounwind uwtable
+define linkonce_odr i32 @_Z4funci(i32 %i) #0 {
+  %1 = alloca i32, align 4
+  store i32 %i, i32* %1, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !20), !dbg !21
+  %2 = load i32* %1, align 4, !dbg !22
+  %3 = mul nsw i32 %2, 2, !dbg !22
+  ret i32 %3, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0, !13}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19, !19}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !10, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = metadata !{metadata !"func.h", metadata !"/tmp/dbginfo"}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/func.h]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !9}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 4, metadata !12, i32 0, i32 1, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{i32 786449, metadata !14, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!14 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
+!15 = metadata !{metadata !16}
+!16 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 4, metadata !12, i32 0, i32 1, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{metadata !"clang version 3.5.0 "}
+!20 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!21 = metadata !{i32 1, i32 0, metadata !4, null}
+!22 = metadata !{i32 2, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/dead-argument-order.ll b/test/DebugInfo/dead-argument-order.ll
new file mode 100644
index 000000000000..ea805a4872fa
--- /dev/null
+++ b/test/DebugInfo/dead-argument-order.ll
@@ -0,0 +1,81 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Built from the following source with clang -O1
+; struct S { int i; };
+; int function(struct S s, int i) { return s.i + i; }
+
+; Due to the X86_64 ABI, 's' is passed in registers and once optimized, the
+; entirety of 's' is never reconstituted, since only the int is required, and
+; thus the variable's location is unknown/dead to debug info.
+
+; Future/current work should enable us to describe partial variables, which, in
+; this case, happens to be the entire variable.
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "function"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "s"
+; CHECK-NOT: DW_TAG
+; FIXME: Even though 's' is never reconstituted into a struct, the one member
+; variable is still live and used, and so we should be able to describe 's's
+; location as the location of that int.
+; CHECK-NOT: DW_AT_location
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "i"
+
+
+%struct.S = type { i32 }
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @_Z8function1Si(i32 %s.coerce, i32 %i) #0 {
+entry:
+  tail call void @llvm.dbg.declare(metadata !19, metadata !14), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15), !dbg !20
+  %add = add nsw i32 %i, %s.coerce, !dbg !20
+  ret i32 %add, !dbg !20
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!18}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !8, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dead-argument-order.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"dead-argument-order.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"S", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1S"} ; [ DW_TAG_structure_type ] [S] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1S", metadata !"i", i32 1, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [i] [line 1, size 32, align 32, offset 0] [from int]
+!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"function", metadata !"function", metadata !"_Z8function1Si", i32 2, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32)* @_Z8function1Si, null, null, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [function]
+!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dead-argument-order.cpp]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !7, metadata !4, metadata !7}
+!13 = metadata !{metadata !14, metadata !15}
+!14 = metadata !{i32 786689, metadata !9, metadata !"s", metadata !10, i32 16777218, metadata !"_ZTS1S", i32 0, i32 0} ; [ DW_TAG_arg_variable ] [s] [line 2]
+!15 = metadata !{i32 786689, metadata !9, metadata !"i", metadata !10, i32 33554434, metadata !7, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 2]
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{metadata !"clang version 3.5.0 "}
+!19 = metadata !{%struct.S* undef}
+!20 = metadata !{i32 2, i32 0, metadata !9, null}
+
diff --git a/test/DebugInfo/debug-info-qualifiers.ll b/test/DebugInfo/debug-info-qualifiers.ll
index 2aea73607076..b624d3874cb3 100644
--- a/test/DebugInfo/debug-info-qualifiers.ll
+++ b/test/DebugInfo/debug-info-qualifiers.ll
@@ -21,8 +21,6 @@
 ; CHECK-NEXT: DW_AT_rvalue_reference DW_FORM_flag_present
 ;
 ; CHECK: DW_TAG_subprogram
-;
-; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG_subprogram
 ; CHECK:   DW_AT_name {{.*}}"l"
 ; CHECK-NOT: DW_TAG_subprogram
diff --git a/test/DebugInfo/dwarf-public-names.ll b/test/DebugInfo/dwarf-public-names.ll
index ca0d721e8c6f..72189641e3aa 100644
--- a/test/DebugInfo/dwarf-public-names.ll
+++ b/test/DebugInfo/dwarf-public-names.ll
@@ -40,12 +40,14 @@
 ; CHECK: version = 0x0002
 
 ; Check for each name in the output.
-; CHECK: global_namespace_variable
-; CHECK: global_namespace_function
-; CHECK: static_member_function
-; CHECK: global_variable
-; CHECK: global_function
-; CHECK: member_function
+; CHECK-DAG: "ns"
+; CHECK-DAG: "C::static_member_function"
+; CHECK-DAG: "global_variable"
+; CHECK-DAG: "ns::global_namespace_variable"
+; CHECK-DAG: "ns::global_namespace_function"
+; CHECK-DAG: "global_function"
+; CHECK-DAG: "C::static_member_variable"
+; CHECK-DAG: "C::member_function"
 
 %struct.C = type { i8 }
 
@@ -109,7 +111,7 @@ attributes #1 = { nounwind readnone }
 !18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
 !19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
 !20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{i32 786489, null, metadata !"ns", metadata !4, i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!21 = metadata !{i32 786489, metadata !4, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
 !22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25, metadata !26, metadata !27}
diff --git a/test/DebugInfo/incorrect-variable-debugloc.ll b/test/DebugInfo/incorrect-variable-debugloc.ll
new file mode 100644
index 000000000000..284704c54a91
--- /dev/null
+++ b/test/DebugInfo/incorrect-variable-debugloc.ll
@@ -0,0 +1,391 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O2 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; This is a test case that's as reduced as I can get it, though I haven't fully
+; understood the mechanisms by which this bug occurs, so perhaps there's further
+; simplification to be had (it's certainly a bit non-obvious what's going on). I
+; hesitate to hand-craft or otherwise simplify the IR compared to what Clang
+; generates as this is a particular tickling of optimizations and debug location
+; propagation I want a realistic example of.
+
+; Generated with clang-tot -cc1 -g -O2 -w -std=c++11  -fsanitize=address,use-after-return -fcxx-exceptions -fexceptions -x c++ incorrect-variable-debug-loc.cpp -emit-llvm
+
+; struct A {
+;   int m_fn1();
+; };
+;
+; struct B {
+;   void __attribute__((always_inline)) m_fn2() { i = 0; }
+;   int i;
+; };
+;
+; struct C {
+;   void m_fn3();
+;   int j;
+;   B b;
+; };
+;
+; int fn1() {
+;   C A;
+;   A.b.m_fn2();
+;   A.m_fn3();
+; }
+; void C::m_fn3() {
+;   A().m_fn1();
+;   b.m_fn2();
+; }
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}} "C"
+; CHECK: [[FN3_DECL:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "m_fn3"
+
+; CHECK: DW_AT_specification {{.*}} {[[FN3_DECL]]}
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "this"
+
+%struct.C = type { i32, %struct.B }
+%struct.B = type { i32 }
+%struct.A = type { i8 }
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 1, void ()* @asan.module_ctor }]
+@__asan_option_detect_stack_use_after_return = external global i32
+@__asan_gen_ = private unnamed_addr constant [11 x i8] c"1 32 8 1 A\00", align 1
+@__asan_gen_1 = private unnamed_addr constant [13 x i8] c"1 32 1 3 tmp\00", align 1
+
+; Function Attrs: noreturn sanitize_address
+define i32 @_Z3fn1v() #0 {
+entry:
+  %MyAlloca = alloca [64 x i8], align 32, !dbg !39
+  %0 = ptrtoint [64 x i8]* %MyAlloca to i64, !dbg !39
+  %1 = load i32* @__asan_option_detect_stack_use_after_return, !dbg !39
+  %2 = icmp ne i32 %1, 0, !dbg !39
+  br i1 %2, label %3, label %5
+
+; <label>:3                                       ; preds = %entry
+  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0), !dbg !39
+  br label %5
+
+; <label>:5                                       ; preds = %entry, %3
+  %6 = phi i64 [ %0, %entry ], [ %4, %3 ], !dbg !39
+  %7 = add i64 %6, 32, !dbg !39
+  %8 = inttoptr i64 %7 to %struct.C*, !dbg !39
+  %9 = inttoptr i64 %6 to i64*, !dbg !39
+  store i64 1102416563, i64* %9, !dbg !39
+  %10 = add i64 %6, 8, !dbg !39
+  %11 = inttoptr i64 %10 to i64*, !dbg !39
+  store i64 ptrtoint ([11 x i8]* @__asan_gen_ to i64), i64* %11, !dbg !39
+  %12 = add i64 %6, 16, !dbg !39
+  %13 = inttoptr i64 %12 to i64*, !dbg !39
+  store i64 ptrtoint (i32 ()* @_Z3fn1v to i64), i64* %13, !dbg !39
+  %14 = lshr i64 %6, 3, !dbg !39
+  %15 = add i64 %14, 2147450880, !dbg !39
+  %16 = add i64 %15, 0, !dbg !39
+  %17 = inttoptr i64 %16 to i64*, !dbg !39
+  store i64 -868083117767659023, i64* %17, !dbg !39
+  %i.i = getelementptr inbounds %struct.C* %8, i64 0, i32 1, i32 0, !dbg !39
+  %18 = ptrtoint i32* %i.i to i64, !dbg !39
+  %19 = lshr i64 %18, 3, !dbg !39
+  %20 = add i64 %19, 2147450880, !dbg !39
+  %21 = inttoptr i64 %20 to i8*, !dbg !39
+  %22 = load i8* %21, !dbg !39
+  %23 = icmp ne i8 %22, 0, !dbg !39
+  br i1 %23, label %24, label %30, !dbg !39
+
+; <label>:24                                      ; preds = %5
+  %25 = and i64 %18, 7, !dbg !39
+  %26 = add i64 %25, 3, !dbg !39
+  %27 = trunc i64 %26 to i8, !dbg !39
+  %28 = icmp sge i8 %27, %22, !dbg !39
+  br i1 %28, label %29, label %30
+
+; <label>:29                                      ; preds = %24
+  call void @__asan_report_store4(i64 %18), !dbg !39
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:30                                      ; preds = %24, %5
+  store i32 0, i32* %i.i, align 4, !dbg !39, !tbaa !41
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %8}, i64 0, metadata !27), !dbg !46
+  call void @_ZN1C5m_fn3Ev(%struct.C* %8), !dbg !47
+  unreachable, !dbg !47
+}
+
+; Function Attrs: sanitize_address
+define void @_ZN1C5m_fn3Ev(%struct.C* nocapture %this) #1 align 2 {
+entry:
+  %MyAlloca = alloca [64 x i8], align 32, !dbg !48
+  %0 = ptrtoint [64 x i8]* %MyAlloca to i64, !dbg !48
+  %1 = load i32* @__asan_option_detect_stack_use_after_return, !dbg !48
+  %2 = icmp ne i32 %1, 0, !dbg !48
+  br i1 %2, label %3, label %5
+
+; <label>:3                                       ; preds = %entry
+  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0), !dbg !48
+  br label %5
+
+; <label>:5                                       ; preds = %entry, %3
+  %6 = phi i64 [ %0, %entry ], [ %4, %3 ], !dbg !48
+  %7 = add i64 %6, 32, !dbg !48
+  %8 = inttoptr i64 %7 to %struct.A*, !dbg !48
+  %9 = inttoptr i64 %6 to i64*, !dbg !48
+  store i64 1102416563, i64* %9, !dbg !48
+  %10 = add i64 %6, 8, !dbg !48
+  %11 = inttoptr i64 %10 to i64*, !dbg !48
+  store i64 ptrtoint ([13 x i8]* @__asan_gen_1 to i64), i64* %11, !dbg !48
+  %12 = add i64 %6, 16, !dbg !48
+  %13 = inttoptr i64 %12 to i64*, !dbg !48
+  store i64 ptrtoint (void (%struct.C*)* @_ZN1C5m_fn3Ev to i64), i64* %13, !dbg !48
+  %14 = lshr i64 %6, 3, !dbg !48
+  %15 = add i64 %14, 2147450880, !dbg !48
+  %16 = add i64 %15, 0, !dbg !48
+  %17 = inttoptr i64 %16 to i64*, !dbg !48
+  store i64 -868083113472691727, i64* %17, !dbg !48
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %this}, i64 0, metadata !30), !dbg !48
+  %call = call i32 @_ZN1A5m_fn1Ev(%struct.A* %8), !dbg !49
+  %i.i = getelementptr inbounds %struct.C* %this, i64 0, i32 1, i32 0, !dbg !50
+  %18 = ptrtoint i32* %i.i to i64, !dbg !50
+  %19 = lshr i64 %18, 3, !dbg !50
+  %20 = add i64 %19, 2147450880, !dbg !50
+  %21 = inttoptr i64 %20 to i8*, !dbg !50
+  %22 = load i8* %21, !dbg !50
+  %23 = icmp ne i8 %22, 0, !dbg !50
+  br i1 %23, label %24, label %30, !dbg !50
+
+; <label>:24                                      ; preds = %5
+  %25 = and i64 %18, 7, !dbg !50
+  %26 = add i64 %25, 3, !dbg !50
+  %27 = trunc i64 %26 to i8, !dbg !50
+  %28 = icmp sge i8 %27, %22, !dbg !50
+  br i1 %28, label %29, label %30
+
+; <label>:29                                      ; preds = %24
+  call void @__asan_report_store4(i64 %18), !dbg !50
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:30                                      ; preds = %24, %5
+  store i32 0, i32* %i.i, align 4, !dbg !50, !tbaa !41
+  store i64 1172321806, i64* %9, !dbg !52
+  %31 = icmp ne i64 %6, %0, !dbg !52
+  br i1 %31, label %32, label %39, !dbg !52
+
+; <label>:32                                      ; preds = %30
+  %33 = add i64 %15, 0, !dbg !52
+  %34 = inttoptr i64 %33 to i64*, !dbg !52
+  store i64 -723401728380766731, i64* %34, !dbg !52
+  %35 = add i64 %6, 56, !dbg !52
+  %36 = inttoptr i64 %35 to i64*, !dbg !52
+  %37 = load i64* %36, !dbg !52
+  %38 = inttoptr i64 %37 to i8*, !dbg !52
+  store i8 0, i8* %38, !dbg !52
+  br label %42, !dbg !52
+
+; <label>:39                                      ; preds = %30
+  %40 = add i64 %15, 0, !dbg !52
+  %41 = inttoptr i64 %40 to i64*, !dbg !52
+  store i64 0, i64* %41, !dbg !52
+  br label %42, !dbg !52
+
+; <label>:42                                      ; preds = %39, %32
+  ret void, !dbg !52
+}
+
+declare i32 @_ZN1A5m_fn1Ev(%struct.A*) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #3
+
+define internal void @asan.module_ctor() {
+  tail call void @__asan_init_v3()
+  ret void
+}
+
+declare void @__asan_init_v3()
+
+declare void @__asan_report_load1(i64)
+
+declare void @__asan_load1(i64)
+
+declare void @__asan_report_load2(i64)
+
+declare void @__asan_load2(i64)
+
+declare void @__asan_report_load4(i64)
+
+declare void @__asan_load4(i64)
+
+declare void @__asan_report_load8(i64)
+
+declare void @__asan_load8(i64)
+
+declare void @__asan_report_load16(i64)
+
+declare void @__asan_load16(i64)
+
+declare void @__asan_report_store1(i64)
+
+declare void @__asan_store1(i64)
+
+declare void @__asan_report_store2(i64)
+
+declare void @__asan_store2(i64)
+
+declare void @__asan_report_store4(i64)
+
+declare void @__asan_store4(i64)
+
+declare void @__asan_report_store8(i64)
+
+declare void @__asan_store8(i64)
+
+declare void @__asan_report_store16(i64)
+
+declare void @__asan_store16(i64)
+
+declare void @__asan_report_load_n(i64, i64)
+
+declare void @__asan_report_store_n(i64, i64)
+
+declare void @__asan_loadN(i64, i64)
+
+declare void @__asan_storeN(i64, i64)
+
+declare i8* @__asan_memmove(i8*, i8*, i64)
+
+declare i8* @__asan_memcpy(i8*, i8*, i64)
+
+declare i8* @__asan_memset(i8*, i32, i64)
+
+declare void @__asan_handle_no_return()
+
+declare void @__sanitizer_cov()
+
+declare void @__sanitizer_ptr_cmp(i64, i64)
+
+declare void @__sanitizer_ptr_sub(i64, i64)
+
+declare i64 @__asan_stack_malloc_0(i64, i64)
+
+declare void @__asan_stack_free_0(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_1(i64, i64)
+
+declare void @__asan_stack_free_1(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_2(i64, i64)
+
+declare void @__asan_stack_free_2(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_3(i64, i64)
+
+declare void @__asan_stack_free_3(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_4(i64, i64)
+
+declare void @__asan_stack_free_4(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_5(i64, i64)
+
+declare void @__asan_stack_free_5(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_6(i64, i64)
+
+declare void @__asan_stack_free_6(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_7(i64, i64)
+
+declare void @__asan_stack_free_7(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_8(i64, i64)
+
+declare void @__asan_stack_free_8(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_9(i64, i64)
+
+declare void @__asan_stack_free_9(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_10(i64, i64)
+
+declare void @__asan_stack_free_10(i64, i64, i64)
+
+declare void @__asan_poison_stack_memory(i64, i64)
+
+declare void @__asan_unpoison_stack_memory(i64, i64)
+
+declare void @__asan_before_dynamic_init(i64)
+
+declare void @__asan_after_dynamic_init()
+
+declare void @__asan_register_globals(i64, i64)
+
+declare void @__asan_unregister_globals(i64, i64)
+
+declare void @__sanitizer_cov_module_init(i64)
+
+attributes #0 = { noreturn sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!36, !37}
+!llvm.ident = !{!38}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !21, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !14}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"C", i32 10, i64 64, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 10, size 64, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"incorrect-variable-debug-loc.cpp", metadata !"/tmp/dbginfo"}
+!6 = metadata !{metadata !7, metadata !9, metadata !10}
+!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1C", metadata !"j", i32 12, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [j] [line 12, size 32, align 32, offset 0] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1C", metadata !"b", i32 13, i64 32, i64 32, i64 32, i32 0, metadata !"_ZTS1B"} ; [ DW_TAG_member ] [b] [line 13, size 32, align 32, offset 32] [from _ZTS1B]
+!10 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"m_fn3", metadata !"m_fn3", metadata !"_ZN1C5m_fn3Ev", i32 11, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 11} ; [ DW_TAG_subprogram ] [line 11] [m_fn3]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!14 = metadata !{i32 786451, metadata !5, null, metadata !"B", i32 5, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1B", metadata !"i", i32 7, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [i] [line 7, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1B", metadata !"m_fn2", metadata !"m_fn2", metadata !"_ZN1B5m_fn2Ev", i32 6, metadata !18, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [m_fn2]
+!18 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{null, metadata !20}
+!20 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!21 = metadata !{metadata !22, metadata !28, metadata !32}
+!22 = metadata !{i32 786478, metadata !5, metadata !23, metadata !"fn1", metadata !"fn1", metadata !"_Z3fn1v", i32 16, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z3fn1v, null, null, metadata !26, i32 16} ; [ DW_TAG_subprogram ] [line 16] [def] [fn1]
+!23 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/incorrect-variable-debug-loc.cpp]
+!24 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = metadata !{metadata !8}
+!26 = metadata !{metadata !27}
+!27 = metadata !{i32 786688, metadata !22, metadata !"A", metadata !23, i32 17, metadata !"_ZTS1C", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [A] [line 17]
+!28 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"m_fn3", metadata !"m_fn3", metadata !"_ZN1C5m_fn3Ev", i32 21, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.C*)* @_ZN1C5m_fn3Ev, null, metadata !10, metadata !29, i32 21} ; [ DW_TAG_subprogram ] [line 21] [def] [m_fn3]
+!29 = metadata !{metadata !30}
+!30 = metadata !{i32 786689, metadata !28, metadata !"this", null, i32 16777216, metadata !31, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!32 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1B", metadata !"m_fn2", metadata !"m_fn2", metadata !"_ZN1B5m_fn2Ev", i32 6, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !33, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [m_fn2]
+!33 = metadata !{metadata !34}
+!34 = metadata !{i32 786689, metadata !32, metadata !"this", null, i32 16777216, metadata !35, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{metadata !"clang version 3.5.0 "}
+!39 = metadata !{i32 6, i32 0, metadata !32, metadata !40}
+!40 = metadata !{i32 18, i32 0, metadata !22, null}
+!41 = metadata !{metadata !42, metadata !43, i64 0}
+!42 = metadata !{metadata !"_ZTS1B", metadata !43, i64 0}
+!43 = metadata !{metadata !"int", metadata !44, i64 0}
+!44 = metadata !{metadata !"omnipotent char", metadata !45, i64 0}
+!45 = metadata !{metadata !"Simple C/C++ TBAA"}
+!46 = metadata !{i32 17, i32 0, metadata !22, null}
+!47 = metadata !{i32 19, i32 0, metadata !22, null}
+!48 = metadata !{i32 0, i32 0, metadata !28, null}
+!49 = metadata !{i32 22, i32 0, metadata !28, null}
+!50 = metadata !{i32 6, i32 0, metadata !32, metadata !51}
+!51 = metadata !{i32 23, i32 0, metadata !28, null}
+!52 = metadata !{i32 24, i32 0, metadata !28, null}
diff --git a/test/DebugInfo/inline-no-debug-info.ll b/test/DebugInfo/inline-no-debug-info.ll
new file mode 100644
index 000000000000..2257b8961fb5
--- /dev/null
+++ b/test/DebugInfo/inline-no-debug-info.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+
+; This was generated from the following source:
+; int a, b;
+; __attribute__((__always_inline__)) static void callee2() { b = 2; }
+; __attribute__((__nodebug__)) void callee() { a = 1; callee2(); }
+; void caller() { callee(); }
+; by running
+;   clang -S test.c -emit-llvm -O1 -gline-tables-only -fno-strict-aliasing
+
+; CHECK-LABEL: @caller(
+
+; This instruction did not have a !dbg metadata in the callee.
+; CHECK: store i32 1, {{.*}}, !dbg [[A:!.*]]
+
+; This instruction came from callee with a !dbg metadata.
+; CHECK: store i32 2, {{.*}}, !dbg [[B:!.*]]
+
+; The remaining instruction from the caller.
+; CHECK: ret void, !dbg [[A]]
+
+; Debug location of the code in caller() and of the inlined code that did not
+; have any debug location before.
+; CHECK-DAG: [[A]] = metadata !{i32 4, i32 0, metadata !{{[01-9]+}}, null}
+
+; Debug location of the inlined code.
+; CHECK-DAG: [[B]] = metadata !{i32 2, i32 0, metadata !{{[01-9]+}}, metadata [[A]]}
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define void @callee() #0 {
+entry:
+  store i32 1, i32* @a, align 4
+  store i32 2, i32* @b, align 4, !dbg !11
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @caller() #0 {
+entry:
+  tail call void @callee(), !dbg !12
+  ret void, !dbg !12
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (210174)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"/code/llvm/build0"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !7}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"caller", metadata !"caller", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @caller, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [caller]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/code/llvm/build0/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"callee2", metadata !"callee2", metadata !"", i32 2, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [callee2]
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5.0 (210174)"}
+!11 = metadata !{i32 2, i32 0, metadata !7, null}
+!12 = metadata !{i32 4, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/inline-scopes.ll b/test/DebugInfo/inline-scopes.ll
index 310b0404f876..36c073516c56 100644
--- a/test/DebugInfo/inline-scopes.ll
+++ b/test/DebugInfo/inline-scopes.ll
@@ -32,6 +32,9 @@
 ; Ensure that file changes don't interfere with creating inlined subroutines.
 ; (see the line directive inside 'f2' in thesource)
 ; CHECK: DW_TAG_inlined_subroutine
+; CHECK:   DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin
 
 ; Function Attrs: uwtable
 define i32 @main() #0 {
diff --git a/test/DebugInfo/inlined-arguments.ll b/test/DebugInfo/inlined-arguments.ll
index ebc81a683f75..6979862a536f 100644
--- a/test/DebugInfo/inlined-arguments.ll
+++ b/test/DebugInfo/inlined-arguments.ll
@@ -16,9 +16,11 @@
 
 ; CHECK: DW_AT_name{{.*}}"f1"
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"x"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"x"
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"y"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"y"
 
 ; Function Attrs: uwtable
 define void @_Z2f2v() #0 {
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index 6aa12876fa55..20d3dda21ab0 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -17,6 +17,8 @@ RUN: echo "%p/Inputs/macho-universal 0x1f84" >> %t.input
 RUN: echo "%p/Inputs/macho-universal:i386 0x1f67" >> %t.input
 RUN: echo "%p/Inputs/macho-universal:x86_64 0x100000f05" >> %t.input
 RUN: echo "%p/Inputs/llvm-symbolizer-dwo-test 0x400514" >> %t.input
+RUN: echo "%p/Inputs/fission-ranges.elf-x86_64 0x720" >> %t.input
+RUN: echo "%p/Inputs/arange-overlap.elf-x86_64 0x714" >> %t.input
 
 RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck %s
@@ -90,6 +92,12 @@ CHECK:      _Z3inci
 CHECK: main
 CHECK-NEXT: llvm-symbolizer-dwo-test.cc:11
 
+CHECK: main
+CHECK-NEXT: {{.*}}fission-ranges.cc:6
+
+CHECK: _ZN1S3bazEv
+CHECK-NEXT: {{.*}}arange-overlap.cc:6
+
 RUN: echo "unexisting-file 0x1234" > %t.input2
 RUN: llvm-symbolizer < %t.input2
 
diff --git a/test/DebugInfo/missing-abstract-variable.ll b/test/DebugInfo/missing-abstract-variable.ll
new file mode 100644
index 000000000000..59a38cf39d5f
--- /dev/null
+++ b/test/DebugInfo/missing-abstract-variable.ll
@@ -0,0 +1,191 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; The formal parameter 'b' for Function 'x' when inlined within 'a' is lost on
+; mips and powerpc64 (and on x86_64 at at least -O2). Presumably this is a
+; SelectionDAG issue (do mips/powerpc64 use FastISel?).
+; XFAIL: mips, powerpc64, s390x
+
+; Build from the following source with clang -O2.
+
+; The important details are that 'x's abstract definition is first built during
+; the definition of 'b', where the parameter to 'x' is constant and so 'x's 's'
+; variable is optimized away. No abstract definition DIE for 's' is constructed.
+; Then, during 'a' emission, the abstract DbgVariable for 's' is created, but
+; the abstract DIE isn't (since the abstract definition for 'b' is already
+; built). This results in 's' inlined in 'a' being emitted with its name, line,
+; file there, rather than referencing an abstract definition.
+
+; extern int t;
+;
+; void f(int);
+;
+; inline void x(bool b) {
+;   if (b) {
+;     int s = t;
+;     f(s);
+;   }
+;   f(0);
+; }
+;
+; void b() {
+;   x(false);
+; }
+;
+; void a(bool u) {
+;   x(u);
+; }
+
+; CHECK: [[ABS_X:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "x"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ABS_B:.*]]:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "b"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:       DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ABS_S:.*]]:       DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:         DW_AT_name {{.*}} "s"
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "b"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_X]]}
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_B]]}
+; Notice 'x's local variable 's' is missing. Not necessarily a bug here,
+; since it's been optimized entirely away and it should be described in
+; abstract subprogram.
+; CHECK-NOT: DW_TAG
+; CHECK: NULL
+; CHECK-NOT: DW_TAG
+; CHECK: NULL
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "a"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_X]]}
+; CHECK-NOT: {{DW_TAG|NULL}}
+; FIXME: This formal parameter goes missing at least at -O2 (& on
+; mips/powerpc), maybe before that. Perhaps SelectionDAG is to blame (and
+; fastisel succeeds).
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_B]]}
+
+; The two lexical blocks here are caused by the scope of the if that includes
+; the condition variable, and the scope within the if's composite statement. I'm
+; not sure we really need both of them since there's no variable declared in the
+; outer of the two
+
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:       DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:         DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:           DW_AT_abstract_origin {{.*}} {[[ABS_S]]}
+
+@t = external global i32
+
+; Function Attrs: uwtable
+define void @_Z1bv() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !24, i64 0, metadata !25), !dbg !27
+  tail call void @_Z1fi(i32 0), !dbg !28
+  ret void, !dbg !29
+}
+
+; Function Attrs: uwtable
+define void @_Z1ab(i1 zeroext %u) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !13), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !31), !dbg !33
+  br i1 %u, label %if.then.i, label %_Z1xb.exit, !dbg !34
+
+if.then.i:                                        ; preds = %entry
+  %0 = load i32* @t, align 4, !dbg !35, !tbaa !36
+  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !40), !dbg !35
+  tail call void @_Z1fi(i32 %0), !dbg !41
+  br label %_Z1xb.exit, !dbg !42
+
+_Z1xb.exit:                                       ; preds = %entry, %if.then.i
+  tail call void @_Z1fi(i32 0), !dbg !43
+  ret void, !dbg !44
+}
+
+declare void @_Z1fi(i32) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !22}
+!llvm.ident = !{!23}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/missing-abstract-variables.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"missing-abstract-variables.cc", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !8, metadata !14}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"b", metadata !"b", metadata !"_Z1bv", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1bv, null, null, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"a", metadata !"a", metadata !"_Z1ab", i32 17, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i1)* @_Z1ab, null, null, metadata !12, i32 17} ; [ DW_TAG_subprogram ] [line 17] [def] [a]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{null, metadata !11}
+!11 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786689, metadata !8, metadata !"u", metadata !5, i32 16777233, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [u] [line 17]
+!14 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x", metadata !"x", metadata !"_Z1xb", i32 5, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !15, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [x]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!17 = metadata !{i32 786688, metadata !18, metadata !"s", metadata !5, i32 7, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!18 = metadata !{i32 786443, metadata !1, metadata !19, i32 6, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!19 = metadata !{i32 786443, metadata !1, metadata !14, i32 6, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!20 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!23 = metadata !{metadata !"clang version 3.5.0 "}
+!24 = metadata !{i1 false}
+!25 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, metadata !26} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!26 = metadata !{i32 14, i32 0, metadata !4, null}
+!27 = metadata !{i32 5, i32 0, metadata !14, metadata !26}
+!28 = metadata !{i32 10, i32 0, metadata !14, metadata !26}
+!29 = metadata !{i32 15, i32 0, metadata !4, null}
+!30 = metadata !{i32 17, i32 0, metadata !8, null}
+!31 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, metadata !32} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!32 = metadata !{i32 18, i32 0, metadata !8, null}
+!33 = metadata !{i32 5, i32 0, metadata !14, metadata !32}
+!34 = metadata !{i32 6, i32 0, metadata !19, metadata !32}
+!35 = metadata !{i32 7, i32 0, metadata !18, metadata !32}
+!36 = metadata !{metadata !37, metadata !37, i64 0}
+!37 = metadata !{metadata !"int", metadata !38, i64 0}
+!38 = metadata !{metadata !"omnipotent char", metadata !39, i64 0}
+!39 = metadata !{metadata !"Simple C/C++ TBAA"}
+!40 = metadata !{i32 786688, metadata !18, metadata !"s", metadata !5, i32 7, metadata !20, i32 0, metadata !32} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!41 = metadata !{i32 8, i32 0, metadata !18, metadata !32} ; [ DW_TAG_imported_declaration ]
+!42 = metadata !{i32 9, i32 0, metadata !18, metadata !32}
+!43 = metadata !{i32 10, i32 0, metadata !14, metadata !32}
+!44 = metadata !{i32 19, i32 0, metadata !8, null}
diff --git a/test/DebugInfo/namespace.ll b/test/DebugInfo/namespace.ll
index 6af3dc3ddd9c..a9de62c39062 100644
--- a/test/DebugInfo/namespace.ll
+++ b/test/DebugInfo/namespace.ll
@@ -16,14 +16,6 @@
 ; CHECK: [[I:0x[0-9a-f]*]]:{{ *}}DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name{{.*}}= "i"
 ; CHECK-NOT: NULL
-; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name{{.*}}= "f1"
-; CHECK: [[FUNC1:0x[0-9a-f]*]]:{{ *}}DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name{{.*}}= "f1"
-; CHECK: NULL
-; CHECK-NOT: NULL
 ; CHECK: [[FOO:0x[0-9a-f]*]]:{{ *}}DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name{{.*}}= "foo"
 ; CHECK-NEXT: DW_AT_declaration
@@ -31,7 +23,16 @@
 ; CHECK: [[BAR:0x[0-9a-f]*]]:{{ *}}DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name{{.*}}= "bar"
 ; CHECK: NULL
-; CHECK: NULL
+; CHECK: [[FUNC1:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "f1"
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "f1"
 ; CHECK: NULL
 
 ; CHECK-NOT: NULL
@@ -44,9 +45,18 @@
 ; CHECK: NULL
 ; CHECK-NOT: NULL
 
+; CHECK: DW_TAG_imported_module
+; Same bug as above, this should be F2, not F1
+; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F1]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x0b)
+; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
+; CHECK-NOT: NULL
+
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name{{.*}}= "func"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "func"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_module
 ; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
@@ -93,13 +103,7 @@
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS2]]})
 ; CHECK: NULL
 ; CHECK: NULL
-; CHECK-NOT: NULL
-
-; CHECK: DW_TAG_imported_module
-; Same bug as above, this should be F2, not F1
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F1]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x0b)
-; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
+; CHECK: NULL
 
 ; CHECK: file_names[  [[F1]]]{{.*}}debug-info-namespace.cpp
 ; CHECK: file_names[  [[F2]]]{{.*}}foo.cpp
diff --git a/test/DebugInfo/namespace_function_definition.ll b/test/DebugInfo/namespace_function_definition.ll
index 15f39fd6d9d8..590f2b301ffe 100644
--- a/test/DebugInfo/namespace_function_definition.ll
+++ b/test/DebugInfo/namespace_function_definition.ll
@@ -12,9 +12,9 @@
 ; CHECK-NEXT: DW_AT_name {{.*}} "ns"
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN2ns4funcEv"
-; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_low_pc
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN2ns4funcEv"
 ; CHECK: NULL
 ; CHECK: NULL
 
diff --git a/test/DebugInfo/namespace_inline_function_definition.ll b/test/DebugInfo/namespace_inline_function_definition.ll
new file mode 100644
index 000000000000..65fa4a442dc6
--- /dev/null
+++ b/test/DebugInfo/namespace_inline_function_definition.ll
@@ -0,0 +1,92 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Generate from clang with the following source. Note that the definition of
+; the inline function follows its use to workaround another bug that should be
+; fixed soon.
+; namespace ns {
+; int func(int i);
+; }
+; extern int x;
+; int main() { return ns::func(x); }
+; int __attribute__((always_inline)) ns::func(int i) { return i * 2; }
+
+; CHECK: DW_TAG_namespace
+; CHECK-NEXT: DW_AT_name {{.*}} "ns"
+; CHECK-NOT: DW_TAG
+; CHECK: [[ABS_DEF:0x.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN2ns4funcEi"
+; CHECK-NOT: DW_TAG
+; CHECK: [[ABS_PRM:0x.*]]:   DW_TAG_formal_parameter
+; CHECK:   NULL
+; CHECK-NOT: NULL
+; CHECK:   DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_DEF]]}
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_TAG_formal_parameter
+; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_PRM]]}
+; CHECK:     NULL
+; CHECK:   NULL
+; CHECK: NULL
+
+@x = external global i32
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %i.addr.i = alloca i32, align 4
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @x, align 4, !dbg !16
+  store i32 %0, i32* %i.addr.i, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i}, metadata !17), !dbg !18
+  %1 = load i32* %i.addr.i, align 4, !dbg !18
+  %mul.i = mul nsw i32 %1, 2, !dbg !18
+  ret i32 %mul.i, !dbg !16
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define i32 @_ZN2ns4funcEi(i32 %i) #1 {
+entry:
+  %i.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !17), !dbg !19
+  %0 = load i32* %i.addr, align 4, !dbg !19
+  %mul = mul nsw i32 %0, 2, !dbg !19
+  ret i32 %mul, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_inline_function_definition.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"namespace_inline_function_definition.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !9}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/namespace_inline_function_definition.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"func", metadata !"func", metadata !"_ZN2ns4funcEi", i32 6, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_ZN2ns4funcEi, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!10 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 1} ; [ DW_TAG_namespace ] [ns] [line 1]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !8, metadata !8}
+!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!14 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{metadata !"clang version 3.5.0 "}
+!16 = metadata !{i32 5, i32 0, metadata !4, null}
+!17 = metadata !{i32 786689, metadata !9, metadata !"i", metadata !5, i32 16777222, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 6]
+!18 = metadata !{i32 6, i32 0, metadata !9, metadata !16}
+!19 = metadata !{i32 6, i32 0, metadata !9, null}
diff --git a/test/DebugInfo/varargs.ll b/test/DebugInfo/varargs.ll
index a32741426194..ddfcd858f539 100644
--- a/test/DebugInfo/varargs.ll
+++ b/test/DebugInfo/varargs.ll
@@ -13,25 +13,25 @@
 ;
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name {{.*}} "b"
+; CHECK: DW_AT_name {{.*}} "a"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_unspecified_parameters
 ;
-; Variadic C++ member function.
-; struct A { void a(int c, ...); }
-;
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name {{.*}} "a"
-; CHECK-NOT: DW_TAG
-; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_name {{.*}} "b"
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_unspecified_parameters
 ;
+; Variadic C++ member function.
+; struct A { void a(int c, ...); }
+;
 ; Variadic function pointer.
 ; void (*fptr)(int, ...);
 ;
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index 5dc749da24eb..f98140357736 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -1,5 +1,5 @@
 root = config.root
-targets = set(root.targets_to_build.split())
+targets = root.targets
 if ('X86' in targets) | ('AArch64' in targets) | ('ARM' in targets) | \
    ('Mips' in targets) | ('PowerPC' in targets) | ('SystemZ' in targets):
     config.unsupported = False
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index 7f0b69e544a1..f6673df3c358 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -1,7 +1,10 @@
-if config.root.host_arch in ['PowerPC', 'AArch64', 'ARM64', 'SystemZ']:
+if config.root.host_arch in ['PowerPC', 'AArch64', 'SystemZ']:
     config.unsupported = True
 
 # CMake and autoconf diverge in naming or host_arch
+if 'powerpc64' in config.root.target_triple:
+    config.unsupported = True
+
 if 'aarch64' in config.root.target_triple \
     or 'arm64' in config.root.target_triple:
         config.unsupported = True
diff --git a/test/Feature/alias2.ll b/test/Feature/alias2.ll
index 693ef7c9bef9..73c874f2b9ee 100644
--- a/test/Feature/alias2.ll
+++ b/test/Feature/alias2.ll
@@ -6,14 +6,23 @@
 @v2 = global [1 x i32] zeroinitializer
 ; CHECK: @v2 = global [1 x i32] zeroinitializer
 
-@v3 = alias i16, i32* @v1
-; CHECK: @v3 = alias i16, i32* @v1
+@v3 = global [2 x i16] zeroinitializer
+; CHECK: @v3 = global [2 x i16] zeroinitializer
 
-@v4 = alias i32, [1 x i32]* @v2
-; CHECK: @v4 = alias i32, [1 x i32]* @v2
+@a1 = alias bitcast (i32* @v1 to i16*)
+; CHECK: @a1 = alias bitcast (i32* @v1 to i16*)
 
-@v5 = alias addrspace(2) i32, i32* @v1
-; CHECK: @v5 = alias addrspace(2) i32, i32* @v1
+@a2 = alias bitcast([1 x i32]* @v2 to i32*)
+; CHECK: @a2 = alias getelementptr inbounds ([1 x i32]* @v2, i32 0, i32 0)
 
-@v6 = alias i16, i32* @v1
-; CHECK: @v6 = alias i16, i32* @v1
+@a3 = alias addrspacecast (i32* @v1 to i32 addrspace(2)*)
+; CHECK: @a3 = alias addrspacecast (i32* @v1 to i32 addrspace(2)*)
+
+@a4 = alias bitcast (i32* @v1 to i16*)
+; CHECK: @a4 = alias bitcast (i32* @v1 to i16*)
+
+@a5 = thread_local(localdynamic) alias i32* @v1
+; CHECK: @a5 = thread_local(localdynamic) alias i32* @v1
+
+@a6 = alias getelementptr ([2 x i16]* @v3, i32 1, i32 1)
+; CHECK: @a6 = alias getelementptr ([2 x i16]* @v3, i32 1, i32 1)
diff --git a/test/Feature/aliases.ll b/test/Feature/aliases.ll
index b2ce82a8388d..ad1d1b08901c 100644
--- a/test/Feature/aliases.ll
+++ b/test/Feature/aliases.ll
@@ -7,6 +7,14 @@
 @bar = global i32 0
 @foo1 = alias i32* @bar
 @foo2 = alias i32* @bar
+@foo3 = alias i32* @foo2
+@foo4 = unnamed_addr alias i32* @foo2
+
+; Make sure the verifier does not complain about references to a global
+; declaration from an initializer.
+@decl = external global i32
+@ptr = global i32* @decl
+@ptr_a = alias i32** @ptr
 
 %FunTy = type i32()
 
@@ -14,10 +22,11 @@ define i32 @foo_f() {
   ret i32 0
 }
 @bar_f = alias weak_odr %FunTy* @foo_f
+@bar_ff = alias i32()* @bar_f
 
 @bar_i = alias internal i32* @bar
 
-@A = alias i64, i32* @bar
+@A = alias bitcast (i32* @bar to i64*)
 
 define i32 @test() {
 entry:
diff --git a/test/Feature/globalvars.ll b/test/Feature/globalvars.ll
index dad1cf31d5e6..84b4bdfd3750 100644
--- a/test/Feature/globalvars.ll
+++ b/test/Feature/globalvars.ll
@@ -16,3 +16,5 @@ define i32 @foo(i32 %blah) {
         ret i32 %blah
 }
 
+hidden dllexport global i32 42
+dllexport global i32 42
diff --git a/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg b/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
+++ b/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 7a125a396433..7d1aa0b8530b 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -34,7 +34,7 @@ define i32 @test_load(i32* %a) sanitize_address {
 
 
 entry:
-  %tmp1 = load i32* %a
+  %tmp1 = load i32* %a, align 4
   ret i32 %tmp1
 }
 
@@ -66,7 +66,7 @@ define void @test_store(i32* %a) sanitize_address {
 ;
 
 entry:
-  store i32 42, i32* %a
+  store i32 42, i32* %a, align 4
   ret void
 }
 
@@ -115,6 +115,18 @@ define void @i40test(i40* %a, i40* %b) nounwind uwtable sanitize_address {
 ; CHECK: __asan_report_store_n{{.*}}, i64 5)
 ; CHECK: ret void
 
+define void @i64test_align1(i64* %b) nounwind uwtable sanitize_address {
+  entry:
+  store i64 0, i64* %b, align 1
+  ret void
+}
+
+; CHECK-LABEL: i64test_align1
+; CHECK: __asan_report_store_n{{.*}}, i64 8)
+; CHECK: __asan_report_store_n{{.*}}, i64 8)
+; CHECK: ret void
+
+
 define void @i80test(i80* %a, i80* %b) nounwind uwtable sanitize_address {
   entry:
   %t = load i80* %a
diff --git a/test/Instrumentation/AddressSanitizer/coverage-dbg.ll b/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
index 77d72867050c..3f7998d1a738 100644
--- a/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
+++ b/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
@@ -2,32 +2,66 @@
 
 ; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s
 
+; C++ source:
+; 1: struct A {
+; 2:  int f();
+; 3:  int x;
+; 4: };
+; 5:
+; 6: int A::f() {
+; 7:    return x;
+; 8: }
+; clang++ ../1.cc -O3 -g -S -emit-llvm  -fno-strict-aliasing
+; and add sanitize_address to @_ZN1A1fEv
+
+; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f().
+; CHECK: call void @__sanitizer_cov(), !dbg [[A:!.*]]
+; CHECK: [[A]] = metadata !{i32 6, i32 0, metadata !{{.*}}, null}
+
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; Function Attrs: nounwind readnone uwtable
-define void @_Z1fv() #0 {
+%struct.A = type { i32 }
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @_ZN1A1fEv(%struct.A* nocapture readonly %this) #0 align 2 {
 entry:
-  ret void, !dbg !11
+  tail call void @llvm.dbg.value(metadata !{%struct.A* %this}, i64 0, metadata !15), !dbg !20
+  %x = getelementptr inbounds %struct.A* %this, i64 0, i32 0, !dbg !21
+  %0 = load i32* %x, align 4, !dbg !21
+  ret i32 %0, !dbg !21
 }
 
-; CHECK: call void @__sanitizer_cov(), !dbg !
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
 
-attributes #0 = { sanitize_address nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_address nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9}
-!llvm.ident = !{!10}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (208682)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp//tmp/1.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"/tmp/1.cc", metadata !"/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (210251)", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/../1.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"../1.cc", metadata !"/code/llvm/build0"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"_Z1fv", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1fv, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp//tmp/1.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!10 = metadata !{metadata !"clang version 3.5.0 (208682)"}
-!11 = metadata !{i32 2, i32 0, metadata !4, null}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !8}
+!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1A", metadata !"x", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [x] [line 3, size 32, align 32, offset 0] [from int]
+!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"f", metadata !"f", metadata !"_ZN1A1fEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [f]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !7, metadata !11}
+!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"f", metadata !"f", metadata !"_ZN1A1fEv", i32 6, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.A*)* @_ZN1A1fEv, null, metadata !8, metadata !14, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [f]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786689, metadata !13, metadata !"this", null, i32 16777216, metadata !16, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{metadata !"clang version 3.5.0 (210251)"}
+!20 = metadata !{i32 0, i32 0, metadata !13, null}
+!21 = metadata !{i32 7, i32 0, metadata !13, null}
diff --git a/test/Instrumentation/AddressSanitizer/coverage.ll b/test/Instrumentation/AddressSanitizer/coverage.ll
index 7f397f453ff1..79bb5c135330 100644
--- a/test/Instrumentation/AddressSanitizer/coverage.ll
+++ b/test/Instrumentation/AddressSanitizer/coverage.ll
@@ -1,7 +1,20 @@
+; RUN: opt < %s -asan -asan-module -asan-coverage=0 -S | FileCheck %s --check-prefix=CHECK0
 ; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s --check-prefix=CHECK1
 ; RUN: opt < %s -asan -asan-module -asan-coverage=2 -S | FileCheck %s --check-prefix=CHECK2
 ; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2
 ; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=1  -S | FileCheck %s --check-prefix=CHECK1
+
+; RUN: opt < %s -asan -asan-module -asan-coverage=0 -asan-globals=0 -S | \
+; RUN:     FileCheck %s --check-prefix=CHECK0
+; RUN: opt < %s -asan -asan-module -asan-coverage=1 -asan-globals=0 -S | \
+; RUN:     FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-globals=0 -S | \
+; RUN:     FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=10 \
+; RUN:     -asan-globals=0 -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=1 \
+; RUN:     -asan-globals=0 -S | FileCheck %s --check-prefix=CHECK1
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
 define void @foo(i32* %a) sanitize_address {
@@ -16,6 +29,10 @@ entry:
   if.end:                                           ; preds = %entry, %if.then
   ret void
 }
+
+; CHECK0-NOT: call void @__sanitizer_cov(
+; CHECK0-NOT: call void @__sanitizer_cov_module_init(
+
 ; CHECK1-LABEL: define void @foo
 ; CHECK1: %0 = load atomic i8* @__asan_gen_cov_foo monotonic, align 1
 ; CHECK1: %1 = icmp eq i8 0, %0
@@ -24,9 +41,20 @@ entry:
 ; CHECK1-NOT: call void @__sanitizer_cov
 ; CHECK1: store atomic i8 1, i8* @__asan_gen_cov_foo monotonic, align 1
 
+; CHECK1-LABEL: define internal void @asan.module_ctor
+; CHECK1-NOT: ret
+; CHECK1: call void @__sanitizer_cov_module_init(i64 1)
+; CHECK1: ret
+
+
 ; CHECK2-LABEL: define void @foo
 ; CHECK2: call void @__sanitizer_cov
 ; CHECK2: call void @__sanitizer_cov
 ; CHECK2: call void @__sanitizer_cov
 ; CHECK2-NOT: call void @__sanitizer_cov
 ; CHECK2: ret void
+
+; CHECK2-LABEL: define internal void @asan.module_ctor
+; CHECK2-NOT: ret
+; CHECK2: call void @__sanitizer_cov_module_init(i64 3)
+; CHECK2: ret
diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
index daf29571c4a6..a4501d2ed292 100644
--- a/test/Instrumentation/AddressSanitizer/debug_info.ll
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+; RUN: opt < %s -asan -asan-module -asan-use-after-return=0 -S | FileCheck %s
 
 ; Checks that llvm.dbg.declare instructions are updated 
 ; accordingly as we merge allocas.
diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
index 1d00cfacafe4..05e18b5a01bd 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
@@ -23,6 +23,8 @@ entry:
   ret void
 }
 
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
 define internal void @_GLOBAL__I_a() sanitize_address section ".text.startup" {
 entry:
   call void @__cxx_global_var_init()
diff --git a/test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll b/test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll
index dd82444b17b3..adb434112cd2 100644
--- a/test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll
+++ b/test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll
@@ -20,10 +20,10 @@ entry:
 ; CHECK-CUSTOM-PREFIX: call void @__foo_load8
 ; CHECK-CUSTOM-PREFIX: call void @__foo_loadN
 ; CHECK-INLINE-NOT: call void @__asan_load
-  %tmp1 = load i32* %a
-  %tmp2 = load i64* %b
-  %tmp3 = load i512* %c
-  %tmp4 = load i80* %d
+  %tmp1 = load i32* %a, align 4
+  %tmp2 = load i64* %b, align 8
+  %tmp3 = load i512* %c, align 32
+  %tmp4 = load i80* %d, align 8
   ret void
 }
 
diff --git a/test/Instrumentation/AddressSanitizer/lifetime.ll b/test/Instrumentation/AddressSanitizer/lifetime.ll
index 1961997dc014..175a07d51e69 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -1,5 +1,5 @@
 ; Test hanlding of llvm.lifetime intrinsics.
-; RUN: opt < %s -asan -asan-module -asan-check-lifetime -S | FileCheck %s
+; RUN: opt < %s -asan -asan-module -asan-check-lifetime -asan-use-after-return=0 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/AddressSanitizer/stack-poisoning.ll b/test/Instrumentation/AddressSanitizer/stack-poisoning.ll
index 6919e5353f20..ace12d03b70e 100644
--- a/test/Instrumentation/AddressSanitizer/stack-poisoning.ll
+++ b/test/Instrumentation/AddressSanitizer/stack-poisoning.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -asan -asan-module -asan-use-after-return -S | FileCheck --check-prefix=CHECK-UAR %s
-; RUN: opt < %s -asan -asan-module -S | FileCheck --check-prefix=CHECK-PLAIN %s
+; RUN: opt < %s -asan -asan-module -asan-use-after-return=0 -S | FileCheck --check-prefix=CHECK-PLAIN %s
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Instrumentation/AddressSanitizer/test64.ll b/test/Instrumentation/AddressSanitizer/test64.ll
index 4f3ed5b478c9..fd93f4576ae4 100644
--- a/test/Instrumentation/AddressSanitizer/test64.ll
+++ b/test/Instrumentation/AddressSanitizer/test64.ll
@@ -6,7 +6,7 @@ entry:
   %tmp1 = load i32* %a, align 4
   ret i32 %tmp1
 }
-; CHECK: @read_4_bytes
+; CHECK-LABEL: @read_4_bytes
 ; CHECK-NOT: ret
 ; CHECK: lshr {{.*}} 3
 ; Check for ASAN's Offset for 64-bit (7fff8000)
@@ -19,8 +19,10 @@ entry:
   ret void
 }
 
-; CHECK: @example_atomicrmw
+; CHECK-LABEL: @example_atomicrmw
 ; CHECK: lshr {{.*}} 3
+; CHECK: __asan_report_store8
+; CHECK-NOT: __asan_report
 ; CHECK: atomicrmw
 ; CHECK: ret
 
@@ -30,7 +32,9 @@ entry:
   ret void
 }
 
-; CHECK: @example_cmpxchg
+; CHECK-LABEL: @example_cmpxchg
 ; CHECK: lshr {{.*}} 3
+; CHECK: __asan_report_store8
+; CHECK-NOT: __asan_report
 ; CHECK: cmpxchg
 ; CHECK: ret
diff --git a/test/Instrumentation/BoundsChecking/phi.ll b/test/Instrumentation/BoundsChecking/phi.ll
index 0f9d1b0b54b3..25a5ed12c62d 100644
--- a/test/Instrumentation/BoundsChecking/phi.ll
+++ b/test/Instrumentation/BoundsChecking/phi.ll
@@ -52,7 +52,7 @@ fn.exit:
 }
 
 
-@global_as1 = private addrspace(1) unnamed_addr constant [10 x i8] c"ola\00mundo\00", align 1
+@global_as1 = private unnamed_addr addrspace(1) constant [10 x i8] c"ola\00mundo\00", align 1
 
 define void @f1_as1(i8 addrspace(1)* nocapture %c) {
 ; CHECK: @f1_as1
diff --git a/test/Instrumentation/MemorySanitizer/atomics.ll b/test/Instrumentation/MemorySanitizer/atomics.ll
index 98697d70382f..c8f3b88815bb 100644
--- a/test/Instrumentation/MemorySanitizer/atomics.ll
+++ b/test/Instrumentation/MemorySanitizer/atomics.ll
@@ -37,12 +37,13 @@ entry:
 
 define i32 @Cmpxchg(i32* %p, i32 %a, i32 %b) sanitize_memory {
 entry:
-  %0 = cmpxchg i32* %p, i32 %a, i32 %b seq_cst seq_cst
+  %pair = cmpxchg i32* %p, i32 %a, i32 %b seq_cst seq_cst
+  %0 = extractvalue { i32, i1 } %pair, 0
   ret i32 %0
 }
 
 ; CHECK: @Cmpxchg
-; CHECK: store i32 0,
+; CHECK: store { i32, i1 } zeroinitializer,
 ; CHECK: icmp
 ; CHECK: br
 ; CHECK: @__msan_warning
@@ -55,12 +56,13 @@ entry:
 
 define i32 @CmpxchgMonotonic(i32* %p, i32 %a, i32 %b) sanitize_memory {
 entry:
-  %0 = cmpxchg i32* %p, i32 %a, i32 %b monotonic monotonic
+  %pair = cmpxchg i32* %p, i32 %a, i32 %b monotonic monotonic
+  %0 = extractvalue { i32, i1 } %pair, 0
   ret i32 %0
 }
 
 ; CHECK: @CmpxchgMonotonic
-; CHECK: store i32 0,
+; CHECK: store { i32, i1 } zeroinitializer,
 ; CHECK: icmp
 ; CHECK: br
 ; CHECK: @__msan_warning
diff --git a/test/Instrumentation/MemorySanitizer/missing_origin.ll b/test/Instrumentation/MemorySanitizer/missing_origin.ll
new file mode 100644
index 000000000000..673e85369a3a
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/missing_origin.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test that result origin is directy propagated from the argument,
+; and is not affected by all the literal undef operands.
+; https://code.google.com/p/memory-sanitizer/issues/detail?id=56
+
+define <4 x i32> @Shuffle(<4 x i32> %x) nounwind uwtable sanitize_memory {
+entry:
+  %y = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i32> %y
+}
+
+; CHECK-LABEL: @Shuffle(
+; CHECK: [[A:%.*]] = load i32* {{.*}}@__msan_param_origin_tls,
+; CHECK: store i32 [[A]], i32* @__msan_retval_origin_tls
+; CHECK: ret <4 x i32>
diff --git a/test/Instrumentation/MemorySanitizer/mul_by_constant.ll b/test/Instrumentation/MemorySanitizer/mul_by_constant.ll
new file mode 100644
index 000000000000..e068f69ae4ba
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/mul_by_constant.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check instrumentation mul when one of the operands is a constant.
+
+define i64 @MulConst(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, 42949672960000
+  ret i64 %y
+}
+
+; 42949672960000 = 2**32 * 10000
+; 36 trailing zero bits
+; 68719476736 = 2**36
+
+; CHECK-LABEL: @MulConst(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 68719476736
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulZero(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, 0
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulZero(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 0{{$}}
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulNeg(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, -16
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulNeg(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 16
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulNeg2(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, -48
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulNeg2(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 16
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulOdd(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, 12345
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulOdd(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 1
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulLarge(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, -9223372036854775808
+  ret i64 %y
+}
+
+; -9223372036854775808 = 0x7000000000000000
+
+; CHECK-LABEL: @MulLarge(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], -9223372036854775808
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+define <4 x i32> @MulVectorConst(<4 x i32> %x) sanitize_memory {
+entry:
+  %y = mul <4 x i32> %x, <i32 3072, i32 0, i32 -16, i32 -48>
+  ret <4 x i32> %y
+}
+
+; CHECK-LABEL: @MulVectorConst(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul <4 x i32> [[A]], <i32 1024, i32 0, i32 16, i32 16>
+; CHECK: store <4 x i32> [[B]], <4 x i32>* {{.*}} @__msan_retval_tls
diff --git a/test/Instrumentation/MemorySanitizer/vector_arith.ll b/test/Instrumentation/MemorySanitizer/vector_arith.ll
new file mode 100644
index 000000000000..6541a1c3a394
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+
+define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory {
+entry:
+  %c = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b) nounwind
+  ret <4 x i32> %c
+}
+
+; CHECK-LABEL: @Test_sse2_pmadd_wd(
+; CHECK: or <8 x i16>
+; CHECK: bitcast <8 x i16> {{.*}} to <4 x i32>
+; CHECK: icmp ne <4 x i32> {{.*}}, zeroinitializer
+; CHECK: sext <4 x i1> {{.*}} to <4 x i32>
+; CHECK: ret <4 x i32>
+
+
+define x86_mmx @Test_ssse3_pmadd_ub_sw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+entry:
+  %c = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind
+  ret x86_mmx %c
+}
+
+; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
+; CHECK: or i64
+; CHECK: bitcast i64 {{.*}} to <4 x i16>
+; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer
+; CHECK: sext <4 x i1> {{.*}} to <4 x i16>
+; CHECK: bitcast <4 x i16> {{.*}} to i64
+; CHECK: ret x86_mmx
+
+
+define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
+  %c = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a, <16 x i8> %b)
+  ret <2 x i64> %c
+}
+
+; CHECK-LABEL: @Test_x86_sse2_psad_bw(
+; CHECK: or <16 x i8> {{.*}}, {{.*}}
+; CHECK: bitcast <16 x i8> {{.*}} to <2 x i64>
+; CHECK: icmp ne <2 x i64> {{.*}}, zeroinitializer
+; CHECK: sext <2 x i1> {{.*}} to <2 x i64>
+; CHECK: lshr <2 x i64> {{.*}}, <i64 48, i64 48>
+; CHECK: ret <2 x i64>
+
+
+define x86_mmx @Test_x86_mmx_psad_bw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+entry:
+  %c = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind
+  ret x86_mmx %c
+}
+
+; CHECK-LABEL: @Test_x86_mmx_psad_bw(
+; CHECK: or i64
+; CHECK: icmp ne i64
+; CHECK: sext i1 {{.*}} to i64
+; CHECK: lshr i64 {{.*}}, 48
+; CHECK: ret x86_mmx
diff --git a/test/Instrumentation/MemorySanitizer/vector_pack.ll b/test/Instrumentation/MemorySanitizer/vector_pack.ll
new file mode 100644
index 000000000000..31c0c62980ec
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+
+define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory {
+entry:
+  %c = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) nounwind
+  ret <8 x i16> %c
+}
+
+; CHECK-LABEL: @Test_packssdw_128(
+; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32>
+; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32>
+; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128(
+; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128(
+; CHECK: ret <8 x i16>
+
+
+define <32 x i8> @Test_avx_packuswb(<16 x i16> %a, <16 x i16> %b) sanitize_memory {
+entry:
+  %c = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind
+  ret <32 x i8> %c
+}
+
+; CHECK-LABEL: @Test_avx_packuswb(
+; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16>
+; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16>
+; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packsswb(
+; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packuswb(
+; CHECK: ret <32 x i8>
+
+
+define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory {
+entry:
+  %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind
+  ret x86_mmx %c
+}
+
+; CHECK-LABEL: @Test_mmx_packuswb(
+; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
+; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
+; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}}
+; CHECK-DAG: bitcast x86_mmx {{.*}} to i64
+; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}}
+; CHECK: ret x86_mmx
diff --git a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
index d449a97a62a1..dc6e43e3ecb9 100644
--- a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
+++ b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
@@ -27,7 +27,7 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 
 ; Check that tsan converts mem intrinsics back to function calls.
 
-define void @MemCpyTest(i8* nocapture %x, i8* nocapture %y) {
+define void @MemCpyTest(i8* nocapture %x, i8* nocapture %y) sanitize_thread {
 entry:
     tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %x, i8* %y, i64 16, i32 4, i1 false)
     ret void
@@ -36,7 +36,7 @@ entry:
 ; CHECK: ret void
 }
 
-define void @MemMoveTest(i8* nocapture %x, i8* nocapture %y) {
+define void @MemMoveTest(i8* nocapture %x, i8* nocapture %y) sanitize_thread {
 entry:
     tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %x, i8* %y, i64 16, i32 4, i1 false)
     ret void
@@ -45,7 +45,7 @@ entry:
 ; CHECK: ret void
 }
 
-define void @MemSetTest(i8* nocapture %x)  {
+define void @MemSetTest(i8* nocapture %x) sanitize_thread {
 entry:
     tail call void @llvm.memset.p0i8.i64(i8* %x, i8 77, i64 16, i32 4, i1 false)
     ret void
diff --git a/test/LTO/lit.local.cfg b/test/LTO/lit.local.cfg
index 6df0e03ee648..afde89be896d 100644
--- a/test/LTO/lit.local.cfg
+++ b/test/LTO/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
   config.unsupported = True
diff --git a/test/Linker/Inputs/PR8300.b.ll b/test/Linker/Inputs/PR8300.b.ll
index 362d309a1910..9e538f5d286a 100644
--- a/test/Linker/Inputs/PR8300.b.ll
+++ b/test/Linker/Inputs/PR8300.b.ll
@@ -1,7 +1,7 @@
 %foo = type { [8 x i8] }
 %bar = type { [9 x i8] }
 
-@zed = alias void (%foo*), void (%bar*)* @xyz
+@zed = alias bitcast (void (%bar*)* @xyz to void (%foo*)*)
 
 define void @xyz(%bar* %this) {
 entry:
diff --git a/test/Linker/Inputs/alias.ll b/test/Linker/Inputs/alias.ll
index b869cae71e3d..f379476e7654 100644
--- a/test/Linker/Inputs/alias.ll
+++ b/test/Linker/Inputs/alias.ll
@@ -1,3 +1,3 @@
 @zed = global i32 42
 @foo = alias i32* @zed
-@foo2 = alias i16, i32* @zed
+@foo2 = alias bitcast (i32* @zed to i16*)
diff --git a/test/Linker/Inputs/cycle.ll b/test/Linker/Inputs/cycle.ll
deleted file mode 100644
index d0eddb6e7c02..000000000000
--- a/test/Linker/Inputs/cycle.ll
+++ /dev/null
@@ -1,2 +0,0 @@
-@foo = alias i32* @bar
-@bar = weak global i32 0
diff --git a/test/Linker/alias.ll b/test/Linker/alias.ll
index 5809a1508207..bce51ad9836f 100644
--- a/test/Linker/alias.ll
+++ b/test/Linker/alias.ll
@@ -5,12 +5,12 @@
 ; CHECK-DAG: @foo = alias i32* @zed
 
 @bar = alias i32* @foo
-; CHECK-DAG: @bar = alias i32* @zed
+; CHECK-DAG: @bar = alias i32* @foo
 
 @foo2 = weak global i32 0
-; CHECK-DAG: @foo2 = alias i16, i32* @zed
+; CHECK-DAG: @foo2 = alias bitcast (i32* @zed to i16*)
 
 @bar2 = alias i32* @foo2
-; CHECK-DAG: @bar2 = alias i32* @zed
+; CHECK-DAG: @bar2 = alias bitcast (i16* @foo2 to i32*)
 
 ; CHECK-DAG: @zed = global i32 42
diff --git a/test/Linker/cycle.ll b/test/Linker/cycle.ll
deleted file mode 100644
index 7d9ad2d9d6ff..000000000000
--- a/test/Linker/cycle.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: not llvm-link %s %S/Inputs/cycle.ll 2>&1 | FileCheck %s
-; RUN: not llvm-link %S/Inputs/cycle.ll %s 2>&1 | FileCheck %s
-
-; CHECK: Linking these modules creates an alias cycle
-
-@foo = weak global i32 0
-@bar = alias i32* @foo
diff --git a/test/Linker/type-unique-odr-a.ll b/test/Linker/type-unique-odr-a.ll
index a1b8d28e6450..91c80339ec03 100644
--- a/test/Linker/type-unique-odr-a.ll
+++ b/test/Linker/type-unique-odr-a.ll
@@ -22,10 +22,6 @@
 ;     return A().getFoo();
 ; }
 ;
-; CHECK:      DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_MIPS_linkage_name {{.*}} "_Z3bazv"
-; CHECK:      DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_MIPS_linkage_name {{.*}} "_ZL3barv"
 ; CHECK:      DW_TAG_class_type
 ; CHECK-NEXT:   DW_AT_name {{.*}} "A"
 ; CHECK-NOT:  DW_TAG
@@ -33,8 +29,16 @@
 ; CHECK-NEXT:   DW_AT_name {{.*}} "data"
 ; CHECK-NOT:  DW_TAG
 ; CHECK:      DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_MIPS_linkage_name {{.*}} "_ZN1A6getFooEv"
-; CHECK-NEXT:   DW_AT_name {{.*}} "getFoo"
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN1A6getFooEv"
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "getFoo"
+; CHECK:      DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_Z3bazv"
+; CHECK:      DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZL3barv"
 
 ; getFoo and A may only appear once.
 ; CHECK-NOT:  {{(getFoo)|("A")}}
diff --git a/test/Linker/unnamed-addr1-a.ll b/test/Linker/unnamed-addr1-a.ll
index adaa40024cfb..794ae987797c 100644
--- a/test/Linker/unnamed-addr1-a.ll
+++ b/test/Linker/unnamed-addr1-a.ll
@@ -21,6 +21,11 @@ define weak void @func-b() unnamed_addr { ret void }
 @global-f = weak global i32 42
 ; CHECK-DAG: @global-f = global i32 42
 
+@alias-a = weak global i32 42
+; CHECK-DAG: @alias-a = alias i32* @global-f
+@alias-b = weak unnamed_addr global i32 42
+; CHECK-DAG: @alias-b = unnamed_addr alias i32* @global-f
+
 declare void @func-c()
 ; CHECK-DAG: define weak void @func-c() {
 define weak void @func-d() { ret void }
@@ -38,6 +43,12 @@ define weak void @func-e() unnamed_addr { ret void }
 @global-j = weak global i32 42
 ; CHECK-DAG: @global-j = global i32 42
 
+@alias-c = weak global i32 42
+; CHECK-DAG: @alias-c = alias i32* @global-f
+@alias-d = weak unnamed_addr global i32 42
+; CHECK-DAG: @alias-d = alias i32* @global-f
+
+
 declare void @func-g()
 ; CHECK-DAG: define weak void @func-g() {
 define weak void @func-h() { ret void }
diff --git a/test/Linker/unnamed-addr1-b.ll b/test/Linker/unnamed-addr1-b.ll
index aa1507b9c6b3..39a0c8bd7e5c 100644
--- a/test/Linker/unnamed-addr1-b.ll
+++ b/test/Linker/unnamed-addr1-b.ll
@@ -6,6 +6,9 @@
 @global-e = unnamed_addr global i32 42
 @global-f = unnamed_addr global i32 42
 
+@alias-a =  unnamed_addr alias i32* @global-f
+@alias-b =  unnamed_addr alias i32* @global-f
+
 define weak void @func-c() unnamed_addr { ret void }
 define weak void @func-d() unnamed_addr { ret void }
 define weak void @func-e() unnamed_addr { ret void }
@@ -15,6 +18,9 @@ define weak void @func-e() unnamed_addr { ret void }
 @global-i = global i32 42
 @global-j = global i32 42
 
+@alias-c =  alias i32* @global-f
+@alias-d =  alias i32* @global-f
+
 define weak void @func-g() { ret void }
 define weak void @func-h() { ret void }
 define weak void @func-i() { ret void }
diff --git a/test/MC/AArch64/adrp-relocation.s b/test/MC/AArch64/adrp-relocation.s
index 6c7fbf5b872f..3bcef34e4f5d 100644
--- a/test/MC/AArch64/adrp-relocation.s
+++ b/test/MC/AArch64/adrp-relocation.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o - %s| llvm-readobj -r - | FileCheck %s
-// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj -o - %s| llvm-readobj -r - | FileCheck %s
         .text
 // These should produce an ADRP/ADD pair to calculate the address of
 // testfn. The important point is that LLVM shouldn't think it can deal with the
diff --git a/test/MC/ARM64/adr.s b/test/MC/AArch64/arm64-adr.s
similarity index 88%
rename from test/MC/ARM64/adr.s
rename to test/MC/AArch64/arm64-adr.s
index 3442225dfe3a..131e545d3bb5 100644
--- a/test/MC/ARM64/adr.s
+++ b/test/MC/AArch64/arm64-adr.s
@@ -8,9 +8,9 @@ adr x0, foo
 // CHECK: adr x0, #0          // encoding: [0x00,0x00,0x00,0x10]
 // CHECK: adr x0, #1          // encoding: [0x00,0x00,0x00,0x30]
 // CHECK: adr x0, .Ltmp0      // encoding: [A,A,A,0x10'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_arm64_pcrel_adr_imm21
+// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_aarch64_pcrel_adr_imm21
 // CHECK: adr x0, foo         // encoding: [A,A,A,0x10'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_arm64_pcrel_adr_imm21
+// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adr_imm21
 
 adrp x0, #0
 adrp x0, #4096
@@ -19,9 +19,9 @@ adrp x0, foo
 // CHECK: adrp    x0, #0      // encoding: [0x00,0x00,0x00,0x90]
 // CHECK: adrp    x0, #4096   // encoding: [0x00,0x00,0x00,0xb0]
 // CHECK: adrp    x0, .Ltmp0  // encoding: [A,A,A,0x90'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_aarch64_pcrel_adrp_imm21
 // CHECK: adrp    x0, foo     // encoding: [A,A,A,0x90'A']
-// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adrp_imm21
 
 adr x0, #0xffffffff
 adrp x0, #0xffffffff
diff --git a/test/MC/ARM64/advsimd.s b/test/MC/AArch64/arm64-advsimd.s
similarity index 100%
rename from test/MC/ARM64/advsimd.s
rename to test/MC/AArch64/arm64-advsimd.s
diff --git a/test/MC/ARM64/aliases.s b/test/MC/AArch64/arm64-aliases.s
similarity index 100%
rename from test/MC/ARM64/aliases.s
rename to test/MC/AArch64/arm64-aliases.s
diff --git a/test/MC/ARM64/arithmetic-encoding.s b/test/MC/AArch64/arm64-arithmetic-encoding.s
similarity index 100%
rename from test/MC/ARM64/arithmetic-encoding.s
rename to test/MC/AArch64/arm64-arithmetic-encoding.s
diff --git a/test/MC/ARM64/arm64-fixup.s b/test/MC/AArch64/arm64-arm64-fixup.s
similarity index 76%
rename from test/MC/ARM64/arm64-fixup.s
rename to test/MC/AArch64/arm64-arm64-fixup.s
index eae6f68390ef..81306fb5ac06 100644
--- a/test/MC/ARM64/arm64-fixup.s
+++ b/test/MC/AArch64/arm64-arm64-fixup.s
@@ -3,8 +3,8 @@
 foo:
   adr x3, Lbar
 ; CHECK: adr x3, Lbar            ; encoding: [0x03'A',A,A,0x10'A']
-; CHECK: fixup A - offset: 0, value: Lbar, kind: fixup_arm64_pcrel_adr_imm21
+; CHECK: fixup A - offset: 0, value: Lbar, kind: fixup_aarch64_pcrel_adr_imm21
 Lbar:
   adrp x3, _printf@page
 ; CHECK: adrp x3, _printf@PAGE      ; encoding: [0x03'A',A,A,0x90'A']
-; CHECK: fixup A - offset: 0, value: _printf@PAGE, kind: fixup_arm64_pcrel_adrp_imm21
+; CHECK: fixup A - offset: 0, value: _printf@PAGE, kind: fixup_aarch64_pcrel_adrp_imm21
diff --git a/test/MC/ARM64/basic-a64-instructions.s b/test/MC/AArch64/arm64-basic-a64-instructions.s
similarity index 100%
rename from test/MC/ARM64/basic-a64-instructions.s
rename to test/MC/AArch64/arm64-basic-a64-instructions.s
diff --git a/test/MC/ARM64/be-datalayout.s b/test/MC/AArch64/arm64-be-datalayout.s
similarity index 100%
rename from test/MC/ARM64/be-datalayout.s
rename to test/MC/AArch64/arm64-be-datalayout.s
diff --git a/test/MC/ARM64/bitfield-encoding.s b/test/MC/AArch64/arm64-bitfield-encoding.s
similarity index 100%
rename from test/MC/ARM64/bitfield-encoding.s
rename to test/MC/AArch64/arm64-bitfield-encoding.s
diff --git a/test/MC/ARM64/branch-encoding.s b/test/MC/AArch64/arm64-branch-encoding.s
similarity index 77%
rename from test/MC/ARM64/branch-encoding.s
rename to test/MC/AArch64/arm64-branch-encoding.s
index ba8fb3d90187..48c2099012f6 100644
--- a/test/MC/ARM64/branch-encoding.s
+++ b/test/MC/AArch64/arm64-branch-encoding.s
@@ -20,7 +20,7 @@ foo:
 ; CHECK: encoding: [0x20,0x01,0x3f,0xd6]
   bl  L1
 ; CHECK: bl L1   ; encoding: [A,A,A,0b100101AA]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_call26
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_call26
 
 ;-----------------------------------------------------------------------------
 ; Contitional branch instructions.
@@ -28,52 +28,52 @@ foo:
 
   b     L1
 ; CHECK: b L1      ; encoding: [A,A,A,0b000101AA]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch26
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch26
   b.eq  L1
 ; CHECK: b.eq L1   ; encoding: [0bAAA00000,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.ne  L1
 ; CHECK: b.ne L1   ; encoding: [0bAAA00001,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.cs  L1
 ; CHECK: b.hs L1   ; encoding: [0bAAA00010,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.cc  L1
 ; CHECK: b.lo L1   ; encoding: [0bAAA00011,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.mi  L1
 ; CHECK: b.mi L1   ; encoding: [0bAAA00100,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.pl  L1
 ; CHECK: b.pl L1   ; encoding: [0bAAA00101,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.vs  L1
 ; CHECK: b.vs L1   ; encoding: [0bAAA00110,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.vc  L1
 ; CHECK: b.vc L1   ; encoding: [0bAAA00111,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.hi  L1
 ; CHECK: b.hi L1   ; encoding: [0bAAA01000,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.ls  L1
 ; CHECK: b.ls L1   ; encoding: [0bAAA01001,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.ge  L1
 ; CHECK: b.ge L1   ; encoding: [0bAAA01010,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.lt  L1
 ; CHECK: b.lt L1   ; encoding: [0bAAA01011,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.gt  L1
 ; CHECK: b.gt L1   ; encoding: [0bAAA01100,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.le  L1
 ; CHECK: b.le L1   ; encoding: [0bAAA01101,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
   b.al  L1
 ; CHECK: b.al L1      ; encoding: [0bAAA01110,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch19
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
 L1:
   b #28
 ; CHECK: b #28
diff --git a/test/MC/ARM64/condbr-without-dots.s b/test/MC/AArch64/arm64-condbr-without-dots.s
similarity index 100%
rename from test/MC/ARM64/condbr-without-dots.s
rename to test/MC/AArch64/arm64-condbr-without-dots.s
diff --git a/test/MC/ARM64/crypto.s b/test/MC/AArch64/arm64-crypto.s
similarity index 100%
rename from test/MC/ARM64/crypto.s
rename to test/MC/AArch64/arm64-crypto.s
diff --git a/test/MC/ARM64/diagno-predicate.s b/test/MC/AArch64/arm64-diagno-predicate.s
similarity index 100%
rename from test/MC/ARM64/diagno-predicate.s
rename to test/MC/AArch64/arm64-diagno-predicate.s
diff --git a/test/MC/ARM64/diags.s b/test/MC/AArch64/arm64-diags.s
similarity index 92%
rename from test/MC/ARM64/diags.s
rename to test/MC/AArch64/arm64-diags.s
index edbdfe98c2ee..cf00e9826e16 100644
--- a/test/MC/ARM64/diags.s
+++ b/test/MC/AArch64/arm64-diags.s
@@ -8,8 +8,8 @@ foo:
   ldr x3, (foo + 4)
   ldr x3, [foo + 4]
 ; CHECK:  ldr x3, foo+4               ; encoding: [0bAAA00011,A,A,0x58]
-; CHECK:                              ;   fixup A - offset: 0, value: foo+4, kind: fixup_arm64_ldr_pcrel_imm19
-; CHECK-ERRORS: error: register expected
+; CHECK:                              ;   fixup A - offset: 0, value: foo+4, kind: fixup_aarch64_ldr_pcrel_imm19
+; CHECK-ERRORS: error: invalid operand for instruction
 
 ; The last argument should be flagged as an error.  rdar://9576009
   ld4.8b	{v0, v1, v2, v3}, [x0], #33
@@ -33,10 +33,10 @@ foo:
 
         ldur x0, [x1, #-257]
 
-; CHECK-ERRORS: error: invalid offset in memory address.
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
 ; CHECK-ERRORS:         ldr x0, [x0, #804]
 ; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: invalid offset in memory address.
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
 ; CHECK-ERRORS:         ldr w0, [x0, #802]
 ; CHECK-ERRORS:                 ^
 ; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
@@ -66,7 +66,7 @@ foo:
 ; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
 ; CHECK-ERRORS:         ldp x3, x4, [x5], #12
 ; CHECK-ERRORS:                           ^
-; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
+; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024, 1008].
 ; CHECK-ERRORS:         ldp q3, q4, [x5], #12
 ; CHECK-ERRORS:                           ^
 ; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
@@ -84,31 +84,31 @@ ldr    s1, [x3, w3, sxtw #4]
 ldr    d1, [x3, w3, sxtw #4]
 ldr    q1, [x3, w3, sxtw #1]
 
-; CHECK-ERRORS: error: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
 ; CHECK-ERRORS:ldrb   w1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
 ; CHECK-ERRORS:ldrh   w1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
 ; CHECK-ERRORS:ldr    w1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
 ; CHECK-ERRORS:ldr    x1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
 ; CHECK-ERRORS:ldr    b1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
 ; CHECK-ERRORS:ldr    h1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
 ; CHECK-ERRORS:ldr    s1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
 ; CHECK-ERRORS:ldr    d1, [x3, w3, sxtw #4]
 ; CHECK-ERRORS:           ^
-; CHECK-ERRORS: invalid offset in memory address.
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
 ; CHECK-ERRORS:ldr    q1, [x3, w3, sxtw #1]
 ; CHECK-ERRORS:           ^
 
@@ -118,10 +118,10 @@ ldr    q1, [x3, w3, sxtw #1]
   str    d1, [x3, w3, sxtx #3]
   ldr    s1, [x3, d3, sxtx #2]
 
-; CHECK-ERRORS: 32-bit general purpose offset register requires sxtw or uxtw extend
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
 ; CHECK-ERRORS:   str    d1, [x3, w3, sxtx #3]
 ; CHECK-ERRORS:                       ^
-; CHECK-ERRORS: error: 64-bit general purpose offset register expected
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
 ; CHECK-ERRORS:   ldr    s1, [x3, d3, sxtx #2]
 ; CHECK-ERRORS:                   ^
 
diff --git a/test/MC/ARM64/directive_loh.s b/test/MC/AArch64/arm64-directive_loh.s
similarity index 100%
rename from test/MC/ARM64/directive_loh.s
rename to test/MC/AArch64/arm64-directive_loh.s
diff --git a/test/MC/ARM64/elf-reloc-condbr.s b/test/MC/AArch64/arm64-elf-reloc-condbr.s
similarity index 100%
rename from test/MC/ARM64/elf-reloc-condbr.s
rename to test/MC/AArch64/arm64-elf-reloc-condbr.s
diff --git a/test/MC/ARM64/elf-relocs.s b/test/MC/AArch64/arm64-elf-relocs.s
similarity index 100%
rename from test/MC/ARM64/elf-relocs.s
rename to test/MC/AArch64/arm64-elf-relocs.s
diff --git a/test/MC/ARM64/fp-encoding.s b/test/MC/AArch64/arm64-fp-encoding.s
similarity index 100%
rename from test/MC/ARM64/fp-encoding.s
rename to test/MC/AArch64/arm64-fp-encoding.s
diff --git a/test/MC/ARM64/large-relocs.s b/test/MC/AArch64/arm64-large-relocs.s
similarity index 83%
rename from test/MC/ARM64/large-relocs.s
rename to test/MC/AArch64/arm64-large-relocs.s
index 48aea43d6bcb..2a0cfa222862 100644
--- a/test/MC/ARM64/large-relocs.s
+++ b/test/MC/AArch64/arm64-large-relocs.s
@@ -4,9 +4,9 @@
         movz x2, #:abs_g0:sym
         movk w3, #:abs_g0_nc:sym
 // CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_arm64_movw
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_aarch64_movw
 // CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_arm64_movw
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_aarch64_movw
 
 // CHECK-OBJ: 0 R_AARCH64_MOVW_UABS_G0 sym
 // CHECK-OBJ: 4 R_AARCH64_MOVW_UABS_G0_NC sym
@@ -14,9 +14,9 @@
         movz x4, #:abs_g1:sym
         movk w5, #:abs_g1_nc:sym
 // CHECK: movz     x4, #:abs_g1:sym       // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_arm64_movw
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_aarch64_movw
 // CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_arm64_movw
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_aarch64_movw
 
 // CHECK-OBJ: 8 R_AARCH64_MOVW_UABS_G1 sym
 // CHECK-OBJ: c R_AARCH64_MOVW_UABS_G1_NC sym
@@ -24,15 +24,15 @@
         movz x6, #:abs_g2:sym
         movk x7, #:abs_g2_nc:sym
 // CHECK: movz     x6, #:abs_g2:sym       // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_arm64_movw
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_aarch64_movw
 // CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_arm64_movw
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_aarch64_movw
 
 // CHECK-OBJ: 10 R_AARCH64_MOVW_UABS_G2 sym
 // CHECK-OBJ: 14 R_AARCH64_MOVW_UABS_G2_NC sym
 
         movz x8, #:abs_g3:sym
 // CHECK: movz     x8, #:abs_g3:sym       // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
 
 // CHECK-OBJ: 18 R_AARCH64_MOVW_UABS_G3 sym
diff --git a/test/MC/ARM64/leaf-compact-unwind.s b/test/MC/AArch64/arm64-leaf-compact-unwind.s
similarity index 100%
rename from test/MC/ARM64/leaf-compact-unwind.s
rename to test/MC/AArch64/arm64-leaf-compact-unwind.s
diff --git a/test/MC/ARM64/logical-encoding.s b/test/MC/AArch64/arm64-logical-encoding.s
similarity index 100%
rename from test/MC/ARM64/logical-encoding.s
rename to test/MC/AArch64/arm64-logical-encoding.s
diff --git a/test/MC/ARM64/mapping-across-sections.s b/test/MC/AArch64/arm64-mapping-across-sections.s
similarity index 100%
rename from test/MC/ARM64/mapping-across-sections.s
rename to test/MC/AArch64/arm64-mapping-across-sections.s
diff --git a/test/MC/ARM64/mapping-within-section.s b/test/MC/AArch64/arm64-mapping-within-section.s
similarity index 100%
rename from test/MC/ARM64/mapping-within-section.s
rename to test/MC/AArch64/arm64-mapping-within-section.s
diff --git a/test/MC/ARM64/memory.s b/test/MC/AArch64/arm64-memory.s
similarity index 100%
rename from test/MC/ARM64/memory.s
rename to test/MC/AArch64/arm64-memory.s
diff --git a/test/MC/ARM64/nv-cond.s b/test/MC/AArch64/arm64-nv-cond.s
similarity index 100%
rename from test/MC/ARM64/nv-cond.s
rename to test/MC/AArch64/arm64-nv-cond.s
diff --git a/test/MC/ARM64/optional-hash.s b/test/MC/AArch64/arm64-optional-hash.s
similarity index 100%
rename from test/MC/ARM64/optional-hash.s
rename to test/MC/AArch64/arm64-optional-hash.s
diff --git a/test/MC/ARM64/separator.s b/test/MC/AArch64/arm64-separator.s
similarity index 100%
rename from test/MC/ARM64/separator.s
rename to test/MC/AArch64/arm64-separator.s
diff --git a/test/MC/ARM64/simd-ldst.s b/test/MC/AArch64/arm64-simd-ldst.s
similarity index 100%
rename from test/MC/ARM64/simd-ldst.s
rename to test/MC/AArch64/arm64-simd-ldst.s
diff --git a/test/MC/ARM64/small-data-fixups.s b/test/MC/AArch64/arm64-small-data-fixups.s
similarity index 100%
rename from test/MC/ARM64/small-data-fixups.s
rename to test/MC/AArch64/arm64-small-data-fixups.s
diff --git a/test/MC/ARM64/spsel-sysreg.s b/test/MC/AArch64/arm64-spsel-sysreg.s
similarity index 100%
rename from test/MC/ARM64/spsel-sysreg.s
rename to test/MC/AArch64/arm64-spsel-sysreg.s
diff --git a/test/MC/ARM64/system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s
similarity index 99%
rename from test/MC/ARM64/system-encoding.s
rename to test/MC/AArch64/arm64-system-encoding.s
index 9246608725b9..87f8f8a4e98c 100644
--- a/test/MC/ARM64/system-encoding.s
+++ b/test/MC/AArch64/arm64-system-encoding.s
@@ -4,7 +4,7 @@
 foo:
 
 ;-----------------------------------------------------------------------------
-; Simple encodings (instuctions w/ no operands)
+; Simple encodings (instructions w/ no operands)
 ;-----------------------------------------------------------------------------
 
   nop
diff --git a/test/MC/ARM64/target-specific-sysreg.s b/test/MC/AArch64/arm64-target-specific-sysreg.s
similarity index 100%
rename from test/MC/ARM64/target-specific-sysreg.s
rename to test/MC/AArch64/arm64-target-specific-sysreg.s
diff --git a/test/MC/ARM64/tls-modifiers-darwin.s b/test/MC/AArch64/arm64-tls-modifiers-darwin.s
similarity index 100%
rename from test/MC/ARM64/tls-modifiers-darwin.s
rename to test/MC/AArch64/arm64-tls-modifiers-darwin.s
diff --git a/test/MC/ARM64/tls-relocs.s b/test/MC/AArch64/arm64-tls-relocs.s
similarity index 82%
rename from test/MC/ARM64/tls-relocs.s
rename to test/MC/AArch64/arm64-tls-relocs.s
index 681f616d909d..96c2b55c36d8 100644
--- a/test/MC/ARM64/tls-relocs.s
+++ b/test/MC/AArch64/arm64-tls-relocs.s
@@ -9,14 +9,14 @@
 
         movz x15, #:gottprel_g1:var
 // CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM:[^ ]+]]
 
 
         movk x13, #:gottprel_g0_nc:var
 // CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
 
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
@@ -25,11 +25,11 @@
         ldr x10, [x0, #:gottprel_lo12:var]
         ldr x9, :gottprel:var
 // CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_pcrel_adrp_imm21
 // CHECK: ldr     x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
 // CHECK: ldr     x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_ldr_pcrel_imm19
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_ldr_pcrel_imm19
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
@@ -43,9 +43,9 @@
         movz x3, #:tprel_g2:var
         movn x4, #:tprel_g2:var
 // CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
 // CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
@@ -55,11 +55,11 @@
         movn x6, #:tprel_g1:var
         movz w7, #:tprel_g1:var
 // CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
 // CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
 // CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
@@ -69,9 +69,9 @@
         movk x9, #:tprel_g1_nc:var
         movk w10, #:tprel_g1_nc:var
 // CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
 // CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
@@ -81,11 +81,11 @@
         movn x12, #:tprel_g0:var
         movz w13, #:tprel_g0:var
 // CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
 // CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
 // CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
@@ -95,9 +95,9 @@
         movk x15, #:tprel_g0_nc:var
         movk w16, #:tprel_g0_nc:var
 // CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
 // CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
@@ -105,14 +105,14 @@
 
         add x21, x22, #:tprel_lo12:var
 // CHECK: add     x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
 
 
         add x25, x26, #:tprel_lo12_nc:var
 // CHECK: add     x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
 
@@ -120,9 +120,9 @@
         ldrb w29, [x30, #:tprel_lo12:var]
         ldrsb x29, [x28, #:tprel_lo12_nc:var]
 // CHECK: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
 // CHECK: ldrsb   x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
@@ -131,9 +131,9 @@
         strh w27, [x26, #:tprel_lo12:var]
         ldrsh x25, [x24, #:tprel_lo12_nc:var]
 // CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
 // CHECK: ldrsh   x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
@@ -142,9 +142,9 @@
         ldr w23, [x22, #:tprel_lo12:var]
         ldrsw x21, [x20, #:tprel_lo12_nc:var]
 // CHECK: ldr     w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
 // CHECK: ldrsw   x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
@@ -152,9 +152,9 @@
         ldr x19, [x18, #:tprel_lo12:var]
         str x17, [x16, #:tprel_lo12_nc:var]
 // CHECK: ldr     x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
 // CHECK: str     x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
@@ -167,9 +167,9 @@
         movz x3, #:dtprel_g2:var
         movn x4, #:dtprel_g2:var
 // CHECK: movz    x3, #:dtprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
 // CHECK: movn    x4, #:dtprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
@@ -179,11 +179,11 @@
         movn x6, #:dtprel_g1:var
         movz w7, #:dtprel_g1:var
 // CHECK: movz    x5, #:dtprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
 // CHECK: movn    x6, #:dtprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
 // CHECK: movz    w7, #:dtprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
@@ -193,9 +193,9 @@
         movk x9, #:dtprel_g1_nc:var
         movk w10, #:dtprel_g1_nc:var
 // CHECK: movk    x9, #:dtprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
 // CHECK: movk    w10, #:dtprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
@@ -205,11 +205,11 @@
         movn x12, #:dtprel_g0:var
         movz w13, #:dtprel_g0:var
 // CHECK: movz    x11, #:dtprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
 // CHECK: movn    x12, #:dtprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
 // CHECK: movz    w13, #:dtprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
@@ -219,9 +219,9 @@
         movk x15, #:dtprel_g0_nc:var
         movk w16, #:dtprel_g0_nc:var
 // CHECK: movk    x15, #:dtprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
 // CHECK: movk    w16, #:dtprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
@@ -229,14 +229,14 @@
 
         add x21, x22, #:dtprel_lo12:var
 // CHECK: add     x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
 
 
         add x25, x26, #:dtprel_lo12_nc:var
 // CHECK: add     x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
 
@@ -244,9 +244,9 @@
         ldrb w29, [x30, #:dtprel_lo12:var]
         ldrsb x29, [x28, #:dtprel_lo12_nc:var]
 // CHECK: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
 // CHECK: ldrsb   x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
@@ -255,9 +255,9 @@
         strh w27, [x26, #:dtprel_lo12:var]
         ldrsh x25, [x24, #:dtprel_lo12_nc:var]
 // CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
 // CHECK: ldrsh   x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
@@ -266,9 +266,9 @@
         ldr w23, [x22, #:dtprel_lo12:var]
         ldrsw x21, [x20, #:dtprel_lo12_nc:var]
 // CHECK: ldr     w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
 // CHECK: ldrsw   x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
@@ -276,9 +276,9 @@
         ldr x19, [x18, #:dtprel_lo12:var]
         str x17, [x16, #:dtprel_lo12_nc:var]
 // CHECK: ldr     x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
 // CHECK: str     x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
@@ -294,13 +294,13 @@
         blr x3
 
 // CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_aarch64_pcrel_adrp_imm21
 // CHECK: ldr     x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
 // CHECK: add     x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_add_imm12
 // CHECK: .tlsdesccall var                // encoding: []
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_arm64_tlsdesc_call
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
 // CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
 
 
diff --git a/test/MC/ARM64/v128_lo-diagnostics.s b/test/MC/AArch64/arm64-v128_lo-diagnostics.s
similarity index 100%
rename from test/MC/ARM64/v128_lo-diagnostics.s
rename to test/MC/AArch64/arm64-v128_lo-diagnostics.s
diff --git a/test/MC/ARM64/variable-exprs.s b/test/MC/AArch64/arm64-variable-exprs.s
similarity index 100%
rename from test/MC/ARM64/variable-exprs.s
rename to test/MC/AArch64/arm64-variable-exprs.s
diff --git a/test/MC/ARM64/vector-lists.s b/test/MC/AArch64/arm64-vector-lists.s
similarity index 100%
rename from test/MC/ARM64/vector-lists.s
rename to test/MC/AArch64/arm64-vector-lists.s
diff --git a/test/MC/ARM64/verbose-vector-case.s b/test/MC/AArch64/arm64-verbose-vector-case.s
similarity index 100%
rename from test/MC/ARM64/verbose-vector-case.s
rename to test/MC/AArch64/arm64-verbose-vector-case.s
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index 42493a2ef7a1..b33891cc3d73 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -1,6 +1,4 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ERROR --check-prefix=CHECK-ERROR-AARCH64 < %t %s
-// RUN: not llvm-mc -triple arm64-none-linux-gnu < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ERROR --check-prefix=CHECK-ERROR-ARM64 < %t %s
 
 //------------------------------------------------------------------------------
@@ -1347,39 +1345,59 @@
 
         cset wsp, lt
         csetm sp, ge
+        cset w1, al
+        csetm x6, nv
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cset wsp, lt
 // CHECK-ERROR-NEXT:             ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        csetm sp, ge
 // CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cset w1, al
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        csetm x6, nv
+// CHECK-ERROR-NEXT:                    ^
 
         cinc w3, wsp, ne
         cinc sp, x9, eq
+        cinc x2, x0, nv
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinc w3, wsp, ne
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinc sp, x9, eq
 // CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cinc x2, x0, nv
+// CHECK-ERROR-NEXT:                       ^
 
         cinv w3, wsp, ne
         cinv sp, x9, eq
+        cinv w8, x7, nv
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinv w3, wsp, ne
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinv sp, x9, eq
 // CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cinv w8, x7, nv
+// CHECK-ERROR-NEXT:                       ^
 
         cneg w3, wsp, ne
         cneg sp, x9, eq
+        cneg x4, x5, al
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cneg w3, wsp, ne
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cneg sp, x9, eq
 // CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cneg x4, x5, al
+// CHECK-ERROR-NEXT:                       ^
 
 //------------------------------------------------------------------------------
 // Data Processing (1 source)
@@ -1803,7 +1821,7 @@
        stxrb w2, w3, [x4, #20]
        stlxrh w10, w11, [w2]
 // CHECK-ERROR-AARCH64: error: expected '#0'
-// CHECK-ERROR-ARM64: error: invalid operand for instruction
+// CHECK-ERROR-ARM64: error: index must be absent or #0
 // CHECK-ERROR-NEXT:         stxrb w2, w3, [x4, #20]
 // CHECK-ERROR-NEXT:                       ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -1887,7 +1905,8 @@
 //------------------------------------------------------------------------------
         ldr x3, [x4, #25], #0
         ldr x4, [x9, #0], #4
-// CHECK-ERROR: error: {{expected symbolic reference or integer|index must be a multiple of 8}} in range [0, 32760]
+// CHECK-ERROR-AARCH64: error: {{expected symbolic reference or integer|index must be a multiple of 8}} in range [0, 32760]
+// CHECK-ERROR-ARM64: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr x3, [x4, #25], #0
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
@@ -2083,22 +2102,19 @@
         strh w9, [sp, #-257]!
         str w1, [x19, #256]!
         str w9, [sp, #-257]!
-// CHECK-ERROR-AARCH64: error: invalid operand for instruction
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         strb w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strb w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         strh w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strh w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                            ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
@@ -2111,22 +2127,19 @@
         ldrh w9, [sp, #-257]!
         ldr w1, [x19, #256]!
         ldr w9, [sp, #-257]!
-// CHECK-ERROR-AARCH64: error: invalid operand for instruction
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrb w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrb w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrh w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrh w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                            ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
@@ -2139,22 +2152,19 @@
         ldrsh x22, [x13, #-257]!
         ldrsw x2, [x3, #256]!
         ldrsw x22, [x13, #-257]!
-// CHECK-ERROR-AARCH64: error: invalid operand for instruction
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsb x2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb x22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsh x2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsh x22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsw x2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
@@ -2165,15 +2175,13 @@
         ldrsb w22, [x13, #-257]!
         ldrsh w2, [x3, #256]!
         ldrsh w22, [x13, #-257]!
-// CHECK-ERROR-AARCH64: error: invalid operand for instruction
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsb w2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb w22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsh w2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
@@ -2188,29 +2196,25 @@
         str s3, [x13, #-257]!
         str d3, [x3, #256]!
         str d3, [x13, #-257]!
-// CHECK-ERROR-AARCH64: error: invalid operand for instruction
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str b3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str b3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str h3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str h3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str s3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str s3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str d3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
@@ -2225,29 +2229,25 @@
         ldr s3, [x13, #-257]!
         ldr d3, [x3, #256]!
         ldr d3, [x13, #-257]!
-// CHECK-ERROR-AARCH64: error: invalid operand for instruction
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr b3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr b3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr h3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr h3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr s3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr s3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr d3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
@@ -2262,20 +2262,16 @@
         sttrh w17, [x1, #256]
         ldtrsw x20, [x1, #256]
         ldtr x12, [sp, #256]
-// CHECK-ERROR-AARCH64: error: expected integer in range [-256, 255]
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:        ldtrb w2, [sp, #256]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         sttrh w17, [x1, #256]
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtrsw x20, [x1, #256]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtr x12, [sp, #256]
 // CHECK-ERROR-NEXT:                   ^
 
@@ -2290,12 +2286,10 @@
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         sttr b2, [x2, #-257]
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtrsb x9, [sp, #-257]
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtr w2, [x30, #-257]
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -2313,24 +2307,19 @@
         ldr w0, [x4, #16384]
         ldrh w2, [x21, #8192]
         ldrb w3, [x12, #4096]
-// CHECK-ERROR-AARCH64: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64: error: invalid offset in memory address
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr q0, [x11, #65536]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr x0, [sp, #32768]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr w0, [x4, #16384]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrh w2, [x21, #8192]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrb w3, [x12, #4096]
 // CHECK-ERROR-NEXT:                  ^
 
@@ -2372,8 +2361,7 @@
 // CHECK-ERROR-AARCH64-NEXT: error: too few operands for instruction
 // CHECK-ERROR-AARCH64-NEXT:         str x5, [x22, #12]
 // CHECK-ERROR-AARCH64-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str w7, [x12, #16384]
 // CHECK-ERROR-NEXT:                 ^
 
@@ -2411,92 +2399,78 @@
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ldr w3, [xzr, x3]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected #imm after shift specifier
-// CHECK-ERROR-ARM64-NEXT: error: LSL extend requires immediate operand
+// CHECK-ERROR-NEXT: error: expected #imm after shift specifier
 // CHECK-ERROR-NEXT:         ldr w4, [x0, x4, lsl]
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
-// CHECK-ERROR-AARCH64-NEXT:         ldr w9, [x5, x5, uxtw]
-// CHECK-ERROR-AARCH64-NEXT:                          ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
-// CHECK-ERROR-AARCH64-NEXT:         ldr w10, [x6, x9, sxtw #2]
-// CHECK-ERROR-AARCH64-NEXT:                           ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
-// CHECK-ERROR-ARM64-NEXT: error: 32-bit general purpose offset register requires sxtw or uxtw extend
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         ldr w9, [x5, x5, uxtw]
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         ldr w10, [x6, x9, sxtw #2]
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
 // CHECK-ERROR-NEXT:         ldr w11, [x7, w2, lsl #2]
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
-// CHECK-ERROR-ARM64-NEXT: error: 32-bit general purpose offset register requires sxtw or uxtw extend
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
 // CHECK-ERROR-NEXT:         ldr w12, [x8, w1, sxtx]
 // CHECK-ERROR-NEXT:                           ^
 
         ldrsb w9, [x4, x2, lsl #-1]
         strb w9, [x4, x2, lsl #1]
-// CHECK-ERROR-AARCH64-NEXT: error: expected integer shift amount
-// CHECK-ERROR-ARM64-NEXT: error: immediate operand out of range
+// CHECK-ERROR-NEXT: error: expected integer shift amount
 // CHECK-ERROR-NEXT:         ldrsb w9, [x4, x2, lsl #-1]
 // CHECK-ERROR-NEXT:                                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0
 // CHECK-ERROR-NEXT:         strb w9, [x4, x2, lsl #1]
 // CHECK-ERROR-NEXT:                  ^
 
         ldrsh w9, [x4, x2, lsl #-1]
         ldr h13, [x4, w2, uxtw #2]
-// CHECK-ERROR-AARCH64-NEXT: error: expected integer shift amount
-// CHECK-ERROR-ARM64-NEXT: error: immediate operand out of range
+// CHECK-ERROR-NEXT: error: expected integer shift amount
 // CHECK-ERROR-NEXT:         ldrsh w9, [x4, x2, lsl #-1]
 // CHECK-ERROR-NEXT:                                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
 // CHECK-ERROR-NEXT:         ldr h13, [x4, w2, uxtw #2]
 // CHECK-ERROR-NEXT:                           ^
 
         str w9, [x5, w9, sxtw #-1]
         str s3, [sp, w9, uxtw #1]
         ldrsw x9, [x15, x4, sxtx #3]
-// CHECK-ERROR-AARCH64-NEXT: error: expected integer shift amount
-// CHECK-ERROR-ARM64-NEXT: error: immediate operand out of range
+// CHECK-ERROR-NEXT: error: expected integer shift amount
 // CHECK-ERROR-NEXT:         str w9, [x5, w9, sxtw #-1]
 // CHECK-ERROR-NEXT:                                ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
 // CHECK-ERROR-NEXT:         str s3, [sp, w9, uxtw #1]
 // CHECK-ERROR-NEXT:                          ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
 // CHECK-ERROR-NEXT:         ldrsw x9, [x15, x4, sxtx #3]
 // CHECK-ERROR-NEXT:                             ^
 
         str xzr, [x5, x9, sxtx #-1]
         prfm pldl3keep, [sp, x20, lsl #2]
         ldr d3, [x20, wzr, uxtw #4]
-// CHECK-ERROR-AARCH64-NEXT: error: expected integer shift amount
-// CHECK-ERROR-ARM64-NEXT: error: immediate operand out of range
+// CHECK-ERROR-NEXT: error: expected integer shift amount
 // CHECK-ERROR-NEXT:         str xzr, [x5, x9, sxtx #-1]
 // CHECK-ERROR-NEXT:                                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #3
-// CHECK-ERROR-ARM64-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #3
 // CHECK-ERROR-NEXT:         prfm pldl3keep, [sp, x20, lsl #2]
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-AARCH64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
 // CHECK-ERROR-NEXT:         ldr d3, [x20, wzr, uxtw #4]
 // CHECK-ERROR-NEXT:                 ^
 
         ldr q5, [sp, x2, lsl #-1]
         ldr q10, [x20, w4, uxtw #2]
         str q21, [x20, w4, uxtw #5]
-// CHECK-ERROR-AARCH64-NEXT: error: expected integer shift amount
-// CHECK-ERROR-ARM64-NEXT: error: immediate operand out of range
+// CHECK-ERROR-NEXT: error: expected integer shift amount
 // CHECK-ERROR-NEXT:         ldr q5, [sp, x2, lsl #-1]
 // CHECK-ERROR-NEXT:                               ^
 // CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
-// CHECK-ERROR-ARM64-NEXT: error: invalid offset in memory address
+// CHECK-ERROR-ARM64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
 // CHECK-ERROR-NEXT:         ldr q10, [x20, w4, uxtw #2]
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
-// CHECK-ERROR-ARM64-NEXT: error: immediate operand out of range
+// CHECK-ERROR-ARM64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
 // CHECK-ERROR-NEXT:         str q21, [x20, w4, uxtw #5]
 // CHECK-ERROR-NEXT:                  ^
 
@@ -2695,16 +2669,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldp d3, q2, [sp], #0
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
-// CHECK-ERROR-ARM64-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q3, q5, [sp], #8
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
-// CHECK-ERROR-ARM64-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         stp q20, q25, [x5], #1024
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-AARCH64-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
-// CHECK-ERROR-ARM64-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q30, q15, [x23], #-1040
 // CHECK-ERROR-NEXT:                       ^
 
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index 9a4ec81aae9a..8f323982afa2 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -1,5 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
-// RUN: llvm-mc -triple arm64-none-linux-gnu -show-encoding -mattr=+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+fp-armv8 < %s | FileCheck %s
   .globl _func
 
 // Check that the assembler can handle the documented syntax from the ARM ARM.
@@ -128,8 +127,7 @@ _func:
 // CHECK: adds     w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x2b]
 // CHECK: adds     w2, w5, w1, sxtb #1        // encoding: [0xa2,0x84,0x21,0x2b]
 // CHECK: adds     w26, wsp, w19, sxth        // encoding: [0xfa,0xa3,0x33,0x2b]
-// CHECK-AARCH64: adds     wzr, w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
-// CHECK-ARM64: cmn w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
+// CHECK: cmn w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
 // CHECK: adds     w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x2b]
 
         // subs
@@ -257,8 +255,7 @@ _func:
 // CHECK: sub      sp, x3, x7, lsl #4         // encoding: [0x7f,0x70,0x27,0xcb]
 // CHECK: add      w2, wsp, w3, lsl #1        // encoding: [0xe2,0x47,0x23,0x0b]
 // CHECK: cmp      wsp, w9                    // encoding: [0xff,0x43,0x29,0x6b]
-// CHECK-AARCH64: adds     wzr, wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
-// CHECK-ARM64: cmn wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
+// CHECK: cmn wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
 // CHECK: subs     x3, sp, x9, lsl #2         // encoding: [0xe3,0x6b,0x29,0xeb]
 
 //------------------------------------------------------------------------------
@@ -352,10 +349,8 @@ _func:
 
 // A relocation check (default to lo12, which is the only sane relocation anyway really)
         add x0, x4, #:lo12:var
-// CHECK-AARCH64: add     x0, x4, #:lo12:var         // encoding: [0x80'A',A,A,0x91'A']
-// CHECK-AARCH64:                                    //   fixup A - offset: 0, value: :lo12:var, kind: fixup_a64_add_lo12
-// CHECK-ARM64: add x0, x4, :lo12:var       // encoding: [0x80,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-ARM64:                             // fixup A - offset: 0, value: :lo12:var, kind: fixup_arm64_add_imm12
+// CHECK: add x0, x4, :lo12:var       // encoding: [0x80,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK:                             // fixup A - offset: 0, value: :lo12:var, kind: fixup_aarch64_add_imm12
 
 //------------------------------------------------------------------------------
 // Add-sub (shifted register)
@@ -489,8 +484,7 @@ _func:
         sub w4, w6, wzr
 // CHECK: sub      w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x4b]
 // CHECK: sub      wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x4b]
-// CHECK-AARCH64: sub      w20, wzr, w4       // encoding: [0xf4,0x03,0x04,0x4b]
-// CHECK-ARM64: neg      w20, w4              // encoding: [0xf4,0x03,0x04,0x4b]
+// CHECK: neg      w20, w4              // encoding: [0xf4,0x03,0x04,0x4b]
 // CHECK: sub      w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x4b]
 
         sub w11, w13, w15, lsl #0
@@ -520,8 +514,7 @@ _func:
         sub x4, x6, xzr
 // CHECK: sub      x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xcb]
 // CHECK: sub      xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xcb]
-// CHECK-AARCH64: sub      x20, xzr, x4       // encoding: [0xf4,0x03,0x04,0xcb]
-// CHECK-ARM64: neg      x20, x4              // encoding: [0xf4,0x03,0x04,0xcb]
+// CHECK: neg      x20, x4              // encoding: [0xf4,0x03,0x04,0xcb]
 // CHECK: sub      x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xcb]
 
         sub x11, x13, x15, lsl #0
@@ -551,8 +544,7 @@ _func:
         subs w4, w6, wzr
 // CHECK: subs     w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x6b]
 // CHECK: {{subs wzr,|cmp}} w3, w5            // encoding: [0x7f,0x00,0x05,0x6b]
-// CHECK-AARCH64: subs     w20, wzr, w4       // encoding: [0xf4,0x03,0x04,0x6b]
-// CHECK-ARM64: negs     w20, w4              // encoding: [0xf4,0x03,0x04,0x6b]
+// CHECK: negs     w20, w4              // encoding: [0xf4,0x03,0x04,0x6b]
 // CHECK: subs     w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x6b]
 
         subs w11, w13, w15, lsl #0
@@ -582,8 +574,7 @@ _func:
         subs x4, x6, xzr
 // CHECK: subs     x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xeb]
 // CHECK: {{subs xzr,|cmp}} x3, x5            // encoding: [0x7f,0x00,0x05,0xeb]
-// CHECK-AARCH64: subs     x20, xzr, x4       // encoding: [0xf4,0x03,0x04,0xeb]
-// CHECK-ARM64: negs     x20, x4              // encoding: [0xf4,0x03,0x04,0xeb]
+// CHECK: negs     x20, x4              // encoding: [0xf4,0x03,0x04,0xeb]
 // CHECK: subs     x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xeb]
 
         subs x11, x13, x15, lsl #0
@@ -610,9 +601,11 @@ _func:
         cmn w0, w3
         cmn wzr, w4
         cmn w5, wzr
+        cmn wsp, w6
 // CHECK: cmn      w0, w3                     // encoding: [0x1f,0x00,0x03,0x2b]
 // CHECK: cmn      wzr, w4                    // encoding: [0xff,0x03,0x04,0x2b]
 // CHECK: cmn      w5, wzr                    // encoding: [0xbf,0x00,0x1f,0x2b]
+// CHECK: cmn      wsp, w6                    // encoding: [0xff,0x43,0x26,0x2b]
 
         cmn w6, w7, lsl #0
         cmn w8, w9, lsl #15
@@ -638,9 +631,11 @@ _func:
         cmn x0, x3
         cmn xzr, x4
         cmn x5, xzr
+        cmn sp, x6
 // CHECK: cmn      x0, x3                     // encoding: [0x1f,0x00,0x03,0xab]
 // CHECK: cmn      xzr, x4                    // encoding: [0xff,0x03,0x04,0xab]
 // CHECK: cmn      x5, xzr                    // encoding: [0xbf,0x00,0x1f,0xab]
+// CHECK: cmn      sp, x6                     // encoding: [0xff,0x63,0x26,0xab]
 
         cmn x6, x7, lsl #0
         cmn x8, x9, lsl #15
@@ -666,9 +661,11 @@ _func:
         cmp w0, w3
         cmp wzr, w4
         cmp w5, wzr
+        cmp wsp, w6
 // CHECK: cmp      w0, w3                     // encoding: [0x1f,0x00,0x03,0x6b]
 // CHECK: cmp      wzr, w4                    // encoding: [0xff,0x03,0x04,0x6b]
 // CHECK: cmp      w5, wzr                    // encoding: [0xbf,0x00,0x1f,0x6b]
+// CHECK: cmp      wsp, w6                    // encoding: [0xff,0x43,0x26,0x6b]
 
         cmp w6, w7, lsl #0
         cmp w8, w9, lsl #15
@@ -694,9 +691,11 @@ _func:
         cmp x0, x3
         cmp xzr, x4
         cmp x5, xzr
+        cmp sp, x6
 // CHECK: cmp      x0, x3                     // encoding: [0x1f,0x00,0x03,0xeb]
 // CHECK: cmp      xzr, x4                    // encoding: [0xff,0x03,0x04,0xeb]
 // CHECK: cmp      x5, xzr                    // encoding: [0xbf,0x00,0x1f,0xeb]
+// CHECK: cmp      sp, x6                     // encoding: [0xff,0x63,0x26,0xeb]
 
         cmp x6, x7, lsl #0
         cmp x8, x9, lsl #15
@@ -722,23 +721,17 @@ _func:
         neg w29, w30
         neg w30, wzr
         neg wzr, w0
-// CHECK-AARCH64: sub      w29, wzr, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
-// CHECK-AARCH64: sub      w30, wzr, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
-// CHECK-AARCH64: sub      wzr, wzr, w0              // encoding: [0xff,0x03,0x00,0x4b]
-// CHECK-ARM64: neg      w29, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
-// CHECK-ARM64: neg      w30, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
-// CHECK-ARM64: neg      wzr, w0               // encoding: [0xff,0x03,0x00,0x4b]
+// CHECK: neg      w29, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
+// CHECK: neg      w30, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
+// CHECK: neg      wzr, w0               // encoding: [0xff,0x03,0x00,0x4b]
 
         neg w28, w27, lsl #0
         neg w26, w25, lsl #29
         neg w24, w23, lsl #31
-// CHECK-AARCH64: sub      w28, wzr, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
-// CHECK-AARCH64: neg     w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
-// CHECK-AARCH64: neg      w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
 
-// CHECK-ARM64: neg      w28, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
-// CHECK-ARM64: neg      w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
-// CHECK-ARM64: neg      w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
+// CHECK: neg      w28, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
+// CHECK: neg      w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
+// CHECK: neg      w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
 
         neg w22, w21, lsr #0
         neg w20, w19, lsr #1
@@ -757,23 +750,17 @@ _func:
         neg x29, x30
         neg x30, xzr
         neg xzr, x0
-// CHECK-AARCH64: sub      x29, xzr, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
-// CHECK-AARCH64: sub      x30, xzr, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
-// CHECK-AARCH64: sub      xzr, xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
-// CHECK-ARM64: neg      x29, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
-// CHECK-ARM64: neg      x30, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
-// CHECK-ARM64: neg      xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
+// CHECK: neg      x29, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
+// CHECK: neg      x30, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
+// CHECK: neg      xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
 
         neg x28, x27, lsl #0
         neg x26, x25, lsl #29
         neg x24, x23, lsl #31
-// CHECK-AARCH64: sub      x28, xzr, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
-// CHECK-AARCH64: neg      x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
-// CHECK-AARCH64: neg      x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
 
-// CHECK-ARM64: neg      x28, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
-// CHECK-ARM64: neg      x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
-// CHECK-ARM64: neg      x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
+// CHECK: neg      x28, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
+// CHECK: neg      x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
+// CHECK: neg      x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
 
         neg x22, x21, lsr #0
         neg x20, x19, lsr #1
@@ -792,23 +779,17 @@ _func:
         negs w29, w30
         negs w30, wzr
         negs wzr, w0
-// CHECK-AARCH64: subs     w29, wzr, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
-// CHECK-AARCH64: subs     w30, wzr, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
-// CHECK-AARCH64: subs     wzr, wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
-// CHECK-ARM64: negs     w29, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
-// CHECK-ARM64: negs     w30, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
-// CHECK-ARM64: cmp      wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
+// CHECK: negs     w29, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
+// CHECK: negs     w30, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
+// CHECK: cmp      wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
 
         negs w28, w27, lsl #0
         negs w26, w25, lsl #29
         negs w24, w23, lsl #31
-// CHECK-AARCH64: subs     w28, wzr, w27      // encoding: [0xfc,0x03,0x1b,0x6b]
-// CHECK-AARCH64: negs     w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
-// CHECK-AARCH64: negs     w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
 
-// CHECK-ARM64: negs     w28, w27             // encoding: [0xfc,0x03,0x1b,0x6b]
-// CHECK-ARM64: negs     w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
-// CHECK-ARM64: negs     w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
+// CHECK: negs     w28, w27             // encoding: [0xfc,0x03,0x1b,0x6b]
+// CHECK: negs     w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
+// CHECK: negs     w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
 
         negs w22, w21, lsr #0
         negs w20, w19, lsr #1
@@ -827,23 +808,17 @@ _func:
         negs x29, x30
         negs x30, xzr
         negs xzr, x0
-// CHECK-AARCH64: subs     x29, xzr, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
-// CHECK-AARCH64: subs     x30, xzr, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
-// CHECK-AARCH64: subs     xzr, xzr, x0               // encoding: [0xff,0x03,0x00,0xeb]
-// CHECK-ARM64: negs     x29, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
-// CHECK-ARM64: negs     x30, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
-// CHECK-ARM64: cmp     xzr, x0                // encoding: [0xff,0x03,0x00,0xeb]
+// CHECK: negs     x29, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
+// CHECK: negs     x30, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
+// CHECK: cmp     xzr, x0                // encoding: [0xff,0x03,0x00,0xeb]
 
         negs x28, x27, lsl #0
         negs x26, x25, lsl #29
         negs x24, x23, lsl #31
-// CHECK-AARCH64: subs     x28, xzr, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
-// CHECK-AARCH64: negs     x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
-// CHECK-AARCH64: negs     x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
 
-// CHECK-ARM64: negs     x28, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
-// CHECK-ARM64: negs     x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
-// CHECK-ARM64: negs     x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
+// CHECK: negs     x28, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
+// CHECK: negs     x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
+// CHECK: negs     x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
 
         negs x22, x21, lsr #0
         negs x20, x19, lsr #1
@@ -970,41 +945,29 @@ _func:
         sbfm x3, x4, #63, #63
         sbfm wzr, wzr, #31, #31
         sbfm w12, w9, #0, #0
-// CHECK-AARCH64: sbfm     x1, x2, #3, #4     // encoding: [0x41,0x10,0x43,0x93]
-// CHECK-AARCH64: sbfm     x3, x4, #63, #63   // encoding: [0x83,0xfc,0x7f,0x93]
-// CHECK-AARCH64: sbfm     wzr, wzr, #31, #31 // encoding: [0xff,0x7f,0x1f,0x13]
-// CHECK-AARCH64: sbfm     w12, w9, #0, #0    // encoding: [0x2c,0x01,0x00,0x13]
 
-// CHECK-ARM64: sbfx     x1, x2, #3, #2       // encoding: [0x41,0x10,0x43,0x93]
-// CHECK-ARM64: asr      x3, x4, #63          // encoding: [0x83,0xfc,0x7f,0x93]
-// CHECK-ARM64: asr      wzr, wzr, #31        // encoding: [0xff,0x7f,0x1f,0x13]
-// CHECK-ARM64: sbfx     w12, w9, #0, #1      // encoding: [0x2c,0x01,0x00,0x13]
+// CHECK: sbfx     x1, x2, #3, #2       // encoding: [0x41,0x10,0x43,0x93]
+// CHECK: asr      x3, x4, #63          // encoding: [0x83,0xfc,0x7f,0x93]
+// CHECK: asr      wzr, wzr, #31        // encoding: [0xff,0x7f,0x1f,0x13]
+// CHECK: sbfx     w12, w9, #0, #1      // encoding: [0x2c,0x01,0x00,0x13]
 
         ubfm x4, x5, #12, #10
         ubfm xzr, x4, #0, #0
         ubfm x4, xzr, #63, #5
         ubfm x5, x6, #12, #63
-// CHECK-AARCH64: ubfm     x4, x5, #12, #10           // encoding: [0xa4,0x28,0x4c,0xd3]
-// CHECK-AARCH64: ubfm     xzr, x4, #0, #0            // encoding: [0x9f,0x00,0x40,0xd3]
-// CHECK-AARCH64: ubfm     x4, xzr, #63, #5            // encoding: [0xe4,0x17,0x7f,0xd3]
-// CHECK-AARCH64: ubfm     x5, x6, #12, #63           // encoding: [0xc5,0xfc,0x4c,0xd3]
-// CHECK-ARM64: ubfiz    x4, x5, #52, #11        // encoding: [0xa4,0x28,0x4c,0xd3]
-// CHECK-ARM64: ubfx     xzr, x4, #0, #1         // encoding: [0x9f,0x00,0x40,0xd3]
-// CHECK-ARM64: ubfiz    x4, xzr, #1, #6         // encoding: [0xe4,0x17,0x7f,0xd3]
-// CHECK-ARM64: lsr      x5, x6, #12             // encoding: [0xc5,0xfc,0x4c,0xd3]
+// CHECK: ubfiz    x4, x5, #52, #11        // encoding: [0xa4,0x28,0x4c,0xd3]
+// CHECK: ubfx     xzr, x4, #0, #1         // encoding: [0x9f,0x00,0x40,0xd3]
+// CHECK: ubfiz    x4, xzr, #1, #6         // encoding: [0xe4,0x17,0x7f,0xd3]
+// CHECK: lsr      x5, x6, #12             // encoding: [0xc5,0xfc,0x4c,0xd3]
 
         bfm x4, x5, #12, #10
         bfm xzr, x4, #0, #0
         bfm x4, xzr, #63, #5
         bfm x5, x6, #12, #63
-// CHECK-AARCH64: bfm      x4, x5, #12, #10           // encoding: [0xa4,0x28,0x4c,0xb3]
-// CHECK-AARCH64: bfm      xzr, x4, #0, #0            // encoding: [0x9f,0x00,0x40,0xb3]
-// CHECK-AARCH64: bfm      x4, xzr, #63, #5            // encoding: [0xe4,0x17,0x7f,0xb3]
-// CHECK-AARCH64: bfm      x5, x6, #12, #63           // encoding: [0xc5,0xfc,0x4c,0xb3]
-// CHECK-ARM64: bfi      x4, x5, #52, #11           // encoding: [0xa4,0x28,0x4c,0xb3]
-// CHECK-ARM64: bfxil    xzr, x4, #0, #1            // encoding: [0x9f,0x00,0x40,0xb3]
-// CHECK-ARM64: bfi      x4, xzr, #1, #6            // encoding: [0xe4,0x17,0x7f,0xb3]
-// CHECK-ARM64: bfxil    x5, x6, #12, #52           // encoding: [0xc5,0xfc,0x4c,0xb3]
+// CHECK: bfi      x4, x5, #52, #11           // encoding: [0xa4,0x28,0x4c,0xb3]
+// CHECK: bfxil    xzr, x4, #0, #1            // encoding: [0x9f,0x00,0x40,0xb3]
+// CHECK: bfi      x4, xzr, #1, #6            // encoding: [0xe4,0x17,0x7f,0xb3]
+// CHECK: bfxil    x5, x6, #12, #52           // encoding: [0xc5,0xfc,0x4c,0xb3]
 
         sxtb w1, w2
         sxtb xzr, w3
@@ -1063,11 +1026,9 @@ _func:
         sbfiz xzr, xzr, #10, #11
 // CHECK: {{sbfiz|sbfx}}    w9, w10, #0, #1   // encoding: [0x49,0x01,0x00,0x13]
 // CHECK: sbfiz    x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0x93]
-// CHECK-AARCH64: sbfiz    x19, x20, #0, #64  // encoding: [0x93,0xfe,0x40,0x93]
-// CHECK-ARM64: asr    x19, x20, #0           // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: asr    x19, x20, #0           // encoding: [0x93,0xfe,0x40,0x93]
 // CHECK: sbfiz    x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0x93]
-// CHECK-AARCH64: sbfiz    w9, w10, #0, #32   // encoding: [0x49,0x7d,0x00,0x13]
-// CHECK-ARM64: asr    w9, w10, #0            // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: asr    w9, w10, #0            // encoding: [0x49,0x7d,0x00,0x13]
 // CHECK: sbfiz    w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x13]
 // CHECK: sbfiz    w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x13]
 // CHECK: sbfiz    xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0x93]
@@ -1081,18 +1042,12 @@ _func:
         sbfx w13, w14, #29, #3
         sbfx xzr, xzr, #10, #11
 // CHECK: sbfx     w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x13]
-// CHECK-AARCH64: sbfx     x2, x3, #63, #1    // encoding: [0x62,0xfc,0x7f,0x93]
-// CHECK-ARM64: asr     x2, x3, #63           // encoding: [0x62,0xfc,0x7f,0x93]
-// CHECK-AARCH64: sbfx     x19, x20, #0, #64  // encoding: [0x93,0xfe,0x40,0x93]
-// CHECK-ARM64: asr     x19, x20, #0          // encoding: [0x93,0xfe,0x40,0x93]
-// CHECK-AARCH64: sbfx     x9, x10, #5, #59   // encoding: [0x49,0xfd,0x45,0x93]
-// CHECK-ARM64: asr     x9, x10, #5           // encoding: [0x49,0xfd,0x45,0x93]
-// CHECK-AARCH64: sbfx     w9, w10, #0, #32   // encoding: [0x49,0x7d,0x00,0x13]
-// CHECK-ARM64: asr     w9, w10, #0           // encoding: [0x49,0x7d,0x00,0x13]
-// CHECK-AARCH64: sbfx     w11, w12, #31, #1  // encoding: [0x8b,0x7d,0x1f,0x13]
-// CHECK-ARM64: asr     w11, w12, #31         // encoding: [0x8b,0x7d,0x1f,0x13]
-// CHECK-AARCH64: sbfx     w13, w14, #29, #3  // encoding: [0xcd,0x7d,0x1d,0x13]
-// CHECK-ARM64: asr     w13, w14, #29         // encoding: [0xcd,0x7d,0x1d,0x13]
+// CHECK: asr     x2, x3, #63           // encoding: [0x62,0xfc,0x7f,0x93]
+// CHECK: asr     x19, x20, #0          // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: asr     x9, x10, #5           // encoding: [0x49,0xfd,0x45,0x93]
+// CHECK: asr     w9, w10, #0           // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: asr     w11, w12, #31         // encoding: [0x8b,0x7d,0x1f,0x13]
+// CHECK: asr     w13, w14, #29         // encoding: [0xcd,0x7d,0x1d,0x13]
 // CHECK: sbfx     xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0x93]
 
         bfi w9, w10, #0, #1
@@ -1103,23 +1058,15 @@ _func:
         bfi w11, w12, #31, #1
         bfi w13, w14, #29, #3
         bfi xzr, xzr, #10, #11
-// CHECK-AARCH64: bfi      w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
-// CHECK-AARCH64: bfi      x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xb3]
-// CHECK-AARCH64: bfi      x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
-// CHECK-AARCH64: bfi      x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xb3]
-// CHECK-AARCH64: bfi      w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
-// CHECK-AARCH64: bfi      w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x33]
-// CHECK-AARCH64: bfi      w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x33]
-// CHECK-AARCH64: bfi      xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xb3]
-
-// CHECK-ARM64: bfxil    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
-// CHECK-ARM64: bfi      x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xb3]
-// CHECK-ARM64: bfxil    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
-// CHECK-ARM64: bfi      x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xb3]
-// CHECK-ARM64: bfxil    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
-// CHECK-ARM64: bfi      w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x33]
-// CHECK-ARM64: bfi      w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x33]
-// CHECK-ARM64: bfi      xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xb3]
+
+// CHECK: bfxil    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
+// CHECK: bfi      x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xb3]
+// CHECK: bfxil    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
+// CHECK: bfi      x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xb3]
+// CHECK: bfxil    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
+// CHECK: bfi      w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x33]
+// CHECK: bfi      w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x33]
+// CHECK: bfi      xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xb3]
 
         bfxil w9, w10, #0, #1
         bfxil x2, x3, #63, #1
@@ -1146,23 +1093,15 @@ _func:
         ubfiz w11, w12, #31, #1
         ubfiz w13, w14, #29, #3
         ubfiz xzr, xzr, #10, #11
-// CHECK-AARCH64: ubfiz    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x53]
-// CHECK-AARCH64: ubfiz    x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xd3]
-// CHECK-AARCH64: ubfiz    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK-AARCH64: ubfiz    x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xd3]
-// CHECK-AARCH64: ubfiz    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK-AARCH64: ubfiz    w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x53]
-// CHECK-AARCH64: ubfiz    w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x53]
-// CHECK-AARCH64: ubfiz    xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xd3]
-
-// CHECK-ARM64: ubfx     w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
-// CHECK-ARM64: lsl      x2, x3, #63             // encoding: [0x62,0x00,0x41,0xd3]
-// CHECK-ARM64: lsr      x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK-ARM64: lsl      x9, x10, #5             // encoding: [0x49,0xe9,0x7b,0xd3]
-// CHECK-ARM64: lsr      w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK-ARM64: lsl      w11, w12, #31           // encoding: [0x8b,0x01,0x01,0x53]
-// CHECK-ARM64: lsl      w13, w14, #29           // encoding: [0xcd,0x09,0x03,0x53]
-// CHECK-ARM64: ubfiz    xzr, xzr, #10, #11      // encoding: [0xff,0x2b,0x76,0xd3]
+
+// CHECK: ubfx     w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: lsl      x2, x3, #63             // encoding: [0x62,0x00,0x41,0xd3]
+// CHECK: lsr      x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: lsl      x9, x10, #5             // encoding: [0x49,0xe9,0x7b,0xd3]
+// CHECK: lsr      w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: lsl      w11, w12, #31           // encoding: [0x8b,0x01,0x01,0x53]
+// CHECK: lsl      w13, w14, #29           // encoding: [0xcd,0x09,0x03,0x53]
+// CHECK: ubfiz    xzr, xzr, #10, #11      // encoding: [0xff,0x2b,0x76,0xd3]
 
         ubfx w9, w10, #0, #1
         ubfx x2, x3, #63, #1
@@ -1172,23 +1111,15 @@ _func:
         ubfx w11, w12, #31, #1
         ubfx w13, w14, #29, #3
         ubfx xzr, xzr, #10, #11
-// CHECK-AARCH64: ubfx     w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x53]
-// CHECK-AARCH64: ubfx     x2, x3, #63, #1            // encoding: [0x62,0xfc,0x7f,0xd3]
-// CHECK-AARCH64: ubfx     x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK-AARCH64: ubfx     x9, x10, #5, #59           // encoding: [0x49,0xfd,0x45,0xd3]
-// CHECK-AARCH64: ubfx     w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK-AARCH64: ubfx     w11, w12, #31, #1          // encoding: [0x8b,0x7d,0x1f,0x53]
-// CHECK-AARCH64: ubfx     w13, w14, #29, #3          // encoding: [0xcd,0x7d,0x1d,0x53]
-// CHECK-AARCH64: ubfx     xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0xd3]
-
-// CHECK-ARM64: ubfx    w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
-// CHECK-ARM64: lsr     x2, x3, #63             // encoding: [0x62,0xfc,0x7f,0xd3]
-// CHECK-ARM64: lsr     x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK-ARM64: lsr     x9, x10, #5             // encoding: [0x49,0xfd,0x45,0xd3]
-// CHECK-ARM64: lsr     w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK-ARM64: lsr     w11, w12, #31           // encoding: [0x8b,0x7d,0x1f,0x53]
-// CHECK-ARM64: lsr     w13, w14, #29           // encoding: [0xcd,0x7d,0x1d,0x53]
-// CHECK-ARM64: ubfx    xzr, xzr, #10, #11      // encoding: [0xff,0x53,0x4a,0xd3]
+
+// CHECK: ubfx    w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: lsr     x2, x3, #63             // encoding: [0x62,0xfc,0x7f,0xd3]
+// CHECK: lsr     x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: lsr     x9, x10, #5             // encoding: [0x49,0xfd,0x45,0xd3]
+// CHECK: lsr     w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: lsr     w11, w12, #31           // encoding: [0x8b,0x7d,0x1f,0x53]
+// CHECK: lsr     w13, w14, #29           // encoding: [0xcd,0x7d,0x1d,0x53]
+// CHECK: ubfx    xzr, xzr, #10, #11      // encoding: [0xff,0x53,0x4a,0xd3]
 //------------------------------------------------------------------------------
 // Compare & branch (immediate)
 //------------------------------------------------------------------------------
@@ -1197,34 +1128,22 @@ _func:
         cbz x5, lbl
         cbnz x2, lbl
         cbnz x26, lbl
-// CHECK-AARCH64: cbz      w5, lbl                // encoding: [0x05'A',A,A,0x34'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: cbz      x5, lbl                // encoding: [0x05'A',A,A,0xb4'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: cbnz     x2, lbl                // encoding: [0x02'A',A,A,0xb5'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: cbnz     x26, lbl               // encoding: [0x1a'A',A,A,0xb5'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-ARM64: cbz    w5, lbl                 // encoding: [0bAAA00101,A,A,0x34]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbz    x5, lbl                 // encoding: [0bAAA00101,A,A,0xb4]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbnz    x2, lbl                 // encoding: [0bAAA00010,A,A,0xb5]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbnz    x26, lbl                // encoding: [0bAAA11010,A,A,0xb5]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
+// CHECK: cbz    w5, lbl                 // encoding: [0bAAA00101,A,A,0x34]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbz    x5, lbl                 // encoding: [0bAAA00101,A,A,0xb4]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    x2, lbl                 // encoding: [0bAAA00010,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    x26, lbl                // encoding: [0bAAA11010,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         cbz wzr, lbl
         cbnz xzr, lbl
-// CHECK-AARCH64: cbz      wzr, lbl               // encoding: [0x1f'A',A,A,0x34'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: cbnz     xzr, lbl               // encoding: [0x1f'A',A,A,0xb5'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
 
-// CHECK-ARM64: cbz    wzr, lbl                // encoding: [0bAAA11111,A,A,0x34]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: cbnz    xzr, lbl                // encoding: [0bAAA11111,A,A,0xb5]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
+// CHECK: cbz    wzr, lbl                // encoding: [0bAAA11111,A,A,0x34]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    xzr, lbl                // encoding: [0bAAA11111,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         cbz w5, #0
         cbnz x3, #-4
@@ -1256,75 +1175,41 @@ _func:
         b.gt lbl
         b.le lbl
         b.al lbl
-// CHECK-AARCH64: b.eq lbl                        // encoding: [A,A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.ne lbl                        // encoding: [0x01'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.mi lbl                        // encoding: [0x04'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.pl lbl                        // encoding: [0x05'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.vs lbl                        // encoding: [0x06'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.vc lbl                        // encoding: [0x07'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.hi lbl                        // encoding: [0x08'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.ls lbl                        // encoding: [0x09'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.ge lbl                        // encoding: [0x0a'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.lt lbl                        // encoding: [0x0b'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.gt lbl                        // encoding: [0x0c'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.le lbl                        // encoding: [0x0d'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.al lbl                        // encoding: [0x0e'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-
-// CHECK-ARM64: b.eq lbl                     // encoding: [0bAAA00000,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.ne lbl                     // encoding: [0bAAA00001,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.mi lbl                     // encoding: [0bAAA00100,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.pl lbl                     // encoding: [0bAAA00101,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.vs lbl                     // encoding: [0bAAA00110,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.vc lbl                     // encoding: [0bAAA00111,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.hi lbl                     // encoding: [0bAAA01000,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.ls lbl                     // encoding: [0bAAA01001,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.ge lbl                     // encoding: [0bAAA01010,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.lt lbl                     // encoding: [0bAAA01011,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.gt lbl                     // encoding: [0bAAA01100,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.le lbl                     // encoding: [0bAAA01101,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
-// CHECK-ARM64: b.al lbl                     // encoding: [0bAAA01110,A,A,0x54]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: lbl, kind: fixup_arm64_pcrel_branch19
+
+// CHECK: b.eq lbl                     // encoding: [0bAAA00000,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ne lbl                     // encoding: [0bAAA00001,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.mi lbl                     // encoding: [0bAAA00100,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.pl lbl                     // encoding: [0bAAA00101,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.vs lbl                     // encoding: [0bAAA00110,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.vc lbl                     // encoding: [0bAAA00111,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hi lbl                     // encoding: [0bAAA01000,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ls lbl                     // encoding: [0bAAA01001,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ge lbl                     // encoding: [0bAAA01010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lt lbl                     // encoding: [0bAAA01011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.gt lbl                     // encoding: [0bAAA01100,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.le lbl                     // encoding: [0bAAA01101,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.al lbl                     // encoding: [0bAAA01110,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         //  ARM64 has these in a separate file
         beq lbl
@@ -1344,40 +1229,6 @@ _func:
         bgt lbl
         ble lbl
         bal lbl
-// CHECK-AARCH64: b.eq lbl                        // encoding: [A,A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.ne lbl                        // encoding: [0x01'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.mi lbl                        // encoding: [0x04'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.pl lbl                        // encoding: [0x05'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.vs lbl                        // encoding: [0x06'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.vc lbl                        // encoding: [0x07'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.hi lbl                        // encoding: [0x08'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.ls lbl                        // encoding: [0x09'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.ge lbl                        // encoding: [0x0a'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.lt lbl                        // encoding: [0x0b'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.gt lbl                        // encoding: [0x0c'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.le lbl                        // encoding: [0x0d'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK-AARCH64: b.al lbl                        // encoding: [0x0e'A',A,A,0x54'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
 
         b.eq #0
         b.lt #-4
@@ -2342,36 +2193,24 @@ _func:
         ldr w3, here
         ldr x29, there
         ldrsw xzr, everywhere
-// CHECK-AARCH64: ldr     w3, here                // encoding: [0x03'A',A,A,0x18'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: here, kind: fixup_a64_ld_prel
-// CHECK-AARCH64: ldr     x29, there              // encoding: [0x1d'A',A,A,0x58'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: there, kind: fixup_a64_ld_prel
-// CHECK-AARCH64: ldrsw   xzr, everywhere         // encoding: [0x1f'A',A,A,0x98'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_a64_ld_prel
-
-// CHECK-ARM64: ldr    w3, here                // encoding: [0bAAA00011,A,A,0x18]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: here, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldr    x29, there              // encoding: [0bAAA11101,A,A,0x58]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: there, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldrsw    xzr, everywhere         // encoding: [0bAAA11111,A,A,0x98]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_arm64_ldr_pcrel_imm19
+
+// CHECK: ldr    w3, here                // encoding: [0bAAA00011,A,A,0x18]
+// CHECK:                                 //   fixup A - offset: 0, value: here, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    x29, there              // encoding: [0bAAA11101,A,A,0x58]
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldrsw    xzr, everywhere         // encoding: [0bAAA11111,A,A,0x98]
+// CHECK:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_aarch64_ldr_pcrel_imm19
 
         ldr s0, who_knows
         ldr d0, i_dont
         ldr q0, there_must_be_a_better_way
-// CHECK-AARCH64: ldr     s0, who_knows           // encoding: [A,A,A,0x1c'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_a64_ld_prel
-// CHECK-AARCH64: ldr     d0, i_dont              // encoding: [A,A,A,0x5c'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_a64_ld_prel
-// CHECK-AARCH64: ldr     q0, there_must_be_a_better_way // encoding: [A,A,A,0x9c'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_a64_ld_prel
-
-// CHECK-ARM64: ldr    s0, who_knows           // encoding: [0bAAA00000,A,A,0x1c]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldr    d0, i_dont              // encoding: [0bAAA00000,A,A,0x5c]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: ldr    q0, there_must_be_a_better_way // encoding: [0bAAA00000,A,A,0x9c]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_arm64_ldr_pcrel_imm19
+
+// CHECK: ldr    s0, who_knows           // encoding: [0bAAA00000,A,A,0x1c]
+// CHECK:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    d0, i_dont              // encoding: [0bAAA00000,A,A,0x5c]
+// CHECK:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    q0, there_must_be_a_better_way // encoding: [0bAAA00000,A,A,0x9c]
+// CHECK:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_aarch64_ldr_pcrel_imm19
 
         ldr w0, #1048572
         ldr x10, #-1048576
@@ -2380,15 +2219,11 @@ _func:
 
         prfm pldl1strm, nowhere
         prfm #22, somewhere
-// CHECK-AARCH64: prfm    pldl1strm, nowhere      // encoding: [0x01'A',A,A,0xd8'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_ld_prel
-// CHECK-AARCH64: prfm    #22, somewhere          // encoding: [0x16'A',A,A,0xd8'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_ld_prel
 
-// CHECK-ARM64: prfm    pldl1strm, nowhere      // encoding: [0bAAA00001,A,A,0xd8]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_arm64_ldr_pcrel_imm19
-// CHECK-ARM64: prfm    #22, somewhere          // encoding: [0bAAA10110,A,A,0xd8]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_arm64_ldr_pcrel_imm19
+// CHECK: prfm    pldl1strm, nowhere      // encoding: [0bAAA00001,A,A,0xd8]
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: prfm    #22, somewhere          // encoding: [0bAAA10110,A,A,0xd8]
+// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_ldr_pcrel_imm19
 
 //------------------------------------------------------------------------------
 // Load/store exclusive
@@ -2603,31 +2438,19 @@ _func:
         ldrsw x15, [x5, #:lo12:sym]
         ldr x15, [x5, #:lo12:sym]
         ldr q3, [x2, #:lo12:sym]
-// CHECK-AARCH64: str     x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,A,0xf9'A']
-// CHECK-AARCH64:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst64_lo12
-// CHECK-AARCH64: ldrb    w15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x40'A',0x39'A']
-// CHECK-AARCH64:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst8_lo12
-// CHECK-AARCH64: ldrsh   x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x80'A',0x79'A']
-// CHECK-AARCH64:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst16_lo12
-// CHECK-AARCH64: ldrsw   x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x80'A',0xb9'A']
-// CHECK-AARCH64:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst32_lo12
-// CHECK-AARCH64: ldr     x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x40'A',0xf9'A']
-// CHECK-AARCH64:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst64_lo12
-// CHECK-AARCH64: ldr     q3, [x2, #:lo12:sym]    // encoding: [0x43'A',A,0xc0'A',0x3d'A']
-// CHECK-AARCH64:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst128_lo12
-
-// CHECK-ARM64: str    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b00AAAAAA,0xf9]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: ldrb    w15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0x39]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK-ARM64: ldrsh    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0x79]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK-ARM64: ldrsw    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0xb9]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK-ARM64: ldr    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: ldr    q3, [x2, :lo12:sym]     // encoding: [0x43,0bAAAAAA00,0b11AAAAAA,0x3d]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_arm64_ldst_imm12_scale16
+
+// CHECK: str    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b00AAAAAA,0xf9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldrb    w15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0x39]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsh    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0x79]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsw    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0xb9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldr    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr    q3, [x2, :lo12:sym]     // encoding: [0x43,0bAAAAAA00,0b11AAAAAA,0x3d]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale16
 
         prfm pldl1keep, [sp, #8]
         prfm pldl1strm, [x3]
@@ -3507,108 +3330,72 @@ _func:
 
         movz x2, #:abs_g0:sym
         movk w3, #:abs_g0_nc:sym
-// CHECK-AARCH64: movz    x2, #:abs_g0:sym        // encoding: [0x02'A',A,0x80'A',0xd2'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_a64_movw_uabs_g0
-// CHECK-AARCH64: movk     w3, #:abs_g0_nc:sym    // encoding: [0x03'A',A,0x80'A',0x72'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_a64_movw_uabs_g0_nc
 
-// CHECK-ARM64: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w3, #:abs_g0_nc:sym     // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_arm64_movw
+// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_aarch64_movw
+// CHECK: movk    w3, #:abs_g0_nc:sym     // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_aarch64_movw
 
         movz x4, #:abs_g1:sym
         movk w5, #:abs_g1_nc:sym
-// CHECK-AARCH64: movz     x4, #:abs_g1:sym       // encoding: [0x04'A',A,0xa0'A',0xd2'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_a64_movw_uabs_g1
-// CHECK-AARCH64: movk     w5, #:abs_g1_nc:sym    // encoding: [0x05'A',A,0xa0'A',0x72'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_a64_movw_uabs_g1_nc
 
-// CHECK-ARM64: movz    x4, #:abs_g1:sym        // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w5, #:abs_g1_nc:sym     // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_arm64_movw
+// CHECK: movz    x4, #:abs_g1:sym        // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_aarch64_movw
+// CHECK: movk    w5, #:abs_g1_nc:sym     // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_aarch64_movw
 
         movz x6, #:abs_g2:sym
         movk x7, #:abs_g2_nc:sym
-// CHECK-AARCH64: movz     x6, #:abs_g2:sym       // encoding: [0x06'A',A,0xc0'A',0xd2'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_a64_movw_uabs_g2
-// CHECK-AARCH64: movk     x7, #:abs_g2_nc:sym    // encoding: [0x07'A',A,0xc0'A',0xf2'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_a64_movw_uabs_g2_nc
 
-// CHECK-ARM64: movz    x6, #:abs_g2:sym        // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    x7, #:abs_g2_nc:sym     // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_arm64_movw
+// CHECK: movz    x6, #:abs_g2:sym        // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_aarch64_movw
+// CHECK: movk    x7, #:abs_g2_nc:sym     // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_aarch64_movw
 
         movz x8, #:abs_g3:sym
         movk x9, #:abs_g3:sym
-// CHECK-AARCH64: movz     x8, #:abs_g3:sym       // encoding: [0x08'A',A,0xe0'A',0xd2'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_a64_movw_uabs_g3
-// CHECK-AARCH64: movk     x9, #:abs_g3:sym       // encoding: [0x09'A',A,0xe0'A',0xf2'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_a64_movw_uabs_g3
 
-// CHECK-ARM64: movz    x8, #:abs_g3:sym        // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    x9, #:abs_g3:sym        // encoding: [0bAAA01001,A,0b111AAAAA,0xf2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
+// CHECK: movz    x8, #:abs_g3:sym        // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
+// CHECK: movk    x9, #:abs_g3:sym        // encoding: [0bAAA01001,A,0b111AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
 
 
         movn x30, #:abs_g0_s:sym
         movz x19, #:abs_g0_s:sym
         movn w10, #:abs_g0_s:sym
         movz w25, #:abs_g0_s:sym
-// CHECK-AARCH64: movn     x30, #:abs_g0_s:sym    // encoding: [0x1e'A',A,0x80'A',0x92'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
-// CHECK-AARCH64: movz     x19, #:abs_g0_s:sym    // encoding: [0x13'A',A,0x80'A',0x92'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
-// CHECK-AARCH64: movn     w10, #:abs_g0_s:sym    // encoding: [0x0a'A',A,0x80'A',0x12'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
-// CHECK-AARCH64: movz     w25, #:abs_g0_s:sym    // encoding: [0x19'A',A,0x80'A',0x12'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
-
-// CHECK-ARM64: movn    x30, #:abs_g0_s:sym     // encoding: [0bAAA11110,A,0b100AAAAA,0x92]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x19, #:abs_g0_s:sym     // encoding: [0bAAA10011,A,0b100AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w10, #:abs_g0_s:sym     // encoding: [0bAAA01010,A,0b100AAAAA,0x12]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w25, #:abs_g0_s:sym     // encoding: [0bAAA11001,A,0b100AAAAA,0x52]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_arm64_movw
+
+// CHECK: movn    x30, #:abs_g0_s:sym     // encoding: [0bAAA11110,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g0_s:sym     // encoding: [0bAAA10011,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movn    w10, #:abs_g0_s:sym     // encoding: [0bAAA01010,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    w25, #:abs_g0_s:sym     // encoding: [0bAAA11001,A,0b100AAAAA,0x52]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
 
         movn x30, #:abs_g1_s:sym
         movz x19, #:abs_g1_s:sym
         movn w10, #:abs_g1_s:sym
         movz w25, #:abs_g1_s:sym
-// CHECK-AARCH64: movn     x30, #:abs_g1_s:sym    // encoding: [0x1e'A',A,0xa0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
-// CHECK-AARCH64: movz     x19, #:abs_g1_s:sym    // encoding: [0x13'A',A,0xa0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
-// CHECK-AARCH64: movn     w10, #:abs_g1_s:sym    // encoding: [0x0a'A',A,0xa0'A',0x12'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
-// CHECK-AARCH64: movz     w25, #:abs_g1_s:sym    // encoding: [0x19'A',A,0xa0'A',0x12'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
-
-// CHECK-ARM64: movn    x30, #:abs_g1_s:sym     // encoding: [0bAAA11110,A,0b101AAAAA,0x92]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x19, #:abs_g1_s:sym     // encoding: [0bAAA10011,A,0b101AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w10, #:abs_g1_s:sym     // encoding: [0bAAA01010,A,0b101AAAAA,0x12]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w25, #:abs_g1_s:sym     // encoding: [0bAAA11001,A,0b101AAAAA,0x52]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_arm64_movw
+
+// CHECK: movn    x30, #:abs_g1_s:sym     // encoding: [0bAAA11110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g1_s:sym     // encoding: [0bAAA10011,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movn    w10, #:abs_g1_s:sym     // encoding: [0bAAA01010,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    w25, #:abs_g1_s:sym     // encoding: [0bAAA11001,A,0b101AAAAA,0x52]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
 
         movn x30, #:abs_g2_s:sym
         movz x19, #:abs_g2_s:sym
-// CHECK-AARCH64: movn     x30, #:abs_g2_s:sym    // encoding: [0x1e'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_a64_movw_sabs_g2
-// CHECK-AARCH64: movz     x19, #:abs_g2_s:sym    // encoding: [0x13'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_a64_movw_sabs_g2
 
-// CHECK-ARM64: movn    x30, #:abs_g2_s:sym     // encoding: [0bAAA11110,A,0b110AAAAA,0x92]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x19, #:abs_g2_s:sym     // encoding: [0bAAA10011,A,0b110AAAAA,0xd2]
-// CHECK-ARM64-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_arm64_movw
+// CHECK: movn    x30, #:abs_g2_s:sym     // encoding: [0bAAA11110,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g2_s:sym     // encoding: [0bAAA10011,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_aarch64_movw
 
 //------------------------------------------------------------------------------
 // PC-relative addressing
@@ -3616,22 +3403,16 @@ _func:
 
         adr x2, loc
         adr xzr, loc
- // CHECK-AARCH64: adr     x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
- // CHECK-AARCH64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel
- // CHECK-AARCH64: adr     xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
- // CHECK-AARCH64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel
 
-// CHECK-ARM64: adr    x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_arm64_pcrel_adr_imm21
-// CHECK-ARM64: adr    xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_arm64_pcrel_adr_imm21
+// CHECK: adr    x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adr_imm21
+// CHECK: adr    xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adr_imm21
 
         adrp x29, loc
- // CHECK-AARCH64: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
- // CHECK-AARCH64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel_page
 
-// CHECK-ARM64: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: loc, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adrp_imm21
         adrp x30, #4096
         adr x20, #0
         adr x9, #-1
@@ -5008,37 +4789,25 @@ _func:
         tbz x5, #0, somewhere
         tbz xzr, #63, elsewhere
         tbnz x5, #45, nowhere
-// CHECK-AARCH64: tbz     x5, #0, somewhere       // encoding: [0x05'A',A,A,0x36'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_tstbr
-// CHECK-AARCH64: tbz     xzr, #63, elsewhere     // encoding: [0x1f'A',A,0xf8'A',0xb6'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: elsewhere, kind: fixup_a64_tstbr
-// CHECK-AARCH64: tbnz    x5, #45, nowhere        // encoding: [0x05'A',A,0x68'A',0xb7'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_tstbr
 
-// CHECK-ARM64: tbz    w5, #0, somewhere       // encoding: [0bAAA00101,A,0b00000AAA,0x36]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: somewhere, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbz    xzr, #63, elsewhere     // encoding: [0bAAA11111,A,0b11111AAA,0xb6]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: elsewhere, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbnz   x5, #45, nowhere        // encoding: [0bAAA00101,A,0b01101AAA,0xb7]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: nowhere, kind: fixup_arm64_pcrel_branch14
+// CHECK: tbz    w5, #0, somewhere       // encoding: [0bAAA00101,A,0b00000AAA,0x36]
+// CHECK:                                //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbz    xzr, #63, elsewhere     // encoding: [0bAAA11111,A,0b11111AAA,0xb6]
+// CHECK:                                //   fixup A - offset: 0, value: elsewhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbnz   x5, #45, nowhere        // encoding: [0bAAA00101,A,0b01101AAA,0xb7]
+// CHECK:                                //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_pcrel_branch14
 
 
         tbnz w3, #2, there
         tbnz wzr, #31, nowhere
         tbz w5, #12, anywhere
-// CHECK-AARCH64: tbnz    w3, #2, there           // encoding: [0x03'A',A,0x10'A',0x37'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: there, kind: fixup_a64_tstbr
-// CHECK-AARCH64: tbnz    wzr, #31, nowhere       // encoding: [0x1f'A',A,0xf8'A',0x37'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_tstbr
-// CHECK-AARCH64: tbz     w5, #12, anywhere       // encoding: [0x05'A',A,0x60'A',0x36'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_a64_tstbr
-
-// CHECK-ARM64: tbnz    w3, #2, there           // encoding: [0bAAA00011,A,0b00010AAA,0x37]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: there, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbnz    wzr, #31, nowhere       // encoding: [0bAAA11111,A,0b11111AAA,0x37]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_arm64_pcrel_branch14
-// CHECK-ARM64: tbz     w5, #12, anywhere       // encoding: [0bAAA00101,A,0b01100AAA,0x36]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_arm64_pcrel_branch14
+
+// CHECK: tbnz    w3, #2, there           // encoding: [0bAAA00011,A,0b00010AAA,0x37]
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbnz    wzr, #31, nowhere       // encoding: [0bAAA11111,A,0b11111AAA,0x37]
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbz     w5, #12, anywhere       // encoding: [0bAAA00101,A,0b01100AAA,0x36]
+// CHECK:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_aarch64_pcrel_branch14
 
 //------------------------------------------------------------------------------
 // Unconditional branch (immediate)
@@ -5046,15 +4815,11 @@ _func:
 
         b somewhere
         bl elsewhere
-// CHECK-AARCH64: b       somewhere               // encoding: [A,A,A,0x14'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_uncondbr
-// CHECK-AARCH64: bl      elsewhere               // encoding: [A,A,A,0x94'A']
-// CHECK-AARCH64:                                 //   fixup A - offset: 0, value: elsewhere, kind: fixup_a64_call
-
-// CHECK-ARM64: b    somewhere               // encoding: [A,A,A,0b000101AA]
-// CHECK-ARM64:                              //   fixup A - offset: 0, value: somewhere, kind: fixup_arm64_pcrel_branch26
-// CHECK-ARM64: bl    elsewhere               // encoding: [A,A,A,0b100101AA]
-// CHECK-ARM64:                               //   fixup A - offset: 0, value: elsewhere, kind: fixup_arm64_pcrel_call26
+
+// CHECK: b    somewhere               // encoding: [A,A,A,0b000101AA]
+// CHECK:                              //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_pcrel_branch26
+// CHECK: bl    elsewhere               // encoding: [A,A,A,0b100101AA]
+// CHECK:                               //   fixup A - offset: 0, value: elsewhere, kind: fixup_aarch64_pcrel_call26
 
         b #4
         bl #0
diff --git a/test/MC/AArch64/basic-pic.s b/test/MC/AArch64/basic-pic.s
index c3317f35d3bc..a10874dcca09 100644
--- a/test/MC/AArch64/basic-pic.s
+++ b/test/MC/AArch64/basic-pic.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o -| llvm-objdump -r - | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o -| llvm-objdump -r - | FileCheck %s
 
 // CHECK: RELOCATION RECORDS FOR [.rela.text]
 
diff --git a/test/MC/AArch64/elf-extern.s b/test/MC/AArch64/elf-extern.s
index 23cb4bd46c79..dfa3fb002ed5 100644
--- a/test/MC/AArch64/elf-extern.s
+++ b/test/MC/AArch64/elf-extern.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc < %s -triple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
-// RUN: llvm-mc < %s -triple=arm64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
 
 // External symbols are a different concept to global variables but should still
 // get relocations and so on when used.
diff --git a/test/MC/AArch64/elf-objdump.s b/test/MC/AArch64/elf-objdump.s
index 2d134ff58620..3b3aa65819d8 100644
--- a/test/MC/AArch64/elf-objdump.s
+++ b/test/MC/AArch64/elf-objdump.s
@@ -1,6 +1,5 @@
 // 64 bit little endian
 // RUN: llvm-mc -filetype=obj -triple aarch64-none-linux-gnu %s -o - | llvm-objdump -d -
-// RUN: llvm-mc -filetype=obj -triple arm64-none-linux-gnu %s -o - | llvm-objdump -d -
 
 // We just want to see if llvm-objdump works at all.
 // CHECK: .text
diff --git a/test/MC/AArch64/elf-reloc-addend.s b/test/MC/AArch64/elf-reloc-addend.s
deleted file mode 100644
index 8d575fb8b920..000000000000
--- a/test/MC/AArch64/elf-reloc-addend.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=aarch64-linux-gnu -r - | FileCheck %s
-
-// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=aarch64-linux-gnu -r - | FileCheck %s
-
-	add x0, x4, #:lo12:sym
-// CHECK: 0 R_AARCH64_ADD_ABS_LO12_NC sym
-	add x3, x5, #:lo12:sym+1
-// CHECK: 4 R_AARCH64_ADD_ABS_LO12_NC sym+1
-	add x3, x5, #:lo12:sym-1
-// CHECK: 8 R_AARCH64_ADD_ABS_LO12_NC sym-1
diff --git a/test/MC/AArch64/elf-reloc-addsubimm.s b/test/MC/AArch64/elf-reloc-addsubimm.s
index a64249e8b8f8..e37991bfba1c 100644
--- a/test/MC/AArch64/elf-reloc-addsubimm.s
+++ b/test/MC/AArch64/elf-reloc-addsubimm.s
@@ -1,7 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         add x2, x3, #:lo12:some_label
diff --git a/test/MC/AArch64/elf-reloc-ldrlit.s b/test/MC/AArch64/elf-reloc-ldrlit.s
index 55ba5f8b748a..d4c3a4eb50d0 100644
--- a/test/MC/AArch64/elf-reloc-ldrlit.s
+++ b/test/MC/AArch64/elf-reloc-ldrlit.s
@@ -1,7 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         ldr x0, some_label
diff --git a/test/MC/AArch64/elf-reloc-ldstunsimm.s b/test/MC/AArch64/elf-reloc-ldstunsimm.s
index faf2c459a65b..371e7e51f245 100644
--- a/test/MC/AArch64/elf-reloc-ldstunsimm.s
+++ b/test/MC/AArch64/elf-reloc-ldstunsimm.s
@@ -1,7 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+fp-armv8 -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+fp-armv8 -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         ldrb w0, [sp, #:lo12:some_label]
diff --git a/test/MC/AArch64/elf-reloc-movw.s b/test/MC/AArch64/elf-reloc-movw.s
index 29f89443c336..333159562c0f 100644
--- a/test/MC/AArch64/elf-reloc-movw.s
+++ b/test/MC/AArch64/elf-reloc-movw.s
@@ -1,7 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         movz x0, #:abs_g0:some_label
diff --git a/test/MC/AArch64/elf-reloc-pcreladdressing.s b/test/MC/AArch64/elf-reloc-pcreladdressing.s
index ee9b2073694a..093891d931aa 100644
--- a/test/MC/AArch64/elf-reloc-pcreladdressing.s
+++ b/test/MC/AArch64/elf-reloc-pcreladdressing.s
@@ -1,9 +1,6 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
-        // RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
         adr x2, some_label
         adrp x5, some_label
 
diff --git a/test/MC/AArch64/elf-reloc-tstb.s b/test/MC/AArch64/elf-reloc-tstb.s
index 370b9ee126ac..25c98163b584 100644
--- a/test/MC/AArch64/elf-reloc-tstb.s
+++ b/test/MC/AArch64/elf-reloc-tstb.s
@@ -1,7 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         tbz x6, #45, somewhere
diff --git a/test/MC/AArch64/elf-reloc-uncondbrimm.s b/test/MC/AArch64/elf-reloc-uncondbrimm.s
index 69b0a2fcb619..9ac66bd876a7 100644
--- a/test/MC/AArch64/elf-reloc-uncondbrimm.s
+++ b/test/MC/AArch64/elf-reloc-uncondbrimm.s
@@ -1,7 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         b somewhere
diff --git a/test/MC/AArch64/gicv3-regs-diagnostics.s b/test/MC/AArch64/gicv3-regs-diagnostics.s
index c88431235413..bc005b1d5304 100644
--- a/test/MC/AArch64/gicv3-regs-diagnostics.s
+++ b/test/MC/AArch64/gicv3-regs-diagnostics.s
@@ -1,5 +1,4 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple arm64-none-linux-gnu < %s 2>&1 | FileCheck %s
 
         // Write-only
         mrs x10, icc_eoir1_el1
diff --git a/test/MC/AArch64/gicv3-regs.s b/test/MC/AArch64/gicv3-regs.s
index 470fc4667f77..0f5742ee5435 100644
--- a/test/MC/AArch64/gicv3-regs.s
+++ b/test/MC/AArch64/gicv3-regs.s
@@ -1,5 +1,4 @@
  // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
- // RUN: llvm-mc -triple arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
 
         mrs x8, icc_iar1_el1
         mrs x26, icc_iar0_el1
diff --git a/test/MC/AArch64/inline-asm-modifiers.s b/test/MC/AArch64/inline-asm-modifiers.s
index c12ebf4636ba..cf34a952e90c 100644
--- a/test/MC/AArch64/inline-asm-modifiers.s
+++ b/test/MC/AArch64/inline-asm-modifiers.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj -mattr=+fp-armv8 < %s | llvm-objdump -r - | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -mattr=+fp-armv8 < %s | llvm-objdump -r - | FileCheck %s
 
 	.file	"<stdin>"
 	.text
diff --git a/test/MC/AArch64/jump-table.s b/test/MC/AArch64/jump-table.s
index 3fe9bc58cdd9..578ebf4e6608 100644
--- a/test/MC/AArch64/jump-table.s
+++ b/test/MC/AArch64/jump-table.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc < %s -triple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
-// RUN: llvm-mc < %s -triple=arm64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
 
 	.file	"<stdin>"
 	.text
diff --git a/test/MC/AArch64/lit.local.cfg b/test/MC/AArch64/lit.local.cfg
index 8378712e9cf5..5822b7226687 100644
--- a/test/MC/AArch64/lit.local.cfg
+++ b/test/MC/AArch64/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if 'AArch64' not in targets or 'ARM64' not in targets:
-    config.unsupported = True
\ No newline at end of file
+if 'AArch64' not in config.root.targets:
+    config.unsupported = True
diff --git a/test/MC/AArch64/mapping-across-sections.s b/test/MC/AArch64/mapping-across-sections.s
index 14336382bedb..3d32c1dfb400 100644
--- a/test/MC/AArch64/mapping-across-sections.s
+++ b/test/MC/AArch64/mapping-across-sections.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
 
         .text
         add w0, w0, w0
diff --git a/test/MC/AArch64/mapping-within-section.s b/test/MC/AArch64/mapping-within-section.s
index b80721ac652f..c8bd804fa0e3 100644
--- a/test/MC/AArch64/mapping-within-section.s
+++ b/test/MC/AArch64/mapping-within-section.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
 
     .text
 // $x at 0x0000
diff --git a/test/MC/AArch64/neon-2velem.s b/test/MC/AArch64/neon-2velem.s
index 567f22892171..04841d0164f2 100644
--- a/test/MC/AArch64/neon-2velem.s
+++ b/test/MC/AArch64/neon-2velem.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-3vdiff.s b/test/MC/AArch64/neon-3vdiff.s
index 476f7c6abe64..fc3215b4b671 100644
--- a/test/MC/AArch64/neon-3vdiff.s
+++ b/test/MC/AArch64/neon-3vdiff.s
@@ -1,5 +1,4 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+crypto -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+crypto -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-aba-abd.s b/test/MC/AArch64/neon-aba-abd.s
index 8833b3bbe956..178eb26f64c2 100644
--- a/test/MC/AArch64/neon-aba-abd.s
+++ b/test/MC/AArch64/neon-aba-abd.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-across.s b/test/MC/AArch64/neon-across.s
index 1a5446b3a4f3..60b766d8c881 100644
--- a/test/MC/AArch64/neon-across.s
+++ b/test/MC/AArch64/neon-across.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-add-pairwise.s b/test/MC/AArch64/neon-add-pairwise.s
index 83d443edb777..df9938b07e52 100644
--- a/test/MC/AArch64/neon-add-pairwise.s
+++ b/test/MC/AArch64/neon-add-pairwise.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-add-sub-instructions.s b/test/MC/AArch64/neon-add-sub-instructions.s
index ad169a8ff246..68f169b3dd90 100644
--- a/test/MC/AArch64/neon-add-sub-instructions.s
+++ b/test/MC/AArch64/neon-add-sub-instructions.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-bitwise-instructions.s b/test/MC/AArch64/neon-bitwise-instructions.s
index 949d1b14ffc3..79d0a9b70b54 100644
--- a/test/MC/AArch64/neon-bitwise-instructions.s
+++ b/test/MC/AArch64/neon-bitwise-instructions.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-compare-instructions.s b/test/MC/AArch64/neon-compare-instructions.s
index dfc3ae71515e..19cfaf1f4d36 100644
--- a/test/MC/AArch64/neon-compare-instructions.s
+++ b/test/MC/AArch64/neon-compare-instructions.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-crypto.s b/test/MC/AArch64/neon-crypto.s
index 3f36ba9e2a11..ed1bf8882648 100644
--- a/test/MC/AArch64/neon-crypto.s
+++ b/test/MC/AArch64/neon-crypto.s
@@ -1,5 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -mattr=+crypto -show-encoding < %s | FileCheck %s
-// RUN: not llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CRYPTO %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -mattr=+crypto -show-encoding < %s | FileCheck %s
 // RUN: not llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CRYPTO-ARM64 %s
 
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index 03c678f28f7b..fa1f3caf5ad3 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -1,8 +1,5 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ERROR --check-prefix=CHECK-AARCH64-ERROR < %t %s
-
-// RUN: not llvm-mc -triple arm64-none-linux-gnu -mattr=+neon < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ERROR --check-prefix=CHECK-ARM64-ERROR < %t %s
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
 
 //------------------------------------------------------------------------------
 // Vector Integer Add/sub
@@ -590,19 +587,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmgt v0.2d, v31.2s, v16.2s
 // CHECK-ERROR:                         ^
-// CHECK-AARCH64-ERROR: error: expected floating-point constant #0.0 or invalid register type
-// CHECK-AARCH64-ERROR:        fcmgt v4.4s, v7.4s, v15.4h
-// CHECK-AARCH64-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: expected floating-point constant #0.0 or invalid register type
-// CHECK-AARCH64-ERROR:        fcmlt v29.2d, v5.2d, v2.16b
-// CHECK-AARCH64-ERROR:                                ^
 
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmgt v4.4s, v7.4s, v15.4h
-// CHECK-ARM64-ERROR:                                ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, v2.16b
-// CHECK-ARM64-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmgt v4.4s, v7.4s, v15.4h
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, v2.16b
+// CHECK-ERROR:                                ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Equal to Zero (Integer)
@@ -691,19 +682,13 @@
 // CHECK-ERROR:        fcmeq v0.16b, v1.16b, #0.0
 // CHECK-ERROR:                 ^
 
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmeq v0.8b, v1.4h, #1.0
-// CHECK-AARCH64-ERROR:                             ^
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmeq v0.8b, v1.4h, #1
-// CHECK-AARCH64-ERROR:                             ^
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmeq v0.8b, v1.4h, #1.0
-// CHECK-ARM64-ERROR:                             ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmeq v0.8b, v1.4h, #1
-// CHECK-ARM64-ERROR:                             ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1.0
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1
+// CHECK-ERROR:                             ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
@@ -722,19 +707,13 @@
 // CHECK-ERROR:        fcmge v3.8b, v8.2s, #0.0
 // CHECK-ERROR:                 ^
 
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmle v17.8h, v15.2d, #-1.0
-// CHECK-AARCH64-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmle v17.8h, v15.2d, #2
-// CHECK-AARCH64-ERROR:                               ^
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmle v17.8h, v15.2d, #-1.0
-// CHECK-ARM64-ERROR:                               ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmle v17.8h, v15.2d, #2
-// CHECK-ARM64-ERROR:                               ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmle v17.8h, v15.2d, #-1.0
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmle v17.8h, v15.2d, #2
+// CHECK-ERROR:                               ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than Zero (Floating Point)
@@ -752,19 +731,13 @@
 // CHECK-ERROR:        fcmgt v4.4s, v7.4h, #0.0
 // CHECK-ERROR:                        ^
 
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmlt v29.2d, v5.2d, #255.0
-// CHECK-AARCH64-ERROR:                              ^
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmlt v29.2d, v5.2d, #255
-// CHECK-AARCH64-ERROR:                              ^
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #255.0
-// CHECK-ARM64-ERROR:                              ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #255
-// CHECK-ARM64-ERROR:                              ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255.0
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255
+// CHECK-ERROR:                              ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than or Equal To Zero (Floating Point)
@@ -782,19 +755,13 @@
 // CHECK-ERROR:        fcmge v3.8b, v8.2s, #0.0
 // CHECK-ERROR:                 ^
 
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmle v17.2d, v15.2d, #15.0
-// CHECK-AARCH64-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmle v17.2d, v15.2d, #15
-// CHECK-AARCH64-ERROR:                              ^
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmle v17.2d, v15.2d, #15.0
-// CHECK-ARM64-ERROR:                               ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmle v17.2d, v15.2d, #15
-// CHECK-ARM64-ERROR:                              ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmle v17.2d, v15.2d, #15.0
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmle v17.2d, v15.2d, #15
+// CHECK-ERROR:                              ^
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than Zero (Floating Point)
@@ -812,19 +779,13 @@
 // CHECK-ERROR:        fcmgt v4.4s, v7.4h, #0.0
 // CHECK-ERROR:                        ^
 
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmlt v29.2d, v5.2d, #16.0
-// CHECK-AARCH64-ERROR:                              ^
-// CHECK-AARCH64-ERROR: error: only #0.0 is acceptable as immediate
-// CHECK-AARCH64-ERROR:        fcmlt v29.2d, v5.2d, #2
-// CHECK-AARCH64-ERROR:                              ^
 
-// CHECK-ARM64-ERROR: error: expected floating-point constant #0.0
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #16.0
-// CHECK-ARM64-ERROR:                              ^
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR:        fcmlt v29.2d, v5.2d, #2
-// CHECK-ARM64-ERROR:                              ^
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #16.0
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #2
+// CHECK-ERROR:                              ^
 
 /-----------------------------------------------------------------------
 // Vector Integer Halving Add (Signed)
@@ -1337,13 +1298,10 @@
          shl v0.4s, v21.4s, #32
          shl v0.2d, v1.2d, #64
 
-// CHECK-AARCH64-ERROR: error: expected comma before next operand
-// CHECK-AARCH64-ERROR:         shl v0.4s, v15,2s, #3
-// CHECK-AARCH64-ERROR:                         ^
 
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
-// CHECK-ARM64-ERROR:         shl v0.4s, v15,2s, #3
-// CHECK-ARM64-ERROR:                         ^
+// CHECK-ERROR: error: unexpected token in argument list
+// CHECK-ERROR:         shl v0.4s, v15,2s, #3
+// CHECK-ERROR:                         ^
 
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         shl v0.2d, v17.4s, #3
@@ -2673,13 +2631,10 @@
         pmull2 v0.4s, v1.8h v2.8h
         pmull2 v0.2d, v1.4s, v2.4s
 
-// CHECK-AARCH64-ERROR: error: expected comma before next operand
-// CHECK-AARCH64-ERROR:        pmull2 v0.4s, v1.8h v2.8h
-// CHECK-AARCH64-ERROR:                            ^
 
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
-// CHECK-ARM64-ERROR:        pmull2 v0.4s, v1.8h v2.8h
-// CHECK-ARM64-ERROR:                            ^
+// CHECK-ERROR: error: unexpected token in argument list
+// CHECK-ERROR:        pmull2 v0.4s, v1.8h v2.8h
+// CHECK-ERROR:                            ^
 
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        pmull2 v0.2d, v1.4s, v2.4s
@@ -3003,23 +2958,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mla v0.2d, v1.2d, v16.d[1]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mla v0.2h, v1.2h, v2.h[1]
 // CHECK-ERROR:            ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3041,23 +2992,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mls v0.2d, v1.2d, v16.d[1]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mls v0.2h, v1.2h, v2.h[1]
 // CHECK-ERROR:            ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3082,28 +3029,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmla v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v3.4s, v8.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v3.4s, v8.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3122,28 +3063,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmls v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v3.4s, v8.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v3.4s, v8.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3163,8 +3098,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3173,19 +3107,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3194,12 +3125,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3219,8 +3148,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3229,19 +3157,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3250,12 +3175,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3275,8 +3198,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3285,19 +3207,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3306,12 +3225,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3331,8 +3248,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3341,19 +3257,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3362,12 +3275,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3387,8 +3298,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3397,19 +3307,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3418,12 +3325,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3443,8 +3348,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3453,19 +3357,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3474,12 +3375,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3493,36 +3392,28 @@
       mul v0.4s, v1.4s, v22.s[4]
       mul v0.2d, v1.2d, v2.d[1]
 
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mul v0.4h, v1.4h, v16.h[8]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: invalid operand for instruction
+// CHECK-ERROR: invalid operand for instruction
 // CHECK-ERROR:        mul v0.8h, v1.8h, v16.h[8]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                ^
 
@@ -3540,28 +3431,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmul v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3576,28 +3461,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmulx v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                  ^
 
@@ -3617,8 +3496,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3627,19 +3505,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3648,12 +3523,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3673,8 +3546,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3683,19 +3555,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3704,12 +3573,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3729,8 +3596,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3739,19 +3605,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:                ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3760,12 +3623,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3779,34 +3640,28 @@
       sqdmulh v0.4s, v1.4s, v22.s[4]
       sqdmulh v0.2d, v1.2d, v22.d[1]
 
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v16.h[2]
 // CHECK-ERROR:                              ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v16.h[2]
 // CHECK-ERROR:                                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3823,34 +3678,28 @@
       sqrdmulh v0.4s, v1.4s, v22.s[4]
       sqrdmulh v0.2d, v1.2d, v22.d[1]
 
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v16.h[2]
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v16.h[2]
 // CHECK-ERROR:                                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                     ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4068,20 +3917,16 @@
          ld1 {v4}, [x0]
          ld1 {v32.16b}, [x0]
          ld1 {v15.8h}, [x32]
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        ld1 {x3}, [x2]
 // CHECK-ERROR:             ^
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        ld1 {v4}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        ld1 {v32.16b}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-AARCH64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR: error: register expected
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        ld1 {v15.8h}, [x32]
 // CHECK-ERROR:                       ^
 
@@ -4092,15 +3937,13 @@
          ld1 {v1.8h-v1.8h}, [x0]
          ld1 {v15.8h-v17.4h}, [x15]
          ld1 {v0.8b-v2.8b, [x0]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld1 {v0.16b, v2.16b}, [x0]
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
 // CHECK-ERROR:                                         ^
-// CHECK-AARCH64-ERROR: error: '{' expected
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:        ld1 v0.8b, v1.8b}, [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid number of vectors
@@ -4109,8 +3952,7 @@
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld1 {v1.8h-v1.8h}, [x0]
 // CHECK-ERROR:                   ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld1 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: '}' expected
@@ -4122,20 +3964,15 @@
          ld2 {v15.4h, v16.4h, v17.4h}, [x32]
          ld2 {v15.8h-v16.4h}, [x15]
          ld2 {v0.2d-v2.2d}, [x0]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld2 {v15.8h, v16.4h}, [x15]
 // CHECK-ERROR:                     ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld2 {v0.8b, v2.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-AARCH64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR: error: register expected
 // CHECK-ERROR:        ld2 {v15.4h, v16.4h, v17.4h}, [x32]
 // CHECK-ERROR:            ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld2 {v15.8h-v16.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4147,20 +3984,16 @@
          ld3 {v0.8b, v2.8b, v3.8b}, [x0]
          ld3 {v15.8h-v17.4h}, [x15]
          ld3 {v31.4s-v2.4s}, [sp]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v15.8h, v16.8h, v17.4h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld3 {v0.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4172,19 +4005,16 @@
          ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
          ld4 {v15.8h-v18.4h}, [x15]
          ld4 {v31.2s-v1.2s}, [x31]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
 // CHECK-ERROR:                                             ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld4 {v15.8h-v18.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4195,20 +4025,16 @@
          st1 {v4}, [x0]
          st1 {v32.16b}, [x0]
          st1 {v15.8h}, [x32]
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        st1 {x3}, [x2]
 // CHECK-ERROR:             ^
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st1 {v4}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        st1 {v32.16b}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-AARCH64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR: error: register expected
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st1 {v15.8h}, [x32]
 // CHECK-ERROR:                       ^
 
@@ -4219,15 +4045,13 @@
          st1 {v1.8h-v1.8h}, [x0]
          st1 {v15.8h-v17.4h}, [x15]
          st1 {v0.8b-v2.8b, [x0]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st1 {v0.16b, v2.16b}, [x0]
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
 // CHECK-ERROR:                                         ^
-// CHECK-AARCH64-ERROR: error: '{' expected
-// CHECK-ARM64-ERROR: error: unexpected token in argument list
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:        st1 v0.8b, v1.8b}, [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid number of vectors
@@ -4236,8 +4060,7 @@
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st1 {v1.8h-v1.8h}, [x0]
 // CHECK-ERROR:                   ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st1 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: '}' expected
@@ -4249,19 +4072,16 @@
          st2 {v15.4h, v16.4h, v17.4h}, [x30]
          st2 {v15.8h-v16.4h}, [x15]
          st2 {v0.2d-v2.2d}, [x0]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st2 {v15.8h, v16.4h}, [x15]
 // CHECK-ERROR:                     ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st2 {v0.8b, v2.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st2 {v15.4h, v16.4h, v17.4h}, [x30]
 // CHECK-ERROR:            ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st2 {v15.8h-v16.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4273,20 +4093,16 @@
          st3 {v0.8b, v2.8b, v3.8b}, [x0]
          st3 {v15.8h-v17.4h}, [x15]
          st3 {v31.4s-v2.4s}, [sp]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v15.8h, v16.8h, v17.4h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st3 {v0.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4298,19 +4114,16 @@
          st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
          st4 {v15.8h-v18.4h}, [x15]
          st4 {v31.2s-v1.2s}, [x31]
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: registers must be sequential
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
 // CHECK-ERROR:                                             ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st4 {v15.8h-v18.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4327,8 +4140,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          ld1 {v0.16b}, [x0], #8
 // CHECK-ERROR:                              ^
-// CHECK-AARCH64-ERROR:  error: expected vector type register
-// CHECK-ARM64-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR: error: invalid vector kind qualifier
 // CHECK-ERROR:          ld1 {v0.8h, v1.16h}, [x0], x1
 // CHECK-ERROR:                      ^
 // CHECK-ERROR:  error: invalid operand for instruction
@@ -4344,8 +4156,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          ld3 {v5.2s, v6.2s, v7.2s}, [x1], #48
 // CHECK-ERROR:                                           ^
-// CHECK-AARCH64-ERROR:  error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:          ld4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
 // CHECK-ERROR:                                     ^
 
@@ -4355,8 +4166,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          st1 {v0.16b}, [x0], #8
 // CHECK-ERROR:                              ^
-// CHECK-AARCH64-ERROR:  error: expected vector type register
-// CHECK-ARM64-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR: error: invalid vector kind qualifier
 // CHECK-ERROR:          st1 {v0.8h, v1.16h}, [x0], x1
 // CHECK-ERROR:                      ^
 // CHECK-ERROR:  error: invalid operand for instruction
@@ -4372,8 +4182,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          st3 {v5.2s, v6.2s, v7.2s}, [x1], #48
 // CHECK-ERROR:                                           ^
-// CHECK-AARCH64-ERROR:  error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:          st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
 // CHECK-ERROR:                                     ^
 
@@ -4385,19 +4194,16 @@
          ld2r {v31.4s, v0.2s}, [sp]
          ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
          ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
-// CHECK-AARCH64-ERROR: error: expected vector type register
-// CHECK-ARM64-ERROR: error: vector register expected
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR: ld1r {x1}, [x0]
 // CHECK-ERROR:       ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld2r {v31.4s, v0.2s}, [sp]
 // CHECK-ERROR:               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:      ^
-// CHECK-AARCH64-ERROR: error: invalid space between two vectors
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
 // CHECK-ERROR:                      ^
 
@@ -4409,20 +4215,16 @@
          ld2 {v15.h, v16.h}[8], [x15]
          ld3 {v31.s, v0.s, v1.s}[-1], [sp]
          ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
-// CHECK-AARCH64-ERROR:: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld1 {v0.b}[16], [x0]
 // CHECK-ERROR:            ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld2 {v15.h, v16.h}[8], [x15]
 // CHECK-ERROR:                    ^
-// CHECK-AARCH64-ERROR: error: expected lane number
-// CHECK-ARM64-ERROR: error: vector lane must be an integer in range
+// CHECK-ERROR: error: vector lane must be an integer in range
 // CHECK-ERROR: ld3 {v31.s, v0.s, v1.s}[-1], [sp]
 // CHECK-ERROR:                         ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
 // CHECK-ERROR:                              ^
 
@@ -4430,20 +4232,16 @@
          st2 {v31.s, v0.s}[3], [8]
          st3 {v15.h, v16.h, v17.h}[-1], [x15]
          st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
-// CHECK-AARCH64-ERROR:: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: st1 {v0.d}[16], [x0]
 // CHECK-ERROR:            ^
-// CHECK-AARCH64-ERROR: error: invalid operand for instruction
-// CHECK-ARM64-ERROR: error: register expected
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: st2 {v31.s, v0.s}[3], [8]
 // CHECK-ERROR:                        ^
-// CHECK-AARCH64-ERROR: error: expected lane number
-// CHECK-ARM64-ERROR: error: vector lane must be an integer in range
+// CHECK-ERROR: error: vector lane must be an integer in range
 // CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[-1], [x15]
 // CHECK-ERROR:                           ^
-// CHECK-AARCH64-ERROR: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
 // CHECK-ERROR:                              ^
 
@@ -4482,8 +4280,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: ld2 {v15.h, v16.h}[0], [x15], #3
 // CHECK-ERROR:                               ^
-// CHECK-AARCH64-ERROR: error: expected the same vector layout
-// CHECK-ARM64-ERROR: error: mismatched register size suffix
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
 // CHECK-ERROR:                      ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4517,20 +4314,16 @@
          ins v20.s[1], s30
          ins v1.d[0], d7
 
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v2.b[16], w1
 // CHECK-ERROR:                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v7.h[8], w14
 // CHECK-ERROR:                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v20.s[5], w30
 // CHECK-ERROR:                   ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v1.d[2], x7
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4557,24 +4350,19 @@
          smov x14, v6.d[1]
          smov x20, v9.d[0]
 
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov w1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov w14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x20, v9.s[5]
 // CHECK-ERROR                        ^
 // CHECK-ERROR error: invalid operand for instruction
@@ -4601,20 +4389,16 @@
          umov s20, v9.s[2]
          umov d7, v18.d[1]
 
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w20, v9.s[5]
 // CHECK-ERROR                        ^
-// CHECK-AARCH64-ERROR error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov x7, v18.d[3]
 // CHECK-ERROR                        ^
 // CHECK-ERROR error: invalid operand for instruction
@@ -5030,8 +4814,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal s17, h27, s12
 // CHECK-ERROR:                          ^
-// CHECK-AARCH64-ERROR: error: too few operands for instruction
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal d19, s24, d12
 // CHECK-ERROR:                          ^
 
@@ -5045,8 +4828,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl s14, h12, s25
 // CHECK-ERROR:                          ^
-// CHECK-AARCH64-ERROR: error: too few operands for instruction
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl d12, s23, d13
 // CHECK-ERROR:                          ^
 
@@ -5060,8 +4842,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull s12, h22, s12
 // CHECK-ERROR:                          ^
-// CHECK-AARCH64-ERROR: error: too few operands for instruction
-// CHECK-ARM64-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull d15, s22, d12
 // CHECK-ERROR:                          ^
 
@@ -7103,8 +6884,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmul    h0, h1, v1.s[0]
 // CHECK-ERROR:                  ^
-// CHECK-AARCH64-ERROR: error:  lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmul    s2, s29, v10.s[4]
 // CHECK-ERROR:                                 ^
 
@@ -7123,8 +6903,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmulx    h0, h1, v1.d[0]
 // CHECK-ERROR:                   ^
-// CHECK-AARCH64-ERROR: error:  lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmulx    d2, d29, v10.d[3]
 // CHECK-ERROR:                                  ^
 
@@ -7143,8 +6922,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmla    d30, s11, v1.d[1]
 // CHECK-ERROR:                       ^
-// CHECK-AARCH64-ERROR: error:  lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmla    s16, s22, v16.s[5]
 // CHECK-ERROR:                                  ^
 
@@ -7163,8 +6941,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmls    h7, h17, v26.s[2]
 // CHECK-ERROR:                  ^
-// CHECK-AARCH64-ERROR: error:  expected lane number
-// CHECK-ARM64-ERROR: error: vector lane must be an integer in range [0, 1]
+// CHECK-ERROR: error: vector lane must be an integer in range [0, 1]
 // CHECK-ERROR:          fmls    d16, d22, v16.d[-1]
 // CHECK-ERROR:                                  ^
 
@@ -7186,8 +6963,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmlal s8, s9, v14.s[1]
 // CHECK-ERROR:                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmlal d4, s5, v1.s[5]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7212,8 +6988,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmlsl d1, h1, v13.s[0]
 // CHECK-ERROR:                      ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmlsl d1, s1, v13.s[4]
 // CHECK-ERROR:                                ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7240,8 +7015,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmull s1, s1, v4.s[0]
 // CHECK-ERROR:                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmull s12, h17, v9.h[9]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7266,8 +7040,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmulh s25, s26, v27.h[3]
 // CHECK-ERROR:                  ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmulh s25, s26, v27.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7292,8 +7065,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqrdmulh s5, h6, v7.s[2]
 // CHECK-ERROR:                       ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqrdmulh h31, h30, v14.h[9]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7325,20 +7097,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          dup d0, v17.s[3]
 // CHECK-ERROR:                      ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup d0, v17.d[4]
 // CHECK-ERROR:                        ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup s0, v1.s[7]
 // CHECK-ERROR:                       ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup h0, v31.h[16]
 // CHECK-ERROR:                        ^
-// CHECK-AARCH64-ERROR: error: lane number incompatible with layout
-// CHECK-ARM64-ERROR: vector lane must be an integer in range
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup b1, v3.b[16]
 // CHECK-ERROR:                       ^
 
diff --git a/test/MC/AArch64/neon-extract.s b/test/MC/AArch64/neon-extract.s
index fbfc048de962..1daa46d096ee 100644
--- a/test/MC/AArch64/neon-extract.s
+++ b/test/MC/AArch64/neon-extract.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-facge-facgt.s b/test/MC/AArch64/neon-facge-facgt.s
index bb739fa1859e..212eda2f2092 100644
--- a/test/MC/AArch64/neon-facge-facgt.s
+++ b/test/MC/AArch64/neon-facge-facgt.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-frsqrt-frecp.s b/test/MC/AArch64/neon-frsqrt-frecp.s
index ec3b64bfa59c..79fe5da5e76f 100644
--- a/test/MC/AArch64/neon-frsqrt-frecp.s
+++ b/test/MC/AArch64/neon-frsqrt-frecp.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-halving-add-sub.s b/test/MC/AArch64/neon-halving-add-sub.s
index 8e36b20386e5..555f1b83b4f3 100644
--- a/test/MC/AArch64/neon-halving-add-sub.s
+++ b/test/MC/AArch64/neon-halving-add-sub.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-max-min-pairwise.s b/test/MC/AArch64/neon-max-min-pairwise.s
index 4421be4ed0ab..8d2dadb1997f 100644
--- a/test/MC/AArch64/neon-max-min-pairwise.s
+++ b/test/MC/AArch64/neon-max-min-pairwise.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-max-min.s b/test/MC/AArch64/neon-max-min.s
index 3700f7553941..6d1efde5077f 100644
--- a/test/MC/AArch64/neon-max-min.s
+++ b/test/MC/AArch64/neon-max-min.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-mla-mls-instructions.s b/test/MC/AArch64/neon-mla-mls-instructions.s
index b82706862ec4..3072e6f1200d 100644
--- a/test/MC/AArch64/neon-mla-mls-instructions.s
+++ b/test/MC/AArch64/neon-mla-mls-instructions.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-mov.s b/test/MC/AArch64/neon-mov.s
index 8c420f1c013e..567a5ecc5412 100644
--- a/test/MC/AArch64/neon-mov.s
+++ b/test/MC/AArch64/neon-mov.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-mul-div-instructions.s b/test/MC/AArch64/neon-mul-div-instructions.s
index 6a39ad8e2e05..1fe6d2b819ce 100644
--- a/test/MC/AArch64/neon-mul-div-instructions.s
+++ b/test/MC/AArch64/neon-mul-div-instructions.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-perm.s b/test/MC/AArch64/neon-perm.s
index 641415ee1ee6..4b28dd01db39 100644
--- a/test/MC/AArch64/neon-perm.s
+++ b/test/MC/AArch64/neon-perm.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-rounding-halving-add.s b/test/MC/AArch64/neon-rounding-halving-add.s
index 7e81b1a65ca7..47ac21268020 100644
--- a/test/MC/AArch64/neon-rounding-halving-add.s
+++ b/test/MC/AArch64/neon-rounding-halving-add.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-rounding-shift.s b/test/MC/AArch64/neon-rounding-shift.s
index 5f72bafea48c..e70f766f2b62 100644
--- a/test/MC/AArch64/neon-rounding-shift.s
+++ b/test/MC/AArch64/neon-rounding-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-saturating-add-sub.s b/test/MC/AArch64/neon-saturating-add-sub.s
index 1d2916a48d1b..4a7ed1094262 100644
--- a/test/MC/AArch64/neon-saturating-add-sub.s
+++ b/test/MC/AArch64/neon-saturating-add-sub.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-saturating-rounding-shift.s b/test/MC/AArch64/neon-saturating-rounding-shift.s
index bc5c1c0a2132..9215c1cabefd 100644
--- a/test/MC/AArch64/neon-saturating-rounding-shift.s
+++ b/test/MC/AArch64/neon-saturating-rounding-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-saturating-shift.s b/test/MC/AArch64/neon-saturating-shift.s
index d35e1f3d0f08..9ae393a040b6 100644
--- a/test/MC/AArch64/neon-saturating-shift.s
+++ b/test/MC/AArch64/neon-saturating-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-abs.s b/test/MC/AArch64/neon-scalar-abs.s
index c529cfc7522f..d08756c0c10c 100644
--- a/test/MC/AArch64/neon-scalar-abs.s
+++ b/test/MC/AArch64/neon-scalar-abs.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-add-sub.s b/test/MC/AArch64/neon-scalar-add-sub.s
index fea1fc8ee8aa..0a3eba732122 100644
--- a/test/MC/AArch64/neon-scalar-add-sub.s
+++ b/test/MC/AArch64/neon-scalar-add-sub.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Add
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mla.s b/test/MC/AArch64/neon-scalar-by-elem-mla.s
index 7d5c6d04fd47..fec9d12d8b8d 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mla.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mla.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point fused multiply-add (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mul.s b/test/MC/AArch64/neon-scalar-by-elem-mul.s
index 78c51594d170..8b8a3f57a9ca 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mul.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mul.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point  multiply (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
index 007568cceb44..e3d7e0514f9f 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //-----------------------------------------------------------------------------
 // Signed saturating doubling multiply-add long (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
index 727bc670e1f6..8a8405ef282e 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //-----------------------------------------------------------------------------
 // Signed saturating doubling multiply long (scalar, by element)
diff --git a/test/MC/AArch64/neon-scalar-compare.s b/test/MC/AArch64/neon-scalar-compare.s
index 1cd04fd111f9..28de46a7733a 100644
--- a/test/MC/AArch64/neon-scalar-compare.s
+++ b/test/MC/AArch64/neon-scalar-compare.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-cvt.s b/test/MC/AArch64/neon-scalar-cvt.s
index dc8e3165b6dd..97416daf0801 100644
--- a/test/MC/AArch64/neon-scalar-cvt.s
+++ b/test/MC/AArch64/neon-scalar-cvt.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-dup.s b/test/MC/AArch64/neon-scalar-dup.s
index 81bdb7c4f852..db11ea2aa086 100644
--- a/test/MC/AArch64/neon-scalar-dup.s
+++ b/test/MC/AArch64/neon-scalar-dup.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Duplicate element (scalar)
diff --git a/test/MC/AArch64/neon-scalar-extract-narrow.s b/test/MC/AArch64/neon-scalar-extract-narrow.s
index 7e4ff85de7d0..e25224e386f0 100644
--- a/test/MC/AArch64/neon-scalar-extract-narrow.s
+++ b/test/MC/AArch64/neon-scalar-extract-narrow.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-fp-compare.s b/test/MC/AArch64/neon-scalar-fp-compare.s
index 054f923322e0..b798b3410670 100644
--- a/test/MC/AArch64/neon-scalar-fp-compare.s
+++ b/test/MC/AArch64/neon-scalar-fp-compare.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-mul.s b/test/MC/AArch64/neon-scalar-mul.s
index 968793fea885..e33bdad91a94 100644
--- a/test/MC/AArch64/neon-scalar-mul.s
+++ b/test/MC/AArch64/neon-scalar-mul.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-neg.s b/test/MC/AArch64/neon-scalar-neg.s
index ac61f9b78aa9..8e5d61dd2459 100644
--- a/test/MC/AArch64/neon-scalar-neg.s
+++ b/test/MC/AArch64/neon-scalar-neg.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-recip.s b/test/MC/AArch64/neon-scalar-recip.s
index 9dc6d069cd03..7a886f3b4a73 100644
--- a/test/MC/AArch64/neon-scalar-recip.s
+++ b/test/MC/AArch64/neon-scalar-recip.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-reduce-pairwise.s b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
index bf5eb5304b8f..403a940ec2f2 100644
--- a/test/MC/AArch64/neon-scalar-reduce-pairwise.s
+++ b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //----------------------------------------------------------------------
 // Scalar Reduce Add Pairwise (Integer)
diff --git a/test/MC/AArch64/neon-scalar-rounding-shift.s b/test/MC/AArch64/neon-scalar-rounding-shift.s
index 2d654958917e..6113e09af388 100644
--- a/test/MC/AArch64/neon-scalar-rounding-shift.s
+++ b/test/MC/AArch64/neon-scalar-rounding-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 
 //------------------------------------------------------------------------------
diff --git a/test/MC/AArch64/neon-scalar-saturating-add-sub.s b/test/MC/AArch64/neon-scalar-saturating-add-sub.s
index 3cdfd6204dfc..0bf243495999 100644
--- a/test/MC/AArch64/neon-scalar-saturating-add-sub.s
+++ b/test/MC/AArch64/neon-scalar-saturating-add-sub.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Saturating Add (Signed)
diff --git a/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s b/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
index 17bf222661c9..b09a58923445 100644
--- a/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
+++ b/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Saturating Rounding Shift Lef (Signed)
diff --git a/test/MC/AArch64/neon-scalar-saturating-shift.s b/test/MC/AArch64/neon-scalar-saturating-shift.s
index 3eddabd616fa..b53c9f072f35 100644
--- a/test/MC/AArch64/neon-scalar-saturating-shift.s
+++ b/test/MC/AArch64/neon-scalar-saturating-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Saturating Shift Lef (Signed)
diff --git a/test/MC/AArch64/neon-scalar-shift-imm.s b/test/MC/AArch64/neon-scalar-shift-imm.s
index a0847d207a32..96cb815eafa8 100644
--- a/test/MC/AArch64/neon-scalar-shift-imm.s
+++ b/test/MC/AArch64/neon-scalar-shift-imm.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-shift.s b/test/MC/AArch64/neon-scalar-shift.s
index 54b42f5eab2d..366840a93159 100644
--- a/test/MC/AArch64/neon-scalar-shift.s
+++ b/test/MC/AArch64/neon-scalar-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Scalar Integer Shift Lef (Signed)
diff --git a/test/MC/AArch64/neon-shift-left-long.s b/test/MC/AArch64/neon-shift-left-long.s
index 679af09ea4a0..97604587424e 100644
--- a/test/MC/AArch64/neon-shift-left-long.s
+++ b/test/MC/AArch64/neon-shift-left-long.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-shift.s b/test/MC/AArch64/neon-shift.s
index d5b730c07028..614e6de16222 100644
--- a/test/MC/AArch64/neon-shift.s
+++ b/test/MC/AArch64/neon-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-simd-copy.s b/test/MC/AArch64/neon-simd-copy.s
index dc8b060b3577..4837a4cb9ee8 100644
--- a/test/MC/AArch64/neon-simd-copy.s
+++ b/test/MC/AArch64/neon-simd-copy.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-simd-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-ldst-multi-elem.s
index 85e7c28e396d..b8b3e72ff777 100644
--- a/test/MC/AArch64/neon-simd-ldst-multi-elem.s
+++ b/test/MC/AArch64/neon-simd-ldst-multi-elem.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-simd-ldst-one-elem.s b/test/MC/AArch64/neon-simd-ldst-one-elem.s
index 63b7bca39850..4febf6d8fe0b 100644
--- a/test/MC/AArch64/neon-simd-ldst-one-elem.s
+++ b/test/MC/AArch64/neon-simd-ldst-one-elem.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-simd-misc.s b/test/MC/AArch64/neon-simd-misc.s
index 4486dddce409..6d1aafdd7725 100644
--- a/test/MC/AArch64/neon-simd-misc.s
+++ b/test/MC/AArch64/neon-simd-misc.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
index b8cc266cfca8..c57a122f35c8 100644
--- a/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
+++ b/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-simd-shift.s b/test/MC/AArch64/neon-simd-shift.s
index 46a75009dc40..a16432324efc 100644
--- a/test/MC/AArch64/neon-simd-shift.s
+++ b/test/MC/AArch64/neon-simd-shift.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-sxtl.s b/test/MC/AArch64/neon-sxtl.s
index 2efdb4dcbbd5..0fe26cb5e8e5 100644
--- a/test/MC/AArch64/neon-sxtl.s
+++ b/test/MC/AArch64/neon-sxtl.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-tbl.s b/test/MC/AArch64/neon-tbl.s
index e8d77c75c37f..bb39fa9f22ae 100644
--- a/test/MC/AArch64/neon-tbl.s
+++ b/test/MC/AArch64/neon-tbl.s
@@ -1,4 +1,3 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
diff --git a/test/MC/AArch64/neon-uxtl.s b/test/MC/AArch64/neon-uxtl.s
index 502166b28183..685b6362bcb1 100644
--- a/test/MC/AArch64/neon-uxtl.s
+++ b/test/MC/AArch64/neon-uxtl.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/noneon-diagnostics.s b/test/MC/AArch64/noneon-diagnostics.s
index 3c953e3764d4..60a5fd208af9 100644
--- a/test/MC/AArch64/noneon-diagnostics.s
+++ b/test/MC/AArch64/noneon-diagnostics.s
@@ -1,51 +1,29 @@
 // RUN: not llvm-mc  -triple aarch64-none-linux-gnu -mattr=-neon < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
 
-// RUN: not llvm-mc  -triple arm64-none-linux-gnu -mattr=-neon < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ARM64-ERROR < %t %s
-
         fmla v3.4s, v12.4s, v17.4s
         fmla v1.2d, v30.2d, v20.2d
         fmla v9.2s, v9.2s, v0.2s
-// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v3.4s, v12.4s, v17.4s
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v1.2d, v30.2d, v20.2d
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v9.2s, v9.2s, v0.2s
 // CHECK-ERROR-NEXT:    ^
 
-// CHECK-ARM64-ERROR: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmla v3.4s, v12.4s, v17.4s
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmla v1.2d, v30.2d, v20.2d
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmla v9.2s, v9.2s, v0.2s
-// CHECK-ARM64-ERROR-NEXT:    ^
-
         fmls v3.4s, v12.4s, v17.4s
         fmls v1.2d, v30.2d, v20.2d
         fmls v9.2s, v9.2s, v0.2s
-// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+
+// CHECK-ERROR: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v3.4s, v12.4s, v17.4s
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v1.2d, v30.2d, v20.2d
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v9.2s, v9.2s, v0.2s
 // CHECK-ERROR-NEXT:    ^
-
-// CHECK-ARM64-ERROR: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmls v3.4s, v12.4s, v17.4s
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmls v1.2d, v30.2d, v20.2d
-// CHECK-ARM64-ERROR-NEXT:    ^
-// CHECK-ARM64-ERROR-NEXT: error: instruction requires: neon
-// CHECK-ARM64-ERROR-NEXT:    fmls v9.2s, v9.2s, v0.2s
-// CHECK-ARM64-ERROR-NEXT:    ^
diff --git a/test/MC/AArch64/optional-hash.s b/test/MC/AArch64/optional-hash.s
index a332cb091246..3922b5be34a1 100644
--- a/test/MC/AArch64/optional-hash.s
+++ b/test/MC/AArch64/optional-hash.s
@@ -1,8 +1,5 @@
 // PR18929
 // RUN: llvm-mc < %s -triple=aarch64-linux-gnueabi -mattr=+fp-armv8,+neon -filetype=obj -o - \
-// RUN: | llvm-objdump --disassemble -arch=aarch64 -mattr=+fp-armv8,+neon - | FileCheck %s
-
-// RUN: llvm-mc < %s -triple=arm64-linux-gnueabi -mattr=+fp-armv8,+neon -filetype=obj -o - \
 // RUN: | llvm-objdump --disassemble -arch=arm64 -mattr=+fp-armv8,+neon - | FileCheck %s
 
     .text
diff --git a/test/MC/AArch64/tls-relocs.s b/test/MC/AArch64/tls-relocs.s
index 5b2e98875997..ebf02167a8f3 100644
--- a/test/MC/AArch64/tls-relocs.s
+++ b/test/MC/AArch64/tls-relocs.s
@@ -1,9 +1,5 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s --check-prefix=CHECK-AARCH64
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s -o - | \
-// RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
-
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s --check-prefix=CHECK-ARM64
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s -o - | \
 // RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
 
         // TLS local-dynamic forms
@@ -11,23 +7,15 @@
         movn x2, #:dtprel_g2:var
         movz x3, #:dtprel_g2:var
         movn x4, #:dtprel_g2:var
-// CHECK-AARCH64: movz    x1, #:dtprel_g2:var     // encoding: [0x01'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
-// CHECK-AARCH64: movn    x2, #:dtprel_g2:var     // encoding: [0x02'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
-// CHECK-AARCH64: movz    x3, #:dtprel_g2:var     // encoding: [0x03'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
-// CHECK-AARCH64: movn    x4, #:dtprel_g2:var     // encoding: [0x04'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
-
-// CHECK-ARM64: movz    x1, #:dtprel_g2:var     // encoding: [0bAAA00001,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x2, #:dtprel_g2:var     // encoding: [0bAAA00010,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    x3, #:dtprel_g2:var     // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x4, #:dtprel_g2:var     // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
+
+// CHECK: movz    x1, #:dtprel_g2:var     // encoding: [0bAAA00001,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x2, #:dtprel_g2:var     // encoding: [0bAAA00010,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movz    x3, #:dtprel_g2:var     // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:dtprel_g2:var     // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF:      Relocations [
 // CHECK-ELF-NEXT:   Section (2) .rela.text {
@@ -41,23 +29,15 @@
         movn x6, #:dtprel_g1:var
         movz w7, #:dtprel_g1:var
         movn w8, #:dtprel_g1:var
-// CHECK-AARCH64: movz    x5, #:dtprel_g1:var     // encoding: [0x05'A',A,0xa0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
-// CHECK-AARCH64: movn    x6, #:dtprel_g1:var     // encoding: [0x06'A',A,0xa0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
-// CHECK-AARCH64: movz    w7, #:dtprel_g1:var     // encoding: [0x07'A',A,0xa0'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
-// CHECK-AARCH64: movn    w8, #:dtprel_g1:var     // encoding: [0x08'A',A,0xa0'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
-
-// CHECK-ARM64: movz    x5, #:dtprel_g1:var     // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x6, #:dtprel_g1:var     // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w7, #:dtprel_g1:var     // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w8, #:dtprel_g1:var     // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+
+// CHECK: movz    x5, #:dtprel_g1:var     // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:dtprel_g1:var     // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:dtprel_g1:var     // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    w8, #:dtprel_g1:var     // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x10 R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x14 R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
@@ -67,15 +47,11 @@
 
         movk x9, #:dtprel_g1_nc:var
         movk w10, #:dtprel_g1_nc:var
-// CHECK-AARCH64: movk    x9, #:dtprel_g1_nc:var  // encoding: [0x09'A',A,0xa0'A',0xf2'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_a64_movw_dtprel_g1_nc
-// CHECK-AARCH64: movk    w10, #:dtprel_g1_nc:var // encoding: [0x0a'A',A,0xa0'A',0x72'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_a64_movw_dtprel_g1_nc
 
-// CHECK-ARM64: movk    x9, #:dtprel_g1_nc:var  // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w10, #:dtprel_g1_nc:var // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x9, #:dtprel_g1_nc:var  // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:dtprel_g1_nc:var // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x20 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x24 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
@@ -85,22 +61,15 @@
         movn x12, #:dtprel_g0:var
         movz w13, #:dtprel_g0:var
         movn w14, #:dtprel_g0:var
-// CHECK-AARCH64: movz    x11, #:dtprel_g0:var    // encoding: [0x0b'A',A,0x80'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
-// CHECK-AARCH64: movn    x12, #:dtprel_g0:var    // encoding: [0x0c'A',A,0x80'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
-// CHECK-AARCH64: movz    w13, #:dtprel_g0:var    // encoding: [0x0d'A',A,0x80'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
-// CHECK-AARCH64: movn    w14, #:dtprel_g0:var    // encoding: [0x0e'A',A,0x80'A',0x12'A']
-
-// CHECK-ARM64: movz    x11, #:dtprel_g0:var    // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x12, #:dtprel_g0:var    // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w13, #:dtprel_g0:var    // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w14, #:dtprel_g0:var    // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+
+// CHECK: movz    x11, #:dtprel_g0:var    // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:dtprel_g0:var    // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:dtprel_g0:var    // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    w14, #:dtprel_g0:var    // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x28 R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x2C R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
@@ -110,15 +79,11 @@
 
         movk x15, #:dtprel_g0_nc:var
         movk w16, #:dtprel_g0_nc:var
-// CHECK-AARCH64: movk    x15, #:dtprel_g0_nc:var // encoding: [0x0f'A',A,0x80'A',0xf2'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_a64_movw_dtprel_g0_nc
-// CHECK-AARCH64: movk    w16, #:dtprel_g0_nc:var // encoding: [0x10'A',A,0x80'A',0x72'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_a64_movw_dtprel_g0_nc
 
-// CHECK-ARM64: movk    x15, #:dtprel_g0_nc:var // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w16, #:dtprel_g0_nc:var // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x15, #:dtprel_g0_nc:var // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:dtprel_g0_nc:var // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x38 R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x3C R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
@@ -126,15 +91,11 @@
 
         add x17, x18, #:dtprel_hi12:var, lsl #12
         add w19, w20, #:dtprel_hi12:var, lsl #12
-// CHECK-AARCH64: add     x17, x18, #:dtprel_hi12:var, lsl #12 // encoding: [0x51'A',0x02'A',0x40'A',0x91'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_a64_add_dtprel_hi12
-// CHECK-AARCH64: add     w19, w20, #:dtprel_hi12:var, lsl #12 // encoding: [0x93'A',0x02'A',0x40'A',0x11'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_a64_add_dtprel_hi12
 
-// CHECK-ARM64: add    x17, x18, :dtprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w19, w20, :dtprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x17, x18, :dtprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w19, w20, :dtprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
+// CHECK:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x40 R_AARCH64_TLSLD_ADD_DTPREL_HI12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x44 R_AARCH64_TLSLD_ADD_DTPREL_HI12 [[VARSYM]]
@@ -142,15 +103,11 @@
 
         add x21, x22, #:dtprel_lo12:var
         add w23, w24, #:dtprel_lo12:var
-// CHECK-AARCH64: add     x21, x22, #:dtprel_lo12:var // encoding: [0xd5'A',0x02'A',A,0x91'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_add_dtprel_lo12
-// CHECK-AARCH64: add     w23, w24, #:dtprel_lo12:var // encoding: [0x17'A',0x03'A',A,0x11'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_add_dtprel_lo12
 
-// CHECK-ARM64: add    x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w23, w24, :dtprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w23, w24, :dtprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x48 R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x4C R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
@@ -158,15 +115,11 @@
 
         add x25, x26, #:dtprel_lo12_nc:var
         add w27, w28, #:dtprel_lo12_nc:var
-// CHECK-AARCH64: add     x25, x26, #:dtprel_lo12_nc:var // encoding: [0x59'A',0x03'A',A,0x91'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_add_dtprel_lo12_nc
-// CHECK-AARCH64: add     w27, w28, #:dtprel_lo12_nc:var // encoding: [0x9b'A',0x03'A',A,0x11'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_add_dtprel_lo12_nc
 
-// CHECK-ARM64: add    x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w27, w28, :dtprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w27, w28, :dtprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x50 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x54 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
@@ -174,15 +127,11 @@
 
         ldrb w29, [x30, #:dtprel_lo12:var]
         ldrsb x29, [x28, #:dtprel_lo12_nc:var]
-// CHECK-AARCH64: ldrb    w29, [x30, #:dtprel_lo12:var] // encoding: [0xdd'A',0x03'A',0x40'A',0x39'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst8_dtprel_lo12
-// CHECK-AARCH64: ldrsb   x29, [x28, #:dtprel_lo12_nc:var] // encoding: [0x9d'A',0x03'A',0x80'A',0x39'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst8_dtprel_lo12_nc
 
-// CHECK-ARM64: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK-ARM64: ldrsb    x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-ARM64:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb    x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     0x58 R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x5C R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
@@ -190,15 +139,11 @@
 
         strh w27, [x26, #:dtprel_lo12:var]
         ldrsh x25, [x24, #:dtprel_lo12_nc:var]
-// CHECK-AARCH64: strh    w27, [x26, #:dtprel_lo12:var] // encoding: [0x5b'A',0x03'A',A,0x79'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst16_dtprel_lo12
-// CHECK-AARCH64: ldrsh   x25, [x24, #:dtprel_lo12_nc:var] // encoding: [0x19'A',0x03'A',0x80'A',0x79'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst16_dtprel_lo12_n
 
-// CHECK-ARM64: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK-ARM64: ldrsh    x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-ARM64:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh    x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     0x60 R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x64 R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
@@ -206,15 +151,11 @@
 
         ldr w23, [x22, #:dtprel_lo12:var]
         ldrsw x21, [x20, #:dtprel_lo12_nc:var]
-// CHECK-AARCH64: ldr     w23, [x22, #:dtprel_lo12:var] // encoding: [0xd7'A',0x02'A',0x40'A',0xb9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst32_dtprel_lo12
-// CHECK-AARCH64: ldrsw   x21, [x20, #:dtprel_lo12_nc:var] // encoding: [0x95'A',0x02'A',0x80'A',0xb9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst32_dtprel_lo12_n
 
-// CHECK-ARM64: ldr    w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK-ARM64: ldrsw    x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-ARM64:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK: ldr    w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw    x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     0x68 R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x6C R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
@@ -222,15 +163,11 @@
 
         ldr x19, [x18, #:dtprel_lo12:var]
         str x17, [x16, #:dtprel_lo12_nc:var]
-// CHECK-AARCH64: ldr     x19, [x18, #:dtprel_lo12:var] // encoding: [0x53'A',0x02'A',0x40'A',0xf9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst64_dtprel_lo12
-// CHECK-AARCH64: str     x17, [x16, #:dtprel_lo12_nc:var] // encoding: [0x11'A',0x02'A',A,0xf9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst64_dtprel_lo12_nc
 
-// CHECK-ARM64: ldr    x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: str    x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-ARM64:                                        //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: ldr    x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str    x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK:                                        //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     0x70 R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x74 R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
@@ -239,15 +176,11 @@
         // TLS initial-exec forms
         movz x15, #:gottprel_g1:var
         movz w14, #:gottprel_g1:var
-// CHECK-AARCH64: movz    x15, #:gottprel_g1:var  // encoding: [0x0f'A',A,0xa0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_a64_movw_gottprel_g1
-// CHECK-AARCH64: movz    w14, #:gottprel_g1:var  // encoding: [0x0e'A',A,0xa0'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_a64_movw_gottprel_g1
 
-// CHECK-ARM64: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w14, #:gottprel_g1:var  // encoding: [0bAAA01110,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w14, #:gottprel_g1:var  // encoding: [0bAAA01110,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x78 R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x7C R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM]]
@@ -255,15 +188,11 @@
 
         movk x13, #:gottprel_g0_nc:var
         movk w12, #:gottprel_g0_nc:var
-// CHECK-AARCH64: movk    x13, #:gottprel_g0_nc:var // encoding: [0x0d'A',A,0x80'A',0xf2'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_a64_movw_gottprel_g0_nc
-// CHECK-AARCH64: movk    w12, #:gottprel_g0_nc:var // encoding: [0x0c'A',A,0x80'A',0x72'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_a64_movw_gottprel_g0_nc
 
-// CHECK-ARM64: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w12, #:gottprel_g0_nc:var // encoding: [0bAAA01100,A,0b100AAAAA,0x72]
-// CHECK-ARM64:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
+// CHECK:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w12, #:gottprel_g0_nc:var // encoding: [0bAAA01100,A,0b100AAAAA,0x72]
+// CHECK:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x80 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x84 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
@@ -272,19 +201,13 @@
         adrp x11, :gottprel:var
         ldr x10, [x0, #:gottprel_lo12:var]
         ldr x9, :gottprel:var
-// CHECK-AARCH64: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_a64_adr_gottprel_page
-// CHECK-AARCH64: ldr     x10, [x0, #:gottprel_lo12:var] // encoding: [0x0a'A',A,0x40'A',0xf9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_a64_ld64_gottprel_lo12_nc
-// CHECK-AARCH64: ldr     x9, :gottprel:var       // encoding: [0x09'A',A,A,0x58'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_a64_ld_gottprel_prel19
-
-// CHECK-ARM64: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK-ARM64: ldr    x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                      //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: ldr    x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
-// CHECK-ARM64:                                //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_ldr_pcrel_imm19
+
+// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr    x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                      //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr    x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
+// CHECK:                                //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_ldr_pcrel_imm19
 
 // CHECK-ELF-NEXT:     0x88 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x8C R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
@@ -294,15 +217,11 @@
         // TLS local-exec forms
         movz x3, #:tprel_g2:var
         movn x4, #:tprel_g2:var
-// CHECK-AARCH64: movz    x3, #:tprel_g2:var      // encoding: [0x03'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_a64_movw_tprel_g2
-// CHECK-AARCH64: movn    x4, #:tprel_g2:var      // encoding: [0x04'A',A,0xc0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_a64_movw_tprel_g2
 
-// CHECK-ARM64: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
+// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x94 R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x98 R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
@@ -312,23 +231,15 @@
         movn x6, #:tprel_g1:var
         movz w7, #:tprel_g1:var
         movn w8, #:tprel_g1:var
-// CHECK-AARCH64: movz    x5, #:tprel_g1:var      // encoding: [0x05'A',A,0xa0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
-// CHECK-AARCH64: movn    x6, #:tprel_g1:var      // encoding: [0x06'A',A,0xa0'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
-// CHECK-AARCH64: movz    w7, #:tprel_g1:var      // encoding: [0x07'A',A,0xa0'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
-// CHECK-AARCH64: movn    w8, #:tprel_g1:var      // encoding: [0x08'A',A,0xa0'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
-
-// CHECK-ARM64: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w8, #:tprel_g1:var      // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+
+// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    w8, #:tprel_g1:var      // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x9C R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xA0 R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
@@ -338,15 +249,11 @@
 
         movk x9, #:tprel_g1_nc:var
         movk w10, #:tprel_g1_nc:var
-// CHECK-AARCH64: movk    x9, #:tprel_g1_nc:var   // encoding: [0x09'A',A,0xa0'A',0xf2'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_a64_movw_tprel_g1_nc
-// CHECK-AARCH64: movk    w10, #:tprel_g1_nc:var  // encoding: [0x0a'A',A,0xa0'A',0x72'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_a64_movw_tprel_g1_nc
 
-// CHECK-ARM64: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xAC R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xB0 R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
@@ -356,23 +263,15 @@
         movn x12, #:tprel_g0:var
         movz w13, #:tprel_g0:var
         movn w14, #:tprel_g0:var
-// CHECK-AARCH64: movz    x11, #:tprel_g0:var     // encoding: [0x0b'A',A,0x80'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
-// CHECK-AARCH64: movn    x12, #:tprel_g0:var     // encoding: [0x0c'A',A,0x80'A',0x92'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
-// CHECK-AARCH64: movz    w13, #:tprel_g0:var     // encoding: [0x0d'A',A,0x80'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
-// CHECK-AARCH64: movn    w14, #:tprel_g0:var     // encoding: [0x0e'A',A,0x80'A',0x12'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
-
-// CHECK-ARM64: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movn    w14, #:tprel_g0:var     // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+
+// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    w14, #:tprel_g0:var     // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xB4 R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xB8 R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
@@ -382,15 +281,11 @@
 
         movk x15, #:tprel_g0_nc:var
         movk w16, #:tprel_g0_nc:var
-// CHECK-AARCH64: movk    x15, #:tprel_g0_nc:var  // encoding: [0x0f'A',A,0x80'A',0xf2'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_a64_movw_tprel_g0_nc
-// CHECK-AARCH64: movk    w16, #:tprel_g0_nc:var  // encoding: [0x10'A',A,0x80'A',0x72'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_a64_movw_tprel_g0_nc
 
-// CHECK-ARM64: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK-ARM64: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xC4 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xC8 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
@@ -398,15 +293,11 @@
 
         add x17, x18, #:tprel_hi12:var, lsl #12
         add w19, w20, #:tprel_hi12:var, lsl #12
-// CHECK-AARCH64: add     x17, x18, #:tprel_hi12:var, lsl #12 // encoding: [0x51'A',0x02'A',0x40'A',0x91'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_a64_add_tprel_hi12
-// CHECK-AARCH64: add     w19, w20, #:tprel_hi12:var, lsl #12 // encoding: [0x93'A',0x02'A',0x40'A',0x11'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_a64_add_tprel_hi12
 
-// CHECK-ARM64: add    x17, x18, :tprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w19, w20, :tprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x17, x18, :tprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w19, w20, :tprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
+// CHECK:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xCC R_AARCH64_TLSLE_ADD_TPREL_HI12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xD0 R_AARCH64_TLSLE_ADD_TPREL_HI12 [[VARSYM]]
@@ -414,15 +305,11 @@
 
         add x21, x22, #:tprel_lo12:var
         add w23, w24, #:tprel_lo12:var
-// CHECK-AARCH64: add     x21, x22, #:tprel_lo12:var // encoding: [0xd5'A',0x02'A',A,0x91'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_add_tprel_lo12
-// CHECK-AARCH64: add     w23, w24, #:tprel_lo12:var // encoding: [0x17'A',0x03'A',A,0x11'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_add_tprel_lo12
 
-// CHECK-ARM64: add    x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w23, w24, :tprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w23, w24, :tprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xD4 R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xD8 R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
@@ -430,15 +317,11 @@
 
         add x25, x26, #:tprel_lo12_nc:var
         add w27, w28, #:tprel_lo12_nc:var
-// CHECK-AARCH64: add     x25, x26, #:tprel_lo12_nc:var // encoding: [0x59'A',0x03'A',A,0x91'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_add_tprel_lo12_nc
-// CHECK-AARCH64: add     w27, w28, #:tprel_lo12_nc:var // encoding: [0x9b'A',0x03'A',A,0x11'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_add_tprel_lo12_nc
 
-// CHECK-ARM64: add    x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: add    w27, w28, :tprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+// CHECK: add    x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w27, w28, :tprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xDC R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xE0 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
@@ -446,15 +329,11 @@
 
         ldrb w29, [x30, #:tprel_lo12:var]
         ldrsb x29, [x28, #:tprel_lo12_nc:var]
-// CHECK-AARCH64: ldrb    w29, [x30, #:tprel_lo12:var] // encoding: [0xdd'A',0x03'A',0x40'A',0x39'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst8_tprel_lo12
-// CHECK-AARCH64: ldrsb   x29, [x28, #:tprel_lo12_nc:var] // encoding: [0x9d'A',0x03'A',0x80'A',0x39'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst8_tprel_lo12_nc
 
-// CHECK-ARM64: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK-ARM64: ldrsb    x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb    x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     0xE4 R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xE8 R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
@@ -462,15 +341,11 @@
 
         strh w27, [x26, #:tprel_lo12:var]
         ldrsh x25, [x24, #:tprel_lo12_nc:var]
-// CHECK-AARCH64: strh    w27, [x26, #:tprel_lo12:var] // encoding: [0x5b'A',0x03'A',A,0x79'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst16_tprel_lo12
-// CHECK-AARCH64: ldrsh   x25, [x24, #:tprel_lo12_nc:var] // encoding: [0x19'A',0x03'A',0x80'A',0x79'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst16_tprel_lo12_n
 
-// CHECK-ARM64: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-ARM64:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK-ARM64: ldrsh    x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-ARM64:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh    x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     0xEC R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xF0 R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
@@ -478,30 +353,22 @@
 
         ldr w23, [x22, #:tprel_lo12:var]
         ldrsw x21, [x20, #:tprel_lo12_nc:var]
-// CHECK-AARCH64: ldr     w23, [x22, #:tprel_lo12:var] // encoding: [0xd7'A',0x02'A',0x40'A',0xb9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst32_tprel_lo12
-// CHECK-AARCH64: ldrsw   x21, [x20, #:tprel_lo12_nc:var] // encoding: [0x95'A',0x02'A',0x80'A',0xb9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst32_tprel_lo12_n
 
-// CHECK-ARM64: ldr    w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-ARM64:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK-ARM64: ldrsw    x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-ARM64:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK: ldr    w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw    x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     0xF4 R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xF8 R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
 
         ldr x19, [x18, #:tprel_lo12:var]
         str x17, [x16, #:tprel_lo12_nc:var]
-// CHECK-AARCH64: ldr     x19, [x18, #:tprel_lo12:var] // encoding: [0x53'A',0x02'A',0x40'A',0xf9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst64_tprel_lo12
-// CHECK-AARCH64: str     x17, [x16, #:tprel_lo12_nc:var] // encoding: [0x11'A',0x02'A',A,0xf9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst64_tprel_lo12_nc
 
-// CHECK-ARM64: ldr    x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: str    x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-ARM64:                                       //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: ldr    x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str    x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK:                                       //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     0xFC  R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x100 R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
@@ -513,25 +380,16 @@
         .tlsdesccall var
         blr x3
 
-// CHECK-AARCH64: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_a64_tlsdesc_adr_page
-// CHECK-AARCH64: ldr     x7, [x6, #:tlsdesc_lo12:var] // encoding: [0xc7'A',A,0x40'A',0xf9'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_a64_tlsdesc_ld64_lo12_nc
-// CHECK-AARCH64: add     x5, x4, #:tlsdesc_lo12:var // encoding: [0x85'A',A,A,0x91'A']
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_a64_tlsdesc_add_lo12_nc
-// CHECK-AARCH64: .tlsdesccall var                // encoding: []
-// CHECK-AARCH64-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_a64_tlsdesc_call
-// CHECK-AARCH64: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
-
-// CHECK-ARM64: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK-ARM64: ldr    x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-ARM64:                                    //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK-ARM64: add    x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-ARM64:                                  //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK-ARM64: .tlsdesccall var                // encoding: []
-// CHECK-ARM64:                                 //   fixup A - offset: 0, value: var, kind: fixup_arm64_tlsdesc_call
-// CHECK-ARM64: blr    x3                      // encoding: [0x60,0x00,0x3f,0xd6]
+
+// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
+// CHECK:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr    x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: add    x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK:                                  //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: .tlsdesccall var                // encoding: []
+// CHECK:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
+// CHECK: blr    x3                      // encoding: [0x60,0x00,0x3f,0xd6]
 
 // CHECK-ELF-NEXT:     0x104 R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
 // CHECK-ELF-NEXT:     0x108 R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
diff --git a/test/MC/AArch64/trace-regs-diagnostics.s b/test/MC/AArch64/trace-regs-diagnostics.s
index 04f9d277355c..41331e7703c8 100644
--- a/test/MC/AArch64/trace-regs-diagnostics.s
+++ b/test/MC/AArch64/trace-regs-diagnostics.s
@@ -1,5 +1,4 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple arm64-none-linux-gnu < %s 2>&1 | FileCheck %s
         // Write-only
         mrs x12, trcoslar
         mrs x10, trclar
diff --git a/test/MC/AArch64/trace-regs.s b/test/MC/AArch64/trace-regs.s
index b763e67c91cf..92f16cd54f31 100644
--- a/test/MC/AArch64/trace-regs.s
+++ b/test/MC/AArch64/trace-regs.s
@@ -1,5 +1,4 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
 
         mrs x8, trcstatr
         mrs x9, trcidr8
diff --git a/test/MC/ARM/AlignedBundling/lit.local.cfg b/test/MC/ARM/AlignedBundling/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/MC/ARM/AlignedBundling/lit.local.cfg
+++ b/test/MC/ARM/AlignedBundling/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/ARM/Windows/mov32t-range.s b/test/MC/ARM/Windows/mov32t-range.s
new file mode 100644
index 000000000000..fef8ff2aca7a
--- /dev/null
+++ b/test/MC/ARM/Windows/mov32t-range.s
@@ -0,0 +1,37 @@
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -r - | FileCheck -check-prefix CHECK-RELOCATIONS %s
+
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-objdump -d - | FileCheck -check-prefix CHECK-ENCODING %s
+
+	.syntax unified
+	.thumb
+	.text
+
+	.def truncation
+		.scl 3
+		.type 32
+	.endef
+	.align 2
+	.thumb_func
+truncation:
+	movw r0, :lower16:.Lerange
+	movt r0, :upper16:.Lerange
+	bx lr
+
+	.section .rdata,"rd"
+.Lbuffer:
+	.zero 65536
+.Lerange:
+	.asciz "-erange"
+
+@ CHECK-RELOCATIONS: Relocations [
+@ CHECK-RELOCATIONS:   .text {
+@ CHECK-RELOCATIONS:     0x0 IMAGE_REL_ARM_MOV32T .rdata
+@ CHECK-RELOCATIONS-NOT: 0x4 IMAGE_REL_ARM_MOV32T .rdata
+@ CHECK-RELOCATIONS:   }
+@ CHECK-RELOCATIONS: ]
+
+@ CHECK-ENCODING:      0: 40 f2 00 00
+@ CHECK-ENCODING-NEXT: 4: c0 f2 01 00
+
diff --git a/test/MC/ARM/Windows/text-attributes.s b/test/MC/ARM/Windows/text-attributes.s
new file mode 100644
index 000000000000..62aa028789fb
--- /dev/null
+++ b/test/MC/ARM/Windows/text-attributes.s
@@ -0,0 +1,30 @@
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -s - | FileCheck %s
+
+	.syntax unified
+	.thumb
+
+	.text
+
+	.def function
+		.type 32
+		.scl 2
+	.endef
+	.global function
+	.thumb_func
+function:
+	bx lr
+
+@ CHECK: Sections [
+@ CHECK:   Section {
+@ CHECK:     Name: .text
+@ CHECK:     Characteristics [
+@ CHECK:       IMAGE_SCN_ALIGN_4BYTES
+@ CHECK:       IMAGE_SCN_CNT_CODE
+@ CHECK:       IMAGE_SCN_MEM_16BIT
+@ CHECK:       IMAGE_SCN_MEM_EXECUTE
+@ CHECK:       IMAGE_SCN_MEM_PURGEABLE
+@ CHECK:       IMAGE_SCN_MEM_READ
+@ CHECK:     ]
+@ CHECK:   }
+@ CHECK: ]
diff --git a/test/MC/ARM/coff-debugging-secrel.ll b/test/MC/ARM/coff-debugging-secrel.ll
index a6c52db4adaf..f37b19e6a70c 100644
--- a/test/MC/ARM/coff-debugging-secrel.ll
+++ b/test/MC/ARM/coff-debugging-secrel.ll
@@ -34,8 +34,6 @@ entry:
 ; CHECK-ITANIUM:     0xC IMAGE_REL_ARM_SECREL .debug_str
 ; CHECK-ITANIUM:     0x12 IMAGE_REL_ARM_SECREL .debug_str
 ; CHECK-ITANIUM:     0x16 IMAGE_REL_ARM_SECREL .debug_line
-; CHECK-ITANIUM:     0x1A IMAGE_REL_ARM_SECREL .debug_str
-; CHECK-ITANIUM:     0x27 IMAGE_REL_ARM_SECREL .debug_str
 ; CHECK-ITANIUM:   }
 ; CHECK-ITANIUM:   Section {{.*}}.debug_pubnames {
 ; CHECK-ITANIUM:     0x6 IMAGE_REL_ARM_SECREL .debug_info
diff --git a/test/MC/ARM/lit.local.cfg b/test/MC/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/MC/ARM/lit.local.cfg
+++ b/test/MC/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/ARM64/lit.local.cfg b/test/MC/ARM64/lit.local.cfg
deleted file mode 100644
index 4d6d8826b531..000000000000
--- a/test/MC/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-config.suffixes = ['.ll', '.s']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/MC/AsmParser/directive_seh.s b/test/MC/AsmParser/directive_seh.s
index f6eb9705caa8..1821747a2ba4 100644
--- a/test/MC/AsmParser/directive_seh.s
+++ b/test/MC/AsmParser/directive_seh.s
@@ -1,36 +1,25 @@
 # RUN: llvm-mc -triple x86_64-pc-win32 %s | FileCheck %s
 
-# CHECK: .seh_proc func
-# CHECK: .seh_pushframe @code
-# CHECK: .seh_stackalloc 24
-# CHECK: .seh_savereg %rbp, 16
-# CHECK: .seh_savexmm %r8, 0
-# CHECK: .seh_pushreg %rbx
-# CHECK: .seh_setframe %rbx, 0
-# CHECK: .seh_endprologue
-# CHECK: .seh_handler __C_specific_handler, @except
-# CHECK-NOT: .section{{.*}}.xdata
-# CHECK: .seh_handlerdata
-# CHECK: .text
-# CHECK: .seh_startchained
-# CHECK: .seh_endprologue
-# CHECK: .seh_endchained
-# CHECK: .seh_endproc
-
     .text
     .globl func
     .def func; .scl 2; .type 32; .endef
     .seh_proc func
+# CHECK: .seh_proc func
 func:
     .seh_pushframe @code
+# CHECK: .seh_pushframe @code
     subq $24, %rsp
     .seh_stackalloc 24
+# CHECK: .seh_stackalloc 24
     movq %rsi, 16(%rsp)
     .seh_savereg %rsi, 16
+# CHECK: .seh_savereg 6, 16
     movups %xmm8, (%rsp)
     .seh_savexmm %xmm8, 0
+# CHECK: .seh_savexmm 8, 0
     pushq %rbx
     .seh_pushreg 3
+# CHECK: .seh_pushreg 3
     mov %rsp, %rbx
     .seh_setframe 3, 0
     .seh_endprologue
@@ -41,8 +30,18 @@ func:
     .seh_startchained
     .seh_endprologue
     .seh_endchained
+# CHECK: .seh_setframe 3, 0
+# CHECK: .seh_endprologue
+# CHECK: .seh_handler __C_specific_handler, @except
+# CHECK-NOT: .section{{.*}}.xdata
+# CHECK: .seh_handlerdata
+# CHECK: .text
+# CHECK: .seh_startchained
+# CHECK: .seh_endprologue
+# CHECK: .seh_endchained
     lea (%rbx), %rsp
     pop %rbx
     addq $24, %rsp
     ret
     .seh_endproc
+# CHECK: .seh_endproc
diff --git a/test/MC/AsmParser/invalid-input-assertion.s b/test/MC/AsmParser/invalid-input-assertion.s
new file mode 100644
index 000000000000..2557f6e4aa6f
--- /dev/null
+++ b/test/MC/AsmParser/invalid-input-assertion.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple i686-linux -o /dev/null %s
+
+	.macro macro parameter=0
+		.if \parameter
+		.else
+	.endm
+
+	macro 1
+
diff --git a/test/MC/AsmParser/lit.local.cfg b/test/MC/AsmParser/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/MC/AsmParser/lit.local.cfg
+++ b/test/MC/AsmParser/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/COFF/alias.s b/test/MC/COFF/alias.s
index dc4f65acf74e..eb5398a4ed2b 100644
--- a/test/MC/COFF/alias.s
+++ b/test/MC/COFF/alias.s
@@ -1,4 +1,5 @@
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-readobj -t -r | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - \
+// RUN:   | llvm-readobj -t -r | FileCheck %s
 
 local1:
 external_aliased_to_local = local1
@@ -36,7 +37,7 @@ weak_aliased_to_external = external2
 // CHECK-NEXT:     AuxSymbolCount: 1
 // CHECK:        }
 // CHECK:        Symbol {
-// CHECK-NEXT:     Name: local1
+// CHECK:          Name: local1
 // CHECK-NEXT:     Value: 0
 // CHECK-NEXT:     Section: .text (1)
 // CHECK-NEXT:     BaseType: Null (0x0)
@@ -89,7 +90,7 @@ weak_aliased_to_external = external2
 // CHECK-NEXT:     StorageClass: WeakExternal (0x69)
 // CHECK-NEXT:     AuxSymbolCount: 1
 // CHECK-NEXT:     AuxWeakExternal {
-// CHECK-NEXT:       Linked: external2 (9)
+// CHECK-NEXT:       Linked: external2 (13)
 // CHECK-NEXT:       Search: Library (0x2)
 // CHECK-NEXT:       Unused: (00 00 00 00 00 00 00 00 00 00)
 // CHECK-NEXT:     }
diff --git a/test/MC/COFF/basic-coff-64.s b/test/MC/COFF/basic-coff-64.s
index 89d17452d0d7..38a9e578a4ca 100644
--- a/test/MC/COFF/basic-coff-64.s
+++ b/test/MC/COFF/basic-coff-64.s
@@ -25,10 +25,10 @@ _main:                                  # @main
 
 // CHECK: ImageFileHeader {
 // CHECK:   Machine: IMAGE_FILE_MACHINE_AMD64
-// CHECK:   SectionCount: 2
+// CHECK:   SectionCount: 3
 // CHECK:   TimeDateStamp: {{[0-9]+}}
 // CHECK:   PointerToSymbolTable: 0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount: 6
+// CHECK:   SymbolCount: 8
 // CHECK:   OptionalHeaderSize: 0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/COFF/basic-coff.s b/test/MC/COFF/basic-coff.s
index 9b299707a130..38bfa6d1014c 100644
--- a/test/MC/COFF/basic-coff.s
+++ b/test/MC/COFF/basic-coff.s
@@ -25,10 +25,10 @@ L_.str:                                 # @.str
 
 // CHECK: ImageFileHeader {
 // CHECK:   Machine: IMAGE_FILE_MACHINE_I386
-// CHECK:   SectionCount: 2
+// CHECK:   SectionCount: 3
 // CHECK:   TimeDateStamp: {{[0-9]+}}
 // CHECK:   PointerToSymbolTable: 0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount: 6
+// CHECK:   SymbolCount: 8
 // CHECK:   OptionalHeaderSize: 0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/COFF/directive-section-characteristics.ll b/test/MC/COFF/directive-section-characteristics.ll
new file mode 100644
index 000000000000..ca8102af641d
--- /dev/null
+++ b/test/MC/COFF/directive-section-characteristics.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple i686-windows -filetype obj -o - %s | llvm-readobj -sections \
+; RUN:    | FileCheck %s
+
+define dllexport void @function() {
+entry:
+  ret void
+}
+
+; CHECK: Section {
+; CHECK:   Name: .drectve
+; CHECK:   Characteristics [
+; CHECK:     IMAGE_SCN_ALIGN_1BYTES
+; CHECK:     IMAGE_SCN_LNK_INFO
+; CHECK:     IMAGE_SCN_LNK_REMOVE
+; CHECK:   ]
+; CHECK: }
+
diff --git a/test/MC/COFF/early-dce.s b/test/MC/COFF/early-dce.s
new file mode 100644
index 000000000000..ec1a9bda76fc
--- /dev/null
+++ b/test/MC/COFF/early-dce.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -triple i686-windows -g -filetype obj -o - %s \
+# RUN:   | llvm-readobj -s -t | FileCheck %s
+
+	.section .rdata
+
+	.align 8
+	.global data
+data:
+	.quad 0
+
+# CHECK: Sections [
+# CHECK:  Section {
+# CHECK:    Name: .text
+# CHECK:  }
+# CHECK: ]
+
diff --git a/test/MC/COFF/global_ctors_dtors.ll b/test/MC/COFF/global_ctors_dtors.ll
index 046e93a6ce23..ca17f24a68e0 100644
--- a/test/MC/COFF/global_ctors_dtors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -11,9 +11,10 @@
 
 %ini = type { i32, void()*, i8* }
 
-@llvm.global_ctors = appending global [2 x %ini ] [
+@llvm.global_ctors = appending global [3 x %ini ] [
   %ini { i32 65535, void ()* @a_global_ctor, i8* null },
-  %ini { i32 65535, void ()* @b_global_ctor, i8* bitcast (i32* @b to i8*) }
+  %ini { i32 65535, void ()* @b_global_ctor, i8* bitcast (i32* @b to i8*) },
+  %ini { i32 65535, void ()* @c_global_ctor, i8* bitcast (i32* @c to i8*) }
 ]
 @llvm.global_dtors = appending global [1 x %ini ] [%ini { i32 65535, void ()* @a_global_dtor, i8* null }]
 
@@ -26,11 +27,18 @@ define void @a_global_ctor() nounwind {
 
 @b = global i32 zeroinitializer
 
+@c = available_externally dllimport global i32 zeroinitializer
+
 define void @b_global_ctor() nounwind {
   store i32 42, i32* @b
   ret void
 }
 
+define void @c_global_ctor() nounwind {
+  store i32 42, i32* @c
+  ret void
+}
+
 define void @a_global_dtor() nounwind {
   %1 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str2, i32 0, i32 0))
   ret void
@@ -43,13 +51,15 @@ define i32 @main() nounwind {
 
 ; WIN32: .section .CRT$XCU,"rd"
 ; WIN32: a_global_ctor
-; WIN32: .section .CRT$XCU,"rd",associative .bss,{{_?}}b
+; WIN32: .section .CRT$XCU,"rd",associative,{{_?}}b
 ; WIN32: b_global_ctor
+; WIN32-NOT: c_global_ctor
 ; WIN32: .section .CRT$XTX,"rd"
 ; WIN32: a_global_dtor
 ; MINGW32: .section .ctors,"wd"
 ; MINGW32: a_global_ctor
-; MINGW32: .section .ctors,"wd",associative .bss,{{_?}}b
+; MINGW32: .section .ctors,"wd",associative,{{_?}}b
 ; MINGW32: b_global_ctor
+; MINGW32-NOT: c_global_ctor
 ; MINGW32: .section .dtors,"wd"
 ; MINGW32: a_global_dtor
diff --git a/test/MC/COFF/invalid-def.s b/test/MC/COFF/invalid-def.s
new file mode 100644
index 000000000000..42821c22cf71
--- /dev/null
+++ b/test/MC/COFF/invalid-def.s
@@ -0,0 +1,5 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.def first
+	.def second
+
diff --git a/test/MC/COFF/invalid-endef.s b/test/MC/COFF/invalid-endef.s
new file mode 100644
index 000000000000..c6fd8f596268
--- /dev/null
+++ b/test/MC/COFF/invalid-endef.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.endef
+
diff --git a/test/MC/COFF/invalid-scl-range.s b/test/MC/COFF/invalid-scl-range.s
new file mode 100644
index 000000000000..57225059821e
--- /dev/null
+++ b/test/MC/COFF/invalid-scl-range.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.def storage_class_range
+		.scl 1337
+	.endef
+
diff --git a/test/MC/COFF/invalid-scl.s b/test/MC/COFF/invalid-scl.s
new file mode 100644
index 000000000000..8565a5afe0e9
--- /dev/null
+++ b/test/MC/COFF/invalid-scl.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.scl 1337
+
diff --git a/test/MC/COFF/invalid-type-range.s b/test/MC/COFF/invalid-type-range.s
new file mode 100644
index 000000000000..92874cc4586b
--- /dev/null
+++ b/test/MC/COFF/invalid-type-range.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.def invalid_type_range
+		.type 65536
+	.endef
+
diff --git a/test/MC/COFF/invalid-type.s b/test/MC/COFF/invalid-type.s
new file mode 100644
index 000000000000..a1e131e99e55
--- /dev/null
+++ b/test/MC/COFF/invalid-type.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.type 65536
+
diff --git a/test/MC/COFF/linkonce-invalid.s b/test/MC/COFF/linkonce-invalid.s
index 90ce4a7ad317..cc3a27c90333 100644
--- a/test/MC/COFF/linkonce-invalid.s
+++ b/test/MC/COFF/linkonce-invalid.s
@@ -19,21 +19,9 @@
 // CHECK: error: unexpected token in directive
 .linkonce discard foo
 
-// CHECK: error: expected associated section name
+// CHECK: error: cannot make section associative with .linkonce
 .linkonce associative
 
-// CHECK: error: cannot associate unknown section 'unknown'
-.linkonce associative unknown
-
-// CHECK: error: cannot associate a section with itself
-.linkonce associative invalid
-
-// CHECK: error: associated section must be a COMDAT section
-.linkonce associative non_comdat
-
-// CHECK: error: associated section cannot be itself associative
-.linkonce associative assoc
-
 // CHECK: error: section 'multi' is already linkonce
 .section multi
 .linkonce discard
diff --git a/test/MC/COFF/linkonce.s b/test/MC/COFF/linkonce.s
index e7b7f475a3c6..f2e350645bfc 100644
--- a/test/MC/COFF/linkonce.s
+++ b/test/MC/COFF/linkonce.s
@@ -24,7 +24,6 @@
 .long 1
 
 .section s6
-.linkonce associative s1
 .long 1
 
 .section s7
@@ -39,11 +38,6 @@
 .linkonce discard
 .long 1
 
-// Check that valid '.section' names can be associated.
-.section multi
-.linkonce associative .foo$bar
-.long 1
-
 
 // CHECK: Sections [
 // CHECK:   Section {
@@ -79,7 +73,6 @@
 // CHECK:   Section {
 // CHECK:     Name: s6
 // CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
@@ -94,86 +87,64 @@
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
-// CHECK:   Section {
-// CHECK:     Name: multi
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
 // CHECK: ]
 // CHECK: Symbols [
 // CHECK:   Symbol {
 // CHECK:     Name: s1
-// CHECK:     Section: s1 (1)
+// CHECK:     Section: s1 (4)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 1
+// CHECK:       Number: 4
 // CHECK:       Selection: Any (0x2)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s2
-// CHECK:     Section: s2 (2)
+// CHECK:     Section: s2 (5)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 2
+// CHECK:       Number: 5
 // CHECK:       Selection: NoDuplicates (0x1)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s3
-// CHECK:     Section: s3 (3)
+// CHECK:     Section: s3 (6)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 3
+// CHECK:       Number: 6
 // CHECK:       Selection: Any (0x2)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s4
-// CHECK:     Section: s4 (4)
+// CHECK:     Section: s4 (7)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 4
+// CHECK:       Number: 7
 // CHECK:       Selection: SameSize (0x3)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s5
-// CHECK:     Section: s5 (5)
+// CHECK:     Section: s5 (8)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 5
+// CHECK:       Number: 8
 // CHECK:       Selection: ExactMatch (0x4)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s6
-// CHECK:     Section: s6 (6)
-// CHECK:     AuxSectionDef {
-// CHECK:       Number: 1
-// CHECK:       Selection: Associative (0x5)
-// CHECK:       AssocSection: s1
-// CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s7
-// CHECK:     Section: s7 (7)
+// CHECK:     Section: s7 (10)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 7
+// CHECK:       Number: 10
 // CHECK:       Selection: Largest (0x6)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s8
-// CHECK:     Section: s8 (8)
+// CHECK:     Section: s8 (11)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 8
+// CHECK:       Number: 11
 // CHECK:       Selection: Newest (0x7)
 // CHECK:     }
 // CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: multi
-// CHECK:     Value: 0
-// CHECK:     Section: multi (10)
-// CHECK:     AuxSectionDef {
-// CHECK:       Number: 9
-// CHECK:       Selection: Associative (0x5)
-// CHECK:       AssocSection: .foo$bar
-// CHECK:     }
-// CHECK:   }
diff --git a/test/MC/COFF/lit.local.cfg b/test/MC/COFF/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/MC/COFF/lit.local.cfg
+++ b/test/MC/COFF/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/COFF/section-comdat-conflict.s b/test/MC/COFF/section-comdat-conflict.s
new file mode 100644
index 000000000000..7ed452a5cdcb
--- /dev/null
+++ b/test/MC/COFF/section-comdat-conflict.s
@@ -0,0 +1,13 @@
+// RUN: not llvm-mc -triple i386-pc-win32 -filetype=obj < %s 2>&1 |  FileCheck %s
+
+// CHECK: conflicting sections for symbol
+
+        .section .xyz
+        .global bar
+bar:
+        .long 42
+
+        .section        .abcd,"xr",discard,bar
+        .global foo
+foo:
+        .long 42
diff --git a/test/MC/COFF/section-comdat-conflict2.s b/test/MC/COFF/section-comdat-conflict2.s
new file mode 100644
index 000000000000..e2dfc2d68b2f
--- /dev/null
+++ b/test/MC/COFF/section-comdat-conflict2.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple i386-pc-win32 -filetype=obj < %s 2>&1 |  FileCheck %s
+
+// CHECK: two sections have the same comdat
+
+        .section        .xyz,"xr",discard,bar
+        .section        .abcd,"xr",discard,bar
diff --git a/test/MC/COFF/section-comdat.s b/test/MC/COFF/section-comdat.s
index dd5be871b050..8a7de1a1d919 100644
--- a/test/MC/COFF/section-comdat.s
+++ b/test/MC/COFF/section-comdat.s
@@ -1,8 +1,7 @@
 // RUN: llvm-mc -triple i386-pc-win32 -filetype=obj %s | llvm-readobj -s -t | FileCheck %s
 // RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -t | FileCheck %s
 
-.section assocSec
-.linkonce
+.section assocSec, "dr", discard, "assocSym"
 .long 1
 
 .section secName, "dr", discard, "Symbol1"
@@ -25,7 +24,7 @@ Symbol3:
 Symbol4:
 .long 1
 
-.section SecName, "dr", associative assocSec, "Symbol5"
+.section SecName, "dr", associative, "assocSym"
 .globl Symbol5
 Symbol5:
 .long 1
@@ -42,56 +41,56 @@ Symbol7:
 
 // CHECK: Sections [
 // CHECK:   Section {
-// CHECK:     Number: 1
+// CHECK:     Number: 4
 // CHECK:     Name: assocSec
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 2
+// CHECK:     Number: 5
 // CHECK:     Name: secName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 3
+// CHECK:     Number: 6
 // CHECK:     Name: secName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 4
+// CHECK:     Number: 7
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 5
+// CHECK:     Number: 8
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 6
+// CHECK:     Number: 9
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 7
+// CHECK:     Number: 10
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 8
+// CHECK:     Number: 11
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
@@ -101,88 +100,92 @@ Symbol7:
 // CHECK: Symbols [
 // CHECK:   Symbol {
 // CHECK:     Name: assocSec
-// CHECK:     Section: assocSec (1)
+// CHECK:     Section: assocSec (4)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Any
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: assocSym
+// CHECK:     Section: assocSec
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: secName
-// CHECK:     Section: secName (2)
+// CHECK:     Section: secName (5)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Any
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol1
+// CHECK:     Section: secName (5)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: secName
-// CHECK:     Section: secName (3)
+// CHECK:     Section: secName (6)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: NoDuplicates
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol2
+// CHECK:     Section: secName (6)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (4)
+// CHECK:     Section: SecName (7)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: SameSize
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol3
+// CHECK:     Section: SecName (7)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (5)
+// CHECK:     Section: SecName (8)
 // CHECK:     AuxSymbolCount: 1
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: ExactMatch
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol4
+// CHECK:     Section: SecName (8)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (6)
+// CHECK:     Section: SecName (9)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Associative
-// CHECK:       AssocSection: assocSec (1)
+// CHECK:       AssocSection: assocSec (4)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (7)
+// CHECK:     Section: SecName (10)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Largest
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol6
+// CHECK:     Section: SecName (10)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (8)
+// CHECK:     Section: SecName (11)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Newest (0x7)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
-// CHECK:     Name: Symbol1
-// CHECK:     Section: secName (2)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol2
-// CHECK:     Section: secName (3)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol3
-// CHECK:     Section: SecName (4)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol4
-// CHECK:     Section: SecName (5)
+// CHECK:     Name: Symbol7
+// CHECK:     Section: SecName (11)
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: Symbol5
-// CHECK:     Section: SecName (6)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol6
-// CHECK:     Section: SecName (7)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol7
-// CHECK:     Section: SecName (8)
+// CHECK:     Section: SecName (9)
 // CHECK:   }
 // CHECK: ]
diff --git a/test/MC/COFF/section-name-encoding.s b/test/MC/COFF/section-name-encoding.s
index 74cd490bd3cb..7edd6d7446d9 100644
--- a/test/MC/COFF/section-name-encoding.s
+++ b/test/MC/COFF/section-name-encoding.s
@@ -10,11 +10,11 @@
 // Raw encoding
 
 // CHECK:   Section {
-// CHECK:     Number: 1
+// CHECK:     Number: 4
 // CHECK:     Name: s (73 00 00 00 00 00 00 00)
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 2
+// CHECK:     Number: 5
 // CHECK:     Name: s1234567 (73 31 32 33 34 35 36 37)
 // CHECK:   }
 .section s;        .long 1
@@ -25,7 +25,7 @@
 
 // /4
 // CHECK:   Section {
-// CHECK:     Number: 3
+// CHECK:     Number: 6
 // CHECK:     Name: s12345678 (2F 34 00 00 00 00 00 00)
 // CHECK:   }
 .section s12345678; .long 1
@@ -57,7 +57,7 @@ pad_sections aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 //     "s12345678\0"     # of pad sections
 //
 // CHECK:   Section {
-// CHECK:     Number: 9
+// CHECK:     Number: 12
 // CHECK:     Name: seven_digit (2F 31 30 30 30 30 32 39)
 // CHECK:   }
 .section seven_digit; .long 1
@@ -82,7 +82,7 @@ pad_sections_ex aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 // "2F 2F 41 41 6D 4A 61 34" is "//AAmJa4", which decodes to "0 0 38 9 26 56".
 //
 // CHECK:   Section {
-// CHECK:     Number: 15
+// CHECK:     Number: 18
 // CHECK:     Name: double_slash (2F 2F 41 41 6D 4A 61 34)
 // CHECK:   }
 .section double_slash; .long 1
diff --git a/test/MC/COFF/seh.s b/test/MC/COFF/seh.s
index 72d42f4b2ea3..cd884b4fa009 100644
--- a/test/MC/COFF/seh.s
+++ b/test/MC/COFF/seh.s
@@ -35,13 +35,13 @@
 // CHECK-NEXT: ]
 
 // CHECK-NEXT: Relocations [
-// CHECK-NEXT:   Section (2) .xdata {
+// CHECK-NEXT:   Section (4) .xdata {
 // CHECK-NEXT:     0x14 IMAGE_REL_AMD64_ADDR32NB __C_specific_handler
 // CHECK-NEXT:     0x20 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x24 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x28 IMAGE_REL_AMD64_ADDR32NB .xdata
 // CHECK-NEXT:   }
-// CHECK-NEXT:   Section (3) .pdata {
+// CHECK-NEXT:   Section (5) .pdata {
 // CHECK-NEXT:     0x0 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x4 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x8 IMAGE_REL_AMD64_ADDR32NB .xdata
diff --git a/test/MC/COFF/symbol-fragment-offset-64.s b/test/MC/COFF/symbol-fragment-offset-64.s
index b8244709aa75..deac88869b20 100644
--- a/test/MC/COFF/symbol-fragment-offset-64.s
+++ b/test/MC/COFF/symbol-fragment-offset-64.s
@@ -36,10 +36,10 @@ _main:                                  # @main
 
 // CHECK: {
 // CHECK:   Machine:                   IMAGE_FILE_MACHINE_AMD64
-// CHECK:   SectionCount:              2
+// CHECK:   SectionCount:              3
 // CHECK:   TimeDateStamp:             {{[0-9]+}}
 // CHECK:   PointerToSymbolTable:      0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount:               7
+// CHECK:   SymbolCount:               9
 // CHECK:   OptionalHeaderSize:        0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/COFF/symbol-fragment-offset.s b/test/MC/COFF/symbol-fragment-offset.s
index 71b1703972ab..b09c5af1b61e 100644
--- a/test/MC/COFF/symbol-fragment-offset.s
+++ b/test/MC/COFF/symbol-fragment-offset.s
@@ -36,10 +36,10 @@ L_.str2:
 
 // CHECK: {
 // CHECK:   Machine:                   IMAGE_FILE_MACHINE_I386 (0x14C)
-// CHECK:   SectionCount:              2
+// CHECK:   SectionCount:              3
 // CHECK:   TimeDateStamp:             {{[0-9]+}}
 // CHECK:   PointerToSymbolTable:      0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount:               7
+// CHECK:   SymbolCount:               9
 // CHECK:   OptionalHeaderSize:        0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/Disassembler/ARM64/advsimd.txt b/test/MC/Disassembler/AArch64/arm64-advsimd.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/advsimd.txt
rename to test/MC/Disassembler/AArch64/arm64-advsimd.txt
diff --git a/test/MC/Disassembler/ARM64/arithmetic.txt b/test/MC/Disassembler/AArch64/arm64-arithmetic.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/arithmetic.txt
rename to test/MC/Disassembler/AArch64/arm64-arithmetic.txt
diff --git a/test/MC/Disassembler/ARM64/basic-a64-undefined.txt b/test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/basic-a64-undefined.txt
rename to test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt
diff --git a/test/MC/Disassembler/ARM64/bitfield.txt b/test/MC/Disassembler/AArch64/arm64-bitfield.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/bitfield.txt
rename to test/MC/Disassembler/AArch64/arm64-bitfield.txt
diff --git a/test/MC/Disassembler/ARM64/branch.txt b/test/MC/Disassembler/AArch64/arm64-branch.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/branch.txt
rename to test/MC/Disassembler/AArch64/arm64-branch.txt
diff --git a/test/MC/Disassembler/ARM64/canonical-form.txt b/test/MC/Disassembler/AArch64/arm64-canonical-form.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/canonical-form.txt
rename to test/MC/Disassembler/AArch64/arm64-canonical-form.txt
diff --git a/test/MC/Disassembler/ARM64/crc32.txt b/test/MC/Disassembler/AArch64/arm64-crc32.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/crc32.txt
rename to test/MC/Disassembler/AArch64/arm64-crc32.txt
diff --git a/test/MC/Disassembler/ARM64/crypto.txt b/test/MC/Disassembler/AArch64/arm64-crypto.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/crypto.txt
rename to test/MC/Disassembler/AArch64/arm64-crypto.txt
diff --git a/test/MC/Disassembler/ARM64/invalid-logical.txt b/test/MC/Disassembler/AArch64/arm64-invalid-logical.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/invalid-logical.txt
rename to test/MC/Disassembler/AArch64/arm64-invalid-logical.txt
diff --git a/test/MC/Disassembler/ARM64/logical.txt b/test/MC/Disassembler/AArch64/arm64-logical.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/logical.txt
rename to test/MC/Disassembler/AArch64/arm64-logical.txt
diff --git a/test/MC/Disassembler/ARM64/memory.txt b/test/MC/Disassembler/AArch64/arm64-memory.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/memory.txt
rename to test/MC/Disassembler/AArch64/arm64-memory.txt
diff --git a/test/MC/Disassembler/ARM64/non-apple-fmov.txt b/test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/non-apple-fmov.txt
rename to test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt
diff --git a/test/MC/Disassembler/ARM64/scalar-fp.txt b/test/MC/Disassembler/AArch64/arm64-scalar-fp.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/scalar-fp.txt
rename to test/MC/Disassembler/AArch64/arm64-scalar-fp.txt
diff --git a/test/MC/Disassembler/ARM64/system.txt b/test/MC/Disassembler/AArch64/arm64-system.txt
similarity index 100%
rename from test/MC/Disassembler/ARM64/system.txt
rename to test/MC/Disassembler/AArch64/arm64-system.txt
diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
index 397a39eb6cf0..23da001accb6 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
@@ -945,10 +945,15 @@
 # CHECK: cset    x9, pl
 # CHECK: csetm    w20, ne
 # CHECK: csetm    x30, ge
+# "cset w2, nv" and "csetm x3, al" are invalid aliases for these two
+# CHECK: csinc    w2, wzr, wzr, al
+# CHECK: csinv    x3, xzr, xzr, nv
 0xe3 0x17 0x9f 0x1a
 0xe9 0x47 0x9f 0x9a
 0xf4 0x3 0x9f 0x5a
 0xfe 0xb3 0x9f 0xda
+0xe2,0xe7,0x9f,0x1a
+0xe3,0xf3,0x9f,0xda
 
 # CHECK: cinc    w3, w5, gt
 # CHECK: cinc    wzr, w4, le
@@ -956,25 +961,35 @@
 # CHECK: cinc    x3, x5, gt
 # CHECK: cinc    xzr, x4, le
 # CHECK: cset    x9, lt
+# "cinc w5, w6, al" and "cinc x1, x2, nv" are invalid aliases for these two
+# CHECK: csinc   w5, w6, w6, nv
+# CHECK: csinc   x1, x2, x2, al
 0xa3 0xd4 0x85 0x1a
 0x9f 0xc4 0x84 0x1a
 0xe9 0xa7 0x9f 0x1a
 0xa3 0xd4 0x85 0x9a
 0x9f 0xc4 0x84 0x9a
 0xe9 0xa7 0x9f 0x9a
+0xc5,0xf4,0x86,0x1a
+0x41,0xe4,0x82,0x9a
 
 # CHECK: cinv    w3, w5, gt
 # CHECK: cinv    wzr, w4, le
-# CHECK: csetm    w9, lt
+# CHECK: csetm   w9, lt
 # CHECK: cinv    x3, x5, gt
 # CHECK: cinv    xzr, x4, le
-# CHECK: csetm    x9, lt
+# CHECK: csetm   x9, lt
+# "cinv x1, x0, nv" and "cinv w9, w8, al" are invalid aliases for these two
+# CHECK: csinv   x1, x0, x0, al
+# CHECK: csinv   w9, w8, w8, nv
 0xa3 0xd0 0x85 0x5a
 0x9f 0xc0 0x84 0x5a
 0xe9 0xa3 0x9f 0x5a
 0xa3 0xd0 0x85 0xda
 0x9f 0xc0 0x84 0xda
 0xe9 0xa3 0x9f 0xda
+0x01 0xe0 0x80 0xda
+0x09,0xf1,0x88,0x5a
 
 # CHECK: cneg     w3, w5, gt
 # CHECK: cneg     wzr, w4, le
@@ -982,12 +997,17 @@
 # CHECK: cneg     x3, x5, gt
 # CHECK: cneg     xzr, x4, le
 # CHECK: cneg     x9, xzr, lt
+# "cneg x4, x8, nv" and "cneg w5, w6, al" are invalid aliases for these two
+# CHECK: csneg    x4, x8, x8, al
+# CHECK: csinv    w9, w8, w8, nv
 0xa3 0xd4 0x85 0x5a
 0x9f 0xc4 0x84 0x5a
 0xe9 0xa7 0x9f 0x5a
 0xa3 0xd4 0x85 0xda
 0x9f 0xc4 0x84 0xda
 0xe9 0xa7 0x9f 0xda
+0x04,0xe5,0x88,0xda
+0x09,0xf1,0x88,0x5a
 
 #------------------------------------------------------------------------------
 # Data-processing (1 source)
diff --git a/test/MC/Disassembler/AArch64/lit.local.cfg b/test/MC/Disassembler/AArch64/lit.local.cfg
index c6f83453ac20..180bb8a77f11 100644
--- a/test/MC/Disassembler/AArch64/lit.local.cfg
+++ b/test/MC/Disassembler/AArch64/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if 'AArch64' not in targets or 'ARM64' not in targets:
+if 'AArch64' not in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/ARM/hex-immediates.txt b/test/MC/Disassembler/ARM/hex-immediates.txt
index 2634d7ed3368..875d6679d239 100644
--- a/test/MC/Disassembler/ARM/hex-immediates.txt
+++ b/test/MC/Disassembler/ARM/hex-immediates.txt
@@ -1,5 +1,11 @@
-# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -hdis < %s | FileCheck %s
+# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 --disassemble --print-imm-hex < %s | FileCheck %s
 # CHECK: ldr	r4, [pc, #0x20]
 0x08 0x4c
 # CHECK: sub	sp, #0x84
 0xa1 0xb0
+# CHECK: ldr  r0, [sp, #0xb4]
+0x2d 0x98
+# CHECK: str.w  r8, [sp, #0xb4]
+0xcd 0xf8 0xb4 0x80
+# CHECK: ldr.w  r8, [sp, #0xb4]
+0xdd 0xf8 0xb4 0x80
diff --git a/test/MC/Disassembler/ARM/lit.local.cfg b/test/MC/Disassembler/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/MC/Disassembler/ARM/lit.local.cfg
+++ b/test/MC/Disassembler/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/ARM64/lit.local.cfg b/test/MC/Disassembler/ARM64/lit.local.cfg
deleted file mode 100644
index 46a946845e1c..000000000000
--- a/test/MC/Disassembler/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-config.suffixes = ['.txt']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/MC/Disassembler/Mips/lit.local.cfg b/test/MC/Disassembler/Mips/lit.local.cfg
index 1fa54b428cd9..a3183a25afaa 100644
--- a/test/MC/Disassembler/Mips/lit.local.cfg
+++ b/test/MC/Disassembler/Mips/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/Mips/mips32r6.txt b/test/MC/Disassembler/Mips/mips32r6.txt
new file mode 100644
index 000000000000..35ddd56b0afa
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r6.txt
@@ -0,0 +1,127 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r6 | FileCheck %s
+
+0xec 0x80 0x00 0x19 # CHECK: addiupc $4, 100
+0x7c 0x43 0x22 0xa0 # CHECK: align $4, $2, $3, 2
+0xec 0x7f 0x00 0x38 # CHECK: aluipc $3, 56
+0x3c 0x62 0xff 0xe9 # CHECK: aui $3, $2, -23
+0xec 0x7e 0xff 0xff # CHECK: auipc $3, -1
+0xe8 0x37 0x96 0xb8 # CHECK: balc 14572256
+0xc8 0x37 0x96 0xb8 # CHECK: bc 14572256
+
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 4 but the disassembler currently emits 8
+0x45 0x20 0x00 0x01 # CHECK: bc1eqz $f0,
+0x45 0x3f 0x00 0x01 # CHECK: bc1eqz $f31,
+0x45 0xa0 0x00 0x01 # CHECK: bc1nez $f0,
+0x45 0xbf 0x00 0x01 # CHECK: bc1nez $f31,
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 8 but the disassembler currently emits 12
+0x49 0x20 0x00 0x02 # CHECK: bc2eqz $0,
+0x49 0x3f 0x00 0x02 # CHECK: bc2eqz $31,
+0x49 0xa0 0x00 0x02 # CHECK: bc2nez $0,
+0x49 0xbf 0x00 0x02 # CHECK: bc2nez $31,
+
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+# FIXME: Don't check the immediate on the bcczal's for the moment, the
+#        encode/decode functions are not inverses of eachother.
+0x20 0x02 0x01 0x4d # CHECK: beqzalc $2,
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
+0x60 0x02 0x01 0x4d # CHECK: bnezalc $2,
+0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 256
+0x18 0x43 0x00 0x40 # CHECK: bgeuc $2, $3, 256
+0x18 0x42 0x01 0x4d # CHECK: bgezalc $2,
+0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
+0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
+0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 256
+0x1c 0x02 0x01 0x4d # CHECK: bgtzalc $2,
+0x58 0x05 0x00 0x40 # CHECK: blezc $5, 256
+0x1c 0x42 0x01 0x4d # CHECK: bltzalc $2,
+0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
+0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
+0x18 0x02 0x01 0x4d # CHECK: blezalc $2,
+0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 256
+0x1c 0xa6 0x00 0x40 # CHECK: bltuc $5, $6, 256
+0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
+0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
+0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
+0x46 0x84 0x18 0x80 # CHECK: cmp.f.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.f.d $f2, $f3, $f4
+0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
+0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.olt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.olt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.ole.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.ole.d $f2, $f3, $f4
+0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.sf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.sf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.ngle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.ngle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.ngl.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.ngl.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.lt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.nge.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.nge.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.le.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.ngt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.ngt.d $f2, $f3, $f4
+0x00 0x64 0x10 0x9a # CHECK: div $2, $3, $4
+0x00 0x64 0x10 0x9b # CHECK: divu $2, $3, $4
+# 0xf8 0x05 0x01 0x00 # CHECK-TODO: jialc $5, 256
+# 0xd8 0x05 0x01 0x00 # CHECK-TODO: jic $5, 256
+0xec 0x48 0x00 0x43 # CHECK: lwpc $2, 268
+0xec 0x50 0x00 0x43 # CHECK: lwupc $2, 268
+0x00 0x64 0x10 0xda # CHECK: mod $2, $3, $4
+0x00 0x64 0x10 0xdb # CHECK: modu $2, $3, $4
+0x00 0x64 0x10 0x98 # CHECK: mul $2, $3, $4
+0x00 0x64 0x10 0xd8 # CHECK: muh $2, $3, $4
+0x00 0x64 0x10 0x99 # CHECK: mulu $2, $3, $4
+0x00 0x64 0x10 0xd9 # CHECK: muhu $2, $3, $4
+0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
+0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
+0x46 0x22 0x08 0x10 # CHECK: sel.d $f0, $f1, $f2
+0x46 0x02 0x08 0x10 # CHECK: sel.s $f0, $f1, $f2
+0x00 0x64 0x10 0x35 # CHECK: seleqz $2, $3, $4
+0x00 0x64 0x10 0x37 # CHECK: selnez $2, $3, $4
+0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
+0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
+0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
+0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
+0x46 0x00 0x20 0x9a # CHECK: rint.s $f2, $f4
+0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
+0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
+0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
+0x00 0x80 0x04 0x09 # CHECK: jr.hb $4
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x7e 0x42 0xb3 0xb6 # CHECK: ll $2, -153($18)
+0x7e 0x6f 0xec 0x26 # CHECK: sc $15, -40($19)
+0x00 0xa0 0x58 0x51 # CHECK: clo $11, $5
+0x03 0x80 0xe8 0x50 # CHECK: clz $sp, $gp
diff --git a/test/MC/Disassembler/Mips/mips64r6.txt b/test/MC/Disassembler/Mips/mips64r6.txt
new file mode 100644
index 000000000000..951cc5373c0e
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r6.txt
@@ -0,0 +1,145 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips64r6 | FileCheck %s
+
+0xec 0x80 0x00 0x19 # CHECK: addiupc $4, 100
+0x7c 0x43 0x22 0xa0 # CHECK: align $4, $2, $3, 2
+0xec 0x7f 0x00 0x38 # CHECK: aluipc $3, 56
+0x3c 0x62 0xff 0xe9 # CHECK: aui $3, $2, -23
+0xec 0x7e 0xff 0xff # CHECK: auipc $3, -1
+0xe8 0x37 0x96 0xb8 # CHECK: balc 14572256
+0xc8 0x37 0x96 0xb8 # CHECK: bc 14572256
+
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 4 but the disassembler currently emits 8
+0x45 0x20 0x00 0x01 # CHECK: bc1eqz $f0,
+0x45 0x3f 0x00 0x01 # CHECK: bc1eqz $f31,
+0x45 0xa0 0x00 0x01 # CHECK: bc1nez $f0,
+0x45 0xbf 0x00 0x01 # CHECK: bc1nez $f31,
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 8 but the disassembler currently emits 12
+0x49 0x20 0x00 0x02 # CHECK: bc2eqz $0,
+0x49 0x3f 0x00 0x02 # CHECK: bc2eqz $31,
+0x49 0xa0 0x00 0x02 # CHECK: bc2nez $0,
+0x49 0xbf 0x00 0x02 # CHECK: bc2nez $31,
+
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+# FIXME: Don't check the immediate on the bcczal's for the moment, the
+#        encode/decode functions are not inverses of eachother.
+0x20 0x02 0x01 0x4d # CHECK: beqzalc $2,
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
+0x60 0x02 0x01 0x4d # CHECK: bnezalc $2,
+0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 256
+0x18 0x43 0x00 0x40 # CHECK: bgeuc $2, $3, 256
+0x18 0x42 0x01 0x4d # CHECK: bgezalc $2,
+0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
+0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
+0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 256
+0x1c 0x02 0x01 0x4d # CHECK: bgtzalc $2,
+0x58 0x05 0x00 0x40 # CHECK: blezc $5, 256
+0x1c 0x42 0x01 0x4d # CHECK: bltzalc $2,
+0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
+0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
+0x18 0x02 0x01 0x4d # CHECK: blezalc $2,
+0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 256
+0x1c 0xa6 0x00 0x40 # CHECK: bltuc $5, $6, 256
+0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
+0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
+0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
+0x46 0x84 0x18 0x80 # CHECK: cmp.f.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.f.d $f2, $f3, $f4
+0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
+0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.olt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.olt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.ole.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.ole.d $f2, $f3, $f4
+0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.sf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.sf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.ngle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.ngle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.ngl.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.ngl.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.lt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.nge.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.nge.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.le.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.ngt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.ngt.d $f2, $f3, $f4
+0x7c 0x43 0x23 0x64 # CHECK: dalign $4, $2, $3, 5
+0x74 0x62 0x12 0x34 # CHECK: daui $3, $2, 4660
+0x04 0x66 0x56 0x78 # CHECK: dahi $3, 22136
+0x04 0x7e 0xab 0xcd # CHECK: dati $3, -21555
+0x7c 0x02 0x20 0x24 # CHECK: dbitswap $4, $2
+0x00 0x64 0x10 0x9a # CHECK: div $2, $3, $4
+0x00 0x64 0x10 0x9b # CHECK: divu $2, $3, $4
+# 0xf8 0x05 0x01 0x00 # CHECK-TODO: jialc $5, 256
+# 0xd8 0x05 0x01 0x00 # CHECK-TODO: jic $5, 256
+0xec 0x48 0x00 0x43 # CHECK: lwpc $2, 268
+0xec 0x50 0x00 0x43 # CHECK: lwupc $2, 268
+0x00 0x64 0x10 0xda # CHECK: mod $2, $3, $4
+0x00 0x64 0x10 0xdb # CHECK: modu $2, $3, $4
+0x00 0x64 0x10 0x9e # CHECK: ddiv $2, $3, $4
+0x00 0x64 0x10 0x9f # CHECK: ddivu $2, $3, $4
+0x00 0x64 0x10 0xde # CHECK: dmod $2, $3, $4
+0x00 0x64 0x10 0xdf # CHECK: dmodu $2, $3, $4
+0x00 0x64 0x10 0x98 # CHECK: mul $2, $3, $4
+0x00 0x64 0x10 0xd8 # CHECK: muh $2, $3, $4
+0x00 0x64 0x10 0x99 # CHECK: mulu $2, $3, $4
+0x00 0x64 0x10 0xd9 # CHECK: muhu $2, $3, $4
+0x00 0x64 0x10 0xb8 # CHECK: dmul $2, $3, $4
+0x00 0x64 0x10 0xf8 # CHECK: dmuh $2, $3, $4
+0x00 0x64 0x10 0xb9 # CHECK: dmulu $2, $3, $4
+0x00 0x64 0x10 0xf9 # CHECK: dmuhu $2, $3, $4
+0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
+0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
+0x46 0x22 0x08 0x10 # CHECK: sel.d $f0, $f1, $f2
+0x46 0x02 0x08 0x10 # CHECK: sel.s $f0, $f1, $f2
+0x00 0x64 0x10 0x35 # CHECK: seleqz $2, $3, $4
+0x00 0x64 0x10 0x37 # CHECK: selnez $2, $3, $4
+0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
+0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
+0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
+0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
+0x46 0x00 0x20 0x9a # CHECK: rint.s $f2, $f4
+0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
+0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
+0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
+0xec 0x58 0x3c 0x48 # CHECK: ldpc $2, 123456
+0x00 0x80 0x04 0x09 # CHECK: jr.hb $4
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x7e 0x42 0xb3 0xb6 # CHECK: ll $2, -153($18)
+0x7f 0xe0 0x38 0x37 # CHECK: lld $zero, 112($ra)
+0x7e 0x6f 0xec 0x26 # CHECK: sc $15, -40($19)
+0x7f 0xaf 0xe6 0xa7 # CHECK: scd $15, -51($sp)
+0x00 0xa0 0x58 0x51 # CHECK: clo $11, $5
+0x03 0x80 0xe8 0x50 # CHECK: clz $sp, $gp
+0x00 0xc0 0x90 0x53 # CHECK: dclo $18, $6
+0x03 0x20 0x80 0x52 # CHECK: dclz $16, $25
diff --git a/test/MC/Disassembler/PowerPC/lit.local.cfg b/test/MC/Disassembler/PowerPC/lit.local.cfg
index 2e463005586f..5d33887ff0a4 100644
--- a/test/MC/Disassembler/PowerPC/lit.local.cfg
+++ b/test/MC/Disassembler/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/Sparc/lit.local.cfg b/test/MC/Disassembler/Sparc/lit.local.cfg
index 4d344fa91a9e..fa6a54e50132 100644
--- a/test/MC/Disassembler/Sparc/lit.local.cfg
+++ b/test/MC/Disassembler/Sparc/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/SystemZ/lit.local.cfg b/test/MC/Disassembler/SystemZ/lit.local.cfg
index b12af09434be..5c02dd3614a4 100644
--- a/test/MC/Disassembler/SystemZ/lit.local.cfg
+++ b/test/MC/Disassembler/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/X86/hex-immediates.txt b/test/MC/Disassembler/X86/hex-immediates.txt
index 80d24487ee74..fb76c26bcb76 100644
--- a/test/MC/Disassembler/X86/hex-immediates.txt
+++ b/test/MC/Disassembler/X86/hex-immediates.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --hdis %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
+# RUN: llvm-mc --print-imm-hex --disassemble %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
 
 # CHECK: movabsq	$0x7fffffffffffffff, %rcx
 0x48 0xb9 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0x7f
diff --git a/test/MC/Disassembler/X86/lit.local.cfg b/test/MC/Disassembler/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/MC/Disassembler/X86/lit.local.cfg
+++ b/test/MC/Disassembler/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/X86/moffs.txt b/test/MC/Disassembler/X86/moffs.txt
index 67d64e8719a5..dd2664cb7737 100644
--- a/test/MC/Disassembler/X86/moffs.txt
+++ b/test/MC/Disassembler/X86/moffs.txt
@@ -1,6 +1,6 @@
-# RUN: llvm-mc --hdis %s -triple=i686-linux-gnu-code16 | FileCheck --check-prefix=16 %s
-# RUN: llvm-mc --hdis %s -triple=i686-linux-gnu | FileCheck --check-prefix=32 %s
-# RUN: llvm-mc --hdis %s -triple=x86_64-linux-gnu | FileCheck --check-prefix=64 %s
+# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=i686-linux-gnu-code16 | FileCheck --check-prefix=16 %s
+# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=i686-linux-gnu | FileCheck --check-prefix=32 %s
+# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=x86_64-linux-gnu | FileCheck --check-prefix=64 %s
 
 # 16: movb 0x5a5a, %al
 # 32: movb 0x5a5a5a5a, %al
diff --git a/test/MC/Disassembler/XCore/lit.local.cfg b/test/MC/Disassembler/XCore/lit.local.cfg
index 4d17d4642045..bb48713fe33e 100644
--- a/test/MC/Disassembler/XCore/lit.local.cfg
+++ b/test/MC/Disassembler/XCore/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
diff --git a/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s b/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s
new file mode 100644
index 000000000000..eb364755c4d7
--- /dev/null
+++ b/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s
@@ -0,0 +1,9 @@
+@ RUN: not llvm-mc -triple arm-elf -filetype asm -o /dev/null %s 2>&1 | FileCheck %s
+
+	.syntax unified
+
+	.type TYPE #32
+// CHECK: error: expected symbol type in directive
+// CHECK: .type TYPE #32
+// CHECK:             ^
+
diff --git a/test/MC/ELF/ARM/gnu-type-hash.s b/test/MC/ELF/ARM/gnu-type-hash.s
new file mode 100644
index 000000000000..ae5c47c567cd
--- /dev/null
+++ b/test/MC/ELF/ARM/gnu-type-hash.s
@@ -0,0 +1,16 @@
+@ RUN: llvm-mc -triple arm-elf -filetype asm -o - %s | FileCheck %s
+
+	.syntax unified
+
+	.type TYPE #STT_FUNC
+// CHECK: .type TYPE,%function
+
+	.type type #function
+// CHECK: .type type,%function
+
+	.type comma_TYPE, #STT_FUNC
+// CHECK: .type comma_TYPE,%function
+
+	.type comma_type, #function
+// CHECK: .type comma_type,%function
+
diff --git a/test/MC/ELF/ARM/lit.local.cfg b/test/MC/ELF/ARM/lit.local.cfg
new file mode 100644
index 000000000000..d825cc04bf37
--- /dev/null
+++ b/test/MC/ELF/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+# We have to reset config.unsupported here because the parent directory is
+# predicated on 'X86'.
+config.unsupported = not 'ARM' in config.root.targets
diff --git a/test/MC/ELF/gnu-type-diagnostics.s b/test/MC/ELF/gnu-type-diagnostics.s
new file mode 100644
index 000000000000..df87d6df082e
--- /dev/null
+++ b/test/MC/ELF/gnu-type-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple i686-elf -filetype asm -o /dev/null %s 2>&1 | FileCheck %s
+
+	.type TYPE FUNC
+// CHECK: error: unsupported attribute in '.type' directive
+// CHECK: .type TYPE FUNC
+// CHECK:            ^
+
+	.type type stt_func
+// CHECK: error: unsupported attribute in '.type' directive
+// CHECK: .type type stt_func
+// CHECK:            ^
+
+	.type symbol 32
+// CHECK: error: expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', '%<type>' or "<type>"
+// CHECK: .type symbol 32
+// CHECK:              ^
+
+
diff --git a/test/MC/ELF/gnu-type.s b/test/MC/ELF/gnu-type.s
new file mode 100644
index 000000000000..19029e48ee96
--- /dev/null
+++ b/test/MC/ELF/gnu-type.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple i686-elf -filetype asm -o - %s | FileCheck %s
+
+	.type TYPE STT_FUNC
+// CHECK: .type TYPE,@function
+
+	.type comma_TYPE, STT_FUNC
+// CHECK: .type comma_TYPE,@function
+
+	.type at_TYPE, @STT_FUNC
+// CHECK: .type at_TYPE,@function
+
+	.type percent_TYPE, %STT_FUNC
+// CHECK: .type percent_TYPE,@function
+
+	.type string_TYPE, "STT_FUNC"
+// CHECK: .type string_TYPE,@function
+
+	.type type function
+// CHECK: .type type,@function
+
+	.type comma_type, function
+// CHECK: .type comma_type,@function
+
+	.type at_type, @function
+// CHECK: .type at_type,@function
+
+	.type percent_type, %function
+// CHECK: .type percent_type,@function
+
+	.type string_type, "function"
+// CHECK: .type string_type,@function
+
+	.type special gnu_unique_object
+// CHECK: .type special,@gnu_unique_object
+
+	.type comma_special, gnu_unique_object
+// CHECK: .type comma_special,@gnu_unique_object
+
diff --git a/test/MC/ELF/lit.local.cfg b/test/MC/ELF/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/MC/ELF/lit.local.cfg
+++ b/test/MC/ELF/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/ELF/noexec.s b/test/MC/ELF/noexec.s
index 33cb8ae3452b..28f50cb7f692 100644
--- a/test/MC/ELF/noexec.s
+++ b/test/MC/ELF/noexec.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -mc-no-exec-stack -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -t | FileCheck  %s
+// RUN: llvm-mc -no-exec-stack -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -t | FileCheck  %s
 
 // CHECK:        Section {
 // CHECK:          Index: 4
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s b/test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s
similarity index 100%
rename from test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
rename to test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-reloc.s b/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
similarity index 100%
rename from test/MC/MachO/ARM64/darwin-ARM64-reloc.s
rename to test/MC/MachO/AArch64/darwin-ARM64-reloc.s
diff --git a/test/MC/MachO/AArch64/lit.local.cfg b/test/MC/MachO/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..cec29af5bbe4
--- /dev/null
+++ b/test/MC/MachO/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/MC/MachO/ARM/aliased-symbols.s b/test/MC/MachO/ARM/aliased-symbols.s
new file mode 100644
index 000000000000..0b4463d055aa
--- /dev/null
+++ b/test/MC/MachO/ARM/aliased-symbols.s
@@ -0,0 +1,115 @@
+// RUN: llvm-mc -triple thumbv7m-apple-darwin-eabi %s -filetype=obj -o %t
+// RUN:     llvm-readobj -symbols %t | FileCheck %s
+
+        .data
+        var1 = var2
+        .long var1
+        .long var2
+        .long var2 + 4
+defined_early:
+        .long 0
+
+        alias_to_early = defined_early
+        alias_to_late = defined_late
+
+defined_late:
+        .long 0
+
+        .global extern_test
+        extern_test = var2
+
+        alias_to_local = Ltmp0
+Ltmp0:
+
+// CHECK: Symbols [
+
+        // defined_early was defined. Actually has value 0xc.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: defined_early
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_EARLY:[0-9A-F]+]]
+// CHECK-NEXT: }
+
+        // alias_to_early was an alias to defined_early. But we can resolve it.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: alias_to_early
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_EARLY]]
+// CHECK-NEXT: }
+
+        // defined_late was defined. Just after defined_early.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: defined_late
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_LATE:[0-9A-F]+]]
+// CHECK-NEXT: }
+
+        // alias_to_late was an alias to defined_late. But we can resolve it.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: alias_to_late
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_LATE]]
+// CHECK-NEXT: }
+
+        // alias_to_local is an alias, but what it points to has no
+        // MachO representation. We must resolve it.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: alias_to_local (37)
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x14
+// CHECK-NEXT: }
+
+        // extern_test was a pure alias to the unknown "var2".
+        // N_INDR and Extern.
+// CHECK:   Name: extern_test
+// CHECK-NEXT:   Extern
+// CHECK-NEXT:   Type: Indirect (0xA)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[VAR2_STRINGINDEX:[0-9a-f]+]]
+// CHECK-NEXT: }
+
+        // var1 was another alias to an unknown variable. Not extern this time.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: var1 (1)
+// CHECK-NEXT:   Type: Indirect (0xA)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[VAR2_STRINGINDEX]]
+// CHECK-NEXT: }
+
+        // var2 was a normal undefined (extern) symbol.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: var2
+// CHECK-NEXT:   Extern
+// CHECK-NEXT:   Type: Undef (0x0)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x0
+// CHECK-NEXT: }
diff --git a/test/MC/MachO/ARM/lit.local.cfg b/test/MC/MachO/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/MC/MachO/ARM/lit.local.cfg
+++ b/test/MC/MachO/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/MachO/ARM64/lit.local.cfg b/test/MC/MachO/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b6f74c..000000000000
--- a/test/MC/MachO/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/MC/MachO/lit.local.cfg b/test/MC/MachO/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/MC/MachO/lit.local.cfg
+++ b/test/MC/MachO/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/MachO/temp-labels.s b/test/MC/MachO/temp-labels.s
index b7382b7d2c82..ac0f6203aef1 100644
--- a/test/MC/MachO/temp-labels.s
+++ b/test/MC/MachO/temp-labels.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -L -o - | macho-dump --dump-section-data | FileCheck %s
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -save-temp-labels -o - | macho-dump --dump-section-data | FileCheck %s
 
 // CHECK:   # Load Command 1
 // CHECK:  (('command', 2)
diff --git a/test/MC/MachO/variable-exprs.s b/test/MC/MachO/variable-exprs.s
index 8eeb82f0faf7..a7fa45d571a3 100644
--- a/test/MC/MachO/variable-exprs.s
+++ b/test/MC/MachO/variable-exprs.s
@@ -202,10 +202,10 @@ Lt0_x = Lt0_a - Lt0_b
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 8
 // CHECK-I386:    (('n_strx', 1)
-// CHECK-I386:     ('n_type', 0x1)
+// CHECK-I386:     ('n_type', 0xb)
 // CHECK-I386:     ('n_sect', 0)
 // CHECK-I386:     ('n_desc', 0)
-// CHECK-I386:     ('n_value', 0)
+// CHECK-I386:     ('n_value', 4)
 // CHECK-I386:     ('_string', 'd2')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 9
@@ -403,10 +403,10 @@ Lt0_x = Lt0_a - Lt0_b
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 8
 // CHECK-X86_64:    (('n_strx', 1)
-// CHECK-X86_64:     ('n_type', 0x1)
+// CHECK-X86_64:     ('n_type', 0xb)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)
-// CHECK-X86_64:     ('n_value', 0)
+// CHECK-X86_64:     ('n_value', 4)
 // CHECK-X86_64:     ('_string', 'd2')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 9
diff --git a/test/MC/Mips/cpsetup-bad.s b/test/MC/Mips/cpsetup-bad.s
new file mode 100644
index 000000000000..09252a1310ed
--- /dev/null
+++ b/test/MC/Mips/cpsetup-bad.s
@@ -0,0 +1,14 @@
+# RUN: not llvm-mc %s -triple mips64-unknown-unknown 2>%t1
+# RUN:   FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .option pic2
+t1:
+        .cpsetup $bar, 8, __cerror
+# ASM: :[[@LINE-1]]:18: error: expected register containing function address
+        .cpsetup $33, 8, __cerror
+# ASM: :[[@LINE-1]]:18: error: invalid register
+        .cpsetup $31, foo, __cerror
+# ASM: :[[@LINE-1]]:23: error: expected save register or stack offset
+        .cpsetup $31, $32, __cerror
+# ASM: :[[@LINE-1]]:23: error: invalid register
diff --git a/test/MC/Mips/lit.local.cfg b/test/MC/Mips/lit.local.cfg
index 1fa54b428cd9..a3183a25afaa 100644
--- a/test/MC/Mips/lit.local.cfg
+++ b/test/MC/Mips/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Mips/llvm-mc-fixup-endianness.s b/test/MC/Mips/llvm-mc-fixup-endianness.s
new file mode 100644
index 000000000000..bc6a5d96632c
--- /dev/null
+++ b/test/MC/Mips/llvm-mc-fixup-endianness.s
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -show-encoding -mcpu=mips32 -triple mips-unknown-unknown %s | FileCheck -check-prefix=BE %s
+# RUN: llvm-mc -show-encoding -mcpu=mips32 -triple mipsel-unknown-unknown %s | FileCheck -check-prefix=LE %s
+#
+        .text
+        b foo # BE: b foo # encoding: [0x10,0x00,A,A]
+              # LE: b foo # encoding: [A,A,0x00,0x10]
diff --git a/test/MC/Mips/mips-expansions.s b/test/MC/Mips/mips-expansions.s
index 1622965a4139..f0a04a5a2515 100644
--- a/test/MC/Mips/mips-expansions.s
+++ b/test/MC/Mips/mips-expansions.s
@@ -8,6 +8,8 @@
 # CHECK: addiu   $6, $zero, -2345    # encoding: [0xd7,0xf6,0x06,0x24]
 # CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
 # CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $8, $zero, -8       # encoding: [0xf8,0xff,0x08,0x24]
+
 # CHECK: addiu   $4, $zero, 20       # encoding: [0x14,0x00,0x04,0x24]
 # CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
 # CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
@@ -32,17 +34,28 @@
 # CHECK: addu    $1, $1, $9              # encoding: [0x21,0x08,0x29,0x00]
 # CHECK: sw      $10, 57920($1)          # encoding: [0x40,0xe2,0x2a,0xac]
 
+# CHECK: lui     $1, %hi(symbol)
+# CHECK: ldc1    $f0, %lo(symbol)($1)
+# CHECK: lui     $1, %hi(symbol)
+# CHECK: sdc1    $f0, %lo(symbol)($1)
+
     li $5,123
     li $6,-2345
     li $7,65538
+    li $8, ~7
 
     la $a0, 20
     la $7,65538
     la $a0, 20($a1)
     la $7,65538($8)
 
+    .set noat
     lw  $t2, symbol($a0)
+    .set at
     sw  $t2, symbol($t1)
 
     lw  $t2, 655483($a0)
     sw  $t2, 123456($t1)
+
+    ldc1 $f0, symbol
+    sdc1 $f0, symbol
diff --git a/test/MC/Mips/mips-noat.s b/test/MC/Mips/mips-noat.s
index b83c51780776..07db251b0506 100644
--- a/test/MC/Mips/mips-noat.s
+++ b/test/MC/Mips/mips-noat.s
@@ -10,11 +10,10 @@
 test1:
         lw      $2, 65536($2)
 
-# FIXME: It would be better if the error pointed at the mnemonic instead of the newline
-# ERROR: mips-noat.s:[[@LINE+4]]:1: error: Pseudo instruction requires $at, which is not available
 test2:
         .set noat
-        lw      $2, 65536($2)
+        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: Pseudo instruction requires $at, which is not available
+
 
 # Can we switch it back on successfully?
 # CHECK-LABEL: test3:
@@ -25,10 +24,6 @@ test3:
         .set at
         lw      $2, 65536($2)
 
-# FIXME: It would be better if the error pointed at the mnemonic instead of the newline
-# ERROR: mips-noat.s:[[@LINE+4]]:1: error: Pseudo instruction requires $at, which is not available
 test4:
         .set at=$0
-        lw      $2, 65536($2)
-
-# ERROR-NOT: error
+        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: Pseudo instruction requires $at, which is not available
diff --git a/test/MC/Mips/mips1/invalid-mips2.s b/test/MC/Mips/mips1/invalid-mips2.s
index 6c3e80ac458a..7db261d42c98 100644
--- a/test/MC/Mips/mips1/invalid-mips2.s
+++ b/test/MC/Mips/mips1/invalid-mips2.s
@@ -21,3 +21,4 @@
         tnei      $t4,-29647      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         trunc.w.d $f22,$f15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         trunc.w.s $f28,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync                      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips32.s b/test/MC/Mips/mips1/invalid-mips32.s
new file mode 100644
index 000000000000..4ad8d63eb2c8
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips4.s b/test/MC/Mips/mips1/invalid-mips4.s
index 61aaf586b51d..9f246bc16bd0 100644
--- a/test/MC/Mips/mips1/invalid-mips4.s
+++ b/test/MC/Mips/mips1/invalid-mips4.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.w.d  $f11,$f25         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -50,15 +52,20 @@
         floor.w.s $f8,$f9           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1     $f8,$s7($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f10,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f10,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f10,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f26,$f20,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips5.s b/test/MC/Mips/mips1/invalid-mips5.s
index 1eddf02846e2..af5b278c4098 100644
--- a/test/MC/Mips/mips1/invalid-mips5.s
+++ b/test/MC/Mips/mips1/invalid-mips5.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.w.d  $f11,$f25         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -49,15 +51,20 @@
         ldxc1     $f8,$s7($t3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1     $f19,$s6($s5)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$a0,$fcc7     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f10,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f10,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$a3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$a3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/valid.s b/test/MC/Mips/mips1/valid.s
index 473e6b9e9b72..66e11ba2fe52 100644
--- a/test/MC/Mips/mips1/valid.s
+++ b/test/MC/Mips/mips1/valid.s
@@ -9,8 +9,18 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -36,7 +46,7 @@
         li        $zero,-29889
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwc3      $10,-32265($k0)
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
@@ -65,6 +75,7 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         sb        $s6,-19857($14)
         sh        $14,-6704($15)
         sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
@@ -91,7 +102,7 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swc3      $10,-32265($k0)
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
diff --git a/test/MC/Mips/mips2/invalid-mips3-wrong-error.s b/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
index a3f829b77c3d..3eb4ef3afb94 100644
--- a/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
+++ b/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
@@ -7,7 +7,6 @@
 
 	.set noat
         dmult     $s7,$a5           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        dsub      $a3,$s6,$a4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ld        $sp,-28645($s1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldl       $t8,-4167($t8)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldr       $t2,-30358($s4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips2/invalid-mips3.s b/test/MC/Mips/mips2/invalid-mips3.s
index ef498d7a8dc2..458c416c0e9e 100644
--- a/test/MC/Mips/mips2/invalid-mips3.s
+++ b/test/MC/Mips/mips2/invalid-mips3.s
@@ -38,6 +38,7 @@
         dsrl32     $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsrl32     $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsrlv      $s3,$t2,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub       $a3,$s6,$a4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsubu      $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         eret                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.l.d  $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips32.s b/test/MC/Mips/mips2/invalid-mips32.s
index 2975c6876694..43ea345441c5 100644
--- a/test/MC/Mips/mips2/invalid-mips32.s
+++ b/test/MC/Mips/mips2/invalid-mips32.s
@@ -1,28 +1,38 @@
 # Instructions that are invalid
 #
-# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         clo       $11,$a1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         clz       $sp,$gp         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         deret                     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         eret                      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jr.hb     $4              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4, $5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $s6,$13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $zero,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $s3,$gp         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $24,$s2         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfc0      $a2,$14,1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -30,3 +40,5 @@
         msubu     $15,$a1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mtc0      $9,$29,3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mul       $s0,$s4,$at     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync      0               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync      1               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips32r2.s b/test/MC/Mips/mips2/invalid-mips32r2.s
index 37f2eed8d083..72a570aa69ab 100644
--- a/test/MC/Mips/mips2/invalid-mips32r2.s
+++ b/test/MC/Mips/mips2/invalid-mips32r2.s
@@ -1,10 +1,12 @@
 # Instructions that are invalid
 #
-# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         clo     $t3,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         clz     $sp,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         cvt.l.d $f24,$f15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -24,15 +26,20 @@
         maddu   $t8,$s2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfc0    $a2,$14,1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhc1   $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf    $gp,$t0,$fcc7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d  $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s  $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf    $gp,$8,$fcc0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf    $gp,$8,$fcc7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d  $f6,$f11,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d  $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s  $f23,$f5,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s  $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn    $v1,$s1,$s0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d  $f27,$f21,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s  $f12,$f0,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt    $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt    $zero,$s4,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt    $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d  $f0,$f2,$fcc0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s  $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s  $f30,$f2,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s  $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz    $a1,$s6,$t1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d  $f12,$f29,$t1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s  $f25,$f7,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips4.s b/test/MC/Mips/mips2/invalid-mips4.s
index e2eb67248e5d..13923f01df86 100644
--- a/test/MC/Mips/mips2/invalid-mips4.s
+++ b/test/MC/Mips/mips2/invalid-mips4.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         cvt.d.l   $f4,$f16        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -46,15 +48,20 @@
         floor.l.s $f12,$f5        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1     $f8,$s7($15)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips5.s b/test/MC/Mips/mips2/invalid-mips5.s
index f777ffe1b1a4..8f460c7b2732 100644
--- a/test/MC/Mips/mips2/invalid-mips5.s
+++ b/test/MC/Mips/mips2/invalid-mips5.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         cvt.d.l   $f4,$f16        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -45,15 +47,20 @@
         ldxc1     $f8,$s7($t3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1     $f19,$s6($s5)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$a0,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$a0,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$a0,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$a1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$a1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/valid.s b/test/MC/Mips/mips2/valid.s
index e3effded2590..9c3706ee3ff5 100644
--- a/test/MC/Mips/mips2/valid.s
+++ b/test/MC/Mips/mips2/valid.s
@@ -9,8 +9,18 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -35,16 +45,16 @@
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldc3      $29,-28645($s1)
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwc3      $10,-32265($k0)
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
@@ -73,12 +83,13 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdc3      $12,5835($10)
         sh        $14,-6704($15)
         sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
@@ -107,10 +118,11 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swc3      $10,-32265($k0)
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips3/invalid-mips32.s b/test/MC/Mips/mips3/invalid-mips32.s
new file mode 100644
index 000000000000..3acd7651e629
--- /dev/null
+++ b/test/MC/Mips/mips3/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips3 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/invalid-mips4.s b/test/MC/Mips/mips3/invalid-mips4.s
index 6e15d79b90f1..9cd92d39e315 100644
--- a/test/MC/Mips/mips3/invalid-mips4.s
+++ b/test/MC/Mips/mips3/invalid-mips4.s
@@ -4,20 +4,27 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldxc1     $f8,$s7($15)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movz      $a1,$s6,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movz.d    $f12,$f29,$9   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sdxc1     $f11,$10($14)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         swxc1     $f19,$12($k0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/invalid-mips5.s b/test/MC/Mips/mips3/invalid-mips5.s
index d25621b73da1..307eee82075c 100644
--- a/test/MC/Mips/mips3/invalid-mips5.s
+++ b/test/MC/Mips/mips3/invalid-mips5.s
@@ -4,19 +4,26 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldxc1     $f8,$s7($t3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1     $f19,$s6($s5)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$a4,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$a5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$a5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/valid.s b/test/MC/Mips/mips3/valid.s
index 2067666c02e1..cb209fdb208f 100644
--- a/test/MC/Mips/mips3/valid.s
+++ b/test/MC/Mips/mips3/valid.s
@@ -9,8 +9,19 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -32,7 +43,11 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
         ddiv      $zero,$k0,$s3
@@ -64,6 +79,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -75,18 +94,18 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -117,16 +136,17 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sh        $14,-6704($15)
@@ -156,9 +176,10 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips32/valid.s b/test/MC/Mips/mips32/valid.s
index bc29bdcebf91..c58cb88b052d 100644
--- a/test/MC/Mips/mips32/valid.s
+++ b/test/MC/Mips/mips32/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -18,8 +31,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.s   $f22,$f28
         cvt.d.w   $f26,$f11
@@ -39,15 +52,15 @@
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         madd      $s6,$13
@@ -96,12 +109,14 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sh        $14,-6704($15)
         sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
         sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
@@ -129,9 +144,11 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips32r2/invalid.s b/test/MC/Mips/mips32r2/invalid.s
new file mode 100644
index 000000000000..ebccc43834e0
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips32r2/valid.s b/test/MC/Mips/mips32r2/valid.s
index 26f8b6b1d83a..e152f6f437c2 100644
--- a/test/MC/Mips/mips32r2/valid.s
+++ b/test/MC/Mips/mips32r2/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -18,8 +31,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.s   $f22,$f28
         cvt.d.w   $f26,$f11
@@ -40,20 +53,23 @@
         eret
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldxc1     $f8,$s7($15)
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwxc1     $f12,$s1($s8)
@@ -113,7 +129,9 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
         pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
         rdhwr     $sp,$11              
         rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
         rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
@@ -121,9 +139,9 @@
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdxc1     $f11,$10($14)
         seb       $25,$15
         seh       $v1,$12
@@ -155,10 +173,12 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
new file mode 100644
index 000000000000..52fa5f52b8db
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
@@ -0,0 +1,17 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips1.s b/test/MC/Mips/mips32r6/invalid-mips1.s
new file mode 100644
index 000000000000..44d4fbb86623
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips1.s
@@ -0,0 +1,24 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngl.d   $f29,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngle.d  $f0,$f16            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.d    $f30,$f0            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.s    $f14,$f22           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips32r6/invalid-mips2.s b/test/MC/Mips/mips32r6/invalid-mips2.s
new file mode 100644
index 000000000000..bfa2c4c3ee74
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips2.s
@@ -0,0 +1,26 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
index e416a20ca1b9..e63bdd4e7078 100644
--- a/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
@@ -10,6 +10,10 @@
         bc1tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc1fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips32.s b/test/MC/Mips/mips32r6/invalid-mips32.s
new file mode 100644
index 000000000000..e0889ea07bba
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips32.s
@@ -0,0 +1,25 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        madd      $s6,$13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $zero,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $s3,$gp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $24,$s2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub      $s7,$k1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu     $15,$a1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips32r2.s b/test/MC/Mips/mips32r6/invalid-mips32r2.s
new file mode 100644
index 000000000000..25694e330486
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips32r2.s
@@ -0,0 +1,15 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        madd.d    $f18,$f19,$f26,$f20 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd.s    $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.d    $f10,$f1,$f31,$f18  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.s    $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.d   $f18,$f9,$f14,$f19  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.s   $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.d   $f30,$f8,$f16,$f30  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.s   $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s
new file mode 100644
index 000000000000..f3131a947367
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s
@@ -0,0 +1,21 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips4.s b/test/MC/Mips/mips32r6/invalid-mips4.s
new file mode 100644
index 000000000000..8ba2ed88ad6e
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips4.s
@@ -0,0 +1,11 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ldxc1     $f8,$s7($15)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$10($14)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$12($k0)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s
new file mode 100644
index 000000000000..99d10c327a4b
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s
@@ -0,0 +1,11 @@
+# Instructions that are invalid but currently emit the wrong error message.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bc1any2f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any2t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips5.s b/test/MC/Mips/mips32r6/invalid-mips5.s
new file mode 100644
index 000000000000..63f1ccaef8f3
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips5.s
@@ -0,0 +1,9 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        luxc1     $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($13)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid.s b/test/MC/Mips/mips32r6/invalid.s
new file mode 100644
index 000000000000..82cb5ab49430
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid.s
@@ -0,0 +1,14 @@
+# Instructions that are available for the current ISA but should be rejected by
+# the assembler (e.g. invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -mcpu=mips32r6 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        .set noat
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        ldc2    $8,-21181($at)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdc2    $20,23157($s2)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swc2    $25,24880($s0)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/relocations.s b/test/MC/Mips/mips32r6/relocations.s
new file mode 100644
index 000000000000..13b3387e5910
--- /dev/null
+++ b/test/MC/Mips/mips32r6/relocations.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:   | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mips-unknown-linux -mcpu=mips32r6 \
+# RUN:   | llvm-readobj -r | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax for fixups.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: addiupc $2, bar  # encoding: [0xec,0b01000AAA,A,A]
+# CHECK-FIXUP:                  # fixup A - offset: 0,
+# CHECK-FIXUP:                    value: bar, kind: fixup_MIPS_PC19_S2
+# CHECK-FIXUP: beqc $5, $6, bar # encoding: [0x20,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: bnec $5, $6, bar # encoding: [0x60,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: beqzc $9, bar    # encoding: [0xd9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: bnezc $9, bar    # encoding: [0xf9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: balc  bar        # encoding: [0b111010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: bc    bar        # encoding: [0b110010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: aluipc $2, %pcrel_hi(bar)    # encoding: [0xec,0x5f,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_HI16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCHI16
+# CHECK-FIXUP: addiu $2, $2, %pcrel_lo(bar) # encoding: [0x24,0x42,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_LO16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCLO16
+# CHECK-FIXUP: lwpc    $2, bar  # encoding: [0xec,0b01001AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+# CHECK-FIXUP: lwupc   $2, bar  # encoding: [0xec,0b01010AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x0 R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF:     0x4 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x8 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0xC R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x10 R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x18 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x1C R_MIPS_PCHI16 bar 0x0
+# CHECK-ELF:     0x20 R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF:     0x24 R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF:     0x28 R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF: ]
+
+  addiupc   $2,bar
+  beqc  $5, $6, bar
+  bnec  $5, $6, bar
+  beqzc $9, bar
+  bnezc $9, bar
+  balc  bar
+  bc    bar
+  aluipc $2, %pcrel_hi(bar)
+  addiu  $2, $2, %pcrel_lo(bar)
+  lwpc      $2,bar
+  lwupc     $2,bar
diff --git a/test/MC/Mips/mips32r6/valid-xfail.s b/test/MC/Mips/mips32r6/valid-xfail.s
new file mode 100644
index 000000000000..0c911d71f47e
--- /dev/null
+++ b/test/MC/Mips/mips32r6/valid-xfail.s
@@ -0,0 +1,19 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        bovc     $0, $2, 4       # TODO: bovc $0, $2, 4      # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $2, $4, 4       # TODO: bovc $2, $4, 4      # encoding: [0x20,0x82,0x00,0x01]
+        bnvc     $0, $2, 4       # TODO: bnvc $0, $2, 4      # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $2, $4, 4       # TODO: bnvc $2, $4, 4      # encoding: [0x60,0x82,0x00,0x01]
+        beqc    $0, $6, 256      # TODO: beqc $6, $zero, 256 # encoding: [0x20,0xc0,0x00,0x40]
+        beqc    $5, $0, 256      # TODO: beqc $5, $zero, 256 # encoding: [0x20,0xa0,0x00,0x40]
+        beqc    $6, $5, 256      # TODO: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
+        bnec    $0, $6, 256      # TODO: bnec $6, $zero, 256 # encoding: [0x60,0xc0,0x00,0x40]
+        bnec    $5, $0, 256      # TODO: bnec $5, $zero, 256 # encoding: [0x60,0xa0,0x00,0x40]
+        bnec    $6, $5, 256      # TODO: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
diff --git a/test/MC/Mips/mips32r6/valid.s b/test/MC/Mips/mips32r6/valid.s
index e276c8eb9838..0766079fda5b 100644
--- a/test/MC/Mips/mips32r6/valid.s
+++ b/test/MC/Mips/mips32r6/valid.s
@@ -1,21 +1,48 @@
 # Instructions that are valid
 #
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | FileCheck %s
+# Branches have some unusual encoding rules in MIPS32r6 so we need to test:
+#   rs == 0
+#   rs != 0
+#   rt == 0
+#   rt != 0
+#   rs < rt
+#   rs == rt
+#   rs > rt
+# appropriately for each branch instruction
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 2> %t0 | FileCheck %s
+# RUN: FileCheck %s -check-prefix=WARNING < %t0
 
         .set noat
         # FIXME: Add the instructions carried forward from older ISA's
+        and     $2,4             # CHECK: andi $2, $2, 4      # encoding: [0x30,0x42,0x00,0x04]
         addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
         align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
         aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
         aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
         auipc   $3, -1           # CHECK: auipc $3, -1        # encoding: [0xec,0x7e,0xff,0xff]
+        bal     21100            # CHECK: bal 21100           # encoding: [0x04,0x11,0x14,0x9b]
         balc 14572256            # CHECK: balc 14572256       # encoding: [0xe8,0x37,0x96,0xb8]
         bc 14572256              # CHECK: bc 14572256         # encoding: [0xc8,0x37,0x96,0xb8]
+        bc1eqz  $f0,4            # CHECK: bc1eqz $f0, 4       # encoding: [0x45,0x20,0x00,0x01]
+        bc1eqz  $f31,4           # CHECK: bc1eqz $f31, 4      # encoding: [0x45,0x3f,0x00,0x01]
+        bc1nez  $f0,4            # CHECK: bc1nez $f0, 4       # encoding: [0x45,0xa0,0x00,0x01]
+        bc1nez  $f31,4           # CHECK: bc1nez $f31, 4      # encoding: [0x45,0xbf,0x00,0x01]
+        bc2eqz  $0,8             # CHECK: bc2eqz $0, 8        # encoding: [0x49,0x20,0x00,0x02]
+        bc2eqz  $31,8            # CHECK: bc2eqz $31, 8       # encoding: [0x49,0x3f,0x00,0x02]
+        bc2nez  $0,8             # CHECK: bc2nez $0, 8        # encoding: [0x49,0xa0,0x00,0x02]
+        bc2nez  $31,8            # CHECK: bc2nez $31, 8       # encoding: [0x49,0xbf,0x00,0x02]
+        # beqc requires rs < rt && rs != 0 but we also accept when this is not true. See also bovc
+        # FIXME: Testcases are in valid-xfail.s at the moment
         beqc $5, $6, 256         # CHECK: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
         beqzalc $2, 1332         # CHECK: beqzalc $2, 1332    # encoding: [0x20,0x02,0x01,0x4d]
+        # bnec requires rs < rt && rs != 0 but we accept when this is not true. See also bnvc
+        # FIXME: Testcases are in valid-xfail.s at the moment
         bnec $5, $6, 256         # CHECK: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
         bnezalc $2, 1332         # CHECK: bnezalc $2, 1332    # encoding: [0x60,0x02,0x01,0x4d]
         beqzc $5, 72256          # CHECK: beqzc $5, 72256     # encoding: [0xd8,0xa0,0x46,0x90]
+        bgec $2, $3, 256         # CHECK: bgec $2, $3, 256    # encoding: [0x58,0x43,0x00,0x40]
+        bgeuc $2, $3, 256        # CHECK: bgeuc $2, $3, 256   # encoding: [0x18,0x43,0x00,0x40]
         bgezalc $2, 1332         # CHECK: bgezalc $2, 1332    # encoding: [0x18,0x42,0x01,0x4d]
         bnezc $5, 72256          # CHECK: bnezc $5, 72256     # encoding: [0xf8,0xa0,0x46,0x90]
         bltzc $5, 256            # CHECK: bltzc $5, 256       # encoding: [0x5c,0xa5,0x00,0x40]
@@ -26,6 +53,17 @@
         bgtzc $5, 256            # CHECK: bgtzc $5, 256       # encoding: [0x5c,0x05,0x00,0x40]
         bitswap $4, $2           # CHECK: bitswap $4, $2      # encoding: [0x7c,0x02,0x20,0x20]
         blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0x18,0x02,0x01,0x4d]
+        bltc $5, $6, 256         # CHECK: bltc $5, $6, 256    # encoding: [0x5c,0xa6,0x00,0x40]
+        bltuc $5, $6, 256        # CHECK: bltuc $5, $6, 256   # encoding: [0x1c,0xa6,0x00,0x40]
+        # bnvc requires that rs >= rt but we accept both. See also bnec
+        bnvc     $0, $0, 4       # CHECK: bnvc $zero, $zero, 4 # encoding: [0x60,0x00,0x00,0x01]
+        bnvc     $2, $0, 4       # CHECK: bnvc $2, $zero, 4    # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $4, $2, 4       # CHECK: bnvc $4, $2, 4       # encoding: [0x60,0x82,0x00,0x01]
+        # bovc requires that rs >= rt but we accept both. See also beqc
+        bovc     $0, $0, 4       # CHECK: bovc $zero, $zero, 4 # encoding: [0x20,0x00,0x00,0x01]
+        bovc     $2, $0, 4       # CHECK: bovc $2, $zero, 4    # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $4, $2, 4       # CHECK: bovc $4, $2, 4       # encoding: [0x20,0x82,0x00,0x01]
+        cache      1, 8($5)         # CHECK: cache 1, 8($5)         # encoding: [0x7c,0xa1,0x04,0x25]
         cmp.f.s    $f2,$f3,$f4      # CHECK: cmp.f.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
         cmp.f.d    $f2,$f3,$f4      # CHECK: cmp.f.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
         cmp.un.s   $f2,$f3,$f4      # CHECK: cmp.un.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x81]
@@ -66,7 +104,7 @@
         lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
         mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
         modu    $2,$3,$4         # CHECK: modu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdb]
-#        mul     $2,$3,$4         # CHECK-TODO: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
+        mul     $2,$3,$4         # CHECK: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
         muh     $2,$3,$4         # CHECK: muh $2, $3, $4   # encoding: [0x00,0x64,0x10,0xd8]
         mulu    $2,$3,$4         # CHECK: mulu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x99]
         muhu    $2,$3,$4         # CHECK: muhu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xd9]
@@ -74,6 +112,7 @@
         maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
         msubf.s $f2,$f3,$f4      # CHECK: msubf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x99]
         msubf.d $f2,$f3,$f4      # CHECK: msubf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x99]
+        pref    1, 8($5)         # CHECK: pref 1, 8($5)          # encoding: [0x7c,0xa1,0x04,0x35]
         sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
@@ -86,6 +125,7 @@
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
         mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
         mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        or      $2, 4            # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
         selnez.s $f0, $f2, $f4   # CHECK: selnez.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x17]
@@ -94,3 +134,18 @@
         rint.d $f2, $f4          # CHECK: rint.d $f2, $f4        # encoding: [0x46,0x20,0x20,0x9a]
         class.s $f2, $f4         # CHECK: class.s $f2, $f4       # encoding: [0x46,0x00,0x20,0x9b]
         class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x46,0x20,0x20,0x9b]
+        jr.hb   $4               # CHECK: jr.hb $4               # encoding: [0x00,0x80,0x04,0x09]
+        jalr.hb $4               # CHECK: jalr.hb $4             # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb $4, $5           # CHECK: jalr.hb $4, $5         # encoding: [0x00,0xa0,0x24,0x09]
+        ldc2    $8, -701($at)    # CHECK: ldc2 $8, -701($1)      # encoding: [0x49,0xc8,0x0d,0x43]
+        lwc2    $18,-841($a2)    # CHECK: lwc2 $18, -841($6)     # encoding: [0x49,0x52,0x34,0xb7]
+        sdc2    $20,629($s2)     # CHECK: sdc2 $20, 629($18)     # encoding: [0x49,0xf4,0x92,0x75]
+        swc2    $25,304($s0)     # CHECK: swc2 $25, 304($16)     # encoding: [0x49,0x79,0x81,0x30]
+        ll      $v0,-153($s2)    # CHECK: ll $2, -153($18)       # encoding: [0x7e,0x42,0xb3,0xb6]
+        sc      $15,-40($s3)     # CHECK: sc $15, -40($19)       # encoding: [0x7e,0x6f,0xec,0x26]
+        clo     $11,$a1          # CHECK: clo $11, $5            # encoding: [0x00,0xa0,0x58,0x51]
+        clz     $sp,$gp          # CHECK: clz $sp, $gp           # encoding: [0x03,0x80,0xe8,0x50]
+        ssnop                    # WARNING: [[@LINE]]:9: warning: ssnop is deprecated for MIPS32r6 and is equivalent to a nop instruction
+        ssnop                    # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
diff --git a/test/MC/Mips/mips4/invalid-mips32.s b/test/MC/Mips/mips4/invalid-mips32.s
new file mode 100644
index 000000000000..52dea02d10c6
--- /dev/null
+++ b/test/MC/Mips/mips4/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips4 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips4/valid.s b/test/MC/Mips/mips4/valid.s
index 811584e3aaf4..949b91da922c 100644
--- a/test/MC/Mips/mips4/valid.s
+++ b/test/MC/Mips/mips4/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -32,7 +45,11 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
         ddiv      $zero,$k0,$s3
@@ -64,8 +81,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
-        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -77,7 +96,7 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -85,11 +104,11 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -133,16 +152,18 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -173,10 +194,11 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips5/invalid-mips32.s b/test/MC/Mips/mips5/invalid-mips32.s
new file mode 100644
index 000000000000..2e2c8da462df
--- /dev/null
+++ b/test/MC/Mips/mips5/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips5 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips5/invalid-mips64.s b/test/MC/Mips/mips5/invalid-mips64.s
index 19d64dc13515..0a15da823676 100644
--- a/test/MC/Mips/mips5/invalid-mips64.s
+++ b/test/MC/Mips/mips5/invalid-mips64.s
@@ -10,6 +10,9 @@
         dclo      $s2,$a2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dclz      $s0,$25     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         deret                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jr.hb     $4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4, $5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $s6,$13     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $zero,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $s3,$gp     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips5/valid.s b/test/MC/Mips/mips5/valid.s
index 19aad05b0a44..3afdee1887c1 100644
--- a/test/MC/Mips/mips5/valid.s
+++ b/test/MC/Mips/mips5/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -32,7 +45,11 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
         ddiv      $zero,$k0,$s3
@@ -64,8 +81,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
-        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -77,7 +96,7 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -85,12 +104,12 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -134,16 +153,18 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -175,10 +196,11 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips64/valid.s b/test/MC/Mips/mips64/valid.s
index b9e100267887..81c22687796e 100644
--- a/test/MC/Mips/mips64/valid.s
+++ b/test/MC/Mips/mips64/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -20,8 +33,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.l   $f4,$f16
         cvt.d.s   $f22,$f28
@@ -34,11 +47,15 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
-        dclo      $s2,$a2
-        dclz      $s0,$25
+        dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
+        dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
         deret
         ddiv      $zero,$k0,$s3
         ddivu     $zero,$s0,$s1
@@ -69,8 +86,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
-        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -82,7 +101,7 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -90,12 +109,12 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -148,16 +167,18 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -189,10 +210,12 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips64r2/invalid.s b/test/MC/Mips/mips64r2/invalid.s
new file mode 100644
index 000000000000..f53cfff40438
--- /dev/null
+++ b/test/MC/Mips/mips64r2/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -mcpu=mips64r2 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips64r2/valid.s b/test/MC/Mips/mips64r2/valid.s
index 252589df1a88..3d85c137ab9f 100644
--- a/test/MC/Mips/mips64r2/valid.s
+++ b/test/MC/Mips/mips64r2/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -20,8 +33,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.l   $f4,$f16
         cvt.d.s   $f22,$f28
@@ -34,11 +47,15 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
-        dclo      $s2,$a2
-        dclz      $s0,$25
+        dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
+        dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
         deret
         di        $s8
         ddiv      $zero,$k0,$s3
@@ -77,8 +94,12 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
         dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubu     $a1,$a1,$k0
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         ei        $14
@@ -87,11 +108,14 @@
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -99,12 +123,12 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -163,7 +187,9 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
         pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
         rdhwr     $sp,$11
         rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
         rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
@@ -173,11 +199,11 @@
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -211,10 +237,12 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
new file mode 100644
index 000000000000..e914c899f66b
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
@@ -0,0 +1,17 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips1.s b/test/MC/Mips/mips64r6/invalid-mips1.s
new file mode 100644
index 000000000000..6efd8f4cf601
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips1.s
@@ -0,0 +1,27 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngl.d   $f29,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngle.d  $f0,$f16            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.d    $f30,$f0            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.s    $f14,$f22           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid-mips2.s b/test/MC/Mips/mips64r6/invalid-mips2.s
new file mode 100644
index 000000000000..8a5c50ca3582
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips2.s
@@ -0,0 +1,29 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s
new file mode 100644
index 000000000000..7424f493bf56
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s
@@ -0,0 +1,23 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ldl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        ldre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sdle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sdre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips3.s b/test/MC/Mips/mips64r6/invalid-mips3.s
new file mode 100644
index 000000000000..322dabd97829
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips3.s
@@ -0,0 +1,33 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmult     $s7,$9              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       ddiv has been re-encoded. See valid.s
+#       ddivu has been re-encoded. See valid.s
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s
new file mode 100644
index 000000000000..cc85f1885635
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s
@@ -0,0 +1,20 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        bc1fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s
new file mode 100644
index 000000000000..f3131a947367
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s
@@ -0,0 +1,21 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips4.s b/test/MC/Mips/mips64r6/invalid-mips4.s
new file mode 100644
index 000000000000..706db27835ef
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips4.s
@@ -0,0 +1,14 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ldxc1     $f8,$s7($15)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$10($14)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$12($k0)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
new file mode 100644
index 000000000000..4fc94e26eb10
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
@@ -0,0 +1,48 @@
+# Instructions that are invalid but currently emit the wrong error message.
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        abs.ps          $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        add.ps          $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        alnv.ps         $f12,$f18,$f30,$12  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any2f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any2t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.eq.ps         $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.f.ps          $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.le.ps         $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.lt.ps         $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.nge.ps        $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngl.ps        $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngle.ps       $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngt.ps        $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ole.ps        $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.olt.ps        $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.seq.ps        $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.sf.ps         $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ueq.ps        $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ule.ps        $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ult.ps        $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.un.ps         $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.s        $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.pw       $f3,$f18            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        madd.ps         $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mov.ps          $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movf.ps         $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movn.ps         $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movt.ps         $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movz.ps         $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        msub.ps         $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mul.ps          $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        neg.ps          $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmadd.ps        $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmsub.ps        $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pll.ps          $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        plu.ps          $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pul.ps          $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        puu.ps          $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sub.ps          $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips5.s b/test/MC/Mips/mips64r6/invalid-mips5.s
new file mode 100644
index 000000000000..e7fd99a6b05b
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips5.s
@@ -0,0 +1,12 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1     $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($13)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid-mips64.s b/test/MC/Mips/mips64r6/invalid-mips64.s
new file mode 100644
index 000000000000..51e57083b7f9
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips64.s
@@ -0,0 +1,54 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmult     $s7,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubi     $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubi     $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub      $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub      $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalx      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       ddiv has been re-encoded. See valid.s
+#       ddivu has been re-encoded. See valid.s
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid.s b/test/MC/Mips/mips64r6/invalid.s
new file mode 100644
index 000000000000..1b01827368a5
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid.s
@@ -0,0 +1,12 @@
+# Instructions that are available for the current ISA but should be rejected by
+# the assembler (e.g. invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -mcpu=mips64r6 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+	.set noat
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        ldc2    $8,-21181($at)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/relocations.s b/test/MC/Mips/mips64r6/relocations.s
new file mode 100644
index 000000000000..651ebfb6c4c1
--- /dev/null
+++ b/test/MC/Mips/mips64r6/relocations.s
@@ -0,0 +1,76 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:   | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mips-unknown-linux -mcpu=mips64r6 \
+# RUN:   | llvm-readobj -r | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax for fixups.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: addiupc $2, bar  # encoding: [0xec,0b01000AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+# CHECK-FIXUP: beqc     $5, $6, bar # encoding: [0x20,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: bnec $5, $6, bar # encoding: [0x60,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: beqzc $9, bar    # encoding: [0xd9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: bnezc $9, bar    # encoding: [0xf9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: balc  bar        # encoding: [0b111010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: bc    bar        # encoding: [0b110010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: aluipc $2, %pcrel_hi(bar)    # encoding: [0xec,0x5f,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_HI16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCHI16
+# CHECK-FIXUP: addiu $2, $2, %pcrel_lo(bar) # encoding: [0x24,0x42,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_LO16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCLO16
+# CHECK-FIXUP: ldpc    $2, bar  # encoding: [0xec,0b010110AA,A,A]
+# CHECK-FIXUP:                  # fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar,
+# CHECK-FIXUP:                      kind: fixup_Mips_PC18_S3
+# CHECK-FIXUP: lwpc    $2, bar  # encoding: [0xec,0b01001AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+# CHECK-FIXUP: lwupc   $2, bar  # encoding: [0xec,0b01010AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x0 R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF:     0x4 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x8 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0xC R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x10 R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x18 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x1C R_MIPS_PCHI16 bar 0x0
+# CHECK-ELF:     0x20 R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF:     0x24 R_MIPS_PC18_S3 bar 0x0
+# CHECK-ELF:     0x28 R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF:     0x2C R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF: ]
+
+  addiupc   $2,bar
+  beqc  $5, $6, bar
+  bnec  $5, $6, bar
+  beqzc $9, bar
+  bnezc $9, bar
+  balc  bar
+  bc    bar
+  aluipc $2, %pcrel_hi(bar)
+  addiu  $2, $2, %pcrel_lo(bar)
+  ldpc  $2,bar
+  lwpc  $2,bar
+  lwupc $2,bar
diff --git a/test/MC/Mips/mips64r6/valid-xfail.s b/test/MC/Mips/mips64r6/valid-xfail.s
new file mode 100644
index 000000000000..a75122571d93
--- /dev/null
+++ b/test/MC/Mips/mips64r6/valid-xfail.s
@@ -0,0 +1,19 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        bovc     $0, $2, 4       # TODO: bovc $0, $2, 4      # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $2, $4, 4       # TODO: bovc $2, $4, 4      # encoding: [0x20,0x82,0x00,0x01]
+        bnvc     $0, $2, 4       # TODO: bnvc $0, $2, 4      # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $2, $4, 4       # TODO: bnvc $2, $4, 4      # encoding: [0x60,0x82,0x00,0x01]
+        beqc    $0, $6, 256      # TODO: beqc $6, $zero, 256 # encoding: [0x20,0xc0,0x00,0x40]
+        beqc    $5, $0, 256      # TODO: beqc $5, $zero, 256 # encoding: [0x20,0xa0,0x00,0x40]
+        beqc    $6, $5, 256      # TODO: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
+        bnec    $0, $6, 256      # TODO: bnec $6, $zero, 256 # encoding: [0x60,0xc0,0x00,0x40]
+        bnec    $5, $0, 256      # TODO: bnec $5, $zero, 256 # encoding: [0x60,0xa0,0x00,0x40]
+        bnec    $6, $5, 256      # TODO: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
diff --git a/test/MC/Mips/mips64r6/valid.s b/test/MC/Mips/mips64r6/valid.s
index 4b66eb8db9fa..9f32562191a4 100644
--- a/test/MC/Mips/mips64r6/valid.s
+++ b/test/MC/Mips/mips64r6/valid.s
@@ -1,21 +1,48 @@
 # Instructions that are valid
 #
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r6 | FileCheck %s
+# Branches have some unusual encoding rules in MIPS32r6 so we need to test:
+#   rs == 0
+#   rs != 0
+#   rt == 0
+#   rt != 0
+#   rs < rt
+#   rs == rt
+#   rs > rt
+# appropriately for each branch instruction
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r6 2> %t0 | FileCheck %s
+# RUN: FileCheck %s -check-prefix=WARNING < %t0
 
         .set noat
         # FIXME: Add the instructions carried forward from older ISA's
+        and     $2,4           # CHECK: andi $2, $2, 4        # encoding: [0x30,0x42,0x00,0x04]
         addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
         align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
         aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
         aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
         auipc   $3, -1           # CHECK: auipc $3, -1        # encoding: [0xec,0x7e,0xff,0xff]
+        bal     21100            # CHECK: bal 21100           # encoding: [0x04,0x11,0x14,0x9b]
         balc 14572256            # CHECK: balc 14572256       # encoding: [0xe8,0x37,0x96,0xb8]
         bc 14572256              # CHECK: bc 14572256         # encoding: [0xc8,0x37,0x96,0xb8]
+        bc1eqz  $f0,4            # CHECK: bc1eqz $f0, 4       # encoding: [0x45,0x20,0x00,0x01]
+        bc1eqz  $f31,4           # CHECK: bc1eqz $f31, 4      # encoding: [0x45,0x3f,0x00,0x01]
+        bc1nez  $f0,4            # CHECK: bc1nez $f0, 4       # encoding: [0x45,0xa0,0x00,0x01]
+        bc1nez  $f31,4           # CHECK: bc1nez $f31, 4      # encoding: [0x45,0xbf,0x00,0x01]
+        bc2eqz  $0,8             # CHECK: bc2eqz $0, 8        # encoding: [0x49,0x20,0x00,0x02]
+        bc2eqz  $31,8            # CHECK: bc2eqz $31, 8       # encoding: [0x49,0x3f,0x00,0x02]
+        bc2nez  $0,8             # CHECK: bc2nez $0, 8        # encoding: [0x49,0xa0,0x00,0x02]
+        bc2nez  $31,8            # CHECK: bc2nez $31, 8       # encoding: [0x49,0xbf,0x00,0x02]
+        # beqc requires rs < rt && rs != 0 but we also accept when this is not true. See also bovc
+        # FIXME: Testcases are in valid-xfail.s at the moment
         beqc $5, $6, 256         # CHECK: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
         beqzalc $2, 1332         # CHECK: beqzalc $2, 1332    # encoding: [0x20,0x02,0x01,0x4d]
+        # bnec requires rs < rt && rs != 0 but we accept when this is not true. See also bnvc
+        # FIXME: Testcases are in valid-xfail.s at the moment
         bnec $5, $6, 256         # CHECK: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
         bnezalc $2, 1332         # CHECK: bnezalc $2, 1332    # encoding: [0x60,0x02,0x01,0x4d]
         beqzc $5, 72256          # CHECK: beqzc $5, 72256     # encoding: [0xd8,0xa0,0x46,0x90]
+        bgec $2, $3, 256         # CHECK: bgec $2, $3, 256    # encoding: [0x58,0x43,0x00,0x40]
+        bgeuc $2, $3, 256        # CHECK: bgeuc $2, $3, 256   # encoding: [0x18,0x43,0x00,0x40]
         bgezalc $2, 1332         # CHECK: bgezalc $2, 1332    # encoding: [0x18,0x42,0x01,0x4d]
         bnezc $5, 72256          # CHECK: bnezc $5, 72256     # encoding: [0xf8,0xa0,0x46,0x90]
         bltzc $5, 256            # CHECK: bltzc $5, 256       # encoding: [0x5c,0xa5,0x00,0x40]
@@ -26,6 +53,17 @@
         bgtzc $5, 256            # CHECK: bgtzc $5, 256       # encoding: [0x5c,0x05,0x00,0x40]
         bitswap $4, $2           # CHECK: bitswap $4, $2      # encoding: [0x7c,0x02,0x20,0x20]
         blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0x18,0x02,0x01,0x4d]
+        bltc $5, $6, 256         # CHECK: bltc $5, $6, 256    # encoding: [0x5c,0xa6,0x00,0x40]
+        bltuc $5, $6, 256        # CHECK: bltuc $5, $6, 256   # encoding: [0x1c,0xa6,0x00,0x40]
+        # bnvc requires that rs >= rt but we accept both. See also bnec
+        bnvc     $0, $0, 4       # CHECK: bnvc $zero, $zero, 4 # encoding: [0x60,0x00,0x00,0x01]
+        bnvc     $2, $0, 4       # CHECK: bnvc $2, $zero, 4    # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $4, $2, 4       # CHECK: bnvc $4, $2, 4      # encoding: [0x60,0x82,0x00,0x01]
+        # bovc requires that rs >= rt but we accept both. See also beqc
+        bovc     $0, $0, 4       # CHECK: bovc $zero, $zero, 4 # encoding: [0x20,0x00,0x00,0x01]
+        bovc     $2, $0, 4       # CHECK: bovc $2, $zero, 4    # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $4, $2, 4       # CHECK: bovc $4, $2, 4      # encoding: [0x20,0x82,0x00,0x01]
+        cache      1, 8($5)         # CHECK: cache 1, 8($5)         # encoding: [0x7c,0xa1,0x04,0x25]
         cmp.f.s    $f2,$f3,$f4      # CHECK: cmp.f.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
         cmp.f.d    $f2,$f3,$f4      # CHECK: cmp.f.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
         cmp.un.s   $f2,$f3,$f4      # CHECK: cmp.un.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x81]
@@ -60,9 +98,9 @@
         cmp.ngt.d  $f2,$f3,$f4      # CHECK: cmp.ngt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8f]
         dalign  $4,$2,$3,5       # CHECK: dalign $4, $2, $3, 5 # encoding: [0x7c,0x43,0x23,0x64]
         daui    $3,$2,0x1234     # CHECK: daui $3, $2, 4660  # encoding: [0x74,0x62,0x12,0x34]
-        dahi    $3,$3,0x5678     # CHECK: dahi $3, $3, 22136 # encoding: [0x04,0x66,0x56,0x78]
-        dati    $3,$3,0xabcd     # CHECK: dati $3, $3, 43981 # encoding: [0x04,0x7e,0xab,0xcd]
-        dbitswap $4, $2           # CHECK: bitswap $4, $2      # encoding: [0x7c,0x02,0x20,0x24]
+        dahi     $3,0x5678       # CHECK: dahi $3, 22136     # encoding: [0x04,0x66,0x56,0x78]
+        dati     $3,0xabcd       # CHECK: dati $3, 43981     # encoding: [0x04,0x7e,0xab,0xcd]
+        dbitswap $4, $2          # CHECK: dbitswap $4, $2    # encoding: [0x7c,0x02,0x20,0x24]
         div     $2,$3,$4         # CHECK: div $2, $3, $4   # encoding: [0x00,0x64,0x10,0x9a]
         divu    $2,$3,$4         # CHECK: divu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9b]
         jialc   $5, 256          # CHECK: jialc $5, 256    # encoding: [0xf8,0x05,0x01,0x00]
@@ -73,9 +111,10 @@
         ddivu   $2,$3,$4         # CHECK: ddivu $2, $3, $4 # encoding: [0x00,0x64,0x10,0x9f]
         dmod    $2,$3,$4         # CHECK: dmod $2, $3, $4  # encoding: [0x00,0x64,0x10,0xde]
         dmodu   $2,$3,$4         # CHECK: dmodu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xdf]
+        ldpc    $2,123456        # CHECK: ldpc $2, 123456  # encoding: [0xec,0x58,0x3c,0x48]
         lwpc    $2,268           # CHECK: lwpc $2, 268     # encoding: [0xec,0x48,0x00,0x43]
         lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
-#        mul     $2,$3,$4         # CHECK-TODO: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
+        mul     $2,$3,$4         # CHECK: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
         muh     $2,$3,$4         # CHECK: muh $2, $3, $4   # encoding: [0x00,0x64,0x10,0xd8]
         mulu    $2,$3,$4         # CHECK: mulu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x99]
         muhu    $2,$3,$4         # CHECK: muhu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xd9]
@@ -87,6 +126,7 @@
         maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
         msubf.s $f2,$f3,$f4      # CHECK: msubf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x99]
         msubf.d $f2,$f3,$f4      # CHECK: msubf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x99]
+        pref    1, 8($5)         # CHECK: pref 1, 8($5)          # encoding: [0x7c,0xa1,0x04,0x35]
         sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
@@ -99,6 +139,7 @@
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
         mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
         mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        or      $2, 4            # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
         selnez.s $f0, $f2, $f4   # CHECK: selnez.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x17]
@@ -107,3 +148,22 @@
         rint.d $f2, $f4          # CHECK: rint.d $f2, $f4        # encoding: [0x46,0x20,0x20,0x9a]
         class.s $f2, $f4         # CHECK: class.s $f2, $f4       # encoding: [0x46,0x00,0x20,0x9b]
         class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x46,0x20,0x20,0x9b]
+        jr.hb   $4               # CHECK: jr.hb $4               # encoding: [0x00,0x80,0x04,0x09]
+        jalr.hb $4               # CHECK: jalr.hb $4             # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb $4, $5           # CHECK: jalr.hb $4, $5         # encoding: [0x00,0xa0,0x24,0x09]
+        ldc2    $8, -701($at)    # CHECK: ldc2 $8, -701($1)      # encoding: [0x49,0xc8,0x0d,0x43]
+        lwc2    $18,-841($a2)    # CHECK: lwc2 $18, -841($6)     # encoding: [0x49,0x52,0x34,0xb7]
+        sdc2    $20,629($s2)     # CHECK: sdc2 $20, 629($18)     # encoding: [0x49,0xf4,0x92,0x75]
+        swc2    $25,304($s0)     # CHECK: swc2 $25, 304($16)     # encoding: [0x49,0x79,0x81,0x30]
+        ll      $v0,-153($s2)    # CHECK: ll $2, -153($18)       # encoding: [0x7e,0x42,0xb3,0xb6]
+        lld     $zero,112($ra)   # CHECK: lld $zero, 112($ra)    # encoding: [0x7f,0xe0,0x38,0x37]
+        sc      $15,-40($s3)     # CHECK: sc $15, -40($19)       # encoding: [0x7e,0x6f,0xec,0x26]
+        scd     $15,-51($sp)     # CHECK: scd $15, -51($sp)      # encoding: [0x7f,0xaf,0xe6,0xa7]
+        clo     $11,$a1          # CHECK: clo $11, $5            # encoding: [0x00,0xa0,0x58,0x51]
+        clz     $sp,$gp          # CHECK: clz $sp, $gp           # encoding: [0x03,0x80,0xe8,0x50]
+        dclo    $s2,$a2          # CHECK: dclo $18, $6           # encoding: [0x00,0xc0,0x90,0x53]
+        dclz    $s0,$25          # CHECK: dclz $16, $25          # encoding: [0x03,0x20,0x80,0x52]
+        ssnop                    # WARNING: [[@LINE]]:9: warning: ssnop is deprecated for MIPS64r6 and is equivalent to a nop instruction
+        ssnop                    # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
index 6780dd0b3ca5..1a7d61f3ad4f 100644
--- a/test/MC/Mips/mips_directives.s
+++ b/test/MC/Mips/mips_directives.s
@@ -51,7 +51,7 @@ $BB0_4:
     .set  $tmp7, $BB0_4-$BB0_2
     .set f6,$f6
 # CHECK:    abs.s   $f6, $f7           # encoding: [0x46,0x00,0x39,0x85]
-# CHECK:    lui     $1, %hi($tmp7)     # encoding: [0x3c'A',0x01'A',0x00,0x00]
+# CHECK:    lui     $1, %hi($tmp7)     # encoding: [0x3c,0x01,A,A]
 # CHECK:                               #   fixup A - offset: 0, value: ($tmp7)@ABS_HI, kind: fixup_Mips_HI16
     abs.s  f6,FPU_MASK
     lui $1, %hi($tmp7)
diff --git a/test/MC/Mips/mips_gprel16.s b/test/MC/Mips/mips_gprel16.s
index 716c75ec88d4..9dd3fa3281c2 100644
--- a/test/MC/Mips/mips_gprel16.s
+++ b/test/MC/Mips/mips_gprel16.s
@@ -5,6 +5,9 @@
 
 // RUN: llvm-mc -mcpu=mips32r2 -triple=mipsel-pc-linux -filetype=obj -relocation-model=static %s -o - \
 // RUN: | llvm-objdump -disassemble -mattr +mips32r2 - \
+// RUN: | FileCheck %s
+// RUN: llvm-mc -mcpu=mips32r2 -triple=mips-pc-linux -filetype=obj -relocation-model=static %s -o - \
+// RUN: | llvm-objdump -disassemble -mattr +mips32r2 - \
 // RUN: | FileCheck %s
 
 	.text
diff --git a/test/MC/Mips/msa/test_cbranch.s b/test/MC/Mips/msa/test_cbranch.s
index 37b887256029..aa6779b1b46e 100644
--- a/test/MC/Mips/msa/test_cbranch.s
+++ b/test/MC/Mips/msa/test_cbranch.s
@@ -7,22 +7,22 @@
 #CHECK:      bnz.w        $w2, 128      # encoding: [0x47,0xc2,0x00,0x20]
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 #CHECK:      bnz.d        $w3, -128     # encoding: [0x47,0xe3,0xff,0xe0]
-#CHECK:      bnz.b        $w0, SYMBOL0  # encoding: [0x47'A',0x80'A',0x00,0x00]
+#CHECK:      bnz.b        $w0, SYMBOL0  # encoding: [0x47,0x80,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.h        $w1, SYMBOL1  # encoding: [0x47'A',0xa1'A',0x00,0x00]
+#CHECK:      bnz.h        $w1, SYMBOL1  # encoding: [0x47,0xa1,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL1, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.w        $w2, SYMBOL2  # encoding: [0x47'A',0xc2'A',0x00,0x00]
+#CHECK:      bnz.w        $w2, SYMBOL2  # encoding: [0x47,0xc2,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL2, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.d        $w3, SYMBOL3  # encoding: [0x47'A',0xe3'A',0x00,0x00]
+#CHECK:      bnz.d        $w3, SYMBOL3  # encoding: [0x47,0xe3,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL3, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 
 #CHECK:      bnz.v        $w0, 4        # encoding: [0x45,0xe0,0x00,0x01]
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.v        $w0, SYMBOL0  # encoding: [0x45'A',0xe0'A',0x00,0x00]
+#CHECK:      bnz.v        $w0, SYMBOL0  # encoding: [0x45,0xe0,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 
@@ -34,22 +34,22 @@
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 #CHECK:      bz.d         $w3, -1024    # encoding: [0x47,0x63,0xff,0x00]
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.b         $w0, SYMBOL0  # encoding: [0x47'A',A,0x00,0x00]
+#CHECK:      bz.b         $w0, SYMBOL0  # encoding: [0x47,0x00,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.h         $w1, SYMBOL1  # encoding: [0x47'A',0x21'A',0x00,0x00]
+#CHECK:      bz.h         $w1, SYMBOL1  # encoding: [0x47,0x21,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL1, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.w         $w2, SYMBOL2  # encoding: [0x47'A',0x42'A',0x00,0x00]
+#CHECK:      bz.w         $w2, SYMBOL2  # encoding: [0x47,0x42,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL2, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.d         $w3, SYMBOL3  # encoding: [0x47'A',0x63'A',0x00,0x00]
+#CHECK:      bz.d         $w3, SYMBOL3  # encoding: [0x47,0x63,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL3, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 
 #CHECK:      bz.v        $w0, 4        # encoding: [0x45,0x60,0x00,0x01]
 #CHECK:      nop                       # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.v        $w0, SYMBOL0  # encoding: [0x45'A',0x60'A',0x00,0x00]
+#CHECK:      bz.v        $w0, SYMBOL0  # encoding: [0x45,0x60,A,A]
                                        #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                       # encoding: [0x00,0x00,0x00,0x00]
 
diff --git a/test/MC/Mips/nacl-mask.s b/test/MC/Mips/nacl-mask.s
index 820583549c3a..22286ac7dbbc 100644
--- a/test/MC/Mips/nacl-mask.s
+++ b/test/MC/Mips/nacl-mask.s
@@ -283,3 +283,37 @@ test5:
 # CHECK-NEXT:        and     $25, $25, $14
 # CHECK-NEXT:        jalr    $25
 # CHECK-NEXT:        addiu   $4, $zero, 5
+
+
+
+# Test that we can put non-dangerous loads and stores in branch delay slot.
+
+	.align	4
+test6:
+	.set	noreorder
+
+        jal func1
+        sw      $4, 0($sp)
+
+        bal func2
+        lw      $5, 0($t8)
+
+        jalr $t9
+        sw      $sp, 0($sp)
+
+# CHECK-LABEL:   test6:
+
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        jal
+# CHECK-NEXT:        sw      $4, 0($sp)
+
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        bal
+# CHECK-NEXT:        lw      $5, 0($24)
+
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        and     $25, $25, $14
+# CHECK-NEXT:        jalr
+# CHECK-NEXT:        sw      $sp, 0($sp)
diff --git a/test/MC/PowerPC/lit.local.cfg b/test/MC/PowerPC/lit.local.cfg
index 193ebebcd50e..091332439b18 100644
--- a/test/MC/PowerPC/lit.local.cfg
+++ b/test/MC/PowerPC/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
diff --git a/test/MC/Sparc/lit.local.cfg b/test/MC/Sparc/lit.local.cfg
index 4d344fa91a9e..fa6a54e50132 100644
--- a/test/MC/Sparc/lit.local.cfg
+++ b/test/MC/Sparc/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/SystemZ/lit.local.cfg b/test/MC/SystemZ/lit.local.cfg
index b12af09434be..5c02dd3614a4 100644
--- a/test/MC/SystemZ/lit.local.cfg
+++ b/test/MC/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/X86/AlignedBundling/lit.local.cfg b/test/MC/X86/AlignedBundling/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/MC/X86/AlignedBundling/lit.local.cfg
+++ b/test/MC/X86/AlignedBundling/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 2915b7a4d0ac..b9674231d950 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -3151,3 +3151,27 @@ vaddpd 512(%rdi, %rsi, 8) {1to8}, %zmm20, %zmm30
 // CHECK: vaddps {{.*}}{1to16}
 // CHECK: encoding: [0x62,0x61,0x5c,0x50,0x58,0xb4,0xf7,0x00,0x02,0x00,0x00]
 vaddps 512(%rdi, %rsi, 8) {1to16}, %zmm20, %zmm30
+
+// CHECK: vmovntdqa
+// CHECK: encoding: [0x62,0x72,0x7d,0x48,0x2a,0xab,0x78,0x56,0x34,0x12]
+vmovntdqa 0x12345678(%rbx), %zmm13
+
+// CHECK: vmovntdqa
+// CHECK: encoding: [0x62,0xc2,0x7d,0x48,0x2a,0x14,0x56]
+vmovntdqa (%r14,%rdx,2), %zmm18
+
+// CHECK: vmovntdqa
+// CHECK: encoding: [0x62,0xc2,0x7d,0x48,0x2a,0x7c,0x14,0x02]
+vmovntdqa 128(%r12,%rdx), %zmm23
+
+// CHECK: vmovntdq
+// CHECK: encoding: [0x62,0x21,0x7d,0x48,0xe7,0x24,0xa9]
+vmovntdq %zmm28, (%rcx,%r13,4)
+
+// CHECK: vmovntpd
+// CHECK: encoding: [0x62,0xf1,0xfd,0x48,0x2b,0xb2,0x04,0x00,0x00,0x00]
+vmovntpd %zmm6, 4(%rdx)
+
+// CHECK: vmovntps
+// CHECK: encoding: [0x62,0x51,0x7c,0x48,0x2b,0x5c,0x8d,0x00]
+vmovntps %zmm11, (%r13,%rcx,4)
diff --git a/test/MC/X86/lit.local.cfg b/test/MC/X86/lit.local.cfg
index 19840aa7574c..c8625f4d9d24 100644
--- a/test/MC/X86/lit.local.cfg
+++ b/test/MC/X86/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/Makefile b/test/Makefile
index dc99fe1a2f23..c78c2561c34b 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -61,6 +61,15 @@ clang-tools-site-cfg: FORCE
 extra-site-cfgs:: clang-tools-site-cfg
 endif
 
+ifeq ($(shell test -f $(PROJ_OBJ_DIR)/../tools/lld/Makefile && echo OK), OK)
+LIT_ALL_TESTSUITES += $(PROJ_OBJ_DIR)/../tools/lld/test
+
+# Force creation of lld's lit.site.cfg.
+lld-site-cfg: FORCE
+	$(MAKE) -C $(PROJ_OBJ_DIR)/../tools/lld/test lit.site.cfg Unit/lit.site.cfg
+extra-site-cfgs:: lld-site-cfg
+endif
+
 ifeq ($(shell test -f $(PROJ_OBJ_DIR)/../tools/polly/Makefile && echo OK), OK)
 LIT_ALL_TESTSUITES += $(PROJ_OBJ_DIR)/../tools/polly/test
 
diff --git a/test/Object/ARM/lit.local.cfg b/test/Object/ARM/lit.local.cfg
index 5fc35d80541d..236e1d344166 100644
--- a/test/Object/ARM/lit.local.cfg
+++ b/test/Object/ARM/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
diff --git a/test/Object/Inputs/corrupt-archive.a b/test/Object/Inputs/corrupt-archive.a
new file mode 100644
index 000000000000..f8940ff6420c
Binary files /dev/null and b/test/Object/Inputs/corrupt-archive.a differ
diff --git a/test/Object/Inputs/darwin-m-test1.mach0-armv7 b/test/Object/Inputs/darwin-m-test1.mach0-armv7
new file mode 100644
index 000000000000..2ce3a18cfa76
Binary files /dev/null and b/test/Object/Inputs/darwin-m-test1.mach0-armv7 differ
diff --git a/test/Object/Inputs/darwin-m-test2.macho-i386 b/test/Object/Inputs/darwin-m-test2.macho-i386
new file mode 100644
index 000000000000..dc0e86525f40
Binary files /dev/null and b/test/Object/Inputs/darwin-m-test2.macho-i386 differ
diff --git a/test/Object/Inputs/darwin-m-test3.macho-x86-64 b/test/Object/Inputs/darwin-m-test3.macho-x86-64
new file mode 100755
index 000000000000..18960c4f6aab
Binary files /dev/null and b/test/Object/Inputs/darwin-m-test3.macho-x86-64 differ
diff --git a/test/Object/Inputs/hello-world.macho-x86_64 b/test/Object/Inputs/hello-world.macho-x86_64
new file mode 100755
index 000000000000..d004bedf6ad4
Binary files /dev/null and b/test/Object/Inputs/hello-world.macho-x86_64 differ
diff --git a/test/Object/Inputs/macho-archive-x86_64.a b/test/Object/Inputs/macho-archive-x86_64.a
new file mode 100644
index 000000000000..9979ba9dd1e3
Binary files /dev/null and b/test/Object/Inputs/macho-archive-x86_64.a differ
diff --git a/test/Object/Mips/lit.local.cfg b/test/Object/Mips/lit.local.cfg
index 88262fb1d323..7d12f7a9c564 100644
--- a/test/Object/Mips/lit.local.cfg
+++ b/test/Object/Mips/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
diff --git a/test/Object/X86/lit.local.cfg b/test/Object/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Object/X86/lit.local.cfg
+++ b/test/Object/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Object/ar-error.test b/test/Object/ar-error.test
new file mode 100644
index 000000000000..7add9b448e19
--- /dev/null
+++ b/test/Object/ar-error.test
@@ -0,0 +1,6 @@
+Test if we get a proper error with a filename that doesn't exist
+
+RUN: not llvm-ar r %t.out.a sparkle.o %t 2>&1 | FileCheck %s
+
+# Don't check the message "No such file or directory".
+CHECK: llvm-ar{{(.exe|.EXE)?}}: sparkle.o:
diff --git a/test/Object/directory.ll b/test/Object/directory.ll
index 48eefcb6ecb8..c4b0bbf20efb 100644
--- a/test/Object/directory.ll
+++ b/test/Object/directory.ll
@@ -1,6 +1,6 @@
 ;RUN: rm -f %T/test.a
 ;RUN: not llvm-ar r %T/test.a . 2>&1 | FileCheck %s
-;CHECK: .: Is a directory
+;CHECK: .: {{I|i}}s a directory
 
 ;RUN: rm -f %T/test.a
 ;RUN: touch %T/a-very-long-file-name
diff --git a/test/Object/nm-archive.test b/test/Object/nm-archive.test
index fbbf051b478a..d875d6c991c1 100644
--- a/test/Object/nm-archive.test
+++ b/test/Object/nm-archive.test
@@ -33,3 +33,9 @@ RUN: llvm-nm -s %p/Inputs/archive-test.a-gnu-minimal
 
 Don't reject an empty archive.
 RUN: llvm-nm %p/Inputs/archive-test.a-empty
+
+This archive has an unaligned member and a unknown format member.
+GNU AR is able to parse the unaligned member and warns about the member with
+the unknown format. We should probably simply warn on both. For now just check
+that we don't produce an error.
+RUN: llvm-nm %p/Inputs/corrupt-archive.a
diff --git a/test/Object/nm-darwin-m.test b/test/Object/nm-darwin-m.test
new file mode 100644
index 000000000000..6c718128aa61
--- /dev/null
+++ b/test/Object/nm-darwin-m.test
@@ -0,0 +1,53 @@
+RUN: llvm-nm -format darwin %p/Inputs/darwin-m-test1.mach0-armv7 \
+RUN:         | FileCheck %s -check-prefix test1
+RUN: llvm-nm -format darwin %p/Inputs/darwin-m-test2.macho-i386 \
+RUN:         | FileCheck %s -check-prefix test2
+RUN: llvm-nm -format darwin %p/Inputs/darwin-m-test3.macho-x86-64 \
+RUN:         | FileCheck %s -check-prefix test3
+
+# This is testing that the various bits in the n_desc feild are correct
+test1: 00000001 (absolute) non-external _a
+test1: 00000008 (common) (alignment 2^2) external _c
+test1: 0000000a (__DATA,__data) non-external [no dead strip] _d
+test1: 00000004 (__TEXT,__text) non-external [alt entry] _e
+test1: 00000000 (__TEXT,__text) non-external [symbol resolver] _r
+test1: 00000008 (__TEXT,__text) non-external [Thumb] _t
+
+# This is testing that an N_INDR symbol gets its alias name, the "(for ...)"
+test2:          (undefined) external __i
+test2:          (indirect) external _i (for __i)
+
+# This is testing is using darwin-m-test3.macho-x86-64 that is linked with
+# dylibs that have the follow set of -install_names:
+#	Foo.framework/Foo 
+#	/System/Library/Frameworks/FooPath.framework/FooPath 
+#	FooSuffix.framework/FooSuffix_debug
+#	/System/Library/Frameworks/FooPathSuffix.framework/FooPathSuffix_profile
+#	FooVers.framework/Versions/A/FooVers
+#	/System/Library/Frameworks/FooPathVers.framework/Versions/B/FooPathVers
+#	libx.dylib
+#	libxSuffix_profile.dylib
+#	/usr/local/lib/libxPathSuffix_debug.dylib
+#	libATS.A_profile.dylib
+#	/usr/lib/libPathATS.A_profile.dylib
+#	QT.A.qtx
+#	/lib/QTPath.qtx
+#	/usr/lib/libSystem.B.dylib
+# to test that MachOObjectFile::guessLibraryShortName() is correctly parsing 
+# them into their short names.
+test3: 0000000100000000 (__TEXT,__text) [referenced dynamically] external __mh_execute_header
+test3:                  (undefined) external _atsPathVersSuffix (from libPathATS)
+test3:                  (undefined) external _atsVersSuffix (from libATS)
+test3:                  (undefined) external _foo (from Foo)
+test3:                  (undefined) external _fooPath (from FooPath)
+test3:                  (undefined) external _fooPathSuffix (from FooPathSuffix)
+test3:                  (undefined) external _fooPathVers (from FooPathVers)
+test3:                  (undefined) external _fooSuffix (from FooSuffix)
+test3:                  (undefined) external _fooVers (from FooVers)
+test3: 0000000100000e60 (__TEXT,__text) external _main
+test3:                  (undefined) external _qt (from QT)
+test3:                  (undefined) external _qtPath (from QTPath)
+test3:                  (undefined) external _x (from libx)
+test3:                  (undefined) external _xPathSuffix (from libxPathSuffix)
+test3:                  (undefined) external _xSuffix (from libxSuffix)
+test3:                  (undefined) external dyld_stub_binder (from libSystem)
diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index 111749289807..20ac6621e728 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -55,14 +55,14 @@ WEAK-ELF64: 0000000000000000 V x2
 ABSOLUTE-ELF64: 0000000000000123 a a1
 ABSOLUTE-ELF64: 0000000000000123 A a2
 
-macho: 00000000 U _SomeOtherFunction
+macho:          U _SomeOtherFunction
 macho: 00000000 T _main
-macho: 00000000 U _puts
+macho:          U _puts
 
 macho64: 0000000000000028 s L_.str
-macho64: 0000000000000000 U _SomeOtherFunction
+macho64:                  U _SomeOtherFunction
 macho64: 0000000000000000 T _main
-macho64: 0000000000000000 U _puts
+macho64:                  U _puts
 
 
 Test that nm uses addresses even with ELF .o files.
diff --git a/test/Object/nm-universal-binary.test b/test/Object/nm-universal-binary.test
index faf4812e5378..c20c733dcd8b 100644
--- a/test/Object/nm-universal-binary.test
+++ b/test/Object/nm-universal-binary.test
@@ -13,7 +13,7 @@ CHECK-AR: 0000000000000068 s EH_frame0
 CHECK-AR: 000000000000003b s L_.str
 CHECK-AR: 0000000000000000 T _main
 CHECK-AR: 0000000000000080 S _main.eh
-CHECK-AR: 0000000000000000 U _printf
+CHECK-AR:                  U _printf
 CHECK-AR: macho-universal-archive.x86_64.i386:i386:foo.o:
 CHECK-AR: 00000008 S _bar
 CHECK-AR: 00000000 T _foo
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index db03f5afa4d7..98b40d5cdab8 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -191,7 +191,7 @@ ELF-MIPSEL-NEXT:   Class:           ELFCLASS32
 ELF-MIPSEL-NEXT:   Data:            ELFDATA2LSB
 ELF-MIPSEL-NEXT:   Type:            ET_REL
 ELF-MIPSEL-NEXT:   Machine:         EM_MIPS
-ELF-MIPSEL-NEXT:   Flags:           [ EF_MIPS_NOREORDER, EF_MIPS_PIC, EF_MIPS_CPIC, EF_MIPS_ABI_O32, EF_MIPS_ARCH_1, EF_MIPS_ARCH_2, EF_MIPS_ARCH_5, EF_MIPS_ARCH_32 ]
+ELF-MIPSEL-NEXT:   Flags:           [ EF_MIPS_NOREORDER, EF_MIPS_PIC, EF_MIPS_CPIC, EF_MIPS_ABI_O32, EF_MIPS_ARCH_32 ]
 ELF-MIPSEL-NEXT: Sections:
 ELF-MIPSEL-NEXT:   - Name:            .text
 ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
@@ -201,8 +201,8 @@ ELF-MIPSEL-NEXT:     Content:         0000023C00004224E8FFBD271400BFAF1000B0AF21
 ELF-MIPSEL-NEXT:   - Name:            .rel.text
 ELF-MIPSEL-NEXT:     Type:            SHT_REL
 ELF-MIPSEL-NEXT:     Link:            .symtab
-ELF-MIPSEL-NEXT:     Info:            .text
 ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPSEL-NEXT:     Info:            .text
 ELF-MIPSEL-NEXT:     Relocations:
 ELF-MIPSEL-NEXT:       - Offset:          0
 ELF-MIPSEL-NEXT:         Symbol:          _gp_disp
@@ -285,7 +285,7 @@ ELF-MIPS64EL-NEXT:   Class:           ELFCLASS64
 ELF-MIPS64EL-NEXT:   Data:            ELFDATA2LSB
 ELF-MIPS64EL-NEXT:   Type:            ET_REL
 ELF-MIPS64EL-NEXT:   Machine:         EM_MIPS
-ELF-MIPS64EL-NEXT:   Flags:           [ EF_MIPS_ARCH_1, EF_MIPS_ARCH_3 ]
+ELF-MIPS64EL-NEXT:   Flags:           [ EF_MIPS_ARCH_3 ]
 ELF-MIPS64EL-NEXT: Sections:
 ELF-MIPS64EL-NEXT:   - Name:            .text
 ELF-MIPS64EL-NEXT:     Type:            SHT_PROGBITS
@@ -300,8 +300,8 @@ ELF-MIPS64EL-NEXT:     Content:         '00000000000000000000000000000000'
 ELF-MIPS64EL-NEXT:   - Name:            .rela.data
 ELF-MIPS64EL-NEXT:     Type:            SHT_RELA
 ELF-MIPS64EL-NEXT:     Link:            .symtab
-ELF-MIPS64EL-NEXT:     Info:            .data
 ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000008
+ELF-MIPS64EL-NEXT:     Info:            .data
 ELF-MIPS64EL-NEXT:     Relocations:
 ELF-MIPS64EL-NEXT:       - Offset:          0
 ELF-MIPS64EL-NEXT:         Symbol:          zed
@@ -370,8 +370,8 @@ ELF-X86-64-NEXT:   - Name:            .rela.text
 ELF-X86-64-NEXT:     Type:            SHT_RELA
 ELF-X86-64-NEXT:     Address:         0x0000000000000038
 ELF-X86-64-NEXT:     Link:            .symtab
-ELF-X86-64-NEXT:     Info:            .text
 ELF-X86-64-NEXT:     AddressAlign:    0x0000000000000008
+ELF-X86-64-NEXT:     Info:            .text
 ELF-X86-64-NEXT:     Relocations:
 ELF-X86-64-NEXT:       - Offset:          0x000000000000000D
 ELF-X86-64-NEXT:         Symbol:          .rodata.str1.1
diff --git a/test/Object/size-trivial-macho.test b/test/Object/size-trivial-macho.test
index 6ecdf5c2a8c4..960fb25fbecb 100644
--- a/test/Object/size-trivial-macho.test
+++ b/test/Object/size-trivial-macho.test
@@ -2,6 +2,14 @@ RUN: llvm-size -A %p/Inputs/macho-text-data-bss.macho-x86_64 \
 RUN:         | FileCheck %s -check-prefix A
 RUN: llvm-size -B %p/Inputs/macho-text-data-bss.macho-x86_64 \
 RUN:         | FileCheck %s -check-prefix B
+RUN: llvm-size -format darwin %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix m
+RUN: llvm-size %p/Inputs/macho-archive-x86_64.a \
+RUN:         | FileCheck %s -check-prefix AR
+RUN: llvm-size -format darwin %p/Inputs/macho-archive-x86_64.a \
+RUN:         | FileCheck %s -check-prefix mAR
+RUN: llvm-size -format darwin -x -l %p/Inputs/hello-world.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix mxl
 
 A: section              size   addr
 A: __text                 12      0
@@ -11,5 +19,49 @@ A: __compact_unwind       32     16
 A: __eh_frame             64     48
 A: Total                 116
 
-B:   text    data     bss     dec     hex filename
-B:     12     100       4     116      74 
+B:	__TEXT	__DATA	__OBJC	others	dec	hex
+B:	76	8	0	32	116	74	
+
+m: Segment : 116
+m: 	Section (__TEXT, __text): 12
+m: 	Section (__DATA, __data): 4
+m: 	Section (__DATA, __bss): 4
+m: 	Section (__LD, __compact_unwind): 32
+m: 	Section (__TEXT, __eh_frame): 64
+m: 	total 116
+m: total 116
+
+AR: __TEXT	__DATA	__OBJC	others	dec	hex
+AR: 70	0	0	32	102	66	{{.*}}/macho-archive-x86_64.a(foo.o)
+AR: 0	4	0	0	4	4	{{.*}}/macho-archive-x86_64.a(bar.o)
+
+mAR: {{.*}}/macho-archive-x86_64.a(foo.o):
+mAR: Segment : 104
+mAR: 	Section (__TEXT, __text): 6
+mAR: 	Section (__LD, __compact_unwind): 32
+mAR: 	Section (__TEXT, __eh_frame): 64
+mAR: 	total 102
+mAR: total 104
+mAR: {{.*}}/macho-archive-x86_64.a(bar.o):
+mAR: Segment : 4
+mAR: 	Section (__TEXT, __text): 0
+mAR: 	Section (__DATA, __data): 4
+mAR: 	total 4
+mAR: total 4
+
+
+mxl: Segment __PAGEZERO: 0x100000000 (vmaddr 0x0 fileoff 0)
+mxl: Segment __TEXT: 0x1000 (vmaddr 0x100000000 fileoff 0)
+mxl: 	Section __text: 0x3b (addr 0x100000f30 offset 3888)
+mxl: 	Section __stubs: 0x6 (addr 0x100000f6c offset 3948)
+mxl: 	Section __stub_helper: 0x1a (addr 0x100000f74 offset 3956)
+mxl: 	Section __cstring: 0xd (addr 0x100000f8e offset 3982)
+mxl: 	Section __unwind_info: 0x48 (addr 0x100000f9b offset 3995)
+mxl: 	Section __eh_frame: 0x18 (addr 0x100000fe8 offset 4072)
+mxl: 	total 0xc8
+mxl: Segment __DATA: 0x1000 (vmaddr 0x100001000 fileoff 4096)
+mxl: 	Section __nl_symbol_ptr: 0x10 (addr 0x100001000 offset 4096)
+mxl: 	Section __la_symbol_ptr: 0x8 (addr 0x100001010 offset 4112)
+mxl: 	total 0x18
+mxl: Segment __LINKEDIT: 0x1000 (vmaddr 0x100002000 fileoff 8192)
+mxl: total 0x100003000
diff --git a/test/Object/yaml2obj-coff-multi-doc.test b/test/Object/yaml2obj-coff-multi-doc.test
new file mode 100644
index 000000000000..1cf720306bf7
--- /dev/null
+++ b/test/Object/yaml2obj-coff-multi-doc.test
@@ -0,0 +1,91 @@
+# RUN: yaml2obj -format=coff -docnum=1 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC1 %s
+# RUN: yaml2obj -format=coff -docnum=2 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC2 %s
+# RUN: not yaml2obj -format=coff -docnum=3 %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=DOC3 %s
+
+# DOC1: Name: _sym1
+# DOC2: Name: _sym2
+# DOC3: yaml2obj: Cannot find the 3rd document
+
+---
+header:
+  Machine: IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
+
+sections:
+  - Name: .text
+    Alignment: 16
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE,
+                       IMAGE_SCN_MEM_READ ]
+    SectionData: "00000000"
+
+symbols:
+  - Name: .text
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          36
+      NumberOfRelocations: 3
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+
+  - Name: _main
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+  - Name: _sym1
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+---
+header:
+  Machine: IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
+
+sections:
+  - Name: .text
+    Alignment: 16
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE,
+                       IMAGE_SCN_MEM_READ ]
+    SectionData: "00000000"
+
+symbols:
+  - Name: .text
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          36
+      NumberOfRelocations: 3
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+
+  - Name: _main
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+  - Name: _sym2
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/test/Object/yaml2obj-elf-multi-doc.test b/test/Object/yaml2obj-elf-multi-doc.test
new file mode 100644
index 000000000000..c51f803300ec
--- /dev/null
+++ b/test/Object/yaml2obj-elf-multi-doc.test
@@ -0,0 +1,56 @@
+# RUN: yaml2obj -format=elf -docnum=1 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC1 %s
+# RUN: yaml2obj -format=elf -docnum=2 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC2 %s
+# RUN: not yaml2obj -format=elf -docnum=3 %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=DOC3 %s
+
+# DOC1: Name: T1 (1)
+# DOC2: Name: T2 (1)
+# DOC3: yaml2obj: Cannot find the 3rd document
+
+--- !ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS32
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_MIPS
+  Flags: [EF_MIPS_CPIC]
+
+Sections:
+- Name: .text
+  Type: SHT_PROGBITS
+  Content:  "0000000000000000"
+  AddressAlign: 16
+  Flags: [SHF_EXECINSTR, SHF_ALLOC]
+
+Symbols:
+  Global:
+    - Name: T1
+      Section: .text
+      Type: STT_FUNC
+      Value: 0x0
+      Size: 8
+
+--- !ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS32
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_MIPS
+
+Sections:
+- Name: .text
+  Type: SHT_PROGBITS
+  Content:  "00000000"
+  AddressAlign: 16
+  Flags: [SHF_EXECINSTR, SHF_ALLOC]
+
+Symbols:
+  Global:
+    - Name: T2
+      Section: .text
+      Type: STT_FUNC
+      Value: 0x0
+      Size: 4
+...
diff --git a/test/Object/yaml2obj-elf-symbol-visibility.yaml b/test/Object/yaml2obj-elf-symbol-visibility.yaml
new file mode 100644
index 000000000000..113354a05e3a
--- /dev/null
+++ b/test/Object/yaml2obj-elf-symbol-visibility.yaml
@@ -0,0 +1,126 @@
+# RUN: yaml2obj -format=elf %s | llvm-readobj -symbols - | \
+# RUN:   FileCheck --check-prefix OBJ %s
+# RUN: yaml2obj -format=elf %s | obj2yaml - | FileCheck --check-prefix YAML %s
+
+# OBJ:      Symbol {
+# OBJ:        Name: default1 (36)
+# OBJ-NEXT:   Value: 0x0
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 0
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: default2 (27)
+# OBJ-NEXT:   Value: 0x4
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 0
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: internal (8)
+# OBJ-NEXT:   Value: 0x8
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 1
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: hidden (1)
+# OBJ-NEXT:   Value: 0xC
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 2
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: protected (17)
+# OBJ-NEXT:   Value: 0x10
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 3
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+
+# YAML:      Symbols:
+# YAML-NEXT:   Global:
+# YAML-NEXT:     - Name:            default1
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:     - Name:            default2
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x0000000000000004
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:     - Name:            internal
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x0000000000000008
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:       Visibility:      STV_INTERNAL
+# YAML-NEXT:     - Name:            hidden
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x000000000000000C
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:       Visibility:      STV_HIDDEN
+# YAML-NEXT:     - Name:            protected
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x0000000000000010
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:       Visibility:      STV_PROTECTED
+
+---
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_MIPS
+  Flags:           [ EF_MIPS_ABI_O32, EF_MIPS_ARCH_32 ]
+
+Sections:
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_WRITE ]
+    AddressAlign:    0x04
+    Size:            0x14
+
+Symbols:
+  Global:
+    - Name:            default1
+      Type:            STT_OBJECT
+      Visibility:      STV_DEFAULT
+      Section:         .data
+      Value:           0x00
+      Size:            0x04
+    - Name:            default2
+      Type:            STT_OBJECT
+      Section:         .data
+      Value:           0x04
+      Size:            0x04
+    - Name:            internal
+      Type:            STT_OBJECT
+      Visibility:      STV_INTERNAL
+      Section:         .data
+      Value:           0x08
+      Size:            0x04
+    - Name:            hidden
+      Type:            STT_OBJECT
+      Visibility:      STV_HIDDEN
+      Section:         .data
+      Value:           0x0C
+      Size:            0x04
+    - Name:            protected
+      Type:            STT_OBJECT
+      Visibility:      STV_PROTECTED
+      Section:         .data
+      Value:           0x10
+      Size:            0x04
diff --git a/test/Other/X86/lit.local.cfg b/test/Other/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Other/X86/lit.local.cfg
+++ b/test/Other/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Other/constant-fold-gep.ll b/test/Other/constant-fold-gep.ll
index aed4145c5507..dc122fd16109 100644
--- a/test/Other/constant-fold-gep.ll
+++ b/test/Other/constant-fold-gep.ll
@@ -457,7 +457,7 @@ define i8* @different_addrspace() nounwind noinline {
   %p = getelementptr inbounds i8* addrspacecast ([4 x i8] addrspace(12)* @p12 to i8*),
                                   i32 2
   ret i8* %p
-; OPT: ret i8* getelementptr (i8* addrspacecast ([4 x i8] addrspace(12)* @p12 to i8*), i32 2)
+; OPT: ret i8* getelementptr (i8* addrspacecast (i8 addrspace(12)* getelementptr inbounds ([4 x i8] addrspace(12)* @p12, i32 0, i32 0) to i8*), i32 2)
 }
 
 define i8* @same_addrspace() nounwind noinline {
diff --git a/test/TableGen/ForeachLoop.td b/test/TableGen/ForeachLoop.td
index 4aacc74d8aa2..25208fae227e 100644
--- a/test/TableGen/ForeachLoop.td
+++ b/test/TableGen/ForeachLoop.td
@@ -51,8 +51,10 @@ foreach i = [0, 1, 2, 3, 4, 5, 6, 7] in
 // CHECK: string Name = "R7";
 // CHECK: int Index = 7;
 
-foreach i = {0-3,9-7} in
+foreach i = {0-3,9-7} in {
   def S#i : Register<"Q"#i, i>;
+  def : Register<"T"#i, i>;
+}
 
 // CHECK: def S0
 // CHECK: def S1
@@ -61,3 +63,25 @@ foreach i = {0-3,9-7} in
 // CHECK: def S7
 // CHECK: def S8
 // CHECK: def S9
+
+// CHECK: def
+// CHECK: string Name = "T0";
+
+// CHECK: def
+// CHECK: string Name = "T1";
+
+// CHECK: def
+// CHECK: string Name = "T2";
+
+// CHECK: def
+// CHECK: string Name = "T3";
+
+// CHECK: def
+// CHECK: string Name = "T9";
+
+// CHECK: def
+// CHECK: string Name = "T8";
+
+// CHECK: def
+// CHECK: string Name = "T7";
+
diff --git a/test/TableGen/if-empty-list-arg.td b/test/TableGen/if-empty-list-arg.td
new file mode 100644
index 000000000000..39edf58ff29a
--- /dev/null
+++ b/test/TableGen/if-empty-list-arg.td
@@ -0,0 +1,7 @@
+// RUN: llvm-tblgen %s
+// XFAIL: vg_leak
+
+class C<bit cond> {
+  list<int> X = !if(cond, [1, 2, 3], []);
+  list<int> Y = !if(cond, [], [4, 5, 6]);
+}
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
index ac9fc1f586f3..f2f5b9251635 100644
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
@@ -229,22 +229,28 @@ define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[BARRIER:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK: fence seq_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK: fence seq_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i8 [[OLDVAL]]
 
-  %old = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %old = extractvalue { i8, i1 } %pairold, 0
   ret i8 %old
 }
 
@@ -257,22 +263,28 @@ define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newv
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK: fence seq_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i16 [[OLDVAL]]
 
-  %old = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %old = extractvalue { i16, i1 } %pairold, 0
   ret i16 %old
 }
 
@@ -284,21 +296,27 @@ define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newva
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK: fence acquire
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK: fence acquire
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i32 [[OLDVAL]]
 
-  %old = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %old = extractvalue { i32, i1 } %pairold, 0
   ret i32 %old
 }
 
@@ -317,7 +335,7 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
 ; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
@@ -325,16 +343,22 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
 ; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
 ; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK-NOT: fence
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i64 [[OLDVAL]]
 
-  %old = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %old = extractvalue { i64, i1 } %pairold, 0
   ret i64 %old
 }
\ No newline at end of file
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
index bec5befaab60..8092c1010ff5 100644
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
@@ -91,22 +91,28 @@ define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[BARRIER:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
-; CHECK-NOT: fence
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i8 [[OLDVAL]]
 
-  %old = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %old = extractvalue { i8, i1 } %pairold, 0
   ret i8 %old
 }
 
@@ -119,22 +125,28 @@ define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newv
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK-NOT: fence
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i16 [[OLDVAL]]
 
-  %old = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %old = extractvalue { i16, i1 } %pairold, 0
   ret i16 %old
 }
 
@@ -146,21 +158,27 @@ define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newva
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
-; CHECK-NOT: fence
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i32 [[OLDVAL]]
 
-  %old = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %old = extractvalue { i32, i1 } %pairold, 0
   ret i32 %old
 }
 
@@ -179,7 +197,7 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
 ; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
@@ -187,16 +205,22 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
 ; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
 ; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
-; CHECK-NOT: fence
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i64 [[OLDVAL]]
 
-  %old = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %old = extractvalue { i64, i1 } %pairold, 0
   ret i64 %old
 }
\ No newline at end of file
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll
new file mode 100644
index 000000000000..07a4a7f26e62
--- /dev/null
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll
@@ -0,0 +1,97 @@
+; RUN: opt -atomic-ll-sc -S -mtriple=thumbv7s-apple-ios7.0 %s | FileCheck %s
+
+define i32 @test_cmpxchg_seq_cst(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_seq_cst
+; CHECK:     fence release
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK:     fence seq_cst
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK:     fence seq_cst
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i32 [[LOADED]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  ret i32 %oldval
+}
+
+define i1 @test_cmpxchg_weak_fail(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_weak_fail
+; CHECK:     fence release
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK:     fence seq_cst
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i1 [[SUCCESS]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 1
+  ret i1 %oldval
+}
+
+define i32 @test_cmpxchg_monotonic(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_monotonic
+; CHECK-NOT: fence
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i32 [[LOADED]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new monotonic monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  ret i32 %oldval
+}
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg b/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/BBVectorize/lit.local.cfg
+++ b/test/Transforms/BBVectorize/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/CodeGenPrepare/X86/lit.local.cfg b/test/Transforms/CodeGenPrepare/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/CodeGenPrepare/X86/lit.local.cfg
+++ b/test/Transforms/CodeGenPrepare/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
new file mode 100644
index 000000000000..a985c36707a9
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @load_cast_gep
+; CHECK: add i64 %sunkaddr, 40
+define void @load_cast_gep(i1 %cond, i64* %base) {
+entry:
+  %addr = getelementptr inbounds i64* %base, i64 5
+  %casted = addrspacecast i64* %addr to i32 addrspace(1)*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %v = load i32 addrspace(1)* %casted, align 4
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+; CHECK-LABEL: @store_gep_cast
+; CHECK: add i64 %sunkaddr, 20
+define void @store_gep_cast(i1 %cond, i64* %base) {
+entry:
+  %casted = addrspacecast i64* %base to i32 addrspace(1)*
+  %addr = getelementptr inbounds i32 addrspace(1)* %casted, i64 5
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  store i32 0, i32 addrspace(1)* %addr, align 4
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
diff --git a/test/Transforms/ConstantHoisting/ARM64/const-addr.ll b/test/Transforms/ConstantHoisting/AArch64/const-addr.ll
similarity index 100%
rename from test/Transforms/ConstantHoisting/ARM64/const-addr.ll
rename to test/Transforms/ConstantHoisting/AArch64/const-addr.ll
diff --git a/test/Transforms/ConstantHoisting/ARM64/large-immediate.ll b/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
similarity index 100%
rename from test/Transforms/ConstantHoisting/ARM64/large-immediate.ll
rename to test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
diff --git a/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..7184443994b6
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/ConstantHoisting/ARM64/lit.local.cfg b/test/Transforms/ConstantHoisting/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac9811f012..000000000000
--- a/test/Transforms/ConstantHoisting/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg b/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
index 2e463005586f..5d33887ff0a4 100644
--- a/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
+++ b/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/ConstantHoisting/X86/large-immediate.ll b/test/Transforms/ConstantHoisting/X86/large-immediate.ll
index e0af9c9be572..b8c04f38b12f 100644
--- a/test/Transforms/ConstantHoisting/X86/large-immediate.ll
+++ b/test/Transforms/ConstantHoisting/X86/large-immediate.ll
@@ -25,3 +25,12 @@ define i196 @test3(i196 %a) nounwind {
   %2 = mul i196 %1, 2
   ret i196 %2
 }
+
+; Check that we don't hoist immediates with small values.
+define i96 @test4(i96 %a) nounwind {
+; CHECK-LABEL: test4
+; CHECK-NOT: %const = bitcast i96 2 to i96
+  %1 = mul i96 %a, 2
+  %2 = add i96 %1, 2
+  ret i96 %2
+}
diff --git a/test/Transforms/ConstantHoisting/X86/lit.local.cfg b/test/Transforms/ConstantHoisting/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/ConstantHoisting/X86/lit.local.cfg
+++ b/test/Transforms/ConstantHoisting/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/ConstantHoisting/X86/stackmap.ll b/test/Transforms/ConstantHoisting/X86/stackmap.ll
index cef022ebc0bd..9df44177820e 100644
--- a/test/Transforms/ConstantHoisting/X86/stackmap.ll
+++ b/test/Transforms/ConstantHoisting/X86/stackmap.ll
@@ -6,11 +6,11 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Test if the 3rd argument of a stackmap is hoisted.
 define i128 @test1(i128 %a) {
 ; CHECK-LABEL:  @test1
-; CHECK:        %const = bitcast i128 13464618275673403322 to i128
+; CHECK:        %const = bitcast i128 134646182756734033220 to i128
 ; CHECK:        tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 24, i128 %const)
 entry:
-  %0 = add i128 %a, 13464618275673403322
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 24, i128 13464618275673403322)
+  %0 = add i128 %a, 134646182756734033220
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 24, i128 134646182756734033220)
   ret i128 %0
 }
 
diff --git a/test/Transforms/FunctionAttrs/nocapture.ll b/test/Transforms/FunctionAttrs/nocapture.ll
index d2460c0b6ded..d3842c8415ca 100644
--- a/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/test/Transforms/FunctionAttrs/nocapture.ll
@@ -68,7 +68,7 @@ define i1* @lookup_bit(i32* %q, i32 %bitno) readnone nounwind {
 	ret i1* %lookup
 }
 
-; CHECK: define i1 @c7(i32* readnone %q, i32 %bitno)
+; CHECK: define i1 @c7(i32* readonly %q, i32 %bitno)
 define i1 @c7(i32* %q, i32 %bitno) {
 	%ptr = call i1* @lookup_bit(i32* %q, i32 %bitno)
 	%val = load i1* %ptr
diff --git a/test/Transforms/FunctionAttrs/readattrs.ll b/test/Transforms/FunctionAttrs/readattrs.ll
index 7ae38bbb1023..b4e904cf9b04 100644
--- a/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/test/Transforms/FunctionAttrs/readattrs.ll
@@ -51,3 +51,17 @@ define void @test6_2(i8** %p, i8* %q) {
 define void @test7_1(i32* inalloca %a) {
   ret void
 }
+
+; CHECK: define i32* @test8_1(i32* readnone %p)
+define i32* @test8_1(i32* %p) {
+entry:
+  ret i32* %p
+}
+
+; CHECK: define void @test8_2(i32* %p)
+define void @test8_2(i32* %p) {
+entry:
+  %call = call i32* @test8_1(i32* %p)
+  store i32 10, i32* %call, align 4
+  ret void
+}
diff --git a/test/Transforms/GCOVProfiling/global-ctor.ll b/test/Transforms/GCOVProfiling/global-ctor.ll
new file mode 100644
index 000000000000..722a0962ec57
--- /dev/null
+++ b/test/Transforms/GCOVProfiling/global-ctor.ll
@@ -0,0 +1,58 @@
+; RUN: echo '!16 = metadata !{metadata !"%T/global-ctor.ll", metadata !0}' > %t1
+; RUN: cat %s %t1 > %t2
+; RUN: opt -insert-gcov-profiling -disable-output < %t2
+; RUN: not grep '_GLOBAL__sub_I_global-ctor' %T/global-ctor.gcno
+; RUN: rm %T/global-ctor.gcno
+
+; REQUIRES: shell
+
+@x = global i32 0, align 4
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_global-ctor.ll, i8* null }]
+
+; Function Attrs: nounwind
+define internal void @__cxx_global_var_init() #0 section ".text.startup" {
+entry:
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  %call = call i32 @_Z1fv(), !dbg !13
+  store i32 %call, i32* @x, align 4, !dbg !13
+  ret void, !dbg !13
+}
+
+declare i32 @_Z1fv() #1
+
+; Function Attrs: nounwind
+define internal void @_GLOBAL__sub_I_global-ctor.ll() #0 section ".text.startup" {
+entry:
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  call void @__cxx_global_var_init(), !dbg !14
+  ret void, !dbg !14
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.gcov = !{!16}
+!llvm.ident = !{!12}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 210217)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/home/nlewycky/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/home/nlewycky"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 2, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [__cxx_global_var_init]
+!5 = metadata !{metadata !"global-ctor.ll", metadata !"/home/nlewycky"}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/home/nlewycky/global-ctor.ll]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"", metadata !"", metadata !"_GLOBAL__sub_I_global-ctor.ll", i32 0, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__sub_I_global-ctor.ll, null, null, metadata !2, i32 0} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!9 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/nlewycky/<stdin>]
+!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!11 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{metadata !"clang version 3.5.0 (trunk 210217)"}
+!13 = metadata !{i32 2, i32 0, metadata !4, null}
+!14 = metadata !{i32 0, i32 0, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !5, metadata !8} ; [ DW_TAG_lexical_block ] [/home/nlewycky/global-ctor.ll]
diff --git a/test/Transforms/GCOVProfiling/linezero.ll b/test/Transforms/GCOVProfiling/linezero.ll
new file mode 100644
index 000000000000..e2f832498428
--- /dev/null
+++ b/test/Transforms/GCOVProfiling/linezero.ll
@@ -0,0 +1,143 @@
+; RUN: sed -e 's@PATTERN@\%T@g' < %s > %t1
+; RUN: opt -insert-gcov-profiling -disable-output < %t1
+; RUN: rm %T/linezero.gcno %t1
+; REQUIRES: shell
+
+; This is a crash test.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.vector = type { i8 }
+
+; Function Attrs: nounwind
+define i32 @_Z4testv() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %__range = alloca %struct.vector*, align 8
+  %ref.tmp = alloca %struct.vector, align 1
+  %undef.agg.tmp = alloca %struct.vector, align 1
+  %__begin = alloca i8*, align 8
+  %__end = alloca i8*, align 8
+  %spec = alloca i8, align 1
+  call void @llvm.dbg.declare(metadata !{%struct.vector** %__range}, metadata !27), !dbg !30
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  call void @_Z13TagFieldSpecsv(), !dbg !31
+  store %struct.vector* %ref.tmp, %struct.vector** %__range, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata !{i8** %__begin}, metadata !32), !dbg !30
+  %1 = load %struct.vector** %__range, align 8, !dbg !31
+  %call = call i8* @_ZN6vector5beginEv(%struct.vector* %1), !dbg !31
+  store i8* %call, i8** %__begin, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata !{i8** %__end}, metadata !33), !dbg !30
+  %2 = load %struct.vector** %__range, align 8, !dbg !31
+  %call1 = call i8* @_ZN6vector3endEv(%struct.vector* %2), !dbg !31
+  store i8* %call1, i8** %__end, align 8, !dbg !31
+  br label %for.cond, !dbg !31
+
+for.cond:                                         ; preds = %for.inc, %0
+  %3 = load i8** %__begin, align 8, !dbg !34
+  %4 = load i8** %__end, align 8, !dbg !34
+  %cmp = icmp ne i8* %3, %4, !dbg !34
+  br i1 %cmp, label %for.body, label %for.end, !dbg !34
+
+for.body:                                         ; preds = %for.cond
+  call void @llvm.dbg.declare(metadata !{i8* %spec}, metadata !37), !dbg !31
+  %5 = load i8** %__begin, align 8, !dbg !38
+  %6 = load i8* %5, align 1, !dbg !38
+  store i8 %6, i8* %spec, align 1, !dbg !38
+  br label %for.inc, !dbg !38
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i8** %__begin, align 8, !dbg !40
+  %incdec.ptr = getelementptr inbounds i8* %7, i32 1, !dbg !40
+  store i8* %incdec.ptr, i8** %__begin, align 8, !dbg !40
+  br label %for.cond, !dbg !40
+
+for.end:                                          ; preds = %for.cond
+  call void @llvm.trap(), !dbg !42
+  unreachable, !dbg !42
+
+return:                                           ; No predecessors!
+  %8 = load i32* %retval, !dbg !44
+  ret i32 %8, !dbg !44
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare void @_Z13TagFieldSpecsv() #2
+
+declare i8* @_ZN6vector5beginEv(%struct.vector*) #2
+
+declare i8* @_ZN6vector3endEv(%struct.vector*) #2
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #3
+
+; Function Attrs: nounwind
+define void @_Z2f1v() #0 {
+entry:
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  ret void, !dbg !45
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noreturn nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!23, !24}
+!llvm.gcov = !{!25}
+!llvm.ident = !{!26}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 209871)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"PATTERN"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"vector", i32 21, i64 8, i64 8, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_structure_type ] [vector] [line 21, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !"linezero.cc", metadata !"PATTERN"}
+!6 = metadata !{metadata !7, metadata !13}
+!7 = metadata !{i32 786478, metadata !5, metadata !"_ZTS6vector", metadata !"begin", metadata !"begin", metadata !"_ZN6vector5beginEv", i32 25, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 25} ; [ DW_TAG_subprogram ] [line 25] [begin]
+!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !10, metadata !12}
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS6vector"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS6vector]
+!13 = metadata !{i32 786478, metadata !5, metadata !"_ZTS6vector", metadata !"end", metadata !"end", metadata !"_ZN6vector3endEv", i32 26, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 26} ; [ DW_TAG_subprogram ] [line 26] [end]
+!14 = metadata !{metadata !15, metadata !20}
+!15 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"test", metadata !"test", metadata !"_Z4testv", i32 50, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z4testv, null, null, metadata !2, i32 50} ; [ DW_TAG_subprogram ] [line 50] [def] [test]
+!16 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [./linezero.cc]
+!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"f1", metadata !"f1", metadata !"_Z2f1v", i32 54, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z2f1v, null, null, metadata !2, i32 54} ; [ DW_TAG_subprogram ] [line 54] [def] [f1]
+!21 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{null}
+!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!24 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!25 = metadata !{metadata !"PATTERN/linezero.o", metadata !0}
+!26 = metadata !{metadata !"clang version 3.5.0 (trunk 209871)"}
+!27 = metadata !{i32 786688, metadata !28, metadata !"__range", null, i32 0, metadata !29, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__range] [line 0]
+!28 = metadata !{i32 786443, metadata !5, metadata !15, i32 51, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!29 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS6vector"} ; [ DW_TAG_rvalue_reference_type ] [line 0, size 0, align 0, offset 0] [from _ZTS6vector]
+!30 = metadata !{i32 0, i32 0, metadata !28, null}
+!31 = metadata !{i32 51, i32 0, metadata !28, null}
+!32 = metadata !{i32 786688, metadata !28, metadata !"__begin", null, i32 0, metadata !10, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__begin] [line 0]
+!33 = metadata !{i32 786688, metadata !28, metadata !"__end", null, i32 0, metadata !10, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__end] [line 0]
+!34 = metadata !{i32 51, i32 0, metadata !35, null}
+!35 = metadata !{i32 786443, metadata !5, metadata !36, i32 51, i32 0, i32 5, i32 5} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!36 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!37 = metadata !{i32 786688, metadata !28, metadata !"spec", metadata !16, i32 51, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [spec] [line 51]
+!38 = metadata !{i32 51, i32 0, metadata !39, null}
+!39 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 2, i32 2} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!40 = metadata !{i32 51, i32 0, metadata !41, null}
+!41 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 4, i32 4} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!42 = metadata !{i32 51, i32 0, metadata !43, null}
+!43 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 3, i32 3} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!44 = metadata !{i32 52, i32 0, metadata !15, null}
+!45 = metadata !{i32 54, i32 0, metadata !20, null}
diff --git a/test/Transforms/GVN/calloc-load-removal.ll b/test/Transforms/GVN/calloc-load-removal.ll
new file mode 100644
index 000000000000..2dde5b7b4146
--- /dev/null
+++ b/test/Transforms/GVN/calloc-load-removal.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
+; RUN: opt -S -basicaa -gvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
+; Check that loads from calloc are recognized as being zero.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1() {
+  %1 = tail call noalias i8* @calloc(i64 1, i64 4)
+  %2 = bitcast i8* %1 to i32*
+  ; This load is trivially constant zero
+  %3 = load i32* %2, align 4
+  ret i32 %3
+
+; CHECK-LABEL: @test1(
+; CHECK-NOT: %3 = load i32* %2, align 4
+; CHECK: ret i32 0
+
+; CHECK_NO_LIBCALLS-LABEL: @test1(
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: ret i32 %
+
+}
+
+declare noalias i8* @calloc(i64, i64)
diff --git a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
index 4b967997ce8d..0bdced5114d3 100644
--- a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
+++ b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
@@ -11,8 +11,8 @@
 @L1 = alias i32* @A
 ; CHECK: @L1 = alias i32* @A
 
-@L2 = alias internal i32* @A
-; DEAD-NOT: @L2
+@L2 = alias internal i32* @L1
+; CHECK: @L2 = alias internal i32* @L1
 
-@L3 = alias i32* @A
-; CHECK: @L3 = alias i32* @A
+@L3 = alias i32* @L2
+; CHECK: @L3 = alias i32* @L2
diff --git a/test/Transforms/GlobalMerge/ARM/lit.local.cfg b/test/Transforms/GlobalMerge/ARM/lit.local.cfg
deleted file mode 100644
index 8a3ba96497e7..000000000000
--- a/test/Transforms/GlobalMerge/ARM/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/GlobalMerge/ARM64/lit.local.cfg b/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b6f74c..000000000000
--- a/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll b/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
index 03d6ee4f8a12..d6a565ad10a8 100644
--- a/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
+++ b/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
@@ -2,7 +2,7 @@
 
 @g = global i32 0
 
-@a = alias i8, i32* @g
+@a = alias bitcast (i32* @g to i8*)
 
 define void @f() {
 	%tmp = load i8* @a
diff --git a/test/Transforms/GlobalOpt/alias-resolve.ll b/test/Transforms/GlobalOpt/alias-resolve.ll
index bd07b31b2358..2d5a956d14b1 100644
--- a/test/Transforms/GlobalOpt/alias-resolve.ll
+++ b/test/Transforms/GlobalOpt/alias-resolve.ll
@@ -1,9 +1,9 @@
 ; RUN: opt < %s -globalopt -S | FileCheck %s
 
-@foo1 = alias void ()* @bar2
+@foo1 = alias void ()* @foo2
 ; CHECK: @foo1 = alias void ()* @bar2
 
-@foo2 = alias void()* @bar2
+@foo2 = alias void()* @bar1
 ; CHECK: @foo2 = alias void ()* @bar2
 
 @bar1  = alias void ()* @bar2
diff --git a/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll b/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll
index bd174a8be3ff..4ea0b88fd0de 100644
--- a/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as <%s | opt -ipsccp | llvm-dis | FileCheck %s
+; RUN: opt < %s -ipsccp -S | FileCheck %s
 ; Don't constant-propagate byval pointers, since they are not pointers!
 ; PR5038
 %struct.MYstr = type { i8, i32 }
diff --git a/test/Transforms/IndVarSimplify/pr18223.ll b/test/Transforms/IndVarSimplify/pr18223.ll
new file mode 100644
index 000000000000..738f75c0fe03
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr18223.ll
@@ -0,0 +1,30 @@
+; RUN: opt -indvars -S < %s | FileCheck %s
+
+; indvars should transform the phi node pair from the for-loop
+; CHECK-LABEL: @main(
+; CHECK: ret = phi i32 [ 0, %entry ], [ 0, {{.*}} ]
+
+@c = common global i32 0, align 4
+
+define i32 @main() #0 {
+entry:
+  %0 = load i32* @c, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.body, label %exit
+
+for.body:
+  %inc2 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %sub = add i32 %inc2, -1
+  %cmp1 = icmp uge i32 %sub, %inc2
+  %conv = zext i1 %cmp1 to i32
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %inc2, 1
+  %cmp = icmp slt i32 %inc, 5
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  %ret = phi i32 [ 0, %entry ], [ %conv, %for.inc ]
+  ret i32 %ret
+}
diff --git a/test/Transforms/Inline/optimization-remarks.ll b/test/Transforms/Inline/optimization-remarks.ll
new file mode 100644
index 000000000000..9108f3ab14d6
--- /dev/null
+++ b/test/Transforms/Inline/optimization-remarks.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -inline -pass-remarks=inline -pass-remarks-missed=inline -pass-remarks-analysis=inline -S 2>&1 | FileCheck %s
+
+; CHECK: foo should always be inlined (cost=always)
+; CHECK: foo inlined into bar
+; CHECK: foz should never be inlined (cost=never)
+; CHECK: foz will not be inlined into bar
+
+; Function Attrs: alwaysinline nounwind uwtable
+define i32 @foo(i32 %x, i32 %y) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32* %x.addr, align 4
+  %1 = load i32* %y.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+; Function Attrs: noinline nounwind uwtable
+define float @foz(i32 %x, i32 %y) #1 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32* %x.addr, align 4
+  %1 = load i32* %y.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  ret float %conv
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @bar(i32 %j) #2 {
+entry:
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32* %j.addr, align 4
+  %1 = load i32* %j.addr, align 4
+  %sub = sub nsw i32 %1, 2
+  %call = call i32 @foo(i32 %0, i32 %sub)
+  %conv = sitofp i32 %call to float
+  %2 = load i32* %j.addr, align 4
+  %sub1 = sub nsw i32 %2, 2
+  %3 = load i32* %j.addr, align 4
+  %call2 = call float @foz(i32 %sub1, i32 %3)
+  %mul = fmul float %conv, %call2
+  %conv3 = fptosi float %mul to i32
+  ret i32 %conv3
+}
+
+attributes #0 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
index 1883a8fc8e6e..39408a2d394c 100644
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
@@ -68,7 +68,7 @@ declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind rea
 
 define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> zeroinitializer
@@ -76,7 +76,7 @@ entry:
 
 define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
@@ -85,7 +85,7 @@ entry:
 
 define <4 x i32> @constantMulARM64() nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
@@ -93,7 +93,7 @@ entry:
 
 define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -101,7 +101,7 @@ entry:
 
 define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -109,17 +109,17 @@ entry:
 
 define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
   %b = add <4 x i32> zeroinitializer, %a
   ret <4 x i32> %b
 ; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   %b = add <4 x i32> %x, %a
   ret <4 x i32> %b
 ; CHECK: entry:
@@ -127,8 +127,8 @@ entry:
 ; CHECK-NEXT: ret <4 x i32> %b
 }
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind readnone ssp }
 ; CHECK: attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
index 4d185bf7e06e..ac9c555020d0 100644
--- a/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
+++ b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 ; CHECK: addrspacecast
 
-@base = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 16
+@base = internal unnamed_addr addrspace(3) global [16 x i32] zeroinitializer, align 16
 declare void @foo(i32*)
 
 define void @test() nounwind {
diff --git a/test/Transforms/InstCombine/AddOverFlow.ll b/test/Transforms/InstCombine/AddOverFlow.ll
new file mode 100644
index 000000000000..70178ec83b6e
--- /dev/null
+++ b/test/Transforms/InstCombine/AddOverFlow.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @oppositesign
+; CHECK: add nsw i16 %a, %b
+define i16 @oppositesign(i16 %x, i16 %y) {
+; %a is negative, %b is positive
+  %a = or i16 %x, 32768
+  %b = and i16 %y, 32767
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+define i16 @zero_sign_bit(i16 %a) {
+; CHECK-LABEL: @zero_sign_bit(
+; CHECK-NEXT: and
+; CHECK-NEXT: add nuw
+; CHECK-NEXT: ret
+  %1 = and i16 %a, 32767
+  %2 = add i16 %1, 512
+  ret i16 %2
+}
+
+define i16 @zero_sign_bit2(i16 %a, i16 %b) {
+; CHECK-LABEL: @zero_sign_bit2(
+; CHECK-NEXT: and
+; CHECK-NEXT: and
+; CHECK-NEXT: add nuw
+; CHECK-NEXT: ret
+  %1 = and i16 %a, 32767
+  %2 = and i16 %b, 32767
+  %3 = add i16 %1, %2
+  ret i16 %3
+}
+
+; CHECK-LABEL: @ripple_nsw1
+; CHECK: add nsw i16 %a, %b
+define i16 @ripple_nsw1(i16 %x, i16 %y) {
+; %a has at most one bit set
+  %a = and i16 %y, 1
+
+; %b has a 0 bit other than the sign bit
+  %b = and i16 %x, 49151
+
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+; Like the previous test, but flip %a and %b
+; CHECK-LABEL: @ripple_nsw2
+; CHECK: add nsw i16 %b, %a
+define i16 @ripple_nsw2(i16 %x, i16 %y) {
+  %a = and i16 %y, 1
+  %b = and i16 %x, 49151
+  %c = add i16 %b, %a
+  ret i16 %c
+}
+
+; CHECK-LABEL: @ripple_no_nsw1
+; CHECK: add i32 %a, %x
+define i32 @ripple_no_nsw1(i32 %x, i32 %y) {
+; We know nothing about %x
+  %a = and i32 %y, 1
+  %b = add i32 %a, %x
+  ret i32 %b
+}
+
+; CHECK-LABEL: @ripple_no_nsw2
+; CHECK: add nuw i16 %a, %b
+define i16 @ripple_no_nsw2(i16 %x, i16 %y) {
+; %a has at most one bit set
+  %a = and i16 %y, 1
+
+; %b has a 0 bit, but it is the sign bit
+  %b = and i16 %x, 32767
+
+  %c = add i16 %a, %b
+  ret i16 %c
+}
diff --git a/test/Transforms/InstCombine/abs_abs.ll b/test/Transforms/InstCombine/abs_abs.ll
new file mode 100644
index 000000000000..de10fd180a79
--- /dev/null
+++ b/test/Transforms/InstCombine/abs_abs.ll
@@ -0,0 +1,961 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @abs_abs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
\ No newline at end of file
diff --git a/test/Transforms/InstCombine/add-shrink.ll b/test/Transforms/InstCombine/add-shrink.ll
index 3edb392ed184..67a990fcb105 100644
--- a/test/Transforms/InstCombine/add-shrink.ll
+++ b/test/Transforms/InstCombine/add-shrink.ll
@@ -1,9 +1,11 @@
-; RUN: opt < %s -instcombine -S | grep "add nsw i32"
-; RUN: opt < %s -instcombine -S | grep sext | count 1
-
-; Should only have one sext and the add should be i32 instead of i64.
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
+; CHECK-LABEL: define i64 @test
 define i64 @test1(i32 %A) {
+; CHECK: %[[ADD:.*]] = add nsw i32 %B, %C
+; CHECK: %F = sext i32 %[[ADD]] to i64
+; CHECK: ret i64 %F
+
 	%B = ashr i32 %A, 7		; <i32> [#uses=1]
 	%C = ashr i32 %A, 9		; <i32> [#uses=1]
 	%D = sext i32 %B to i64		; <i64> [#uses=1]
diff --git a/test/Transforms/InstCombine/add-sitofp.ll b/test/Transforms/InstCombine/add-sitofp.ll
index 40edf7114a06..3b5485e00528 100644
--- a/test/Transforms/InstCombine/add-sitofp.ll
+++ b/test/Transforms/InstCombine/add-sitofp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep "add nsw i32"
+; RUN: opt < %s -instcombine -S | grep "add nuw nsw i32"
 
 define double @x(i32 %a, i32 %b) nounwind {
   %m = lshr i32 %a, 24
diff --git a/test/Transforms/InstCombine/add2.ll b/test/Transforms/InstCombine/add2.ll
index 67d560e500a5..aaf3a7a235bf 100644
--- a/test/Transforms/InstCombine/add2.ll
+++ b/test/Transforms/InstCombine/add2.ll
@@ -76,3 +76,13 @@ define <2 x i64> @test8(<2 x i64> %A) {
 ; CHECK-NEXT: %add = sub <2 x i64> <i64 1, i64 2>, %A
 ; CHECK-NEXT: ret <2 x i64> %add
 }
+
+define i16 @test9(i16 %a) {
+       %b = mul i16 %a, 2
+       %c = mul i16 %a, 32767
+       %d = add i16 %b, %c
+       ret i16 %d
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:  %d = mul i16 %a, -32767
+; CHECK-NEXT:  ret i16 %d
+}
diff --git a/test/Transforms/InstCombine/addrspacecast.ll b/test/Transforms/InstCombine/addrspacecast.ll
index d908b556e195..c1684361f99f 100644
--- a/test/Transforms/InstCombine/addrspacecast.ll
+++ b/test/Transforms/InstCombine/addrspacecast.ll
@@ -28,13 +28,91 @@ define <4 x i32*> @combine_redundant_addrspacecast_vector(<4 x i32 addrspace(1)*
 
 define float* @combine_redundant_addrspacecast_types(i32 addrspace(1)* %x) nounwind {
 ; CHECK-LABEL: @combine_redundant_addrspacecast_types(
-; CHECK: addrspacecast i32 addrspace(1)* %x to float*
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float*
 ; CHECK-NEXT: ret
   %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(3)*
   %z = addrspacecast i32 addrspace(3)* %y to float*
   ret float* %z
 }
 
+define <4 x float*> @combine_redundant_addrspacecast_types_vector(<4 x i32 addrspace(1)*> %x) nounwind {
+; CHECK-LABEL: @combine_redundant_addrspacecast_types_vector(
+; CHECK-NEXT: bitcast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(1)*>
+; CHECK-NEXT: addrspacecast <4 x float addrspace(1)*> %1 to <4 x float*>
+; CHECK-NEXT: ret
+  %y = addrspacecast <4 x i32 addrspace(1)*> %x to <4 x i32 addrspace(3)*>
+  %z = addrspacecast <4 x i32 addrspace(3)*> %y to <4 x float*>
+  ret <4 x float*> %z
+}
+
+define float addrspace(2)* @combine_addrspacecast_bitcast_1(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_bitcast_1(
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+  %z = bitcast i32 addrspace(2)* %y to float addrspace(2)*
+  ret float addrspace(2)* %z
+}
+
+define i32 addrspace(2)* @combine_addrspacecast_bitcast_2(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_bitcast_2(
+; CHECK: addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to float addrspace(2)*
+  %z = bitcast float addrspace(2)* %y to i32 addrspace(2)*
+  ret i32 addrspace(2)* %z
+}
+
+define i32 addrspace(2)* @combine_bitcast_addrspacecast_1(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_bitcast_addrspacecast_1(
+; CHECK: addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+; CHECK-NEXT: ret
+  %y = bitcast i32 addrspace(1)* %x to i8 addrspace(1)*
+  %z = addrspacecast i8 addrspace(1)* %y to i32 addrspace(2)*
+  ret i32 addrspace(2)* %z
+}
+
+define float addrspace(2)* @combine_bitcast_addrspacecast_2(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_bitcast_addrspacecast_2(
+; CHECK: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = bitcast i32 addrspace(1)* %x to i8 addrspace(1)*
+  %z = addrspacecast i8 addrspace(1)* %y to float addrspace(2)*
+  ret float addrspace(2)* %z
+}
+
+define float addrspace(2)* @combine_addrspacecast_types(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_types(
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to float addrspace(2)*
+  ret float addrspace(2)* %y
+}
+
+define <4 x float addrspace(2)*> @combine_addrspacecast_types_vector(<4 x i32 addrspace(1)*> %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_types_vector(
+; CHECK-NEXT: bitcast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(1)*>
+; CHECK-NEXT: addrspacecast <4 x float addrspace(1)*> %1 to <4 x float addrspace(2)*>
+; CHECK-NEXT: ret
+  %y = addrspacecast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(2)*>
+  ret <4 x float addrspace(2)*> %y
+}
+
+define i32 @canonicalize_addrspacecast([16 x i32] addrspace(1)* %arr) {
+; CHECK-LABEL: @canonicalize_addrspacecast(
+; CHECK-NEXT: getelementptr inbounds [16 x i32] addrspace(1)* %arr, i32 0, i32 0
+; CHECK-NEXT: addrspacecast i32 addrspace(1)* %{{[a-zA-Z0-9]+}} to i32*
+; CHECK-NEXT: load i32*
+; CHECK-NEXT: ret i32
+  %p = addrspacecast [16 x i32] addrspace(1)* %arr to i32*
+  %v = load i32* %p
+  ret i32 %v
+}
+
 @const_array = addrspace(2) constant [60 x i8] [i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
                                                 i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
                                                 i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
diff --git a/test/Transforms/InstCombine/bitcast-alias-function.ll b/test/Transforms/InstCombine/bitcast-alias-function.ll
index 284960b1133b..a6b56f94ffbf 100644
--- a/test/Transforms/InstCombine/bitcast-alias-function.ll
+++ b/test/Transforms/InstCombine/bitcast-alias-function.ll
@@ -6,46 +6,46 @@ target datalayout = "e-p:32:32:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16
 ; Cases that should be bitcast
 
 ; Test cast between scalars with same bit sizes
-@alias_i32_to_f32 = alias float (float), i32 (i32)* @func_i32
+@alias_i32_to_f32 = alias bitcast (i32 (i32)* @func_i32 to float (float)*)
 
 ; Test cast between vectors with same number of elements and bit sizes
-@alias_v2i32_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
+@alias_v2i32_to_v2f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <2 x float> (<2 x float>)*)
 
 ; Test cast from vector to scalar with same number of bits
-@alias_v2f32_to_i64 = alias <2 x float> (<2 x float>), i64 (i64)* @func_i64
+@alias_v2f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <2 x float> (<2 x float>)*)
 
 ; Test cast from scalar to vector with same number of bits
-@alias_i64_to_v2f32 = alias  i64 (i64), <2 x float> (<2 x float>)* @func_v2f32
+@alias_i64_to_v2f32 = alias bitcast (<2 x float> (<2 x float>)* @func_v2f32 to i64 (i64)*)
 
 ; Test cast between vectors of pointers
-@alias_v2i32p_to_v2i64p = alias <2 x i64*> (<2 x i64*>), <2 x i32*> (<2 x i32*>)* @func_v2i32p
+@alias_v2i32p_to_v2i64p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to <2 x i64*> (<2 x i64*>)*)
 
 
 ; Cases that should be invalid and unchanged
 
 ; Test cast between scalars with different bit sizes
-@alias_i64_to_f32 = alias float (float), i64 (i64)* @func_i64
+@alias_i64_to_f32 = alias bitcast (i64 (i64)* @func_i64 to float (float)*)
 
 ; Test cast between vectors with different bit sizes but the
 ; same number of elements
-@alias_v2i64_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i64> (<2 x i64>)* @func_v2i64
+@alias_v2i64_to_v2f32 = alias bitcast (<2 x i64> (<2 x i64>)* @func_v2i64 to <2 x float> (<2 x float>)*)
 
 ; Test cast between vectors with same number of bits and different
 ; numbers of elements
-@alias_v2i32_to_v4f32 = alias  <4 x float> (<4 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
+@alias_v2i32_to_v4f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <4 x float> (<4 x float>)*)
 
 ; Test cast between scalar and vector with different number of bits
-@alias_i64_to_v4f32 = alias i64 (i64), <4 x float> (<4 x float>)* @func_v4f32
+@alias_i64_to_v4f32 = alias bitcast (<4 x float> (<4 x float>)* @func_v4f32 to i64 (i64)*)
 
 ; Test cast between vector and scalar with different number of bits
-@alias_v4f32_to_i64 = alias <4 x float> (<4 x float>), i64 (i64)* @func_i64
+@alias_v4f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x float> (<4 x float>)*)
 
 ; Test cast from scalar to vector of pointers with same number of bits
 ; We don't know the pointer size at this point, so this can't be done
-@alias_i64_to_v2i32p = alias  i64 (i64), <2 x i32*> (<2 x i32*>)* @func_v2i32p
+@alias_i64_to_v2i32p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to i64 (i64)*)
 
 ; Test cast between vector of pointers and scalar with different number of bits
-@alias_v4i32p_to_i64 = alias <4 x i32*> (<4 x i32*>), i64 (i64)* @func_i64
+@alias_v4i32p_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x i32*> (<4 x i32*>)*)
 
 
 
diff --git a/test/Transforms/InstCombine/blend_x86.ll b/test/Transforms/InstCombine/blend_x86.ll
new file mode 100644
index 000000000000..778d44ba342c
--- /dev/null
+++ b/test/Transforms/InstCombine/blend_x86.ll
@@ -0,0 +1,55 @@
+; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
+
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd
+; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x double> %ab, <2 x double> %xy
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps
+; CHECK: select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %abcd, <4 x float> %xyzw
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb
+; CHECK: select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %abcd, <16 x i8> %xyzw
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_avx
+; CHECK: select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %ab, <4 x double> %xy
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_avx
+; CHECK: select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>,  <8 x float> %abcd, <8 x float> %xyzw
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <8 x float> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_avx2
+; CHECK: select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %abcd, <32 x i8> %xyzw
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
+        <32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <32 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 4fab92f2c11d..0cbfbb071bb7 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -370,7 +370,7 @@ define zeroext i64 @test43(i8 zeroext %on_off) nounwind readonly {
 	ret i64 %C  ;; Should be (add (zext i8 -> i64), -1)
 ; CHECK-LABEL: @test43(
 ; CHECK-NEXT: %A = zext i8 %on_off to i64
-; CHECK-NEXT: %B = add i64 %A, -1
+; CHECK-NEXT: %B = add nsw i64 %A, -1
 ; CHECK-NEXT: ret i64 %B
 }
 
diff --git a/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
index 9f21d5419b72..7fac78a40f56 100644
--- a/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
+++ b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
@@ -230,3 +230,13 @@ define i32 @constant_through_array_as_ptrs() {
   %b = load i32 addrspace(1)* %a, align 4
   ret i32 %b
 }
+
+@shared_mem = external addrspace(3) global [0 x i8]
+
+define float @canonicalize_addrspacecast(i32 %i) {
+; CHECK-LABEL: @canonicalize_addrspacecast
+; CHECK-NEXT: getelementptr inbounds float* addrspacecast (float addrspace(3)* bitcast ([0 x i8] addrspace(3)* @shared_mem to float addrspace(3)*) to float*), i32 %i
+  %p = getelementptr inbounds float* addrspacecast ([0 x i8] addrspace(3)* @shared_mem to float*), i32 %i
+  %v = load float* %p
+  ret float %v
+}
diff --git a/test/Transforms/InstCombine/ffs-1.ll b/test/Transforms/InstCombine/ffs-1.ll
index 1dec11da0eb0..c8763dc199a9 100644
--- a/test/Transforms/InstCombine/ffs-1.ll
+++ b/test/Transforms/InstCombine/ffs-1.ll
@@ -103,7 +103,7 @@ define i32 @test_simplify13(i32 %x) {
 ; CHECK-LABEL: @test_simplify13(
   %ret = call i32 @ffs(i32 %x)
 ; CHECK-NEXT: [[CTTZ:%[a-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[INC:%[a-z0-9]+]] = add i32 [[CTTZ]], 1
+; CHECK-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i32 [[CTTZ]], 1
 ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 %x, 0
 ; CHECK-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[INC]], i32 0
   ret i32 %ret
@@ -114,7 +114,7 @@ define i32 @test_simplify14(i32 %x) {
 ; CHECK-LINUX-LABEL: @test_simplify14(
   %ret = call i32 @ffsl(i32 %x)
 ; CHECK-LINUX-NEXT: [[CTTZ:%[a-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add i32 [[CTTZ]], 1
+; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i32 [[CTTZ]], 1
 ; CHECK-LINUX-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 %x, 0
 ; CHECK-LINUX-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[INC]], i32 0
   ret i32 %ret
@@ -125,7 +125,7 @@ define i32 @test_simplify15(i64 %x) {
 ; CHECK-LINUX-LABEL: @test_simplify15(
   %ret = call i32 @ffsll(i64 %x)
 ; CHECK-LINUX-NEXT: [[CTTZ:%[a-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %x, i1 false)
-; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add i64 [[CTTZ]], 1
+; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i64 [[CTTZ]], 1
 ; CHECK-LINUX-NEXT: [[TRUNC:%[a-z0-9]+]] = trunc i64 [[INC]] to i32
 ; CHECK-LINUX-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i64 %x, 0
 ; CHECK-LINUX-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[TRUNC]], i32 0
diff --git a/test/Transforms/InstCombine/gepphigep.ll b/test/Transforms/InstCombine/gepphigep.ll
new file mode 100644
index 000000000000..9aab609901e2
--- /dev/null
+++ b/test/Transforms/InstCombine/gepphigep.ll
@@ -0,0 +1,56 @@
+; RUN: opt -instcombine -S  < %s | FileCheck %s
+
+%struct1 = type { %struct2*, i32, i32, i32 }
+%struct2 = type { i32, i32 }
+
+define i32 @test1(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
+bb:
+  %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0
+  %tmp1 = load %struct2** %tmp, align 8
+  br i1 %tmp4, label %bb1, label %bb2
+
+bb1:
+  %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9
+  %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0
+  store i32 0, i32* %tmp11, align 4
+  br label %bb3
+
+bb2:
+  %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19
+  %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0
+  store i32 0, i32* %tmp21, align 4
+  br label %bb3
+
+bb3:
+  %phi = phi %struct2* [ %tmp10, %bb1 ], [ %tmp20, %bb2 ]
+  %tmp24 = getelementptr inbounds %struct2* %phi, i64 0, i32 1
+  %tmp25 = load i32* %tmp24, align 4
+  ret i32 %tmp25
+
+; CHECK-LABEL: @test1(
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0
+; CHECK: %[[PHI:[0-9A-Za-z]+]] = phi i64 [ %tmp9, %bb1 ], [ %tmp19, %bb2 ]
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %[[PHI]], i32 1
+
+}
+
+define i32 @test2(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
+bb:
+  %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0
+  %tmp1 = load %struct2** %tmp, align 8
+  %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9
+  %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0
+  store i32 0, i32* %tmp11, align 4
+  %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19
+  %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0
+  store i32 0, i32* %tmp21, align 4
+  %tmp24 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 1
+  %tmp25 = load i32* %tmp24, align 4
+  ret i32 %tmp25
+
+; CHECK-LABEL: @test2(
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 1
+}
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index ef0cb29fd88c..3240c6d2a4d0 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -732,7 +732,8 @@ define i64 @test_gep_bitcast_array_same_size_element([100 x double]* %arr, i64 %
 define i64 @test_gep_bitcast_array_same_size_element_addrspacecast([100 x double]* %arr, i64 %N) {
 ; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_addrspacecast(
 ; CHECK: getelementptr [100 x double]* %arr, i64 0, i64 %V
-; CHECK-NEXT: %t = addrspacecast double*
+; CHECK-NEXT: bitcast double*
+; CHECK-NEXT: %t = addrspacecast i64*
 ; CHECK: load i64 addrspace(3)* %t
   %cast = addrspacecast [100 x double]* %arr to i64 addrspace(3)*
   %V = mul i64 %N, 8
@@ -802,10 +803,22 @@ define i16 @test41([3 x i32] addrspace(1)* %array) {
 ; CHECK-NEXT: ret i16 8
 }
 
-define i32 addrspace(1)* @ascast_0_gep([128 x i32]* %p) nounwind {
+define i32 addrspace(1)* @ascast_0_gep(i32* %p) nounwind {
 ; CHECK-LABEL: @ascast_0_gep(
 ; CHECK-NOT: getelementptr
 ; CHECK: ret
+  %gep = getelementptr i32* %p, i32 0
+  %x = addrspacecast i32* %gep to i32 addrspace(1)*
+  ret i32 addrspace(1)* %x
+}
+
+; Do not merge the GEP and the addrspacecast, because it would undo the
+; addrspacecast canonicalization.
+define i32 addrspace(1)* @ascast_0_0_gep([128 x i32]* %p) nounwind {
+; CHECK-LABEL: @ascast_0_0_gep(
+; CHECK-NEXT: getelementptr [128 x i32]
+; CHECK-NEXT: addrspacecast i32*
+; CHECK-NEXT: ret i32 addrspace(1)*
   %gep = getelementptr [128 x i32]* %p, i32 0, i32 0
   %x = addrspacecast i32* %gep to i32 addrspace(1)*
   ret i32 addrspace(1)* %x
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index f45897cd3eec..26e144f93a57 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1,7 +1,6 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-target datalayout =
-"e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32:32-p3:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define i32 @test1(i32 %X) {
 entry:
@@ -166,6 +165,14 @@ define i1 @test17(i32 %x) nounwind {
 ; CHECK-NEXT: %cmp = icmp ne i32 %x, 3
 }
 
+define i1 @test17a(i32 %x) nounwind {
+  %shl = shl i32 1, %x
+  %and = and i32 %shl, 7
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+; CHECK-LABEL: @test17a(
+; CHECK-NEXT: %cmp = icmp ugt i32 %x, 2
+}
 
 define i1 @test18(i32 %x) nounwind {
   %sh = lshr i32 8, %x
@@ -194,6 +201,15 @@ define i1 @test20(i32 %x) nounwind {
 ; CHECK-NEXT: %cmp = icmp eq i32 %x, 3
 }
 
+define i1 @test20a(i32 %x) nounwind {
+  %shl = shl i32 1, %x
+  %and = and i32 %shl, 7
+  %cmp = icmp ne i32 %and, 0
+  ret i1 %cmp
+; CHECK-LABEL: @test20a(
+; CHECK-NEXT: %cmp = icmp ult i32 %x, 3
+}
+
 define i1 @test21(i8 %x, i8 %y) {
 ; CHECK-LABEL: @test21(
 ; CHECK-NOT: or i8
@@ -657,6 +673,49 @@ define i1 @test60_as1(i8 addrspace(1)* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT: ret i1
 }
 
+; Same as test60, but look through an addrspacecast instead of a
+; bitcast. This uses the same sized addrspace.
+define i1 @test60_addrspacecast(i8* %foo, i64 %i, i64 %j) {
+  %bit = addrspacecast i8* %foo to i32 addrspace(3)*
+  %gep1 = getelementptr inbounds i32 addrspace(3)* %bit, i64 %i
+  %gep2 = getelementptr inbounds i8* %foo, i64 %j
+  %cast1 = addrspacecast i32 addrspace(3)* %gep1 to i8*
+  %cmp = icmp ult i8* %cast1, %gep2
+  ret i1 %cmp
+; CHECK-LABEL: @test60_addrspacecast(
+; CHECK-NEXT: %gep1.idx = shl nuw i64 %i, 2
+; CHECK-NEXT: icmp slt i64 %gep1.idx, %j
+; CHECK-NEXT: ret i1
+}
+
+define i1 @test60_addrspacecast_smaller(i8* %foo, i16 %i, i64 %j) {
+  %bit = addrspacecast i8* %foo to i32 addrspace(1)*
+  %gep1 = getelementptr inbounds i32 addrspace(1)* %bit, i16 %i
+  %gep2 = getelementptr inbounds i8* %foo, i64 %j
+  %cast1 = addrspacecast i32 addrspace(1)* %gep1 to i8*
+  %cmp = icmp ult i8* %cast1, %gep2
+  ret i1 %cmp
+; CHECK-LABEL: @test60_addrspacecast_smaller(
+; CHECK-NEXT: %gep1.idx = shl nuw i16 %i, 2
+; CHECK-NEXT: trunc i64 %j to i16
+; CHECK-NEXT: icmp sgt i16 %1, %gep1.idx
+; CHECK-NEXT: ret i1
+}
+
+define i1 @test60_addrspacecast_larger(i8 addrspace(1)* %foo, i32 %i, i16 %j) {
+  %bit = addrspacecast i8 addrspace(1)* %foo to i32 addrspace(2)*
+  %gep1 = getelementptr inbounds i32 addrspace(2)* %bit, i32 %i
+  %gep2 = getelementptr inbounds i8 addrspace(1)* %foo, i16 %j
+  %cast1 = addrspacecast i32 addrspace(2)* %gep1 to i8 addrspace(1)*
+  %cmp = icmp ult i8 addrspace(1)* %cast1, %gep2
+  ret i1 %cmp
+; CHECK-LABEL: @test60_addrspacecast_larger(
+; CHECK-NEXT:  %gep1.idx = shl nuw i32 %i, 2
+; CHECK-NEXT:  trunc i32 %gep1.idx to i16
+; CHECK-NEXT:  icmp slt i16 %1, %j
+; CHECK-NEXT:  ret i1
+}
+
 define i1 @test61(i8* %foo, i64 %i, i64 %j) {
   %bit = bitcast i8* %foo to i32*
   %gep1 = getelementptr i32* %bit, i64 %i
diff --git a/test/Transforms/InstCombine/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index b5a0ab83bc10..3bc1d36f486d 100644
--- a/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -78,7 +78,8 @@ define void @test2_addrspacecast() {
 ; %A alloca is deleted
 ; This doesn't exactly match what test2 does, because folding the type
 ; cast into the alloca doesn't work for the addrspacecast yet.
-; CHECK-NEXT: alloca %T
+; CHECK-NEXT: alloca [124 x i8]
+; CHECK-NEXT: getelementptr
 ; CHECK-NEXT: addrspacecast
 
 ; use @G instead of %A
diff --git a/test/Transforms/InstCombine/rem.ll b/test/Transforms/InstCombine/rem.ll
index 9f07702681ef..0595a67393a6 100644
--- a/test/Transforms/InstCombine/rem.ll
+++ b/test/Transforms/InstCombine/rem.ll
@@ -127,7 +127,7 @@ define i64 @test14(i64 %x, i32 %y) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT: [[SHL:%.*]] = shl i32 1, %y
 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SHL]] to i64
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT]], -1
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[ZEXT]], -1
 ; CHECK-NEXT: [[AND:%.*]] = and i64 [[ADD]], %x
 ; CHECK-NEXT: ret i64 [[AND]]
 	%shl = shl i32 1, %y
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 2213be1f5113..d625f3b1b33d 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -281,7 +281,7 @@ define i32 @test15i(i32 %X) {
 ; CHECK-NEXT: %t1 = shl i32 %X, 8
 ; CHECK-NEXT: %1 = and i32 %t1, 512
 ; CHECK-NEXT: %2 = xor i32 %1, 512
-; CHECK-NEXT: %3 = add i32 %2, 577
+; CHECK-NEXT: %3 = add nuw nsw i32 %2, 577
 ; CHECK-NEXT: ret i32 %3
 }
 
@@ -294,7 +294,7 @@ define i32 @test15j(i32 %X) {
 ; CHECK-LABEL: @test15j(
 ; CHECK-NEXT: %t1 = shl i32 %X, 8
 ; CHECK-NEXT: %1 = and i32 %t1, 512
-; CHECK-NEXT: %2 = add i32 %1, 577
+; CHECK-NEXT: %2 = add nuw nsw i32 %1, 577
 ; CHECK-NEXT: ret i32 %2
 }
 
@@ -521,7 +521,7 @@ define i32 @test35(i32 %x) {
 ; CHECK-LABEL: @test35(
 ; CHECK: ashr i32 %x, 31
 ; CHECK: and i32 {{.*}}, 40
-; CHECK: add i32 {{.*}}, 60
+; CHECK: add nuw nsw i32 {{.*}}, 60
 ; CHECK: ret
 }
 
@@ -532,7 +532,7 @@ define i32 @test36(i32 %x) {
 ; CHECK-LABEL: @test36(
 ; CHECK: ashr i32 %x, 31
 ; CHECK: and i32 {{.*}}, -40
-; CHECK: add i32 {{.*}}, 100
+; CHECK: add nsw i32 {{.*}}, 100
 ; CHECK: ret
 }
 
@@ -996,6 +996,111 @@ define <2 x i32> @select_icmp_eq_and_1_0_or_vector_of_2s(i32 %x, <2 x i32> %y) {
   ret <2 x i32> %select
 }
 
+; CHECK-LABEL: @select_icmp_and_8_eq_0_or_8(
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, 8
+; CHECK-NEXT: ret i32 [[OR]]
+define i32 @select_icmp_and_8_eq_0_or_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %x, 8
+  %or.x = select i1 %cmp, i32 %or, i32 %x
+  ret i32 %or.x
+}
+
+; CHECK-LABEL: @select_icmp_and_8_ne_0_xor_8(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9
+; CHECK-NEXT: ret i32 [[AND]]
+define i32 @select_icmp_and_8_ne_0_xor_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %x, 8
+  %x.xor = select i1 %cmp, i32 %x, i32 %xor
+  ret i32 %x.xor
+}
+
+; CHECK-LABEL: @select_icmp_and_8_eq_0_xor_8(
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, 8
+; CHECK-NEXT: ret i32 [[OR]]
+define i32 @select_icmp_and_8_eq_0_xor_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %x, 8
+  %xor.x = select i1 %cmp, i32 %xor, i32 %x
+  ret i32 %xor.x
+}
+
+; CHECK-LABEL: @select_icmp_and_8_ne_0_and_not_8(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9
+; CHECK-NEXT: ret i32 [[AND]]
+define i32 @select_icmp_and_8_ne_0_and_not_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %x, -9
+  %x.and1 = select i1 %cmp, i32 %x, i32 %and1
+  ret i32 %x.and1
+}
+
+; CHECK-LABEL: @select_icmp_and_8_eq_0_and_not_8(
+; CHECK-NEXT: ret i32 %x
+define i32 @select_icmp_and_8_eq_0_and_not_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %x, -9
+  %and1.x = select i1 %cmp, i32 %and1, i32 %x
+  ret i32 %and1.x
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_xor_8(
+; CHECK: select i1 %cmp, i64 %y, i64 %xor
+define i64 @select_icmp_x_and_8_eq_0_y_xor_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i64 %y, 8
+  %y.xor = select i1 %cmp, i64 %y, i64 %xor
+  ret i64 %y.xor
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_and_not_8(
+; CHECK: select i1 %cmp, i64 %y, i64 %and1
+define i64 @select_icmp_x_and_8_eq_0_y_and_not_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i64 %y, -9
+  %y.and1 = select i1 %cmp, i64 %y, i64 %and1
+  ret i64 %y.and1
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_xor_8(
+; CHECK: select i1 %cmp, i64 %xor, i64 %y
+define i64 @select_icmp_x_and_8_ne_0_y_xor_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i64 %y, 8
+  %xor.y = select i1 %cmp, i64 %xor, i64 %y
+  ret i64 %xor.y
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_and_not_8(
+; CHECK: select i1 %cmp, i64 %and1, i64 %y
+define i64 @select_icmp_x_and_8_ne_0_y_and_not_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i64 %y, -9
+  %and1.y = select i1 %cmp, i64 %and1, i64 %y
+  ret i64 %and1.y
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_or_8(
+; CHECK: xor i64 %1, 8
+; CHECK: or i64 %2, %y
+define i64 @select_icmp_x_and_8_ne_0_y_or_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %or = or i64 %y, 8
+  %or.y = select i1 %cmp, i64 %or, i64 %y
+  ret i64 %or.y
+}
+
 define i32 @test65(i64 %x) {
   %1 = and i64 %x, 16
   %2 = icmp ne i64 %1, 0
@@ -1130,4 +1235,4 @@ define i32 @test75(i32 %x) {
 ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ult i32 %x, 68
 ; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 68, i32 %x
 ; CHECK-NEXT: ret i32 [[SEL]]
-}
\ No newline at end of file
+}
diff --git a/test/Transforms/InstCombine/sext.ll b/test/Transforms/InstCombine/sext.ll
index b8dfe2257b18..f04afcc747b8 100644
--- a/test/Transforms/InstCombine/sext.ll
+++ b/test/Transforms/InstCombine/sext.ll
@@ -145,7 +145,7 @@ define i32 @test13(i32 %x) nounwind {
 ; CHECK-LABEL: @test13(
 ; CHECK-NEXT: %and = lshr i32 %x, 3
 ; CHECK-NEXT: %1 = and i32 %and, 1
-; CHECK-NEXT: %sext = add i32 %1, -1
+; CHECK-NEXT: %sext = add nsw i32 %1, -1
 ; CHECK-NEXT: ret i32 %sext
 }
 
@@ -157,7 +157,7 @@ define i32 @test14(i16 %x) nounwind {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT: %and = lshr i16 %x, 4
 ; CHECK-NEXT: %1 = and i16 %and, 1
-; CHECK-NEXT: %sext = add i16 %1, -1
+; CHECK-NEXT: %sext = add nsw i16 %1, -1
 ; CHECK-NEXT: %ext = sext i16 %sext to i32
 ; CHECK-NEXT: ret i32 %ext
 }
diff --git a/test/Transforms/InstCombine/zext-bool-add-sub.ll b/test/Transforms/InstCombine/zext-bool-add-sub.ll
index d7f338b659b4..6fa4d70d6360 100644
--- a/test/Transforms/InstCombine/zext-bool-add-sub.ll
+++ b/test/Transforms/InstCombine/zext-bool-add-sub.ll
@@ -6,7 +6,7 @@ entry:
 ; CHECK-LABEL: @a(
 ; CHECK: [[TMP1:%.*]] = sext i1 %y to i32
 ; CHECK: [[TMP2:%.*]] = select i1 %x, i32 2, i32 1
-; CHECK-NEXT: add i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT: add nsw i32 [[TMP2]], [[TMP1]]
   %conv = zext i1 %x to i32
   %conv3 = zext i1 %y to i32
   %conv3.neg = sub i32 0, %conv3
diff --git a/test/Transforms/JumpThreading/phi-eq.ll b/test/Transforms/JumpThreading/phi-eq.ll
index 40d3c7edd05d..e05d5ee7c974 100644
--- a/test/Transforms/JumpThreading/phi-eq.ll
+++ b/test/Transforms/JumpThreading/phi-eq.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -jump-threading | llvm-dis | FileCheck %s
+; RUN: opt < %s -jump-threading -S | FileCheck %s
 ; Test whether two consecutive switches with identical structures assign the
 ; proper value to the proper variable.  This is really testing 
 ; Instruction::isIdenticalToWhenDefined, as previously that function was 
diff --git a/test/Transforms/JumpThreading/pr15851_hang.ll b/test/Transforms/JumpThreading/pr15851_hang.ll
new file mode 100644
index 000000000000..0484bc9f9dc9
--- /dev/null
+++ b/test/Transforms/JumpThreading/pr15851_hang.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -jump-threading < %s | FileCheck %s
+
+; CHECK-LABEL: @f(
+; CHECK-LABEL: entry
+; CHECK: ret void
+; CHECK-NOT: for.cond1
+; CHECK-NOT: for.body
+
+define void @f() {
+entry:
+  ret void
+
+for.cond1:
+  %i.025 = phi i32 [ %inc, %for.body ], [ %inc, %for.body ], [ 1, %for.cond1 ]
+  %cmp = icmp slt i32 %i.025, 2
+  br i1 %cmp, label %for.body, label %for.cond1
+
+for.body:
+  %inc = add nsw i32 %i.025, 0
+  %a = icmp ugt i32 %inc, 2
+  br i1 %a, label %for.cond1, label %for.cond1
+}
diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 201e604e0c5e..545e86c082f4 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll
@@ -127,7 +127,7 @@ L4:
 ; CHECK: test_switch_default
 ; CHECK: entry:
 ; CHECK: load
-; CHECK: switch
+; CHECK: icmp
 ; CHECK: [[THREADED:[A-Za-z.0-9]+]]:
 ; CHECK: store
 ; CHECK: br
diff --git a/test/Transforms/LoadCombine/load-combine.ll b/test/Transforms/LoadCombine/load-combine.ll
new file mode 100644
index 000000000000..c4d9241764d9
--- /dev/null
+++ b/test/Transforms/LoadCombine/load-combine.ll
@@ -0,0 +1,190 @@
+; RUN: opt < %s -load-combine -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Combine read from char* idiom.
+define i64 @LoadU64_x64_0(i64* %pData) {
+  %1 = bitcast i64* %pData to i8*
+  %2 = load i8* %1, align 1
+  %3 = zext i8 %2 to i64
+  %4 = shl nuw i64 %3, 56
+  %5 = getelementptr inbounds i8* %1, i64 1
+  %6 = load i8* %5, align 1
+  %7 = zext i8 %6 to i64
+  %8 = shl nuw nsw i64 %7, 48
+  %9 = or i64 %8, %4
+  %10 = getelementptr inbounds i8* %1, i64 2
+  %11 = load i8* %10, align 1
+  %12 = zext i8 %11 to i64
+  %13 = shl nuw nsw i64 %12, 40
+  %14 = or i64 %9, %13
+  %15 = getelementptr inbounds i8* %1, i64 3
+  %16 = load i8* %15, align 1
+  %17 = zext i8 %16 to i64
+  %18 = shl nuw nsw i64 %17, 32
+  %19 = or i64 %14, %18
+  %20 = getelementptr inbounds i8* %1, i64 4
+  %21 = load i8* %20, align 1
+  %22 = zext i8 %21 to i64
+  %23 = shl nuw nsw i64 %22, 24
+  %24 = or i64 %19, %23
+  %25 = getelementptr inbounds i8* %1, i64 5
+  %26 = load i8* %25, align 1
+  %27 = zext i8 %26 to i64
+  %28 = shl nuw nsw i64 %27, 16
+  %29 = or i64 %24, %28
+  %30 = getelementptr inbounds i8* %1, i64 6
+  %31 = load i8* %30, align 1
+  %32 = zext i8 %31 to i64
+  %33 = shl nuw nsw i64 %32, 8
+  %34 = or i64 %29, %33
+  %35 = getelementptr inbounds i8* %1, i64 7
+  %36 = load i8* %35, align 1
+  %37 = zext i8 %36 to i64
+  %38 = or i64 %34, %37
+  ret i64 %38
+; CHECK-LABEL: @LoadU64_x64_0(
+; CHECK: load i64* %{{.*}}, align 1
+; CHECK-NOT: load
+}
+
+; Combine simple adjacent loads.
+define i32 @"2xi16_i32"(i16* %x) {
+  %1 = load i16* %x, align 2
+  %2 = getelementptr inbounds i16* %x, i64 1
+  %3 = load i16* %2, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw i32 %4, 16
+  %6 = zext i16 %1 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32"(
+; CHECK: load i32* %{{.*}}, align 2
+; CHECK-NOT: load
+}
+
+; Don't combine loads across stores.
+define i32 @"2xi16_i32_store"(i16* %x, i16* %y) {
+  %1 = load i16* %x, align 2
+  store i16 0, i16* %y, align 2
+  %2 = getelementptr inbounds i16* %x, i64 1
+  %3 = load i16* %2, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw i32 %4, 16
+  %6 = zext i16 %1 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32_store"(
+; CHECK: load i16* %{{.*}}, align 2
+; CHECK: store
+; CHECK: load i16* %{{.*}}, align 2
+}
+
+; Don't combine loads with a gap.
+define i32 @"2xi16_i32_gap"(i16* %x) {
+  %1 = load i16* %x, align 2
+  %2 = getelementptr inbounds i16* %x, i64 2
+  %3 = load i16* %2, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw i32 %4, 16
+  %6 = zext i16 %1 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32_gap"(
+; CHECK: load i16* %{{.*}}, align 2
+; CHECK: load i16* %{{.*}}, align 2
+}
+
+; Combine out of order loads.
+define i32 @"2xi16_i32_order"(i16* %x) {
+  %1 = getelementptr inbounds i16* %x, i64 1
+  %2 = load i16* %1, align 2
+  %3 = zext i16 %2 to i32
+  %4 = load i16* %x, align 2
+  %5 = shl nuw i32 %3, 16
+  %6 = zext i16 %4 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32_order"(
+; CHECK: load i32* %{{.*}}, align 2
+; CHECK-NOT: load
+}
+
+; Overlapping loads.
+define i32 @"2xi16_i32_overlap"(i8* %x) {
+  %1 = bitcast i8* %x to i16*
+  %2 = load i16* %1, align 2
+  %3 = getelementptr inbounds i8* %x, i64 1
+  %4 = bitcast i8* %3 to i16*
+  %5 = load i16* %4, align 2
+  %6 = zext i16 %5 to i32
+  %7 = shl nuw i32 %6, 16
+  %8 = zext i16 %2 to i32
+  %9 = or i32 %7, %8
+  ret i32 %9
+; CHECK-LABEL: @"2xi16_i32_overlap"(
+; CHECK: load i16* %{{.*}}, align 2
+; CHECK: load i16* %{{.*}}, align 2
+}
+
+; Combine valid alignments.
+define i64 @"2xi16_i64_align"(i8* %x) {
+  %1 = bitcast i8* %x to i32*
+  %2 = load i32* %1, align 4
+  %3 = getelementptr inbounds i8* %x, i64 4
+  %4 = bitcast i8* %3 to i16*
+  %5 = load i16* %4, align 2
+  %6 = getelementptr inbounds i8* %x, i64 6
+  %7 = bitcast i8* %6 to i16*
+  %8 = load i16* %7, align 2
+  %9 = zext i16 %8 to i64
+  %10 = shl nuw i64 %9, 48
+  %11 = zext i16 %5 to i64
+  %12 = shl nuw nsw i64 %11, 32
+  %13 = zext i32 %2 to i64
+  %14 = or i64 %12, %13
+  %15 = or i64 %14, %10
+  ret i64 %15
+; CHECK-LABEL: @"2xi16_i64_align"(
+; CHECK: load i64* %{{.*}}, align 4
+}
+
+; Non power of two.
+define i64 @"2xi16_i64_npo2"(i8* %x) {
+  %1 = load i8* %x, align 1
+  %2 = zext i8 %1 to i64
+  %3 = getelementptr inbounds i8* %x, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i64
+  %6 = shl nuw nsw i64 %5, 8
+  %7 = or i64 %6, %2
+  %8 = getelementptr inbounds i8* %x, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i64
+  %11 = shl nuw nsw i64 %10, 16
+  %12 = or i64 %11, %7
+  %13 = getelementptr inbounds i8* %x, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i64
+  %16 = shl nuw nsw i64 %15, 24
+  %17 = or i64 %16, %12
+  %18 = getelementptr inbounds i8* %x, i64 4
+  %19 = load i8* %18, align 1
+  %20 = zext i8 %19 to i64
+  %21 = shl nuw nsw i64 %20, 32
+  %22 = or i64 %21, %17
+  %23 = getelementptr inbounds i8* %x, i64 5
+  %24 = load i8* %23, align 1
+  %25 = zext i8 %24 to i64
+  %26 = shl nuw nsw i64 %25, 40
+  %27 = or i64 %26, %22
+  %28 = getelementptr inbounds i8* %x, i64 6
+  %29 = load i8* %28, align 1
+  %30 = zext i8 %29 to i64
+  %31 = shl nuw nsw i64 %30, 48
+  %32 = or i64 %31, %27
+  ret i64 %32
+; CHECK-LABEL: @"2xi16_i64_npo2"(
+; CHECK: load i32* %{{.*}}, align 1
+}
diff --git a/test/Transforms/LoopIdiom/X86/lit.local.cfg b/test/Transforms/LoopIdiom/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/LoopIdiom/X86/lit.local.cfg
+++ b/test/Transforms/LoopIdiom/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll b/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
index e91d141cc6ff..0534a0bf7d06 100644
--- a/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
+++ b/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -loop-simplify -disable-output
+; RUN: opt < %s -loop-simplify -disable-output
 ; PR1752
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-s0:0:64-f80:32:32"
 target triple = "i686-pc-mingw32"
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..675f48e199a0
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
@@ -0,0 +1,4 @@
+config.suffixes = ['.ll']
+
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll
similarity index 100%
rename from test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
rename to test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
similarity index 100%
rename from test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
rename to test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/req-regs.ll b/test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll
similarity index 100%
rename from test/Transforms/LoopStrengthReduce/ARM64/req-regs.ll
rename to test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll
diff --git a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
index 756ea82f37fb..1d56ddea2446 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - -arm-atomic-cfg-tidy=0 | FileCheck %s
 ;
 ; LSR should only check for valid address modes when the IV user is a
 ; memory address.
diff --git a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
deleted file mode 100644
index a49957999f0a..000000000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-config.suffixes = ['.ll']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg b/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
index 2e463005586f..5d33887ff0a4 100644
--- a/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
+++ b/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopUnroll/X86/lit.local.cfg b/test/Transforms/LoopUnroll/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/LoopUnroll/X86/lit.local.cfg
+++ b/test/Transforms/LoopUnroll/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopUnroll/pr18861.ll b/test/Transforms/LoopUnroll/pr18861.ll
new file mode 100644
index 000000000000..62f26106afb2
--- /dev/null
+++ b/test/Transforms/LoopUnroll/pr18861.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -loop-unroll -indvars -disable-output
+
+@b = external global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define void @fn1() #0 {
+entry:
+  br label %for.cond1thread-pre-split
+
+for.cond1thread-pre-split:                        ; preds = %for.inc8, %entry
+  %storemerge1 = phi i32 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br i1 undef, label %for.inc8, label %for.cond2.preheader.lr.ph
+
+for.cond2.preheader.lr.ph:                        ; preds = %for.cond1thread-pre-split
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.inc5, %for.cond2.preheader.lr.ph
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.body3, %for.cond2.preheader
+  %storemerge = phi i32 [ %add, %for.body3 ], [ 0, %for.cond2.preheader ]
+  %cmp = icmp slt i32 %storemerge, 1
+  br i1 %cmp, label %for.body3, label %for.inc5
+
+for.body3:                                        ; preds = %for.cond2
+  %tobool4 = icmp eq i32 %storemerge, 0
+  %add = add nsw i32 %storemerge, 1
+  br i1 %tobool4, label %for.cond2, label %if.then
+
+if.then:                                          ; preds = %for.body3
+  store i32 %storemerge1, i32* @b, align 4
+  ret void
+
+for.inc5:                                         ; preds = %for.cond2
+  br i1 undef, label %for.cond1.for.inc8_crit_edge, label %for.cond2.preheader
+
+for.cond1.for.inc8_crit_edge:                     ; preds = %for.inc5
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.cond1.for.inc8_crit_edge, %for.cond1thread-pre-split
+  %inc9 = add nsw i32 %storemerge1, 1
+  br label %for.cond1thread-pre-split
+}
diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll
new file mode 100644
index 000000000000..5e60818e79b1
--- /dev/null
+++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll
@@ -0,0 +1,285 @@
+; RUN: opt < %s -loop-unroll -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; loop4 contains a small loop which should be completely unrolled by
+; the default unrolling heuristics.  It serves as a control for the
+; unroll(disable) pragma test loop4_with_disable.
+;
+; CHECK-LABEL: @loop4(
+; CHECK-NOT: br i1
+define void @loop4(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; #pragma clang loop unroll(disable)
+;
+; CHECK-LABEL: @loop4_with_disable(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop4_with_disable(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!1 = metadata !{metadata !1, metadata !2}
+!2 = metadata !{metadata !"llvm.loopunroll.enable", i1 false}
+
+; loop64 has a high enough count that it should *not* be unrolled by
+; the default unrolling heuristic.  It serves as the control for the
+; unroll(enable) pragma test loop64_with_.* tests below.
+;
+; CHECK-LABEL: @loop64(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop64(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; #pragma clang loop unroll(enable)
+; Loop should be fully unrolled.
+;
+; CHECK-LABEL: @loop64_with_enable(
+; CHECK-NOT: br i1
+define void @loop64_with_enable(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!3 = metadata !{metadata !3, metadata !4}
+!4 = metadata !{metadata !"llvm.loopunroll.enable", i1 true}
+
+; #pragma clang loop unroll_count(4)
+; Loop should be unrolled 4 times.
+;
+; CHECK-LABEL: @loop64_with_count4(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop64_with_count4(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!5 = metadata !{metadata !5, metadata !6}
+!6 = metadata !{metadata !"llvm.loopunroll.count", i32 4}
+
+
+; #pragma clang loop unroll_count(enable) unroll_count(4)
+; Loop should be unrolled 4 times.
+;
+; CHECK-LABEL: @loop64_with_enable_and_count4(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop64_with_enable_and_count4(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!7 = metadata !{metadata !7, metadata !6, metadata !4}
+
+; #pragma clang loop unroll_count(enable)
+; Full unrolling is requested, but loop has a dynamic trip count so
+; no unrolling should occur.
+;
+; CHECK-LABEL: @dynamic_loop_with_enable(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @dynamic_loop_with_enable(i32* nocapture %a, i32 %b) {
+entry:
+  %cmp3 = icmp sgt i32 %b, 0
+  br i1 %cmp3, label %for.body, label %for.end, !llvm.loop !8
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %b
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !8
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+!8 = metadata !{metadata !8, metadata !4}
+
+; #pragma clang loop unroll_count(4)
+; Loop has a dynamic trip count.  Unrolling should occur, but no
+; conditional branches can be removed.
+;
+; CHECK-LABEL: @dynamic_loop_with_count4(
+; CHECK-NOT: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK-NOT: br i1
+define void @dynamic_loop_with_count4(i32* nocapture %a, i32 %b) {
+entry:
+  %cmp3 = icmp sgt i32 %b, 0
+  br i1 %cmp3, label %for.body, label %for.end, !llvm.loop !9
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %b
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !9
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+!9 = metadata !{metadata !9, metadata !6}
+
+; #pragma clang loop unroll_count(1)
+; Loop should not be unrolled
+;
+; CHECK-LABEL: @unroll_1(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @unroll_1(i32* nocapture %a, i32 %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !10
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!10 = metadata !{metadata !10, metadata !11}
+!11 = metadata !{metadata !"llvm.loopunroll.count", i32 1}
+
+; #pragma clang loop unroll(enable)
+; Loop has very high loop count (1 million) and full unrolling was requested.
+; Loop should unrolled up to the pragma threshold, but not completely.
+;
+; CHECK-LABEL: @unroll_1M(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: br i1
+define void @unroll_1M(i32* nocapture %a, i32 %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !12
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!12 = metadata !{metadata !12, metadata !4}
diff --git a/test/Transforms/LoopVectorize/ARM64/arm64-unroll.ll b/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
similarity index 100%
rename from test/Transforms/LoopVectorize/ARM64/arm64-unroll.ll
rename to test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
diff --git a/test/Transforms/LoopVectorize/ARM64/gather-cost.ll b/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
similarity index 100%
rename from test/Transforms/LoopVectorize/ARM64/gather-cost.ll
rename to test/Transforms/LoopVectorize/AArch64/gather-cost.ll
diff --git a/test/Transforms/LoopVectorize/AArch64/lit.local.cfg b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
index a24a72819c35..937cffb2c119 100644
--- a/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
@@ -1,6 +1,5 @@
 config.suffixes = ['.ll']
 
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/Transforms/LoopVectorize/ARM/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/ARM64/lit.local.cfg b/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
deleted file mode 100644
index f1d1f88cf399..000000000000
--- a/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-config.suffixes = ['.ll']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg b/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
index 2e463005586f..5d33887ff0a4 100644
--- a/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/LoopVectorize/X86/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/XCore/lit.local.cfg b/test/Transforms/LoopVectorize/XCore/lit.local.cfg
index 4d17d4642045..bb48713fe33e 100644
--- a/test/Transforms/LoopVectorize/XCore/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/XCore/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index ad2c663ce461..3102ec59bd41 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -108,3 +108,30 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; <label>:5                                       ; preds = %1
   ret i32 %2
 }
+
+; This loop has a backedge taken count of i32_max. We need to check for this
+; condition and branch directly to the scalar loop.
+
+; CHECK-LABEL: max_i32_backedgetaken
+; CHECK:  %backedge.overflow = icmp eq i32 -1, -1
+; CHECK:  br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked
+
+; CHECK: scalar.ph:
+; CHECK:  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ]
+
+define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
+
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i32 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i32 %b.0, -1
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index c3d570c03a77..7dfaf03b0f2d 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1090,3 +1090,105 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+declare double @llvm.powi.f64(double %Val, i32 %power) nounwind readnone
+
+;CHECK-LABEL: @powi_f64(
+;CHECK: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64(i32 %n, double* noalias %y, double* noalias %x, i32 %P) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %call = tail call double @llvm.powi.f64(double %0, i32  %P) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @powi_f64_neg(
+;CHECK-NOT: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64_neg(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %1 = trunc i64 %indvars.iv to i32
+  %call = tail call double @llvm.powi.f64(double %0, i32  %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.cttz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @cttz_f64(
+;CHECK: llvm.cttz.v4i64
+;CHECK: ret void
+define void @cttz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv
+  %0 = load i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.ctlz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @ctlz_f64(
+;CHECK: llvm.ctlz.v4i64
+;CHECK: ret void
+define void @ctlz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv
+  %0 = load i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index e7b1e2a6b72c..01e28bc44665 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -5,6 +5,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK-LABEL: @add_ints(
 ;CHECK: br
+;CHECK: br
 ;CHECK: getelementptr
 ;CHECK-NEXT: getelementptr
 ;CHECK-DAG: icmp uge
diff --git a/test/Transforms/LowerAtomic/atomic-swap.ll b/test/Transforms/LowerAtomic/atomic-swap.ll
index c319834b6270..cb1124136f86 100644
--- a/test/Transforms/LowerAtomic/atomic-swap.ll
+++ b/test/Transforms/LowerAtomic/atomic-swap.ll
@@ -3,15 +3,20 @@
 define i8 @cmpswap() {
 ; CHECK-LABEL: @cmpswap(
   %i = alloca i8
-  %j = cmpxchg i8* %i, i8 0, i8 42 monotonic monotonic
-; CHECK: [[INST:%[a-z0-9]+]] = load
-; CHECK-NEXT: icmp
-; CHECK-NEXT: select
-; CHECK-NEXT: store
+  %pair = cmpxchg i8* %i, i8 0, i8 42 monotonic monotonic
+  %j = extractvalue { i8, i1 } %pair, 0
+; CHECK: [[OLDVAL:%[a-z0-9]+]] = load i8* [[ADDR:%[a-z0-9]+]]
+; CHECK-NEXT: [[SAME:%[a-z0-9]+]] = icmp eq i8 [[OLDVAL]], 0
+; CHECK-NEXT: [[TO_STORE:%[a-z0-9]+]] = select i1 [[SAME]], i8 42, i8 [[OLDVAL]]
+; CHECK-NEXT: store i8 [[TO_STORE]], i8* [[ADDR]]
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = insertvalue { i8, i1 } undef, i8 [[OLDVAL]], 0
+; CHECK-NEXT: [[RES:%[a-z0-9]+]] = insertvalue { i8, i1 } [[TMP]], i1 [[SAME]], 1
+; CHECK-NEXT: [[VAL:%[a-z0-9]+]] = extractvalue { i8, i1 } [[RES]], 0
   ret i8 %j
-; CHECK: ret i8 [[INST]]
+; CHECK: ret i8 [[VAL]]
 }
 
+
 define i8 @swap() {
 ; CHECK-LABEL: @swap(
   %i = alloca i8
diff --git a/test/Transforms/LowerSwitch/2014-06-10-SwitchContiguousOpt.ll b/test/Transforms/LowerSwitch/2014-06-10-SwitchContiguousOpt.ll
new file mode 100644
index 000000000000..3673c04a8b81
--- /dev/null
+++ b/test/Transforms/LowerSwitch/2014-06-10-SwitchContiguousOpt.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -lowerswitch -S | FileCheck %s
+; CHECK-NOT: icmp eq i32 %0, 1
+
+define i32 @foo(i32 %a) #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  switch i32 %0, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+
+sw.bb:
+  ret i32 12
+
+sw.bb1:
+  ret i32 4
+
+sw.bb2:
+  ret i32 2
+
+sw.default:
+  ret i32 9
+}
diff --git a/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll b/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll
new file mode 100644
index 000000000000..0f737211f598
--- /dev/null
+++ b/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -lowerswitch -S | FileCheck %s
+; CHECK-NOT: {{.*}}icmp eq{{.*}}
+;
+;int foo(int a) {
+;
+;  switch (a) {
+;  case 0:
+;    return 10;
+;  case 1:
+;    return 3;
+;  default:
+;    __builtin_unreachable();
+;  }
+;
+;}
+
+define i32 @foo(i32 %a) nounwind ssp uwtable {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  store i32 %a, i32* %2, align 4
+  %3 = load i32* %2, align 4
+  switch i32 %3, label %6 [
+    i32 0, label %4
+    i32 1, label %5
+  ]
+
+; <label>:4 
+  store i32 10, i32* %1
+  br label %7
+
+; <label>:5
+  store i32 3, i32* %1
+  br label %7
+
+; <label>:6
+  unreachable
+
+; <label>:7
+  %8 = load i32* %1
+  ret i32 %8
+}
diff --git a/test/Transforms/LowerSwitch/feature.ll b/test/Transforms/LowerSwitch/feature.ll
index e85f03ee5c78..09d25f0b06d4 100644
--- a/test/Transforms/LowerSwitch/feature.ll
+++ b/test/Transforms/LowerSwitch/feature.ll
@@ -3,93 +3,57 @@
 ; We have switch on input.
 ; On output we should got binary comparison tree. Check that all is fine.
 
-;CHECK:      entry:
-;CHECK-NEXT:   br label %NodeBlock37
+;CHECK:     entry:
+;CHECK-NEXT:  br label %NodeBlock19
 
-;CHECK:      NodeBlock37:                                      ; preds = %entry
-;CHECK-NEXT:   %Pivot38 = icmp slt i32 %tmp158, 10
-;CHECK-NEXT:   br i1 %Pivot38, label %NodeBlock13, label %NodeBlock35
+;CHECK:     NodeBlock19:                                      ; preds = %entry
+;CHECK-NEXT:  %Pivot20 = icmp slt i32 %tmp158, 10
+;CHECK-NEXT:  br i1 %Pivot20, label %NodeBlock5, label %NodeBlock17
 
-;CHECK:      NodeBlock35:                                      ; preds = %NodeBlock37
-;CHECK-NEXT:   %Pivot36 = icmp slt i32 %tmp158, 13
-;CHECK-NEXT:   br i1 %Pivot36, label %NodeBlock23, label %NodeBlock33
+;CHECK:     NodeBlock17:                                      ; preds = %NodeBlock19
+;CHECK-NEXT:  %Pivot18 = icmp slt i32 %tmp158, 13
+;CHECK-NEXT:  br i1 %Pivot18, label %NodeBlock9, label %NodeBlock15
 
-;CHECK:      NodeBlock33:                                      ; preds = %NodeBlock35
-;CHECK-NEXT:   %Pivot34 = icmp slt i32 %tmp158, 14
-;CHECK-NEXT:   br i1 %Pivot34, label %LeafBlock25, label %NodeBlock31
+;CHECK:     NodeBlock15:                                      ; preds = %NodeBlock17
+;CHECK-NEXT:  %Pivot16 = icmp slt i32 %tmp158, 14
+;CHECK-NEXT:  br i1 %Pivot16, label %bb330, label %NodeBlock13
 
-;CHECK:      NodeBlock31:                                      ; preds = %NodeBlock33
-;CHECK-NEXT:   %Pivot32 = icmp slt i32 %tmp158, 15
-;CHECK-NEXT:   br i1 %Pivot32, label %LeafBlock27, label %LeafBlock29
+;CHECK:     NodeBlock13:                                      ; preds = %NodeBlock15
+;CHECK-NEXT:  %Pivot14 = icmp slt i32 %tmp158, 15
+;CHECK-NEXT:  br i1 %Pivot14, label %bb332, label %LeafBlock11
 
-;CHECK:      LeafBlock29:                                      ; preds = %NodeBlock31
-;CHECK-NEXT:   %SwitchLeaf30 = icmp eq i32 %tmp158, 15
-;CHECK-NEXT:   br i1 %SwitchLeaf30, label %bb334, label %NewDefault
+;CHECK:     LeafBlock11:                                      ; preds = %NodeBlock13
+;CHECK-NEXT:  %SwitchLeaf12 = icmp eq i32 %tmp158, 15
+;CHECK-NEXT:  br i1 %SwitchLeaf12, label %bb334, label %NewDefault
 
-;CHECK:      LeafBlock27:                                      ; preds = %NodeBlock31
-;CHECK-NEXT:   %SwitchLeaf28 = icmp eq i32 %tmp158, 14
-;CHECK-NEXT:   br i1 %SwitchLeaf28, label %bb332, label %NewDefault
+;CHECK:     NodeBlock9:                                       ; preds = %NodeBlock17
+;CHECK-NEXT:  %Pivot10 = icmp slt i32 %tmp158, 11
+;CHECK-NEXT:  br i1 %Pivot10, label %bb324, label %NodeBlock7
 
-;CHECK:      LeafBlock25:                                      ; preds = %NodeBlock33
-;CHECK-NEXT:   %SwitchLeaf26 = icmp eq i32 %tmp158, 13
-;CHECK-NEXT:   br i1 %SwitchLeaf26, label %bb330, label %NewDefault
+;CHECK:     NodeBlock7:                                       ; preds = %NodeBlock9
+;CHECK-NEXT:  %Pivot8 = icmp slt i32 %tmp158, 12
+;CHECK-NEXT:  br i1 %Pivot8, label %bb326, label %bb328
 
-;CHECK:      NodeBlock23:                                      ; preds = %NodeBlock35
-;CHECK-NEXT:   %Pivot24 = icmp slt i32 %tmp158, 11
-;CHECK-NEXT:   br i1 %Pivot24, label %LeafBlock15, label %NodeBlock21
+;CHECK:     NodeBlock5:                                       ; preds = %NodeBlock19
+;CHECK-NEXT:  %Pivot6 = icmp slt i32 %tmp158, 7
+;CHECK-NEXT:  br i1 %Pivot6, label %NodeBlock, label %NodeBlock3
 
-;CHECK:      NodeBlock21:                                      ; preds = %NodeBlock23
-;CHECK-NEXT:   %Pivot22 = icmp slt i32 %tmp158, 12
-;CHECK-NEXT:   br i1 %Pivot22, label %LeafBlock17, label %LeafBlock19
+;CHECK:     NodeBlock3:                                       ; preds = %NodeBlock5
+;CHECK-NEXT:  %Pivot4 = icmp slt i32 %tmp158, 8
+;CHECK-NEXT:  br i1 %Pivot4, label %bb, label %NodeBlock1
 
-;CHECK:      LeafBlock19:                                      ; preds = %NodeBlock21
-;CHECK-NEXT:   %SwitchLeaf20 = icmp eq i32 %tmp158, 12
-;CHECK-NEXT:   br i1 %SwitchLeaf20, label %bb328, label %NewDefault
+;CHECK:     NodeBlock1:                                       ; preds = %NodeBlock3
+;CHECK-NEXT:  %Pivot2 = icmp slt i32 %tmp158, 9
+;CHECK-NEXT:  br i1 %Pivot2, label %bb338, label %bb322
 
-;CHECK:      LeafBlock17:                                      ; preds = %NodeBlock21
-;CHECK-NEXT:   %SwitchLeaf18 = icmp eq i32 %tmp158, 11
-;CHECK-NEXT:   br i1 %SwitchLeaf18, label %bb326, label %NewDefault
+;CHECK:     NodeBlock:                                        ; preds = %NodeBlock5
+;CHECK-NEXT:  %Pivot = icmp slt i32 %tmp158, 0
+;CHECK-NEXT:  br i1 %Pivot, label %LeafBlock, label %bb338
 
-;CHECK:      LeafBlock15:                                      ; preds = %NodeBlock23
-;CHECK-NEXT:   %SwitchLeaf16 = icmp eq i32 %tmp158, 10
-;CHECK-NEXT:   br i1 %SwitchLeaf16, label %bb324, label %NewDefault
-
-;CHECK:      NodeBlock13:                                      ; preds = %NodeBlock37
-;CHECK-NEXT:   %Pivot14 = icmp slt i32 %tmp158, 7
-;CHECK-NEXT:   br i1 %Pivot14, label %NodeBlock, label %NodeBlock11
-
-;CHECK:      NodeBlock11:                                      ; preds = %NodeBlock13
-;CHECK-NEXT:   %Pivot12 = icmp slt i32 %tmp158, 8
-;CHECK-NEXT:   br i1 %Pivot12, label %LeafBlock3, label %NodeBlock9
-
-;CHECK:      NodeBlock9:                                       ; preds = %NodeBlock11
-;CHECK-NEXT:   %Pivot10 = icmp slt i32 %tmp158, 9
-;CHECK-NEXT:   br i1 %Pivot10, label %LeafBlock5, label %LeafBlock7
-
-;CHECK:      LeafBlock7:                                       ; preds = %NodeBlock9
-;CHECK-NEXT:   %SwitchLeaf8 = icmp eq i32 %tmp158, 9
-;CHECK-NEXT:   br i1 %SwitchLeaf8, label %bb322, label %NewDefault
-
-;CHECK:      LeafBlock5:                                       ; preds = %NodeBlock9
-;CHECK-NEXT:   %SwitchLeaf6 = icmp eq i32 %tmp158, 8
-;CHECK-NEXT:   br i1 %SwitchLeaf6, label %bb338, label %NewDefault
-
-;CHECK:      LeafBlock3:                                       ; preds = %NodeBlock11
-;CHECK-NEXT:   %SwitchLeaf4 = icmp eq i32 %tmp158, 7
-;CHECK-NEXT:   br i1 %SwitchLeaf4, label %bb, label %NewDefault
-
-;CHECK:      NodeBlock:                                        ; preds = %NodeBlock13
-;CHECK-NEXT:   %Pivot = icmp slt i32 %tmp158, 0
-;CHECK-NEXT:   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
-
-;CHECK:      LeafBlock1:                                       ; preds = %NodeBlock
-;CHECK-NEXT:   %SwitchLeaf2 = icmp ule i32 %tmp158, 6
-;CHECK-NEXT:   br i1 %SwitchLeaf2, label %bb338, label %NewDefault
-
-;CHECK:      LeafBlock:                                        ; preds = %NodeBlock
-;CHECK-NEXT:   %tmp158.off = add i32 %tmp158, 6
-;CHECK-NEXT:   %SwitchLeaf = icmp ule i32 %tmp158.off, 4
-;CHECK-NEXT:   br i1 %SwitchLeaf, label %bb338, label %NewDefault
+;CHECK:     LeafBlock:                                        ; preds = %NodeBlock
+;CHECK-NEXT:  %tmp158.off = add i32 %tmp158, 6
+;CHECK-NEXT:  %SwitchLeaf = icmp ule i32 %tmp158.off, 4
+;CHECK-NEXT:  br i1 %SwitchLeaf, label %bb338, label %NewDefault
 
 define i32 @main(i32 %tmp158) {
 entry:
diff --git a/test/Transforms/MergeFunc/functions.ll b/test/Transforms/MergeFunc/functions.ll
new file mode 100644
index 000000000000..006fdf523670
--- /dev/null
+++ b/test/Transforms/MergeFunc/functions.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; Be sure we don't merge cross-referenced functions of same type.
+
+; CHECK-LABEL: @left
+; CHECK-LABEL: entry-block
+; CHECK-LABEL: call void @right(i64 %p)
+define void @left(i64 %p) {
+entry-block:
+  call void @right(i64 %p)
+  call void @right(i64 %p)
+  call void @right(i64 %p)
+  call void @right(i64 %p)
+  ret void
+}
+
+; CHECK-LABEL: @right
+; CHECK-LABEL: entry-block
+; CHECK-LABEL: call void @left(i64 %p)
+define void @right(i64 %p) {
+entry-block:
+  call void @left(i64 %p)
+  call void @left(i64 %p)
+  call void @left(i64 %p)
+  call void @left(i64 %p)
+  ret void
+}
diff --git a/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll b/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll
index 5780990c7618..24300352a62c 100644
--- a/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll
+++ b/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll
@@ -1,9 +1,10 @@
-; RUN: opt < %s -reassociate -instcombine -constprop -dce -S | not grep add
+; RUN: opt < %s -reassociate -S | FileCheck %s
 
-define i32 @test(i32 %A) {
-	%X = add i32 %A, 1		; <i32> [#uses=1]
-	%Y = add i32 %A, 1		; <i32> [#uses=1]
-	%r = sub i32 %X, %Y		; <i32> [#uses=1]
-	ret i32 %r
+define i32 @test1(i32 %A) {
+; CHECK-LABEL: test1
+; CHECK: ret i32 0
+  %X = add i32 %A, 1
+  %Y = add i32 %A, 1
+  %r = sub i32 %X, %Y
+  ret i32 %r
 }
-
diff --git a/test/Transforms/Reassociate/2002-05-15-MissedTree.ll b/test/Transforms/Reassociate/2002-05-15-MissedTree.ll
index e8bccbde28e2..5f3c9209aeda 100644
--- a/test/Transforms/Reassociate/2002-05-15-MissedTree.ll
+++ b/test/Transforms/Reassociate/2002-05-15-MissedTree.ll
@@ -1,9 +1,11 @@
-; RUN: opt < %s -reassociate -instcombine -constprop -die -S | not grep 5
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @test(i32 %A, i32 %B) {
-	%W = add i32 %B, -5		; <i32> [#uses=1]
-	%Y = add i32 %A, 5		; <i32> [#uses=1]
-	%Z = add i32 %W, %Y		; <i32> [#uses=1]
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: test1
+; CHECK: %Z = add i32 %B, %A
+; CHECK: ret i32 %Z
+	%W = add i32 %B, -5
+	%Y = add i32 %A, 5
+	%Z = add i32 %W, %Y
 	ret i32 %Z
 }
-
diff --git a/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll b/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll
index c18af5e07efd..29c178ffec3e 100644
--- a/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll
+++ b/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll
@@ -1,12 +1,30 @@
+; RUN: opt < %s -reassociate -constprop -instcombine -dce -S | FileCheck %s
+
 ; With sub reassociation, constant folding can eliminate all of the constants.
-;
-; RUN: opt < %s -reassociate -constprop -instcombine -dce -S | not grep add
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %Z = sub i32 %A, %B
+; CHECK-NEXT: ret i32 %Z
 
-define i32 @test(i32 %A, i32 %B) {
-	%W = add i32 5, %B		; <i32> [#uses=1]
-	%X = add i32 -7, %A		; <i32> [#uses=1]
-	%Y = sub i32 %X, %W		; <i32> [#uses=1]
-	%Z = add i32 %Y, 12		; <i32> [#uses=1]
-	ret i32 %Z
+  %W = add i32 5, %B
+  %X = add i32 -7, %A
+  %Y = sub i32 %X, %W
+  %Z = add i32 %Y, 12
+  ret i32 %Z
 }
+ 
+; With sub reassociation, constant folding can eliminate the two 12 constants.
+define i32 @test2(i32 %A, i32 %B, i32 %C, i32 %D) {
+; CHECK-LABEL: test2
+; CHECK-NEXT: %sum = add i32 %B, %A
+; CHECK-NEXT: %sum1 = add i32 %sum, %C
+; CHECK-NEXT: %Q = sub i32 %D, %sum1
+; CHECK-NEXT: ret i32 %Q
 
+  %M = add i32 %A, 12
+  %N = add i32 %M, %B
+  %O = add i32 %N, %C
+  %P = sub i32 %D, %O
+  %Q = add i32 %P, 12
+  ret i32 %Q
+}
diff --git a/test/Transforms/Reassociate/2002-05-15-SubReassociate2.ll b/test/Transforms/Reassociate/2002-05-15-SubReassociate2.ll
deleted file mode 100644
index 5848821e10fd..000000000000
--- a/test/Transforms/Reassociate/2002-05-15-SubReassociate2.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; With sub reassociation, constant folding can eliminate the two 12 constants.
-;
-; RUN: opt < %s -reassociate -constprop -dce -S | not grep 12
-
-define i32 @test(i32 %A, i32 %B, i32 %C, i32 %D) {
-	%M = add i32 %A, 12		; <i32> [#uses=1]
-	%N = add i32 %M, %B		; <i32> [#uses=1]
-	%O = add i32 %N, %C		; <i32> [#uses=1]
-	%P = sub i32 %D, %O		; <i32> [#uses=1]
-	%Q = add i32 %P, 12		; <i32> [#uses=1]
-	ret i32 %Q
-}
-
diff --git a/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll b/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
index f66148bb4abc..f6cef35b1779 100644
--- a/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
+++ b/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
@@ -1,23 +1,24 @@
-; RUN: opt < %s -reassociate -instcombine -S |\
-; RUN:   grep "ret i32 0"
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @f(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
-	%tmp.2 = add i32 %a4, %a3		; <i32> [#uses=1]
-	%tmp.4 = add i32 %tmp.2, %a2		; <i32> [#uses=1]
-	%tmp.6 = add i32 %tmp.4, %a1		; <i32> [#uses=1]
-	%tmp.8 = add i32 %tmp.6, %a0		; <i32> [#uses=1]
-	%tmp.11 = add i32 %a3, %a2		; <i32> [#uses=1]
-	%tmp.13 = add i32 %tmp.11, %a1		; <i32> [#uses=1]
-	%tmp.15 = add i32 %tmp.13, %a0		; <i32> [#uses=1]
-	%tmp.18 = add i32 %a2, %a1		; <i32> [#uses=1]
-	%tmp.20 = add i32 %tmp.18, %a0		; <i32> [#uses=1]
-	%tmp.23 = add i32 %a1, %a0		; <i32> [#uses=1]
-	%tmp.26 = sub i32 %tmp.8, %tmp.15		; <i32> [#uses=1]
-	%tmp.28 = add i32 %tmp.26, %tmp.20		; <i32> [#uses=1]
-	%tmp.30 = sub i32 %tmp.28, %tmp.23		; <i32> [#uses=1]
-	%tmp.32 = sub i32 %tmp.30, %a4		; <i32> [#uses=1]
-	%tmp.34 = sub i32 %tmp.32, %a2		; <i32> [#uses=2]
-	%T = mul i32 %tmp.34, %tmp.34		; <i32> [#uses=1]
-	ret i32 %T
-}
+define i32 @f1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+; CHECK-LABEL: f1
+; CHECK-NEXT: ret i32 0
 
+  %tmp.2 = add i32 %a4, %a3
+  %tmp.4 = add i32 %tmp.2, %a2
+  %tmp.6 = add i32 %tmp.4, %a1
+  %tmp.8 = add i32 %tmp.6, %a0
+  %tmp.11 = add i32 %a3, %a2
+  %tmp.13 = add i32 %tmp.11, %a1
+  %tmp.15 = add i32 %tmp.13, %a0
+  %tmp.18 = add i32 %a2, %a1
+  %tmp.20 = add i32 %tmp.18, %a0
+  %tmp.23 = add i32 %a1, %a0
+  %tmp.26 = sub i32 %tmp.8, %tmp.15
+  %tmp.28 = add i32 %tmp.26, %tmp.20
+  %tmp.30 = sub i32 %tmp.28, %tmp.23
+  %tmp.32 = sub i32 %tmp.30, %a4
+  %tmp.34 = sub i32 %tmp.32, %a2
+  %T = mul i32 %tmp.34, %tmp.34
+  ret i32 %T
+}
diff --git a/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll b/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
index 384cbc90a744..f78395540026 100644
--- a/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
+++ b/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
@@ -1,8 +1,12 @@
-; RUN: opt < %s -reassociate -disable-output
+; RUN: opt < %s -reassociate -S | FileCheck %s
 
-define void @foo() {
-	%tmp162 = fsub <4 x float> zeroinitializer, zeroinitializer		; <<4 x float>> [#uses=1]
-	%tmp164 = fmul <4 x float> zeroinitializer, %tmp162		; <<4 x float>> [#uses=0]
-	ret void
-}
+define <4 x float> @test1() {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT: %tmp2 = fmul <4 x float> zeroinitializer, %tmp1
+; CHECK-NEXT: ret <4 x float> %tmp2
 
+  %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer
+  %tmp2 = fmul <4 x float> zeroinitializer, %tmp1
+  ret <4 x float> %tmp2
+}
diff --git a/test/Transforms/Reassociate/basictest.ll b/test/Transforms/Reassociate/basictest.ll
index fda0ca6be1aa..d70bfcbb72c6 100644
--- a/test/Transforms/Reassociate/basictest.ll
+++ b/test/Transforms/Reassociate/basictest.ll
@@ -1,46 +1,47 @@
-; With reassociation, constant folding can eliminate the 12 and -12 constants.
-;
-; RUN: opt < %s -reassociate  -gvn -instcombine -S | FileCheck %s
+; RUN: opt < %s -reassociate -gvn -instcombine -S | FileCheck %s
 
 define i32 @test1(i32 %arg) {
-	%tmp1 = sub i32 -12, %arg
-	%tmp2 = add i32 %tmp1, 12
-	ret i32 %tmp2
-; CHECK-LABEL: @test1(
+  %tmp1 = sub i32 -12, %arg
+  %tmp2 = add i32 %tmp1, 12
+  ret i32 %tmp2
+
+; CHECK-LABEL: @test1
 ; CHECK-NEXT: sub i32 0, %arg
 ; CHECK-NEXT: ret i32
 }
 
 define i32 @test2(i32 %reg109, i32 %reg1111) {
-	%reg115 = add i32 %reg109, -30		; <i32> [#uses=1]
-	%reg116 = add i32 %reg115, %reg1111		; <i32> [#uses=1]
-	%reg117 = add i32 %reg116, 30		; <i32> [#uses=1]
-	ret i32 %reg117
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: add i32 %reg1111, %reg109
-; CHECK-NEXT: ret i32
+  %reg115 = add i32 %reg109, -30
+  %reg116 = add i32 %reg115, %reg1111
+  %reg117 = add i32 %reg116, 30
+  ret i32 %reg117
+
+; CHECK-LABEL: @test2
+; CHECK-NEXT: %reg117 = add i32 %reg1111, %reg109
+; CHECK-NEXT: ret i32 %reg117
 }
 
-@e = external global i32		; <i32*> [#uses=3]
-@a = external global i32		; <i32*> [#uses=3]
-@b = external global i32		; <i32*> [#uses=3]
-@c = external global i32		; <i32*> [#uses=3]
-@f = external global i32		; <i32*> [#uses=3]
+@e = external global i32
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@f = external global i32
 
 define void @test3() {
-	%A = load i32* @a		; <i32> [#uses=2]
-	%B = load i32* @b		; <i32> [#uses=2]
-	%C = load i32* @c		; <i32> [#uses=2]
-	%t1 = add i32 %A, %B		; <i32> [#uses=1]
-	%t2 = add i32 %t1, %C		; <i32> [#uses=1]
-	%t3 = add i32 %C, %A		; <i32> [#uses=1]
-	%t4 = add i32 %t3, %B		; <i32> [#uses=1]
-	; e = (a+b)+c;
-        store i32 %t2, i32* @e
-        ; f = (a+c)+b
-	store i32 %t4, i32* @f
-	ret void
-; CHECK-LABEL: @test3(
+  %A = load i32* @a
+  %B = load i32* @b
+  %C = load i32* @c
+  %t1 = add i32 %A, %B
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = (a+b)+c;
+  store i32 %t2, i32* @e
+  ; f = (a+c)+b
+  store i32 %t4, i32* @f
+  ret void
+
+; CHECK-LABEL: @test3
 ; CHECK: add i32
 ; CHECK: add i32
 ; CHECK-NOT: add i32
@@ -48,19 +49,20 @@ define void @test3() {
 }
 
 define void @test4() {
-	%A = load i32* @a		; <i32> [#uses=2]
-	%B = load i32* @b		; <i32> [#uses=2]
-	%C = load i32* @c		; <i32> [#uses=2]
-	%t1 = add i32 %A, %B		; <i32> [#uses=1]
-	%t2 = add i32 %t1, %C		; <i32> [#uses=1]
-	%t3 = add i32 %C, %A		; <i32> [#uses=1]
-	%t4 = add i32 %t3, %B		; <i32> [#uses=1]
-	; e = c+(a+b)
-        store i32 %t2, i32* @e
-        ; f = (c+a)+b
-	store i32 %t4, i32* @f
-	ret void
-; CHECK-LABEL: @test4(
+  %A = load i32* @a
+  %B = load i32* @b
+  %C = load i32* @c
+  %t1 = add i32 %A, %B
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = c+(a+b)
+  store i32 %t2, i32* @e
+  ; f = (c+a)+b
+  store i32 %t4, i32* @f
+  ret void
+
+; CHECK-LABEL: @test4
 ; CHECK: add i32
 ; CHECK: add i32
 ; CHECK-NOT: add i32
@@ -68,19 +70,20 @@ define void @test4() {
 }
 
 define void @test5() {
-	%A = load i32* @a		; <i32> [#uses=2]
-	%B = load i32* @b		; <i32> [#uses=2]
-	%C = load i32* @c		; <i32> [#uses=2]
-	%t1 = add i32 %B, %A		; <i32> [#uses=1]
-	%t2 = add i32 %t1, %C		; <i32> [#uses=1]
-	%t3 = add i32 %C, %A		; <i32> [#uses=1]
-	%t4 = add i32 %t3, %B		; <i32> [#uses=1]
-	; e = c+(b+a)
-        store i32 %t2, i32* @e
-        ; f = (c+a)+b
-	store i32 %t4, i32* @f
-	ret void
-; CHECK-LABEL: @test5(
+  %A = load i32* @a
+  %B = load i32* @b
+  %C = load i32* @c
+  %t1 = add i32 %B, %A
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = c+(b+a)
+  store i32 %t2, i32* @e
+  ; f = (c+a)+b
+  store i32 %t4, i32* @f
+  ret void
+
+; CHECK-LABEL: @test5
 ; CHECK: add i32
 ; CHECK: add i32
 ; CHECK-NOT: add i32
@@ -88,60 +91,61 @@ define void @test5() {
 }
 
 define i32 @test6() {
-	%tmp.0 = load i32* @a
-	%tmp.1 = load i32* @b
-        ; (a+b)
-	%tmp.2 = add i32 %tmp.0, %tmp.1
-	%tmp.4 = load i32* @c
-	; (a+b)+c
-        %tmp.5 = add i32 %tmp.2, %tmp.4
-	; (a+c)
-        %tmp.8 = add i32 %tmp.0, %tmp.4
-	; (a+c)+b
-        %tmp.11 = add i32 %tmp.8, %tmp.1
-	; X ^ X = 0
-        %RV = xor i32 %tmp.5, %tmp.11
-	ret i32 %RV
-; CHECK-LABEL: @test6(
+  %tmp.0 = load i32* @a
+  %tmp.1 = load i32* @b
+  ; (a+b)
+  %tmp.2 = add i32 %tmp.0, %tmp.1
+  %tmp.4 = load i32* @c
+  ; (a+b)+c
+  %tmp.5 = add i32 %tmp.2, %tmp.4
+  ; (a+c)
+  %tmp.8 = add i32 %tmp.0, %tmp.4
+  ; (a+c)+b
+  %tmp.11 = add i32 %tmp.8, %tmp.1
+  ; X ^ X = 0
+  %RV = xor i32 %tmp.5, %tmp.11
+  ret i32 %RV
+
+; CHECK-LABEL: @test6
 ; CHECK: ret i32 0
 }
 
 ; This should be one add and two multiplies.
 define i32 @test7(i32 %A, i32 %B, i32 %C) {
- ; A*A*B + A*C*A
-	%aa = mul i32 %A, %A
-	%aab = mul i32 %aa, %B
-	%ac = mul i32 %A, %C
-	%aac = mul i32 %ac, %A
-	%r = add i32 %aab, %aac
-	ret i32 %r
-; CHECK-LABEL: @test7(
+  ; A*A*B + A*C*A
+  %aa = mul i32 %A, %A
+  %aab = mul i32 %aa, %B
+  %ac = mul i32 %A, %C
+  %aac = mul i32 %ac, %A
+  %r = add i32 %aab, %aac
+  ret i32 %r
+
+; CHECK-LABEL: @test7
 ; CHECK-NEXT: add i32 %C, %B
 ; CHECK-NEXT: mul i32 
 ; CHECK-NEXT: mul i32 
 ; CHECK-NEXT: ret i32 
 }
 
-
 define i32 @test8(i32 %X, i32 %Y, i32 %Z) {
-	%A = sub i32 0, %X
-	%B = mul i32 %A, %Y
-        ; (-X)*Y + Z -> Z-X*Y
-	%C = add i32 %B, %Z
-	ret i32 %C
-; CHECK-LABEL: @test8(
+  %A = sub i32 0, %X
+  %B = mul i32 %A, %Y
+  ; (-X)*Y + Z -> Z-X*Y
+  %C = add i32 %B, %Z
+  ret i32 %C
+
+; CHECK-LABEL: @test8
 ; CHECK-NEXT: %A = mul i32 %Y, %X
 ; CHECK-NEXT: %C = sub i32 %Z, %A
 ; CHECK-NEXT: ret i32 %C
 }
 
-
 ; PR5458
 define i32 @test9(i32 %X) {
   %Y = mul i32 %X, 47
   %Z = add i32 %Y, %Y
   ret i32 %Z
-; CHECK-LABEL: @test9(
+; CHECK-LABEL: @test9
 ; CHECK-NEXT: mul i32 %X, 94
 ; CHECK-NEXT: ret i32
 }
@@ -150,7 +154,7 @@ define i32 @test10(i32 %X) {
   %Y = add i32 %X ,%X
   %Z = add i32 %Y, %X
   ret i32 %Z
-; CHECK-LABEL: @test10(
+; CHECK-LABEL: @test10
 ; CHECK-NEXT: mul i32 %X, 3
 ; CHECK-NEXT: ret i32
 }
@@ -160,7 +164,7 @@ define i32 @test11(i32 %W) {
   %Y = add i32 %X ,%X
   %Z = add i32 %Y, %X
   ret i32 %Z
-; CHECK-LABEL: @test11(
+; CHECK-LABEL: @test11
 ; CHECK-NEXT: mul i32 %W, 381
 ; CHECK-NEXT: ret i32
 }
@@ -169,11 +173,10 @@ define i32 @test12(i32 %X) {
   %A = sub i32 1, %X
   %B = sub i32 2, %X
   %C = sub i32 3, %X
-
   %Y = add i32 %A ,%B
   %Z = add i32 %Y, %C
   ret i32 %Z
-; CHECK-LABEL: @test12(
+; CHECK-LABEL: @test12
 ; CHECK-NEXT: mul i32 %X, -3
 ; CHECK-NEXT: add i32{{.*}}, 6
 ; CHECK-NEXT: ret i32
@@ -185,7 +188,7 @@ define i32 @test13(i32 %X1, i32 %X2, i32 %X3) {
   %C = mul i32 %X1, %X3  ; X1*X3
   %D = add i32 %B, %C    ; -X1*X2 + X1*X3 -> X1*(X3-X2)
   ret i32 %D
-; CHECK-LABEL: @test13(
+; CHECK-LABEL: @test13
 ; CHECK-NEXT: sub i32 %X3, %X2
 ; CHECK-NEXT: mul i32 {{.*}}, %X1
 ; CHECK-NEXT: ret i32
@@ -197,9 +200,10 @@ define i32 @test14(i32 %X1, i32 %X2) {
   %C = mul i32 %X2, -47  ; X2*-47
   %D = add i32 %B, %C    ; X1*47 + X2*-47 -> 47*(X1-X2)
   ret i32 %D
-; CHECK-LABEL: @test14(
+
+; CHECK-LABEL: @test14
 ; CHECK-NEXT: sub i32 %X1, %X2
-; CHECK-NEXT: mul i32 {{.*}}, 47
+; CHECK-NEXT: mul i32 %tmp, 47
 ; CHECK-NEXT: ret i32
 }
 
@@ -210,7 +214,6 @@ define i32 @test15(i32 %X1, i32 %X2, i32 %X3) {
   %C = and i1 %A, %B
   %D = select i1 %C, i32 %X1, i32 0
   ret i32 %D
-; CHECK-LABEL: @test15(
+; CHECK-LABEL: @test15
 ; CHECK: and i1 %A, %B
 }
-
diff --git a/test/Transforms/Reassociate/fp-commute.ll b/test/Transforms/Reassociate/fp-commute.ll
index 025689bb0c10..eac5b5920ac1 100644
--- a/test/Transforms/Reassociate/fp-commute.ll
+++ b/test/Transforms/Reassociate/fp-commute.ll
@@ -1,18 +1,19 @@
 ; RUN: opt -reassociate -S < %s | FileCheck %s
 
-target triple = "armv7-apple-ios"
-
 declare void @use(float)
 
-; CHECK: test
-define void @test(float %x, float %y) {
-entry:
+define void @test1(float %x, float %y) {
+; CHECK-LABEL: test1
 ; CHECK: fmul float %x, %y
 ; CHECK: fmul float %x, %y
-  %0 = fmul float %x, %y
-  %1 = fmul float %y, %x
-  %2 = fsub float %0, %1
-  call void @use(float %0)
-  call void @use(float %2)
+; CHECK: fsub float %1, %2
+; CHECK: call void @use(float %{{.*}})
+; CHECK: call void @use(float %{{.*}})
+
+  %1 = fmul float %x, %y
+  %2 = fmul float %y, %x
+  %3 = fsub float %1, %2
+  call void @use(float %1)
+  call void @use(float %3)
   ret void
 }
diff --git a/test/Transforms/Reassociate/inverses.ll b/test/Transforms/Reassociate/inverses.ll
index afe076caea92..8500cd867fd3 100644
--- a/test/Transforms/Reassociate/inverses.ll
+++ b/test/Transforms/Reassociate/inverses.ll
@@ -32,3 +32,15 @@ define i32 @test3(i32 %b, i32 %a) {
 ; CHECK: %tmp.5 = add i32 %b, 1234
 ; CHECK: ret i32 %tmp.5
 }
+
+define i32 @test4(i32 %b, i32 %a) {
+        %tmp.1 = add i32 %a, 1234
+        %tmp.2 = add i32 %b, %tmp.1
+        %tmp.4 = xor i32 %a, -1
+        ; (b+(a+1234))+~a -> b+1233
+        %tmp.5 = add i32 %tmp.2, %tmp.4
+        ret i32 %tmp.5
+; CHECK-LABEL: @test4(
+; CHECK: %tmp.5 = add i32 %b, 1233
+; CHECK: ret i32 %tmp.5
+}
diff --git a/test/Transforms/Reassociate/looptest.ll b/test/Transforms/Reassociate/looptest.ll
index 91723bc37b01..aad3b206f69f 100644
--- a/test/Transforms/Reassociate/looptest.ll
+++ b/test/Transforms/Reassociate/looptest.ll
@@ -18,6 +18,7 @@
 
 declare i32 @printf(i8*, ...)
 
+; FIXME: No longer works.
 define void @test(i32 %Num, i32* %Array) {
 bb0:
 	%cond221 = icmp eq i32 0, %Num		; <i1> [#uses=3]
diff --git a/test/Transforms/Reassociate/mightymul.ll b/test/Transforms/Reassociate/mightymul.ll
index cfbc485ffa03..ae915da8e576 100644
--- a/test/Transforms/Reassociate/mightymul.ll
+++ b/test/Transforms/Reassociate/mightymul.ll
@@ -1,7 +1,7 @@
-; RUN: opt < %s -reassociate
+; RUN: opt < %s -reassociate -disable-output
 ; PR13021
 
-define i32 @foo(i32 %x) {
+define i32 @test1(i32 %x) {
   %t0 = mul i32 %x, %x
   %t1 = mul i32 %t0, %t0
   %t2 = mul i32 %t1, %t1
diff --git a/test/Transforms/Reassociate/multistep.ll b/test/Transforms/Reassociate/multistep.ll
index d79464753f12..12eaeeea7b7b 100644
--- a/test/Transforms/Reassociate/multistep.ll
+++ b/test/Transforms/Reassociate/multistep.ll
@@ -28,4 +28,3 @@ define i64 @multistep2(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-NEXT: ret
   ret i64 %t3
 }
-
diff --git a/test/Transforms/Reassociate/negation.ll b/test/Transforms/Reassociate/negation.ll
index 6a3dfd3b8206..12d2c86192bb 100644
--- a/test/Transforms/Reassociate/negation.ll
+++ b/test/Transforms/Reassociate/negation.ll
@@ -1,21 +1,31 @@
-; RUN: opt < %s -reassociate -instcombine -S | not grep sub
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
 ; Test that we can turn things like X*-(Y*Z) -> X*-1*Y*Z.
 
 define i32 @test1(i32 %a, i32 %b, i32 %z) {
-	%c = sub i32 0, %z		; <i32> [#uses=1]
-	%d = mul i32 %a, %b		; <i32> [#uses=1]
-	%e = mul i32 %c, %d		; <i32> [#uses=1]
-	%f = mul i32 %e, 12345		; <i32> [#uses=1]
-	%g = sub i32 0, %f		; <i32> [#uses=1]
-	ret i32 %g
+; CHECK-LABEL: test1
+; CHECK-NEXT: %e = mul i32 %a, 12345
+; CHECK-NEXT: %f = mul i32 %e, %b
+; CHECK-NEXT: %g = mul i32 %f, %z
+; CHECK-NEXT: ret i32 %g
+
+  %c = sub i32 0, %z
+  %d = mul i32 %a, %b
+  %e = mul i32 %c, %d
+  %f = mul i32 %e, 12345
+  %g = sub i32 0, %f
+  ret i32 %g
 }
 
 define i32 @test2(i32 %a, i32 %b, i32 %z) {
-	%d = mul i32 %z, 40		; <i32> [#uses=1]
-	%c = sub i32 0, %d		; <i32> [#uses=1]
-	%e = mul i32 %a, %c		; <i32> [#uses=1]
-	%f = sub i32 0, %e		; <i32> [#uses=1]
-	ret i32 %f
-}
+; CHECK-LABEL: test2
+; CHECK-NEXT: %e = mul i32 %a, 40
+; CHECK-NEXT: %f = mul i32 %e, %z
+; CHECK-NEXT: ret i32 %f
 
+  %d = mul i32 %z, 40
+  %c = sub i32 0, %d
+  %e = mul i32 %a, %c
+  %f = sub i32 0, %e
+  ret i32 %f
+}
diff --git a/test/Transforms/Reassociate/otherops.ll b/test/Transforms/Reassociate/otherops.ll
index d68d00818cb8..7718881d8e51 100644
--- a/test/Transforms/Reassociate/otherops.ll
+++ b/test/Transforms/Reassociate/otherops.ll
@@ -1,28 +1,42 @@
 ; Reassociation should apply to Add, Mul, And, Or, & Xor
 ;
-; RUN: opt < %s -reassociate -constprop -instcombine -die -S | not grep 12
+; RUN: opt < %s -reassociate -constprop -instcombine -die -S | FileCheck %s
 
 define i32 @test_mul(i32 %arg) {
-	%tmp1 = mul i32 12, %arg		; <i32> [#uses=1]
-	%tmp2 = mul i32 %tmp1, 12		; <i32> [#uses=1]
-	ret i32 %tmp2
+; CHECK-LABEL: test_mul
+; CHECK-NEXT: %tmp2 = mul i32 %arg, 144
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = mul i32 12, %arg
+  %tmp2 = mul i32 %tmp1, 12
+  ret i32 %tmp2
 }
 
 define i32 @test_and(i32 %arg) {
-	%tmp1 = and i32 14, %arg		; <i32> [#uses=1]
-	%tmp2 = and i32 %tmp1, 14		; <i32> [#uses=1]
-	ret i32 %tmp2
+; CHECK-LABEL: test_and
+; CHECK-NEXT: %tmp2 = and i32 %arg, 14
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = and i32 14, %arg
+  %tmp2 = and i32 %tmp1, 14
+  ret i32 %tmp2
 }
 
 define i32 @test_or(i32 %arg) {
-	%tmp1 = or i32 14, %arg		; <i32> [#uses=1]
-	%tmp2 = or i32 %tmp1, 14		; <i32> [#uses=1]
-	ret i32 %tmp2
+; CHECK-LABEL: test_or
+; CHECK-NEXT: %tmp2 = or i32 %arg, 14
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = or i32 14, %arg
+  %tmp2 = or i32 %tmp1, 14
+  ret i32 %tmp2
 }
 
 define i32 @test_xor(i32 %arg) {
-	%tmp1 = xor i32 12, %arg		; <i32> [#uses=1]
-	%tmp2 = xor i32 %tmp1, 12		; <i32> [#uses=1]
-	ret i32 %tmp2
-}
+; CHECK-LABEL: test_xor
+; CHECK-NEXT: ret i32 %arg
 
+  %tmp1 = xor i32 12, %arg
+  %tmp2 = xor i32 %tmp1, 12
+  ret i32 %tmp2
+}
diff --git a/test/Transforms/Reassociate/shift-factor.ll b/test/Transforms/Reassociate/shift-factor.ll
index 73af5e5304ee..8fbf1b9d4bc5 100644
--- a/test/Transforms/Reassociate/shift-factor.ll
+++ b/test/Transforms/Reassociate/shift-factor.ll
@@ -1,12 +1,14 @@
 ; There should be exactly one shift and one add left.
-; RUN: opt < %s -reassociate -instcombine -S > %t
-; RUN: grep shl %t | count 1
-; RUN: grep add %t | count 1
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @test(i32 %X, i32 %Y) {
-	%tmp.2 = shl i32 %X, 1		; <i32> [#uses=1]
-	%tmp.6 = shl i32 %Y, 1		; <i32> [#uses=1]
-	%tmp.4 = add i32 %tmp.6, %tmp.2		; <i32> [#uses=1]
-	ret i32 %tmp.4
-}
+define i32 @test1(i32 %X, i32 %Y) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp = add i32 %Y, %X
+; CHECK-NEXT: %tmp1 = shl i32 %tmp, 1
+; CHECK-NEXT: ret i32 %tmp1
 
+  %tmp.2 = shl i32 %X, 1
+  %tmp.6 = shl i32 %Y, 1
+  %tmp.4 = add i32 %tmp.6, %tmp.2
+  ret i32 %tmp.4
+}
diff --git a/test/Transforms/Reassociate/subtest.ll b/test/Transforms/Reassociate/subtest.ll
index 4c63d1238a67..e6263d85522c 100644
--- a/test/Transforms/Reassociate/subtest.ll
+++ b/test/Transforms/Reassociate/subtest.ll
@@ -1,11 +1,26 @@
-; With sub reassociation, constant folding can eliminate the 12 and -12 constants.
-;
-; RUN: opt < %s -reassociate -instcombine -S | not grep 12
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @test(i32 %A, i32 %B) {
-	%X = add i32 -12, %A		; <i32> [#uses=1]
-	%Y = sub i32 %X, %B		; <i32> [#uses=1]
-	%Z = add i32 %Y, 12		; <i32> [#uses=1]
-	ret i32 %Z
+; With sub reassociation, constant folding can eliminate the 12 and -12 constants.
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT: %Z = sub i32 %A, %B
+; CHECK-NEXT: ret i32 %Z
+  %X = add i32 -12, %A
+  %Y = sub i32 %X, %B
+  %Z = add i32 %Y, 12
+  ret i32 %Z
 }
 
+; PR2047
+; With sub reassociation, constant folding can eliminate the uses of %a.
+define i32 @test2(i32 %a, i32 %b, i32 %c) nounwind  {
+; CHECK-LABEL: @test2
+; CHECK-NEXT: %sum = add i32 %c, %b
+; CHECK-NEXT: %tmp7 = sub i32 0, %sum
+; CHECK-NEXT: ret i32 %tmp7
+
+  %tmp3 = sub i32 %a, %b
+  %tmp5 = sub i32 %tmp3, %c
+  %tmp7 = sub i32 %tmp5, %a
+  ret i32 %tmp7
+}
diff --git a/test/Transforms/Reassociate/subtest2.ll b/test/Transforms/Reassociate/subtest2.ll
deleted file mode 100644
index 0513c5fc1b63..000000000000
--- a/test/Transforms/Reassociate/subtest2.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; With sub reassociation, constant folding can eliminate the uses of %a.
-;
-; RUN: opt < %s -reassociate -instcombine -S | grep %a | count 1
-; PR2047
-
-define i32 @test(i32 %a, i32 %b, i32 %c) nounwind  {
-entry:
-	%tmp3 = sub i32 %a, %b		; <i32> [#uses=1]
-	%tmp5 = sub i32 %tmp3, %c		; <i32> [#uses=1]
-	%tmp7 = sub i32 %tmp5, %a		; <i32> [#uses=1]
-	ret i32 %tmp7
-}
-
diff --git a/test/Transforms/SCCP/atomic.ll b/test/Transforms/SCCP/atomic.ll
new file mode 100644
index 000000000000..60d4896ec2b0
--- /dev/null
+++ b/test/Transforms/SCCP/atomic.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+define i1 @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg
+; CHECK: cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %val = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %res = extractvalue { i32, i1 } %val, 1
+  ret i1 %res
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..7184443994b6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll b/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
similarity index 100%
rename from test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll
rename to test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
diff --git a/test/Transforms/SLPVectorizer/ARM/lit.local.cfg b/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
index 5fc35d80541d..236e1d344166 100644
--- a/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg b/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac9811f012..000000000000
--- a/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/R600/lit.local.cfg b/test/Transforms/SLPVectorizer/R600/lit.local.cfg
index 9e0ab99235e0..4086e8d681c3 100644
--- a/test/Transforms/SLPVectorizer/R600/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/R600/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'R600' in targets:
+if not 'R600' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
new file mode 100644
index 000000000000..f4f112fe32c6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -0,0 +1,175 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@A = common global [2000 x double] zeroinitializer, align 16
+@B = common global [2000 x double] zeroinitializer, align 16
+@C = common global [2000 x float] zeroinitializer, align 16
+@D = common global [2000 x float] zeroinitializer, align 16
+
+; Currently SCEV isn't smart enough to figure out that accesses
+; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but in future
+; that would hopefully be fixed. For now, check that this isn't
+; vectorized.
+; CHECK-LABEL: foo_3double
+; CHECK-NOT: x double>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_3double(i32 %u) #0 {
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 3
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  %add24 = add nsw i32 %mul, 2
+  %idxprom25 = sext i32 %add24 to i64
+  %arrayidx26 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom25
+  %4 = load double* %arrayidx26, align 8
+  %arrayidx30 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom25
+  %5 = load double* %arrayidx30, align 8
+  %add31 = fadd double %4, %5
+  store double %add31, double* %arrayidx26, align 8
+  ret void
+}
+
+; SCEV should be able to tell that accesses A[C1 + C2*i], A[C1 + C2*i], ...
+; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0.
+; Thus, the following code should be vectorized.
+; CHECK-LABEL: foo_2double
+; CHECK: x double>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_2double(i32 %u) #0 {
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 2
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  ret void
+}
+
+; Similar to the previous test, but with different datatype.
+; CHECK-LABEL: foo_4float
+; CHECK: x float>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_4float(i32 %u) #0 {
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 4
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom
+  %0 = load float* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom
+  %1 = load float* %arrayidx4, align 4
+  %add5 = fadd float %0, %1
+  store float %add5, float* %arrayidx, align 4
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom12
+  %2 = load float* %arrayidx13, align 4
+  %arrayidx17 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom12
+  %3 = load float* %arrayidx17, align 4
+  %add18 = fadd float %2, %3
+  store float %add18, float* %arrayidx13, align 4
+  %add24 = add nsw i32 %mul, 2
+  %idxprom25 = sext i32 %add24 to i64
+  %arrayidx26 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom25
+  %4 = load float* %arrayidx26, align 4
+  %arrayidx30 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom25
+  %5 = load float* %arrayidx30, align 4
+  %add31 = fadd float %4, %5
+  store float %add31, float* %arrayidx26, align 4
+  %add37 = add nsw i32 %mul, 3
+  %idxprom38 = sext i32 %add37 to i64
+  %arrayidx39 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom38
+  %6 = load float* %arrayidx39, align 4
+  %arrayidx43 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom38
+  %7 = load float* %arrayidx43, align 4
+  %add44 = fadd float %6, %7
+  store float %add44, float* %arrayidx39, align 4
+  ret void
+}
+
+; Similar to the previous tests, but now we are dealing with AddRec SCEV.
+; CHECK-LABEL: foo_loop
+; CHECK: x double>
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo_loop(double* %A, i32 %n) #0 {
+entry:
+  %A.addr = alloca double*, align 8
+  %n.addr = alloca i32, align 4
+  %sum = alloca double, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store double 0.000000e+00, double* %sum, align 8
+  store i32 0, i32* %i, align 4
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ]
+  %mul = mul nsw i32 %0, 2
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds double* %A, i64 %idxprom
+  %2 = load double* %arrayidx, align 8
+  %mul1 = fmul double 7.000000e+00, %2
+  %add = add nsw i32 %mul, 1
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds double* %A, i64 %idxprom3
+  %3 = load double* %arrayidx4, align 8
+  %mul5 = fmul double 7.000000e+00, %3
+  %add6 = fadd double %mul1, %mul5
+  %add7 = fadd double %1, %add6
+  store double %add7, double* %sum, align 8
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %i, align 4
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %split = phi double [ %add7, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ]
+  %conv = fptosi double %.lcssa to i32
+  ret i32 %conv
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/Transforms/SLPVectorizer/X86/gep.ll b/test/Transforms/SLPVectorizer/X86/gep.ll
new file mode 100644
index 000000000000..9e105ec98489
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/gep.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test if SLP can handle GEP expressions.
+; The test perform the following action:
+;   x->first  = y->first  + 16
+;   x->second = y->second + 16
+
+; CHECK-LABEL: foo1
+; CHECK: <2 x i32*>
+define void @foo1 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y) {
+  %1 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 0
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i64 16
+  %4 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 0
+  store i32* %3, i32** %4, align 8
+  %5 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 1
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i64 16
+  %8 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 1
+  store i32* %7, i32** %8, align 8
+  ret void
+}
+
+; Test that we don't vectorize GEP expressions if indexes are not constants.
+; We can't produce an efficient code in that case.
+; CHECK-LABEL: foo2
+; CHECK-NOT: <2 x i32*>
+define void @foo2 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y, i32 %i) {
+  %1 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 0
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i32 %i
+  %4 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 0
+  store i32* %3, i32** %4, align 8
+  %5 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 1
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i32 %i
+  %8 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 1
+  store i32* %7, i32** %8, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index db33927cb793..9eda29f101ac 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -223,7 +223,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; instructions that are erased.
 define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ; ZEROTHRESH-LABEL: @take_credit(
-; ZEROTHRESH-CHECK: %1 = fadd <4 x float> %a, %b
+; ZEROTHRESH: %1 = fadd <4 x float> %a, %b
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
   %c0 = fadd float %a0, %b0
diff --git a/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
index 30c509369730..937252f4146b 100644
--- a/test/Transforms/SLPVectorizer/X86/intrinsic.ll
+++ b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -117,3 +117,270 @@ entry:
 ; CHECK: store <4 x i32>
 ; CHECK: ret
 }
+
+declare i32 @llvm.ctlz.i32(i32,i1) nounwind readnone
+
+define void @vec_ctlz_i32(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 true) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 true) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_ctlz_i32(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: call <4 x i32> @llvm.ctlz.v4i32
+; CHECK: store <4 x i32>
+; CHECK: ret
+}
+
+define void @vec_ctlz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 false) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 false) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_ctlz_i32_neg(
+; CHECK-NOT: call <4 x i32> @llvm.ctlz.v4i32
+
+}
+
+
+declare i32 @llvm.cttz.i32(i32,i1) nounwind readnone
+
+define void @vec_cttz_i32(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 true) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 true) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_cttz_i32(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: call <4 x i32> @llvm.cttz.v4i32
+; CHECK: store <4 x i32>
+; CHECK: ret
+}
+
+define void @vec_cttz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 false) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 false) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_cttz_i32_neg(
+; CHECK-NOT: call <4 x i32> @llvm.cttz.v4i32
+}
+
+
+declare float @llvm.powi.f32(float, i32)
+define void @vec_powi_f32(float* %a, float* %b, float* %c, i32 %P) {
+entry:
+  %i0 = load float* %a, align 4
+  %i1 = load float* %b, align 4
+  %add1 = fadd float %i0, %i1
+  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds float* %a, i32 1
+  %i2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %b, i32 1
+  %i3 = load float* %arrayidx3, align 4
+  %add2 = fadd float %i2, %i3
+  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %P) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds float* %a, i32 2
+  %i4 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %b, i32 2
+  %i5 = load float* %arrayidx5, align 4
+  %add3 = fadd float %i4, %i5
+  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds float* %a, i32 3
+  %i6 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %b, i32 3
+  %i7 = load float* %arrayidx7, align 4
+  %add4 = fadd float %i6, %i7
+  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %P) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_powi_f32(
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: call <4 x float> @llvm.powi.v4f32
+; CHECK: store <4 x float>
+; CHECK: ret
+}
+
+
+define void @vec_powi_f32_neg(float* %a, float* %b, float* %c, i32 %P, i32 %Q) {
+entry:
+  %i0 = load float* %a, align 4
+  %i1 = load float* %b, align 4
+  %add1 = fadd float %i0, %i1
+  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds float* %a, i32 1
+  %i2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %b, i32 1
+  %i3 = load float* %arrayidx3, align 4
+  %add2 = fadd float %i2, %i3
+  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %Q) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds float* %a, i32 2
+  %i4 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %b, i32 2
+  %i5 = load float* %arrayidx5, align 4
+  %add3 = fadd float %i4, %i5
+  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds float* %a, i32 3
+  %i6 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %b, i32 3
+  %i7 = load float* %arrayidx7, align 4
+  %add4 = fadd float %i6, %i7
+  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %Q) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_powi_f32_neg(
+; CHECK-NOT: call <4 x float> @llvm.powi.v4f32
+}
diff --git a/test/Transforms/SLPVectorizer/X86/lit.local.cfg b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/SLPVectorizer/X86/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SLPVectorizer/X86/pr19657.ll b/test/Transforms/SLPVectorizer/X86/pr19657.ll
new file mode 100644
index 000000000000..93523089c28b
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr19657.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -O1 -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK: load <2 x double>*
+;CHECK: fadd <2 x double>
+;CHECK: store <2 x double>
+
+; Function Attrs: nounwind uwtable
+define void @foo(double* %x) #0 {
+  %1 = alloca double*, align 8
+  store double* %x, double** %1, align 8
+  %2 = load double** %1, align 8
+  %3 = getelementptr inbounds double* %2, i64 0
+  %4 = load double* %3, align 8
+  %5 = load double** %1, align 8
+  %6 = getelementptr inbounds double* %5, i64 0
+  %7 = load double* %6, align 8
+  %8 = fadd double %4, %7
+  %9 = load double** %1, align 8
+  %10 = getelementptr inbounds double* %9, i64 0
+  %11 = load double* %10, align 8
+  %12 = fadd double %8, %11
+  %13 = load double** %1, align 8
+  %14 = getelementptr inbounds double* %13, i64 0
+  store double %12, double* %14, align 8
+  %15 = load double** %1, align 8
+  %16 = getelementptr inbounds double* %15, i64 1
+  %17 = load double* %16, align 8
+  %18 = load double** %1, align 8
+  %19 = getelementptr inbounds double* %18, i64 1
+  %20 = load double* %19, align 8
+  %21 = fadd double %17, %20
+  %22 = load double** %1, align 8
+  %23 = getelementptr inbounds double* %22, i64 1
+  %24 = load double* %23, align 8
+  %25 = fadd double %21, %24
+  %26 = load double** %1, align 8
+  %27 = getelementptr inbounds double* %26, i64 1
+  store double %25, double* %27, align 8
+  %28 = load double** %1, align 8
+  %29 = getelementptr inbounds double* %28, i64 2
+  %30 = load double* %29, align 8
+  %31 = load double** %1, align 8
+  %32 = getelementptr inbounds double* %31, i64 2
+  %33 = load double* %32, align 8
+  %34 = fadd double %30, %33
+  %35 = load double** %1, align 8
+  %36 = getelementptr inbounds double* %35, i64 2
+  %37 = load double* %36, align 8
+  %38 = fadd double %34, %37
+  %39 = load double** %1, align 8
+  %40 = getelementptr inbounds double* %39, i64 2
+  store double %38, double* %40, align 8
+  %41 = load double** %1, align 8
+  %42 = getelementptr inbounds double* %41, i64 3
+  %43 = load double* %42, align 8
+  %44 = load double** %1, align 8
+  %45 = getelementptr inbounds double* %44, i64 3
+  %46 = load double* %45, align 8
+  %47 = fadd double %43, %46
+  %48 = load double** %1, align 8
+  %49 = getelementptr inbounds double* %48, i64 3
+  %50 = load double* %49, align 8
+  %51 = fadd double %47, %50
+  %52 = load double** %1, align 8
+  %53 = getelementptr inbounds double* %52, i64 3
+  store double %51, double* %53, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/SLPVectorizer/XCore/lit.local.cfg b/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
index 4d17d4642045..bb48713fe33e 100644
--- a/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/SROA/slice-order-independence.ll b/test/Transforms/SROA/slice-order-independence.ll
new file mode 100644
index 000000000000..364ef85f1d14
--- /dev/null
+++ b/test/Transforms/SROA/slice-order-independence.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+; Check that the chosen type for a split is independent from the order of
+; slices even in case of types that are skipped because their width is not a
+; byte width multiple
+define void @skipped_inttype_first({ i16*, i32 }*) {
+; CHECK-LABEL: @skipped_inttype_first
+; CHECK: alloca i8*
+  %arg = alloca { i16*, i32 }, align 8
+  %2 = bitcast { i16*, i32 }* %0 to i8*
+  %3 = bitcast { i16*, i32 }* %arg to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* %2, i32 16, i32 8, i1 false)
+  %b = getelementptr inbounds { i16*, i32 }* %arg, i64 0, i32 0
+  %pb0 = bitcast i16** %b to i63*
+  %b0 = load i63* %pb0
+  %pb1 = bitcast i16** %b to i8**
+  %b1 = load i8** %pb1
+  ret void
+}
+
+define void @skipped_inttype_last({ i16*, i32 }*) {
+; CHECK-LABEL: @skipped_inttype_last
+; CHECK: alloca i8*
+  %arg = alloca { i16*, i32 }, align 8
+  %2 = bitcast { i16*, i32 }* %0 to i8*
+  %3 = bitcast { i16*, i32 }* %arg to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* %2, i32 16, i32 8, i1 false)
+  %b = getelementptr inbounds { i16*, i32 }* %arg, i64 0, i32 0
+  %pb1 = bitcast i16** %b to i8**
+  %b1 = load i8** %pb1
+  %pb0 = bitcast i16** %b to i63*
+  %b0 = load i63* %pb0
+  ret void
+}
diff --git a/test/Transforms/SROA/slice-width.ll b/test/Transforms/SROA/slice-width.ll
new file mode 100644
index 000000000000..179780b4afee
--- /dev/null
+++ b/test/Transforms/SROA/slice-width.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define void @no_split_on_non_byte_width(i32) {
+; This tests that allocas are not split into slices that are not byte width multiple
+  %arg = alloca i32 , align 8
+  store i32 %0, i32* %arg
+  br label %load_i32
+
+load_i32:
+; CHECK-LABEL: load_i32:
+; CHECK-NOT: bitcast {{.*}} to i1
+; CHECK-NOT: zext i1
+  %r0 = load i32* %arg
+  br label %load_i1
+
+load_i1:
+; CHECK-LABEL: load_i1:
+; CHECK: bitcast {{.*}} to i1
+  %p1 = bitcast i32* %arg to i1*
+  %t1 = load i1* %p1
+  ret void
+}
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
index 40532cdaa20b..a5e90f8e3c1d 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'NVPTX' in targets:
+if not 'NVPTX' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 66f4096fa964..c07440cd78fc 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -1,4 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
 ; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR
 
@@ -19,6 +18,90 @@ target triple = "nvptx64-unknown-unknown"
 @array = internal addrspace(3) constant [32 x [32 x float]] zeroinitializer, align 4
 
 define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
+.preheader:
+  %0 = sext i32 %y to i64
+  %1 = sext i32 %x to i64
+  %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
+  %3 = addrspacecast float addrspace(3)* %2 to float*
+  %4 = load float* %3, align 4
+  %5 = fadd float %4, 0.000000e+00
+  %6 = add i32 %y, 1
+  %7 = sext i32 %6 to i64
+  %8 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7
+  %9 = addrspacecast float addrspace(3)* %8 to float*
+  %10 = load float* %9, align 4
+  %11 = fadd float %5, %10
+  %12 = add i32 %x, 1
+  %13 = sext i32 %12 to i64
+  %14 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0
+  %15 = addrspacecast float addrspace(3)* %14 to float*
+  %16 = load float* %15, align 4
+  %17 = fadd float %11, %16
+  %18 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7
+  %19 = addrspacecast float addrspace(3)* %18 to float*
+  %20 = load float* %19, align 4
+  %21 = fadd float %17, %20
+  store float %21, float* %output, align 4
+  ret void
+}
+; PTX-LABEL: sum_of_array(
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+
+; IR-LABEL: @sum_of_array(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
+
+; @sum_of_array2 is very similar to @sum_of_array. The only difference is in
+; the order of "sext" and "add" when computing the array indices. @sum_of_array
+; computes add before sext, e.g., array[sext(x + 1)][sext(y + 1)], while
+; @sum_of_array2 computes sext before add,
+; e.g., array[sext(x) + 1][sext(y) + 1]. SeparateConstOffsetFromGEP should be
+; able to extract constant offsets from both forms.
+define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) {
+.preheader:
+  %0 = sext i32 %y to i64
+  %1 = sext i32 %x to i64
+  %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
+  %3 = addrspacecast float addrspace(3)* %2 to float*
+  %4 = load float* %3, align 4
+  %5 = fadd float %4, 0.000000e+00
+  %6 = add i64 %0, 1
+  %7 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6
+  %8 = addrspacecast float addrspace(3)* %7 to float*
+  %9 = load float* %8, align 4
+  %10 = fadd float %5, %9
+  %11 = add i64 %1, 1
+  %12 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0
+  %13 = addrspacecast float addrspace(3)* %12 to float*
+  %14 = load float* %13, align 4
+  %15 = fadd float %10, %14
+  %16 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6
+  %17 = addrspacecast float addrspace(3)* %16 to float*
+  %18 = load float* %17, align 4
+  %19 = fadd float %15, %18
+  store float %19, float* %output, align 4
+  ret void
+}
+; PTX-LABEL: sum_of_array2(
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+
+; IR-LABEL: @sum_of_array2(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
+
+; Similar to @sum_of_array3, but extends array indices using zext instead of
+; sext. e.g., array[zext(x + 1)][zext(y + 1)].
+define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) {
 .preheader:
   %0 = zext i32 %y to i64
   %1 = zext i32 %x to i64
@@ -45,16 +128,14 @@ define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
   store float %21, float* %output, align 4
   ret void
 }
-
-; PTX-LABEL: sum_of_array(
+; PTX-LABEL: sum_of_array3(
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
 
-; IR-LABEL: @sum_of_array(
-; IR: [[BASE_PTR:%[0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i32 %x, i32 %y
-; IR: [[BASE_INT:%[0-9]+]] = ptrtoint float addrspace(3)* [[BASE_PTR]] to i64
-; IR: %5 = add i64 [[BASE_INT]], 4
-; IR: %10 = add i64 [[BASE_INT]], 128
-; IR: %15 = add i64 [[BASE_INT]], 132
+; IR-LABEL: @sum_of_array3(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index f4020019c9a1..ed40c7e9551a 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -23,54 +23,94 @@ entry:
   %p = getelementptr inbounds [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
   ret double* %p
 }
-; CHECK-LABEL: @struct
-; CHECK: getelementptr [1024 x %struct.S]* @struct_array, i64 0, i32 %i, i32 1
+; CHECK-LABEL: @struct(
+; CHECK: getelementptr [1024 x %struct.S]* @struct_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1
 
-; We should be able to trace into sext/zext if it's directly used as a GEP
-; index.
-define float* @sext_zext(i32 %i, i32 %j) {
+; We should be able to trace into sext(a + b) if a + b is non-negative
+; (e.g., used as an index of an inbounds GEP) and one of a and b is
+; non-negative.
+define float* @sext_add(i32 %i, i32 %j) {
 entry:
-  %i1 = add i32 %i, 1
-  %j2 = add i32 %j, 2
-  %i1.ext = sext i32 %i1 to i64
-  %j2.ext = zext i32 %j2 to i64
-  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i1.ext, i64 %j2.ext
+  %0 = add i32 %i, 1
+  %1 = sext i32 %0 to i64  ; inbound sext(i + 1) = sext(i) + 1
+  %2 = add i32 %j, -2
+  ; However, inbound sext(j + -2) != sext(j) + -2, e.g., j = INT_MIN
+  %3 = sext i32 %2 to i64
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3
   ret float* %p
 }
-; CHECK-LABEL: @sext_zext
-; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i32 %i, i32 %j
-; CHECK: add i64 %{{[0-9]+}}, 136
+; CHECK-LABEL: @sext_add(
+; CHECK-NOT: = add
+; CHECK: add i32 %j, -2
+; CHECK: sext
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 32
 
 ; We should be able to trace into sext/zext if it can be distributed to both
 ; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b)
+;
+; This test verifies we can transform
+;   gep base, a + sext(b +nsw 1), c + zext(d +nuw 1)
+; to
+;   gep base, a + sext(b), c + zext(d); gep ..., 1 * 32 + 1
 define float* @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) {
   %b1 = add nsw i32 %b, 1
   %b2 = sext i32 %b1 to i64
-  %i = add i64 %a, %b2
+  %i = add i64 %a, %b2       ; i = a + sext(b +nsw 1)
   %d1 = add nuw i32 %d, 1
   %d2 = zext i32 %d1 to i64
-  %j = add i64 %c, %d2
+  %j = add i64 %c, %d2       ; j = c + zext(d +nuw 1)
   %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
   ret float* %p
 }
-; CHECK-LABEL: @ext_add_no_overflow
-; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}
-; CHECK: [[BASE_INT:%[0-9]+]] = ptrtoint float* [[BASE_PTR]] to i64
-; CHECK: add i64 [[BASE_INT]], 132
-
-; We should treat "or" with no common bits (%k) as "add", and leave "or" with
-; potentially common bits (%l) as is.
-define float* @or(i64 %i) {
+; CHECK-LABEL: @ext_add_no_overflow(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR]], i64 33
+
+; Verifies we handle nested sext/zext correctly.
+define void @sext_zext(i32 %a, i32 %b, float** %out1, float** %out2) {
 entry:
-  %j = shl i64 %i, 2
-  %k = or i64 %j, 3 ; no common bits
-  %l = or i64 %j, 4 ; potentially common bits
-  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %k, i64 %l
+  %0 = add nsw nuw i32 %a, 1
+  %1 = sext i32 %0 to i48
+  %2 = zext i48 %1 to i64    ; zext(sext(a +nsw nuw 1)) = zext(sext(a)) + 1
+  %3 = add nsw i32 %b, 2
+  %4 = sext i32 %3 to i48
+  %5 = zext i48 %4 to i64    ; zext(sext(b +nsw 2)) != zext(sext(b)) + 2
+  %p1 = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %2, i64 %5
+  store float* %p1, float** %out1
+  %6 = add nuw i32 %a, 3
+  %7 = zext i32 %6 to i48
+  %8 = sext i48 %7 to i64 ; sext(zext(a +nuw 3)) = zext(a +nuw 3) = zext(a) + 3
+  %9 = add nsw i32 %b, 4
+  %10 = zext i32 %9 to i48
+  %11 = sext i48 %10 to i64  ; sext(zext(b +nsw 4)) != zext(b) + 4
+  %p2 = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %8, i64 %11
+  store float* %p2, float** %out2
+  ret void
+}
+; CHECK-LABEL: @sext_zext(
+; CHECK: [[BASE_PTR_1:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR_1]], i64 32
+; CHECK: [[BASE_PTR_2:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR_2]], i64 96
+
+; Similar to @ext_add_no_overflow, we should be able to trace into s/zext if
+; its operand is an OR and the two operands of the OR have no common bits.
+define float* @sext_or(i64 %a, i32 %b) {
+entry:
+  %b1 = shl i32 %b, 2
+  %b2 = or i32 %b1, 1 ; (b << 2) and 1 have no common bits
+  %b3 = or i32 %b1, 4 ; (b << 2) and 4 may have common bits
+  %b2.ext = zext i32 %b2 to i64
+  %b3.ext = sext i32 %b3 to i64
+  %i = add i64 %a, %b2.ext
+  %j = add i64 %a, %b3.ext
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
   ret float* %p
 }
-; CHECK-LABEL: @or
-; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %j, i64 %l
-; CHECK: add i64 %{{[0-9]+}}, 384
+; CHECK-LABEL: @sext_or(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR]], i64 32
 
 ; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b +
 ; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't
@@ -83,11 +123,28 @@ entry:
   store i64 %b5, i64* %out
   ret float* %p
 }
-; CHECK-LABEL: @expr
-; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %0, i64 0
-; CHECK: add i64 %{{[0-9]+}}, 640
+; CHECK-LABEL: @expr(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 0
+; CHECK: getelementptr float* [[BASE_PTR]], i64 160
 ; CHECK: store i64 %b5, i64* %out
 
+; d + sext(a +nsw (b +nsw (c +nsw 8))) => (d + sext(a) + sext(b) + sext(c)) + 8
+define float* @sext_expr(i32 %a, i32 %b, i32 %c, i64 %d) {
+entry:
+  %0 = add nsw i32 %c, 8
+  %1 = add nsw i32 %b, %0
+  %2 = add nsw i32 %a, %1
+  %3 = sext i32 %2 to i64
+  %i = add i64 %d, %3
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
+  ret float* %p
+}
+; CHECK-LABEL: @sext_expr(
+; CHECK: sext i32
+; CHECK: sext i32
+; CHECK: sext i32
+; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 8
+
 ; Verifies we handle "sub" correctly.
 define float* @sub(i64 %i, i64 %j) {
   %i2 = sub i64 %i, 5 ; i - 5
@@ -95,7 +152,110 @@ define float* @sub(i64 %i, i64 %j) {
   %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i2, i64 %j2
   ret float* %p
 }
-; CHECK-LABEL: @sub
-; CHECK: %[[j2:[0-9]+]] = sub i64 0, %j
-; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
-; CHECK: add i64 %{{[0-9]+}}, -620
+; CHECK-LABEL: @sub(
+; CHECK: %[[j2:[a-zA-Z0-9]+]] = sub i64 0, %j
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
+; CHECK: getelementptr float* [[BASE_PTR]], i64 -155
+
+%struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed
+
+; Verifies we can emit correct uglygep if the address is not natually aligned.
+define i64* @packed_struct(i32 %i, i32 %j) {
+entry:
+  %s = alloca [1024 x %struct.Packed], align 16
+  %add = add nsw i32 %j, 3
+  %idxprom = sext i32 %add to i64
+  %add1 = add nsw i32 %i, 1
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed]* %s, i64 0, i64 %idxprom2, i32 1, i64 %idxprom
+  ret i64* %arrayidx3
+}
+; CHECK-LABEL: @packed_struct(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [1024 x %struct.Packed]* %s, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: [[CASTED_PTR:%[a-zA-Z0-9]+]] = bitcast i64* [[BASE_PTR]] to i8*
+; CHECK: %uglygep = getelementptr i8* [[CASTED_PTR]], i64 100
+; CHECK: bitcast i8* %uglygep to i64*
+
+; We shouldn't be able to extract the 8 from "zext(a +nuw (b + 8))",
+; because "zext(b + 8) != zext(b) + 8"
+define float* @zext_expr(i32 %a, i32 %b) {
+entry:
+  %0 = add i32 %b, 8
+  %1 = add nuw i32 %a, %0
+  %i = zext i32 %1 to i64
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
+  ret float* %p
+}
+; CHECK-LABEL: zext_expr(
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
+
+; Per http://llvm.org/docs/LangRef.html#id181, the indices of a off-bound gep
+; should be considered sign-extended to the pointer size. Therefore,
+;   gep base, (add i32 a, b) != gep (gep base, i32 a), i32 b
+; because
+;   sext(a + b) != sext(a) + sext(b)
+;
+; This test verifies we do not illegitimately extract the 8 from
+;   gep base, (i32 a + 8)
+define float* @i32_add(i32 %a) {
+entry:
+  %i = add i32 %a, 8
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i32 %i
+  ret float* %p
+}
+; CHECK-LABEL: @i32_add(
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
+; CHECK-NOT: getelementptr
+
+; Verifies that we compute the correct constant offset when the index is
+; sign-extended and then zero-extended. The old version of our code failed to
+; handle this case because it simply computed the constant offset as the
+; sign-extended value of the constant part of the GEP index.
+define float* @apint(i1 %a) {
+entry:
+  %0 = add nsw nuw i1 %a, 1
+  %1 = sext i1 %0 to i4
+  %2 = zext i4 %1 to i64         ; zext (sext i1 1 to i4) to i64 = 15
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %2
+  ret float* %p
+}
+; CHECK-LABEL: @apint(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR]], i64 15
+
+; Do not trace into binary operators other than ADD, SUB, and OR.
+define float* @and(i64 %a) {
+entry:
+  %0 = shl i64 %a, 2
+  %1 = and i64 %0, 1
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %1
+  ret float* %p
+}
+; CHECK-LABEL: @and(
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array
+; CHECK-NOT: getelementptr
+
+; if zext(a + b) <= max signed value of typeof(a + b), then we can prove
+; a + b >= 0 and zext(a + b) == sext(a + b). If we can prove further a or b is
+; non-negative, we have zext(a + b) == sext(a) + sext(b).
+define float* @inbounds_zext_add(i32 %i, i4 %j) {
+entry:
+  %0 = add i32 %i, 1
+  %1 = zext i32 %0 to i64
+  ; Because zext(i + 1) is an index of an in bounds GEP based on
+  ; float_2d_array, zext(i + 1) <= sizeof(float_2d_array) = 4096.
+  ; Furthermore, since typeof(i + 1) is i32 and 4096 < 2^31, we are sure the
+  ; sign bit of i + 1 is 0. This implies zext(i + 1) = sext(i + 1).
+  %2 = add i4 %j, 2
+  %3 = zext i4 %2 to i64
+  ; In this case, typeof(j + 2) is i4, so zext(j + 2) <= 4096 does not imply
+  ; the sign bit of j + 2 is 0.
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3
+  ret float* %p
+}
+; CHECK-LABEL: @inbounds_zext_add(
+; CHECK-NOT: add
+; CHECK: add i4 %j, 2
+; CHECK: sext
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 32
diff --git a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
index 4d344fa91a9e..fa6a54e50132 100644
--- a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
+++ b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SimplifyCFG/X86/lit.local.cfg b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/SimplifyCFG/X86/lit.local.cfg
+++ b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SimplifyCFG/speculate-vector-ops.ll b/test/Transforms/SimplifyCFG/speculate-vector-ops.ll
new file mode 100644
index 000000000000..91972eb5dd9f
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/speculate-vector-ops.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+
+define i32 @speculate_vector_extract(i32 %d, <4 x i32> %v) #0 {
+; CHECK-LABEL: @speculate_vector_extract(
+; CHECK-NOT: br
+entry:
+  %conv = insertelement <4 x i32> undef, i32 %d, i32 0
+  %conv2 = insertelement <4 x i32> %conv, i32 %d, i32 1
+  %conv3 = insertelement <4 x i32> %conv2, i32 %d, i32 2
+  %conv4 = insertelement <4 x i32> %conv3, i32 %d, i32 3
+  %tmp6 = add nsw <4 x i32> %conv4, <i32 0, i32 -1, i32 -2, i32 -3>
+  %cmp = icmp eq <4 x i32> %tmp6, zeroinitializer
+  %cmp.ext = sext <4 x i1> %cmp to <4 x i32>
+  %tmp8 = extractelement <4 x i32> %cmp.ext, i32 0
+  %tobool = icmp eq i32 %tmp8, 0
+  br i1 %tobool, label %cond.else, label %cond.then
+
+return:                                           ; preds = %cond.end28
+  ret i32 %cond32
+
+cond.then:                                        ; preds = %entry
+  %tmp10 = extractelement <4 x i32> %v, i32 0
+  br label %cond.end
+
+cond.else:                                        ; preds = %entry
+  %tmp12 = extractelement <4 x i32> %v, i32 3
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.else, %cond.then
+  %cond = phi i32 [ %tmp10, %cond.then ], [ %tmp12, %cond.else ]
+  %tmp14 = extractelement <4 x i32> %cmp.ext, i32 1
+  %tobool15 = icmp eq i32 %tmp14, 0
+  br i1 %tobool15, label %cond.else17, label %cond.then16
+
+cond.then16:                                      ; preds = %cond.end
+  %tmp20 = extractelement <4 x i32> %v, i32 1
+  br label %cond.end18
+
+cond.else17:                                      ; preds = %cond.end
+  br label %cond.end18
+
+cond.end18:                                       ; preds = %cond.else17, %cond.then16
+  %cond22 = phi i32 [ %tmp20, %cond.then16 ], [ %cond, %cond.else17 ]
+  %tmp24 = extractelement <4 x i32> %cmp.ext, i32 2
+  %tobool25 = icmp eq i32 %tmp24, 0
+  br i1 %tobool25, label %cond.else27, label %cond.then26
+
+cond.then26:                                      ; preds = %cond.end18
+  %tmp30 = extractelement <4 x i32> %v, i32 2
+  br label %cond.end28
+
+cond.else27:                                      ; preds = %cond.end18
+  br label %cond.end28
+
+cond.end28:                                       ; preds = %cond.else27, %cond.then26
+  %cond32 = phi i32 [ %tmp30, %cond.then26 ], [ %cond22, %cond.else27 ]
+  br label %return
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/TailDup/X86/lit.local.cfg b/test/Transforms/TailDup/X86/lit.local.cfg
index ba763cf03ffc..e71f3cc4c41e 100644
--- a/test/Transforms/TailDup/X86/lit.local.cfg
+++ b/test/Transforms/TailDup/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/TailDup/lit.local.cfg b/test/Transforms/TailDup/lit.local.cfg
index 19840aa7574c..c8625f4d9d24 100644
--- a/test/Transforms/TailDup/lit.local.cfg
+++ b/test/Transforms/TailDup/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/Verifier/alias.ll b/test/Verifier/alias.ll
index e3636bc7017f..ff02a37bab95 100644
--- a/test/Verifier/alias.ll
+++ b/test/Verifier/alias.ll
@@ -10,3 +10,18 @@ declare void @f()
 @ga = alias i32* @g
 ; CHECK: Alias must point to a definition
 ; CHECK-NEXT: @ga
+
+
+@test2_a = alias i32* @test2_b
+@test2_b = alias i32* @test2_a
+; CHECK:      Aliases cannot form a cycle
+; CHECK-NEXT: i32* @test2_a
+; CHECK-NEXT: Aliases cannot form a cycle
+; CHECK-NEXT: i32* @test2_b
+
+
+@test3_a = global i32 42
+@test3_b = alias weak i32* @test3_a
+@test3_c = alias i32* @test3_b
+; CHECK: Alias cannot point to a weak alias
+; CHECK-NEXT: i32* @test3_c
diff --git a/test/Verifier/bitcast-alias-address-space.ll b/test/Verifier/bitcast-alias-address-space.ll
new file mode 100644
index 000000000000..d9794d9e338a
--- /dev/null
+++ b/test/Verifier/bitcast-alias-address-space.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: error: invalid cast opcode for cast from 'i32 addrspace(2)*' to 'i32 addrspace(1)*'
+
+target datalayout = "e-p:32:32:32-p1:16:16:16-p2:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n8:16:32"
+
+
+@data = addrspace(2) global i32 27
+
+@illegal_alias_data = alias bitcast (i32 addrspace(2)* @data to i32 addrspace(1)*)
diff --git a/test/Verifier/jumptable.ll b/test/Verifier/jumptable.ll
new file mode 100644
index 000000000000..5f4cd3fe4f45
--- /dev/null
+++ b/test/Verifier/jumptable.ll
@@ -0,0 +1,9 @@
+; RUN: not llc <%s 2>&1 | FileCheck %s
+
+define i32 @f() jumptable {
+  ret i32 0
+}
+
+; CHECK: Attribute 'jumptable' requires 'unnamed_addr'
+; CHECK: i32 ()* @f
+; CHECK: LLVM ERROR: Broken function found, compilation aborted!
diff --git a/test/lit.cfg b/test/lit.cfg
index 2815a61950b2..cae708baf077 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -265,6 +265,10 @@ for pattern in [r"\bbugpoint\b(?!-)",
         tool_path = llvm_tools_dir + '/' + tool_name
     config.substitutions.append((pattern, tool_pipe + tool_path))
 
+### Targets
+
+config.targets = frozenset(config.targets_to_build.split())
+
 ### Features
 
 # Shell execution
diff --git a/test/tools/llvm-cov/copy_block_helper.m b/test/tools/llvm-cov/copy_block_helper.m
index 1859b883775d..64973f11009a 100644
--- a/test/tools/llvm-cov/copy_block_helper.m
+++ b/test/tools/llvm-cov/copy_block_helper.m
@@ -29,4 +29,4 @@ void test(id x) { // GCOV: -:    [[@LINE]]:void test
 int main(int argc, const char *argv[]) { test(0); }
 
 // llvm-cov doesn't work on big endian yet
-// XFAIL: powerpc64, s390x, mips-, mips64-, sparc
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
diff --git a/test/tools/llvm-cov/llvm-cov.test b/test/tools/llvm-cov/llvm-cov.test
index 2345f8d51e80..82d1273ae61f 100644
--- a/test/tools/llvm-cov/llvm-cov.test
+++ b/test/tools/llvm-cov/llvm-cov.test
@@ -110,4 +110,4 @@ RUN: not llvm-cov test.c -gcda=test_file_checksum_fail.gcda
 # Bad function checksum on gcda
 RUN: not llvm-cov test.c -gcda=test_func_checksum_fail.gcda
 
-XFAIL: powerpc64, s390x, mips-, mips64-, sparc
+XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
diff --git a/test/tools/llvm-cov/range_based_for.cpp b/test/tools/llvm-cov/range_based_for.cpp
index 61f60f6b0d65..3fdb2441399c 100644
--- a/test/tools/llvm-cov/range_based_for.cpp
+++ b/test/tools/llvm-cov/range_based_for.cpp
@@ -26,4 +26,4 @@ int main(int argc, const char *argv[]) { // GCOV: 1:    [[@LINE]]:int main(
 }                                        // GCOV: -:    [[@LINE]]:}
 
 // llvm-cov doesn't work on big endian yet
-// XFAIL: powerpc64, s390x, mips-, mips64-, sparc
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
diff --git a/test/tools/llvm-objdump/lit.local.cfg b/test/tools/llvm-objdump/lit.local.cfg
index 19840aa7574c..c8625f4d9d24 100644
--- a/test/tools/llvm-objdump/lit.local.cfg
+++ b/test/tools/llvm-objdump/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/tools/llvm-readobj/ARM/lit.local.cfg b/test/tools/llvm-readobj/ARM/lit.local.cfg
index 8a3ba96497e7..98c6700c209d 100644
--- a/test/tools/llvm-readobj/ARM/lit.local.cfg
+++ b/test/tools/llvm-readobj/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/tools/llvm-readobj/Inputs/got-tls.so.elf-mips64el b/test/tools/llvm-readobj/Inputs/got-tls.so.elf-mips64el
new file mode 100755
index 000000000000..3afc567f85d5
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/got-tls.so.elf-mips64el differ
diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test
new file mode 100644
index 000000000000..76db3c845eb4
--- /dev/null
+++ b/test/tools/llvm-readobj/mips-got.test
@@ -0,0 +1,306 @@
+RUN: llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips | \
+RUN:   FileCheck %s -check-prefix GOT-OBJ
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips | \
+RUN:   FileCheck %s -check-prefix GOT-EXE
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-so.mips | \
+RUN:   FileCheck %s -check-prefix GOT-SO
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \
+RUN:   FileCheck %s -check-prefix GOT-TLS
+
+GOT-OBJ: Cannot find PLTGOT dynamic table tag.
+
+GOT-EXE:      Primary GOT {
+GOT-EXE-NEXT:   Canonical gp value: 0x418880
+GOT-EXE-NEXT:   Reserved entries [
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x410890
+GOT-EXE-NEXT:       Access: -32752
+GOT-EXE-NEXT:       Initial: 0x0
+GOT-EXE-NEXT:       Purpose: Lazy resolver
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x410894
+GOT-EXE-NEXT:       Access: -32748
+GOT-EXE-NEXT:       Initial: 0x80000000
+GOT-EXE-NEXT:       Purpose: Module pointer (GNU extension)
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:   ]
+GOT-EXE-NEXT:   Local entries [
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x410898
+GOT-EXE-NEXT:       Access: -32744
+GOT-EXE-NEXT:       Initial: 0x400418
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x41089C
+GOT-EXE-NEXT:       Access: -32740
+GOT-EXE-NEXT:       Initial: 0x410840
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x4108A0
+GOT-EXE-NEXT:       Access: -32736
+GOT-EXE-NEXT:       Initial: 0x0
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:   ]
+GOT-EXE-NEXT:   Global entries [
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x4108A4
+GOT-EXE-NEXT:       Access: -32732
+GOT-EXE-NEXT:       Initial: 0x0
+GOT-EXE-NEXT:       Value: 0x0
+GOT-EXE-NEXT:       Type: Function (0x2)
+GOT-EXE-NEXT:       Section: Undefined (0x0)
+GOT-EXE-NEXT:       Name: __gmon_start__@ (1)
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:   ]
+GOT-EXE-NEXT:   Number of TLS and multi-GOT entries: 0
+GOT-EXE-NEXT: }
+
+GOT-SO:      Primary GOT {
+GOT-SO-NEXT:   Canonical gp value: 0x188D0
+GOT-SO-NEXT:   Reserved entries [
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108E0
+GOT-SO-NEXT:       Access: -32752
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Purpose: Lazy resolver
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108E4
+GOT-SO-NEXT:       Access: -32748
+GOT-SO-NEXT:       Initial: 0x80000000
+GOT-SO-NEXT:       Purpose: Module pointer (GNU extension)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:   ]
+GOT-SO-NEXT:   Local entries [
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108E8
+GOT-SO-NEXT:       Access: -32744
+GOT-SO-NEXT:       Initial: 0x108E0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108EC
+GOT-SO-NEXT:       Access: -32740
+GOT-SO-NEXT:       Initial: 0x10000
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108F0
+GOT-SO-NEXT:       Access: -32736
+GOT-SO-NEXT:       Initial: 0x10920
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108F4
+GOT-SO-NEXT:       Access: -32732
+GOT-SO-NEXT:       Initial: 0x108CC
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108F8
+GOT-SO-NEXT:       Access: -32728
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108FC
+GOT-SO-NEXT:       Access: -32724
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10900
+GOT-SO-NEXT:       Access: -32720
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10904
+GOT-SO-NEXT:       Access: -32716
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:   ]
+GOT-SO-NEXT:   Global entries [
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10908
+GOT-SO-NEXT:       Access: -32712
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: None (0x0)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: _ITM_registerTMCloneTable@ (87)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x1090C
+GOT-SO-NEXT:       Access: -32708
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: None (0x0)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: _Jv_RegisterClasses@ (128)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10910
+GOT-SO-NEXT:       Access: -32704
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: Function (0x2)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: __gmon_start__@ (23)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10914
+GOT-SO-NEXT:       Access: -32700
+GOT-SO-NEXT:       Initial: 0x840
+GOT-SO-NEXT:       Value: 0x840
+GOT-SO-NEXT:       Type: Function (0x2)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: puts@GLIBC_2.0 (162)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10918
+GOT-SO-NEXT:       Access: -32696
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: None (0x0)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: _ITM_deregisterTMCloneTable@ (59)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x1091C
+GOT-SO-NEXT:       Access: -32692
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: Function (0x2)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: __cxa_finalize@GLIBC_2.2 (113)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:   ]
+GOT-SO-NEXT:   Number of TLS and multi-GOT entries: 0
+GOT-SO-NEXT: }
+
+GOT-TLS:      Primary GOT {
+GOT-TLS-NEXT:   Canonical gp value: 0x18BF0
+GOT-TLS-NEXT:   Reserved entries [
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C00
+GOT-TLS-NEXT:       Access: -32752
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Purpose: Lazy resolver
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C08
+GOT-TLS-NEXT:       Access: -32744
+GOT-TLS-NEXT:       Initial: 0x8000000000000000
+GOT-TLS-NEXT:       Purpose: Module pointer (GNU extension)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:   ]
+GOT-TLS-NEXT:   Local entries [
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C10
+GOT-TLS-NEXT:       Access: -32736
+GOT-TLS-NEXT:       Initial: 0x10000
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C18
+GOT-TLS-NEXT:       Access: -32728
+GOT-TLS-NEXT:       Initial: 0x10C00
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C20
+GOT-TLS-NEXT:       Access: -32720
+GOT-TLS-NEXT:       Initial: 0x10CB8
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C28
+GOT-TLS-NEXT:       Access: -32712
+GOT-TLS-NEXT:       Initial: 0x10BF0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C30
+GOT-TLS-NEXT:       Access: -32704
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C38
+GOT-TLS-NEXT:       Access: -32696
+GOT-TLS-NEXT:       Initial: 0x948
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C40
+GOT-TLS-NEXT:       Access: -32688
+GOT-TLS-NEXT:       Initial: 0xA20
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C48
+GOT-TLS-NEXT:       Access: -32680
+GOT-TLS-NEXT:       Initial: 0xAF0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C50
+GOT-TLS-NEXT:       Access: -32672
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C58
+GOT-TLS-NEXT:       Access: -32664
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C60
+GOT-TLS-NEXT:       Access: -32656
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:   ]
+GOT-TLS-NEXT:   Global entries [
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C68
+GOT-TLS-NEXT:       Access: -32648
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: None (0x0)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: _ITM_registerTMCloneTable@ (78)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C70
+GOT-TLS-NEXT:       Access: -32640
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: None (0x0)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: _Jv_RegisterClasses@ (119)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C78
+GOT-TLS-NEXT:       Access: -32632
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: Function (0x2)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: __gmon_start__@ (23)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C80
+GOT-TLS-NEXT:       Access: -32624
+GOT-TLS-NEXT:       Initial: 0xB60
+GOT-TLS-NEXT:       Value: 0xB60
+GOT-TLS-NEXT:       Type: Function (0x2)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: __tls_get_addr@GLIBC_2.3 (150)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C88
+GOT-TLS-NEXT:       Access: -32616
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: None (0x0)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: _ITM_deregisterTMCloneTable@ (50)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C90
+GOT-TLS-NEXT:       Access: -32608
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: Function (0x2)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: __cxa_finalize@GLIBC_2.2 (104)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:   ]
+GOT-TLS-NEXT:   Number of TLS and multi-GOT entries: 4
+GOT-TLS-NEXT: }
diff --git a/tools/Makefile b/tools/Makefile
index 2b8c32ee32a7..97ad99a9d1a2 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -19,9 +19,9 @@ else
   OPTIONAL_PARALLEL_DIRS := clang
 endif
 
-# Build LLDB if present. Note LLDB must be built last as it depends on the
-# wider LLVM infrastructure (including Clang).
-OPTIONAL_DIRS := lldb
+# Build LLD and LLDB if present. Note LLDB must be built last as it depends on
+# the wider LLVM infrastructure (including Clang).
+OPTIONAL_DIRS := lld lldb
 
 # NOTE: The tools are organized into five groups of four consisting of one
 # large and three small executables. This is done to minimize memory load
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 5ed7d2cab3c7..25813b34e62d 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -267,7 +267,7 @@ void BugDriver::compileProgram(Module *M, std::string *Error) const {
   // Emit the program to a bitcode file...
   SmallString<128> BitcodeFile;
   int BitcodeFD;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       OutputPrefix + "-test-program-%%%%%%%.bc", BitcodeFD, BitcodeFile);
   if (EC) {
     errs() << ToolName << ": Error making unique filename: " << EC.message()
@@ -305,7 +305,7 @@ std::string BugDriver::executeProgram(const Module *Program,
     // Emit the program to a bitcode file...
     SmallString<128> UniqueFilename;
     int UniqueFD;
-    error_code EC = sys::fs::createUniqueFile(
+    std::error_code EC = sys::fs::createUniqueFile(
         OutputPrefix + "-test-program-%%%%%%%.bc", UniqueFD, UniqueFilename);
     if (EC) {
       errs() << ToolName << ": Error making unique filename: "
@@ -331,7 +331,7 @@ std::string BugDriver::executeProgram(const Module *Program,
 
   // Check to see if this is a valid output filename...
   SmallString<128> UniqueFile;
-  error_code EC = sys::fs::createUniqueFile(OutputFile, UniqueFile);
+  std::error_code EC = sys::fs::createUniqueFile(OutputFile, UniqueFile);
   if (EC) {
     errs() << ToolName << ": Error making unique filename: "
            << EC.message() << "\n";
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 38cdf241ce76..4fb68566ac62 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -366,7 +366,7 @@ Module *BugDriver::ExtractMappedBlocksFromModule(const
                                                  Module *M) {
   SmallString<128> Filename;
   int FD;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       OutputPrefix + "-extractblocks%%%%%%%", FD, Filename);
   if (EC) {
     outs() << "*** Basic Block extraction failed!\n";
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index f5936ac88ea5..3f1f84ef6244 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -964,8 +964,8 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
-  error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
-                                               TestModuleFD, TestModuleBC);
+  std::error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
+                                                    TestModuleFD, TestModuleBC);
   if (EC) {
     errs() << BD.getToolName() << "Error making unique filename: "
            << EC.message() << "\n";
@@ -1058,8 +1058,8 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
-  error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
-                                               TestModuleFD, TestModuleBC);
+  std::error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
+                                                    TestModuleFD, TestModuleBC);
   if (EC) {
     errs() << getToolName() << "Error making unique filename: "
            << EC.message() << "\n";
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index b2722e6d0e7d..d452fd94c06a 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -129,7 +129,7 @@ bool BugDriver::runPasses(Module *Program,
   // setup the output file name
   outs().flush();
   SmallString<128> UniqueFilename;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       OutputPrefix + "-output-%%%%%%%.bc", UniqueFilename);
   if (EC) {
     errs() << getToolName() << ": Error making unique filename: "
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index c481b03e2b9e..4a2401b530f5 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -142,7 +142,7 @@ static std::string ProcessFailure(StringRef ProgPath, const char** Args,
   // Rerun the compiler, capturing any error messages to print them.
   SmallString<128> ErrorFilename;
   int ErrorFD;
-  error_code EC = sys::fs::createTemporaryFile(
+  std::error_code EC = sys::fs::createTemporaryFile(
       "bugpoint.program_error_messages", "", ErrorFD, ErrorFilename);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
@@ -478,7 +478,7 @@ GCC::FileType LLC::OutputCode(const std::string &Bitcode,
   const char *Suffix = (UseIntegratedAssembler ? ".llc.o" : ".llc.s");
 
   SmallString<128> UniqueFile;
-  error_code EC =
+  std::error_code EC =
       sys::fs::createUniqueFile(Bitcode + "-%%%%%%%" + Suffix, UniqueFile);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
@@ -715,7 +715,7 @@ int GCC::ExecuteProgram(const std::string &ProgramFile,
   GCCArgs.push_back("-o");
 
   SmallString<128> OutputBinary;
-  error_code EC =
+  std::error_code EC =
       sys::fs::createUniqueFile(ProgramFile + "-%%%%%%%.gcc.exe", OutputBinary);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
@@ -825,7 +825,7 @@ int GCC::MakeSharedObject(const std::string &InputFile, FileType fileType,
                           const std::vector<std::string> &ArgsForGCC,
                           std::string &Error) {
   SmallString<128> UniqueFilename;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       InputFile + "-%%%%%%%" + LTDL_SHLIB_EXT, UniqueFilename);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 4726d82d69c1..371574f26156 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -21,13 +21,13 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
 #include <cerrno>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <list>
 #include <plugin-api.h>
+#include <system_error>
 #include <vector>
 
 // Support Windows/MinGW crazyness.
@@ -258,7 +258,7 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
     if (file->offset) {
       offset = file->offset;
     }
-    if (error_code ec = MemoryBuffer::getOpenFileSlice(
+    if (std::error_code ec = MemoryBuffer::getOpenFileSlice(
             file->fd, file->name, buffer, file->filesize, offset)) {
       (*message)(LDPL_ERROR, ec.message().c_str());
       return LDPS_ERR;
@@ -484,7 +484,7 @@ static ld_plugin_status all_symbols_read_hook(void) {
 
 static ld_plugin_status cleanup_hook(void) {
   for (int i = 0, e = Cleanup.size(); i != e; ++i) {
-    error_code EC = sys::fs::remove(Cleanup[i]);
+    std::error_code EC = sys::fs::remove(Cleanup[i]);
     if (EC)
       (*message)(LDPL_ERROR, "Failed to delete '%s': %s", Cleanup[i].c_str(),
                  EC.message().c_str());
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 269a5df90414..09ff4613b972 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -75,15 +75,24 @@ OptLevel("O",
 static cl::opt<std::string>
 TargetTriple("mtriple", cl::desc("Override target triple for module"));
 
-cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
-                       cl::desc("Do not verify input module"));
+static cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
+                              cl::desc("Do not verify input module"));
 
-cl::opt<bool>
-DisableSimplifyLibCalls("disable-simplify-libcalls",
-                        cl::desc("Disable simplify-libcalls"),
-                        cl::init(false));
+static cl::opt<bool> DisableSimplifyLibCalls("disable-simplify-libcalls",
+                                             cl::desc("Disable simplify-libcalls"));
 
-static int compileModule(char**, LLVMContext&);
+static cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
+                                    cl::desc("Show encoding in .s output"));
+
+static cl::opt<bool> EnableDwarfDirectory(
+    "enable-dwarf-directory", cl::Hidden,
+    cl::desc("Use .file directives with an explicit directory."));
+
+static cl::opt<bool> AsmVerbose("asm-verbose",
+                                cl::desc("Add comments to directives."),
+                                cl::init(true));
+
+static int compileModule(char **, LLVMContext &);
 
 // GetFileNameRoot - Helper function to get the basename of a filename.
 static inline std::string
@@ -272,6 +281,9 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
   Options.DisableIntegratedAS = NoIntegratedAssembler;
+  Options.MCOptions.ShowMCEncoding = ShowMCEncoding;
+  Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory;
+  Options.MCOptions.AsmVerbose = AsmVerbose;
 
   std::unique_ptr<TargetMachine> target(
       TheTarget->createTargetMachine(TheTriple.getTriple(), MCPU, FeaturesStr,
@@ -309,9 +321,6 @@ static int compileModule(char **argv, LLVMContext &Context) {
     mod->setDataLayout(DL);
   PM.add(new DataLayoutPass(mod));
 
-  // Override default to generate verbose assembly.
-  Target.setAsmVerbosityDefault(true);
-
   if (RelaxAll.getNumOccurrences() > 0 &&
       FileType != TargetMachine::CGFT_ObjectFile)
     errs() << argv[0]
diff --git a/tools/lli/RemoteMemoryManager.cpp b/tools/lli/RemoteMemoryManager.cpp
index 200ab75152c2..481651772afc 100644
--- a/tools/lli/RemoteMemoryManager.cpp
+++ b/tools/lli/RemoteMemoryManager.cpp
@@ -61,7 +61,7 @@ allocateDataSection(uintptr_t Size, unsigned Alignment,
 }
 
 sys::MemoryBlock RemoteMemoryManager::allocateSection(uintptr_t Size) {
-  error_code ec;
+  std::error_code ec;
   sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(Size,
                                                           &Near,
                                                           sys::Memory::MF_READ |
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 4cde10579c86..73a19ac66796 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -415,7 +415,7 @@ int main(int argc, char **argv, char * const *envp) {
 
   // If not jitting lazily, load the whole bitcode file eagerly too.
   if (NoLazyCompilation) {
-    if (error_code EC = Mod->materializeAllPermanently()) {
+    if (std::error_code EC = Mod->materializeAllPermanently()) {
       errs() << argv[0] << ": bitcode didn't read correctly.\n";
       errs() << "Reason: " << EC.message() << "\n";
       exit(1);
@@ -539,7 +539,7 @@ int main(int argc, char **argv, char * const *envp) {
 
   for (unsigned i = 0, e = ExtraArchives.size(); i != e; ++i) {
     std::unique_ptr<MemoryBuffer> ArBuf;
-    error_code ec;
+    std::error_code ec;
     ec = MemoryBuffer::getFileOrSTDIN(ExtraArchives[i], ArBuf);
     if (ec) {
       Err.print(argv[0], errs());
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index db95674d36ad..196240a0c2d4 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -53,7 +54,7 @@ LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
   exit(1);
 }
 
-static void failIfError(error_code EC, Twine Context = "") {
+static void failIfError(std::error_code EC, Twine Context = "") {
   if (!EC)
     return;
 
@@ -368,8 +369,9 @@ static void performReadOperation(ArchiveOperation Operation,
   for (object::Archive::child_iterator I = OldArchive->child_begin(),
                                        E = OldArchive->child_end();
        I != E; ++I) {
-    StringRef Name;
-    failIfError(I->getName(Name));
+    ErrorOr<StringRef> NameOrErr = I->getName();
+    failIfError(NameOrErr.getError());
+    StringRef Name = NameOrErr.get();
 
     if (!Members.empty() &&
         std::find(Members.begin(), Members.end(), Name) == Members.end())
@@ -453,8 +455,7 @@ int NewArchiveIterator::getFD() const {
   // Linux cannot open directories with open(2), although
   // cygwin and *bsd can.
   if (NewStatus.type() == sys::fs::file_type::directory_file)
-    failIfError(error_code(errc::is_a_directory, posix_category()),
-                NewFilename);
+    failIfError(make_error_code(errc::is_a_directory), NewFilename);
 
   return NewFD;
 }
@@ -516,7 +517,7 @@ computeInsertAction(ArchiveOperation Operation,
     // We could try to optimize this to a fstat, but it is not a common
     // operation.
     sys::fs::file_status Status;
-    failIfError(sys::fs::status(*MI, Status));
+    failIfError(sys::fs::status(*MI, Status), *MI);
     if (Status.getLastModificationTime() < I->getLastModified()) {
       if (PosName.empty())
         return IA_AddOldMember;
@@ -544,8 +545,9 @@ computeNewArchiveMembers(ArchiveOperation Operation,
                                          E = OldArchive->child_end();
          I != E; ++I) {
       int Pos = Ret.size();
-      StringRef Name;
-      failIfError(I->getName(Name));
+      ErrorOr<StringRef> NameOrErr = I->getName();
+      failIfError(NameOrErr.getError());
+      StringRef Name = NameOrErr.get();
       if (Name == PosName) {
         assert(AddAfter || AddBefore);
         if (AddBefore)
@@ -783,7 +785,10 @@ static void performWriteOperation(ArchiveOperation Operation,
 
     } else {
       object::Archive::child_iterator OldMember = Member.getOld();
-      failIfError(OldMember->getMemoryBuffer(MemberBuffer));
+      ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
+          OldMember->getMemoryBuffer();
+      failIfError(MemberBufferOrErr.getError());
+      MemberBuffer = std::move(MemberBufferOrErr.get());
     }
     MemberBuffers[I] = MemberBuffer.release();
   }
@@ -939,8 +944,8 @@ int ar_main(char **argv) {
 static int performOperation(ArchiveOperation Operation) {
   // Create or open the archive object.
   std::unique_ptr<MemoryBuffer> Buf;
-  error_code EC = MemoryBuffer::getFile(ArchiveName, Buf, -1, false);
-  if (EC && EC != llvm::errc::no_such_file_or_directory) {
+  std::error_code EC = MemoryBuffer::getFile(ArchiveName, Buf, -1, false);
+  if (EC && EC != errc::no_such_file_or_directory) {
     errs() << ToolName << ": error opening '" << ArchiveName
            << "': " << EC.message() << "!\n";
     return 1;
@@ -958,7 +963,7 @@ static int performOperation(ArchiveOperation Operation) {
     return 0;
   }
 
-  assert(EC == llvm::errc::no_such_file_or_directory);
+  assert(EC == errc::no_such_file_or_directory);
 
   if (!shouldCreateArchive(Operation)) {
     failIfError(EC, Twine("error loading '") + ArchiveName + "'");
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index dfdaa031636b..fe2b235e53c5 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -38,10 +38,10 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cctype>
 #include <map>
+#include <system_error>
 using namespace llvm;
 
 static cl::opt<std::string>
@@ -480,8 +480,7 @@ static int AnalyzeBitcode() {
   // Read the input file.
   std::unique_ptr<MemoryBuffer> MemBuf;
 
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(InputFilename, MemBuf))
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, MemBuf))
     return Error("Error reading '" + InputFilename + "': " + ec.message());
 
   if (MemBuf->getBufferSize() & 3)
diff --git a/tools/llvm-cov/llvm-cov.cpp b/tools/llvm-cov/llvm-cov.cpp
index 9463609ae3de..0a64d76c3741 100644
--- a/tools/llvm-cov/llvm-cov.cpp
+++ b/tools/llvm-cov/llvm-cov.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GCOV.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -20,7 +21,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static cl::opt<std::string> SourceFile(cl::Positional, cl::Required,
@@ -104,7 +105,7 @@ int main(int argc, char **argv) {
   GCOVFile GF;
 
   std::unique_ptr<MemoryBuffer> GCNO_Buff;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputGCNO, GCNO_Buff)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(InputGCNO, GCNO_Buff)) {
     errs() << InputGCNO << ": " << ec.message() << "\n";
     return 1;
   }
@@ -115,7 +116,7 @@ int main(int argc, char **argv) {
   }
 
   std::unique_ptr<MemoryBuffer> GCDA_Buff;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputGCDA, GCDA_Buff)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(InputGCDA, GCDA_Buff)) {
     if (ec != errc::no_such_file_or_directory) {
       errs() << InputGCDA << ": " << ec.message() << "\n";
       return 1;
diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index 0df7328ffc34..3b0f838f1d73 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp
@@ -32,7 +32,7 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static cl::opt<std::string>
@@ -137,7 +137,7 @@ int main(int argc, char **argv) {
     M.reset(getStreamedBitcodeModule(DisplayFilename, streamer, Context,
                                      &ErrorMessage));
     if(M.get()) {
-      if (error_code EC = M->materializeAllPermanently()) {
+      if (std::error_code EC = M->materializeAllPermanently()) {
         ErrorMessage = EC.message();
         M.reset();
       }
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 46ac36e1739b..29f0c8707a5b 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -25,11 +25,11 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
 #include <list>
 #include <string>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -68,13 +68,13 @@ DumpType("debug-dump", cl::init(DIDT_All),
 static void DumpInput(const StringRef &Filename) {
   std::unique_ptr<MemoryBuffer> Buff;
 
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
     errs() << Filename << ": " << ec.message() << "\n";
     return;
   }
 
   ErrorOr<ObjectFile*> ObjOrErr(ObjectFile::createObjectFile(Buff.release()));
-  if (error_code EC = ObjOrErr.getError()) {
+  if (std::error_code EC = ObjOrErr.getError()) {
     errs() << Filename << ": " << EC.message() << '\n';
     return;
   }
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index e87f1eef2829..a6b74a7b1baa 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -65,6 +65,10 @@ static cl::opt<unsigned>
 OutputAsmVariant("output-asm-variant",
                  cl::desc("Syntax variant to use for output printing"));
 
+static cl::opt<bool>
+PrintImmHex("print-imm-hex", cl::init(false),
+            cl::desc("Prefer hex format for immediate values"));
+
 enum OutputFileType {
   OFT_Null,
   OFT_AssemblyFile,
@@ -156,12 +160,17 @@ static cl::opt<std::string>
 MainFileName("main-file-name",
              cl::desc("Specifies the name we should consider the input file"));
 
+static cl::opt<bool> SaveTempLabels("save-temp-labels",
+                                    cl::desc("Don't discard temporary labels"));
+
+static cl::opt<bool> NoExecStack("no-exec-stack",
+                                 cl::desc("File doesn't need an exec stack"));
+
 enum ActionType {
   AC_AsLex,
   AC_Assemble,
   AC_Disassemble,
   AC_MDisassemble,
-  AC_HDisassemble
 };
 
 static cl::opt<ActionType>
@@ -175,9 +184,6 @@ Action(cl::desc("Action to perform:"),
                              "Disassemble strings of hex bytes"),
                   clEnumValN(AC_MDisassemble, "mdis",
                              "Marked up disassembly of strings of hex bytes"),
-                  clEnumValN(AC_HDisassemble, "hdis",
-                             "Disassemble strings of hex bytes printing "
-                             "immediates as hex"),
                   clEnumValEnd));
 
 static const Target *GetTarget(const char *ProgName) {
@@ -361,7 +367,8 @@ int main(int argc, char **argv) {
     return 1;
 
   std::unique_ptr<MemoryBuffer> BufferPtr;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, BufferPtr)) {
+  if (std::error_code ec =
+          MemoryBuffer::getFileOrSTDIN(InputFilename, BufferPtr)) {
     errs() << ProgName << ": " << ec.message() << '\n';
     return 1;
   }
@@ -439,6 +446,11 @@ int main(int argc, char **argv) {
   if (FileType == OFT_AssemblyFile) {
     IP =
       TheTarget->createMCInstPrinter(OutputAsmVariant, *MAI, *MCII, *MRI, *STI);
+
+    // Set the display preference for hex vs. decimal immediates.
+    IP->setPrintImmHex(PrintImmHex);
+
+    // Set up the AsmStreamer.
     MCCodeEmitter *CE = nullptr;
     MCAsmBackend *MAB = nullptr;
     if (ShowEncoding) {
@@ -474,11 +486,6 @@ int main(int argc, char **argv) {
     IP->setUseMarkup(1);
     disassemble = true;
     break;
-  case AC_HDisassemble:
-    assert(IP && "Expected assembly output");
-    IP->setPrintImmHex(1);
-    disassemble = true;
-    break;
   case AC_Disassemble:
     disassemble = true;
     break;
diff --git a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
index f3a3e45fdd3e..cbb0d51dd3df 100644
--- a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
+++ b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static cl::list<std::string>
@@ -136,7 +136,7 @@ MarkupTag MarkupParser::parseTag() {
 
 static void parseMCMarkup(StringRef Filename) {
   std::unique_ptr<MemoryBuffer> BufferPtr;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, BufferPtr)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, BufferPtr)) {
     errs() << ToolName << ": " << ec.message() << '\n';
     return;
   }
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 3be9247c5a55..e605e6e430a7 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -37,21 +37,22 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cctype>
 #include <cerrno>
 #include <cstring>
+#include <system_error>
 #include <vector>
 using namespace llvm;
 using namespace object;
 
 namespace {
-enum OutputFormatTy { bsd, sysv, posix };
+enum OutputFormatTy { bsd, sysv, posix, darwin };
 cl::opt<OutputFormatTy> OutputFormat(
     "format", cl::desc("Specify output format"),
     cl::values(clEnumVal(bsd, "BSD format"), clEnumVal(sysv, "System V format"),
-               clEnumVal(posix, "POSIX.2 format"), clEnumValEnd),
+               clEnumVal(posix, "POSIX.2 format"),
+               clEnumVal(darwin, "Darwin -m format"), clEnumValEnd),
     cl::init(bsd));
 cl::alias OutputFormat2("f", cl::desc("Alias for --format"),
                         cl::aliasopt(OutputFormat));
@@ -131,7 +132,7 @@ static void error(Twine Message, Twine Path = Twine()) {
   errs() << ToolName << ": " << Path << ": " << Message << ".\n";
 }
 
-static bool error(error_code EC, Twine Path = Twine()) {
+static bool error(std::error_code EC, Twine Path = Twine()) {
   if (EC) {
     error(EC.message(), Path);
     return true;
@@ -145,6 +146,7 @@ struct NMSymbol {
   uint64_t Size;
   char TypeChar;
   StringRef Name;
+  DataRefImpl Symb;
 };
 }
 
@@ -204,6 +206,169 @@ static StringRef CurrentFilename;
 typedef std::vector<NMSymbol> SymbolListT;
 static SymbolListT SymbolList;
 
+// darwinPrintSymbol() is used to print a symbol from a Mach-O file when the
+// the OutputFormat is darwin.  It produces the same output as darwin's nm(1) -m
+// output.
+static void darwinPrintSymbol(MachOObjectFile *MachO, SymbolListT::iterator I,
+                              char *SymbolAddrStr, const char *printBlanks) {
+  MachO::mach_header H;
+  MachO::mach_header_64 H_64;
+  uint32_t Filetype, Flags;
+  MachO::nlist_64 STE_64;
+  MachO::nlist STE;
+  uint8_t NType;
+  uint16_t NDesc;
+  uint64_t NValue;
+  if (MachO->is64Bit()) {
+    H_64 = MachO->MachOObjectFile::getHeader64();
+    Filetype = H_64.filetype;
+    Flags = H_64.flags;
+    STE_64 = MachO->getSymbol64TableEntry(I->Symb);
+    NType = STE_64.n_type;
+    NDesc = STE_64.n_desc;
+    NValue = STE_64.n_value;
+  } else {
+    H = MachO->MachOObjectFile::getHeader();
+    Filetype = H.filetype;
+    Flags = H.flags;
+    STE = MachO->getSymbolTableEntry(I->Symb);
+    NType = STE.n_type;
+    NDesc = STE.n_desc;
+    NValue = STE.n_value;
+  }
+
+  if (PrintAddress) {
+    if ((NType & MachO::N_TYPE) == MachO::N_INDR)
+      strcpy(SymbolAddrStr, printBlanks);
+    outs() << SymbolAddrStr << ' ';
+  }
+
+  switch (NType & MachO::N_TYPE) {
+  case MachO::N_UNDF:
+    if (NValue != 0) {
+      outs() << "(common) ";
+      if (MachO::GET_COMM_ALIGN(NDesc) != 0)
+        outs() << "(alignment 2^" <<
+                   (int)MachO::GET_COMM_ALIGN(NDesc) << ") ";
+    } else {
+      if ((NType & MachO::N_TYPE) == MachO::N_PBUD)
+        outs() << "(prebound ";
+      else
+        outs() << "(";
+      if ((NDesc & MachO::REFERENCE_TYPE) ==
+          MachO::REFERENCE_FLAG_UNDEFINED_LAZY)
+        outs() << "undefined [lazy bound]) ";
+      else if ((NDesc & MachO::REFERENCE_TYPE) ==
+               MachO::REFERENCE_FLAG_UNDEFINED_LAZY)
+        outs() << "undefined [private lazy bound]) ";
+      else if ((NDesc & MachO::REFERENCE_TYPE) ==
+               MachO::REFERENCE_FLAG_PRIVATE_UNDEFINED_NON_LAZY)
+        outs() << "undefined [private]) ";
+      else
+        outs() << "undefined) ";
+    }
+    break;
+  case MachO::N_ABS:
+    outs() << "(absolute) ";
+    break;
+  case MachO::N_INDR:
+    outs() << "(indirect) ";
+    break;
+  case MachO::N_SECT: {
+    section_iterator Sec = MachO->section_end();
+    MachO->getSymbolSection(I->Symb, Sec);
+    DataRefImpl Ref = Sec->getRawDataRefImpl();
+    StringRef SectionName;
+    MachO->getSectionName(Ref, SectionName);
+    StringRef SegmentName = MachO->getSectionFinalSegmentName(Ref);
+    outs() << "(" << SegmentName << "," << SectionName << ") ";
+    break;
+  }
+  default:
+    outs() << "(?) ";
+    break;
+  }
+
+  if (NType & MachO::N_EXT) {
+    if (NDesc & MachO::REFERENCED_DYNAMICALLY)
+      outs() << "[referenced dynamically] ";
+    if (NType & MachO::N_PEXT) {
+      if ((NDesc & MachO::N_WEAK_DEF) == MachO::N_WEAK_DEF)
+        outs() <<  "weak private external ";
+      else
+        outs() <<  "private external ";
+    } else {
+      if ((NDesc & MachO::N_WEAK_REF) == MachO::N_WEAK_REF ||
+          (NDesc & MachO::N_WEAK_DEF) == MachO::N_WEAK_DEF){
+        if ((NDesc & (MachO::N_WEAK_REF | MachO::N_WEAK_DEF)) ==
+            (MachO::N_WEAK_REF | MachO::N_WEAK_DEF))
+          outs() << "weak external automatically hidden ";
+        else
+          outs() << "weak external ";
+      }
+      else
+        outs() << "external ";
+    }
+  } else {
+    if (NType & MachO::N_PEXT)
+      outs() << "non-external (was a private external) ";
+    else
+      outs() << "non-external ";
+  }
+
+  if (Filetype == MachO::MH_OBJECT &&
+      (NDesc & MachO::N_NO_DEAD_STRIP) == MachO::N_NO_DEAD_STRIP)
+    outs() << "[no dead strip] ";
+
+  if (Filetype == MachO::MH_OBJECT &&
+      ((NType & MachO::N_TYPE) != MachO::N_UNDF) &&
+      (NDesc & MachO::N_SYMBOL_RESOLVER) == MachO::N_SYMBOL_RESOLVER)
+    outs() << "[symbol resolver] ";
+
+  if (Filetype == MachO::MH_OBJECT &&
+      ((NType & MachO::N_TYPE) != MachO::N_UNDF) &&
+      (NDesc & MachO::N_ALT_ENTRY) == MachO::N_ALT_ENTRY)
+    outs() << "[alt entry] ";
+
+  if ((NDesc & MachO::N_ARM_THUMB_DEF) == MachO::N_ARM_THUMB_DEF)
+    outs() << "[Thumb] ";
+
+  if ((NType & MachO::N_TYPE) == MachO::N_INDR) {
+    outs() << I->Name << " (for ";
+    StringRef IndirectName;
+    if (MachO->getIndirectName(I->Symb, IndirectName))
+      outs() << "?)";
+    else
+      outs() << IndirectName << ")";
+  }
+  else
+    outs() << I->Name;
+
+  if ((Flags & MachO::MH_TWOLEVEL) == MachO::MH_TWOLEVEL &&
+      (((NType & MachO::N_TYPE) == MachO::N_UNDF &&
+        NValue == 0) ||
+       (NType & MachO::N_TYPE) == MachO::N_PBUD)) {
+    uint32_t LibraryOrdinal = MachO::GET_LIBRARY_ORDINAL(NDesc);
+    if (LibraryOrdinal != 0) {
+      if (LibraryOrdinal == MachO::EXECUTABLE_ORDINAL)
+        outs() << " (from executable)";
+      else if (LibraryOrdinal == MachO::DYNAMIC_LOOKUP_ORDINAL)
+        outs() << " (dynamically looked up)";
+      else {
+        StringRef LibraryName;
+        if (MachO->getLibraryShortNameByIndex(LibraryOrdinal - 1,
+                                              LibraryName))
+          outs() << " (from bad library ordinal " <<
+                 LibraryOrdinal << ")";
+        else
+          outs() << " (from " << LibraryName << ")";
+      }
+    }
+  }
+
+  outs() << "\n";
+}
+
 static void sortAndPrintSymbolList(SymbolicFile *Obj) {
   if (!NoSort) {
     if (NumericSort)
@@ -256,10 +421,16 @@ static void sortAndPrintSymbolList(SymbolicFile *Obj) {
     if (I->Size != UnknownAddressOrSize)
       format(printFormat, I->Size).print(SymbolSizeStr, sizeof(SymbolSizeStr));
 
-    if (OutputFormat == posix) {
+    // If OutputFormat is darwin and we have a MachOObjectFile print as darwin's
+    // nm(1) -m output, else if OutputFormat is darwin and not a Mach-O object
+    // fall back to OutputFormat bsd (see below).
+    MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj);
+    if (OutputFormat == darwin && MachO) {
+      darwinPrintSymbol(MachO, I, SymbolAddrStr, printBlanks);
+    } else if (OutputFormat == posix) {
       outs() << I->Name << " " << I->TypeChar << " " << SymbolAddrStr
              << SymbolSizeStr << "\n";
-    } else if (OutputFormat == bsd) {
+    } else if (OutputFormat == bsd || (OutputFormat == darwin && !MachO)) {
       if (PrintAddress)
         outs() << SymbolAddrStr << ' ';
       if (PrintSize) {
@@ -395,6 +566,8 @@ static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
   switch (NType & MachO::N_TYPE) {
   case MachO::N_ABS:
     return 's';
+  case MachO::N_INDR:
+    return 'i';
   case MachO::N_SECT: {
     section_iterator Sec = Obj.section_end();
     Obj.getSymbolSection(Symb, Sec);
@@ -413,15 +586,11 @@ static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
 }
 
 static char getSymbolNMTypeChar(const GlobalValue &GV) {
-  if (isa<Function>(GV))
+  if (GV.getType()->getElementType()->isFunctionTy())
     return 't';
   // FIXME: should we print 'b'? At the IR level we cannot be sure if this
   // will be in bss or not, but we could approximate.
-  if (isa<GlobalVariable>(GV))
-    return 'd';
-  const GlobalAlias *GA = cast<GlobalAlias>(&GV);
-  const GlobalValue *AliasedGV = GA->getAliasee();
-  return getSymbolNMTypeChar(*AliasedGV);
+  return 'd';
 }
 
 static char getSymbolNMTypeChar(IRObjectFile &Obj, basic_symbol_iterator I) {
@@ -531,6 +700,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile *Obj) {
     if (error(I->printName(OS)))
       break;
     OS << '\0';
+    S.Symb = I->getRawDataRefImpl();
     SymbolList.push_back(S);
   }
 
@@ -563,16 +733,14 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
       if (I != E) {
         outs() << "Archive map\n";
         for (; I != E; ++I) {
-          Archive::child_iterator C;
-          StringRef SymName;
-          StringRef FileName;
-          if (error(I->getMember(C)))
-            return;
-          if (error(I->getName(SymName)))
+          ErrorOr<Archive::child_iterator> C = I->getMember();
+          if (error(C.getError()))
             return;
-          if (error(C->getName(FileName)))
+          ErrorOr<StringRef> FileNameOrErr = C.get()->getName();
+          if (error(FileNameOrErr.getError()))
             return;
-          outs() << SymName << " in " << FileName << "\n";
+          StringRef SymName = I->getName();
+          outs() << SymName << " in " << FileNameOrErr.get() << "\n";
         }
         outs() << "\n";
       }
@@ -580,10 +748,10 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
 
     for (Archive::child_iterator I = A->child_begin(), E = A->child_end();
          I != E; ++I) {
-      std::unique_ptr<Binary> Child;
-      if (I->getAsBinary(Child, &Context))
+      ErrorOr<std::unique_ptr<Binary>> ChildOrErr = I->getAsBinary(&Context);
+      if (ChildOrErr.getError())
         continue;
-      if (SymbolicFile *O = dyn_cast<SymbolicFile>(Child.get())) {
+      if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
         outs() << O->getFileName() << ":\n";
         dumpSymbolNamesFromObject(O);
       }
@@ -603,10 +771,11 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
       else if (!I->getAsArchive(A)) {
         for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
              AI != AE; ++AI) {
-          std::unique_ptr<Binary> Child;
-          if (AI->getAsBinary(Child, &Context))
+          ErrorOr<std::unique_ptr<Binary>> ChildOrErr =
+              AI->getAsBinary(&Context);
+          if (ChildOrErr.getError())
             continue;
-          if (SymbolicFile *O = dyn_cast<SymbolicFile>(Child.get())) {
+          if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
             outs() << A->getFileName() << ":";
             outs() << O->getFileName() << ":\n";
             dumpSymbolNamesFromObject(O);
diff --git a/tools/llvm-objdump/COFFDump.cpp b/tools/llvm-objdump/COFFDump.cpp
index 49f27553c13a..39d8e8e39f8c 100644
--- a/tools/llvm-objdump/COFFDump.cpp
+++ b/tools/llvm-objdump/COFFDump.cpp
@@ -22,9 +22,9 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Win64EH.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -157,14 +157,14 @@ static void printAllUnwindCodes(ArrayRef<UnwindCode> UCs) {
 }
 
 // Given a symbol sym this functions returns the address and section of it.
-static error_code resolveSectionAndAddress(const COFFObjectFile *Obj,
-                                           const SymbolRef &Sym,
-                                           const coff_section *&ResolvedSection,
-                                           uint64_t &ResolvedAddr) {
-  if (error_code EC = Sym.getAddress(ResolvedAddr))
+static std::error_code
+resolveSectionAndAddress(const COFFObjectFile *Obj, const SymbolRef &Sym,
+                         const coff_section *&ResolvedSection,
+                         uint64_t &ResolvedAddr) {
+  if (std::error_code EC = Sym.getAddress(ResolvedAddr))
     return EC;
   section_iterator iter(Obj->section_begin());
-  if (error_code EC = Sym.getSection(iter))
+  if (std::error_code EC = Sym.getSection(iter))
     return EC;
   ResolvedSection = Obj->getCOFFSection(*iter);
   return object_error::success;
@@ -172,13 +172,13 @@ static error_code resolveSectionAndAddress(const COFFObjectFile *Obj,
 
 // Given a vector of relocations for a section and an offset into this section
 // the function returns the symbol used for the relocation at the offset.
-static error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
-                                uint64_t Offset, SymbolRef &Sym) {
+static std::error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
+                                     uint64_t Offset, SymbolRef &Sym) {
   for (std::vector<RelocationRef>::const_iterator I = Rels.begin(),
                                                   E = Rels.end();
                                                   I != E; ++I) {
     uint64_t Ofs;
-    if (error_code EC = I->getOffset(Ofs))
+    if (std::error_code EC = I->getOffset(Ofs))
       return EC;
     if (Ofs == Offset) {
       Sym = *I->getSymbol();
@@ -192,18 +192,17 @@ static error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
 // the function resolves the symbol used for the relocation at the offset and
 // returns the section content and the address inside the content pointed to
 // by the symbol.
-static error_code getSectionContents(const COFFObjectFile *Obj,
-                                     const std::vector<RelocationRef> &Rels,
-                                     uint64_t Offset,
-                                     ArrayRef<uint8_t> &Contents,
-                                     uint64_t &Addr) {
+static std::error_code
+getSectionContents(const COFFObjectFile *Obj,
+                   const std::vector<RelocationRef> &Rels, uint64_t Offset,
+                   ArrayRef<uint8_t> &Contents, uint64_t &Addr) {
   SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
+  if (std::error_code EC = resolveSymbol(Rels, Offset, Sym))
     return EC;
   const coff_section *Section;
-  if (error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
+  if (std::error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
     return EC;
-  if (error_code EC = Obj->getSectionContents(Section, Contents))
+  if (std::error_code EC = Obj->getSectionContents(Section, Contents))
     return EC;
   return object_error::success;
 }
@@ -211,12 +210,12 @@ static error_code getSectionContents(const COFFObjectFile *Obj,
 // Given a vector of relocations for a section and an offset into this section
 // the function returns the name of the symbol used for the relocation at the
 // offset.
-static error_code resolveSymbolName(const std::vector<RelocationRef> &Rels,
-                                    uint64_t Offset, StringRef &Name) {
+static std::error_code resolveSymbolName(const std::vector<RelocationRef> &Rels,
+                                         uint64_t Offset, StringRef &Name) {
   SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
+  if (std::error_code EC = resolveSymbol(Rels, Offset, Sym))
     return EC;
-  if (error_code EC = Sym.getName(Name))
+  if (std::error_code EC = Sym.getName(Name))
     return EC;
   return object_error::success;
 }
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 3ca582fdab16..21c4c96d4c22 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -37,9 +37,9 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <system_error>
 using namespace llvm;
 using namespace object;
 
@@ -197,7 +197,7 @@ static void DisassembleInputMachO2(StringRef Filename,
 void llvm::DisassembleInputMachO(StringRef Filename) {
   std::unique_ptr<MemoryBuffer> Buff;
 
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
     errs() << "llvm-objdump: " << Filename << ": " << ec.message() << "\n";
     return;
   }
@@ -289,7 +289,7 @@ static void DisassembleInputMachO2(StringRef Filename,
     // get the sections and supply it to the section name parsing machinery.
     if (!DSYMFile.empty()) {
       std::unique_ptr<MemoryBuffer> Buf;
-      if (error_code ec = MemoryBuffer::getFileOrSTDIN(DSYMFile, Buf)) {
+      if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(DSYMFile, Buf)) {
         errs() << "llvm-objdump: " << Filename << ": " << ec.message() << '\n';
         return;
       }
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index a4fc6d0e33f8..d98691b4b044 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -57,10 +57,10 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cctype>
 #include <cstring>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -148,7 +148,7 @@ YAMLCFG("yaml-cfg",
 
 static StringRef ToolName;
 
-bool llvm::error(error_code EC) {
+bool llvm::error(std::error_code EC) {
   if (!EC)
     return false;
 
@@ -395,7 +395,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
   // Create a mapping, RelocSecs = SectionRelocMap[S], where sections
   // in RelocSecs contain the relocations for section S.
-  error_code EC;
+  std::error_code EC;
   std::map<SectionRef, SmallVector<SectionRef, 1>> SectionRelocMap;
   for (const SectionRef &Section : Obj->sections()) {
     section_iterator Sec2 = Section.getRelocatedSection();
@@ -620,7 +620,7 @@ static void PrintSectionHeaders(const ObjectFile *Obj) {
 }
 
 static void PrintSectionContents(const ObjectFile *Obj) {
-  error_code EC;
+  std::error_code EC;
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
     StringRef Contents;
@@ -850,15 +850,15 @@ static void DumpObject(const ObjectFile *o) {
 static void DumpArchive(const Archive *a) {
   for (Archive::child_iterator i = a->child_begin(), e = a->child_end(); i != e;
        ++i) {
-    std::unique_ptr<Binary> child;
-    if (error_code EC = i->getAsBinary(child)) {
+    ErrorOr<std::unique_ptr<Binary>> ChildOrErr = i->getAsBinary();
+    if (std::error_code EC = ChildOrErr.getError()) {
       // Ignore non-object files.
       if (EC != object_error::invalid_file_type)
         errs() << ToolName << ": '" << a->getFileName() << "': " << EC.message()
                << ".\n";
       continue;
     }
-    if (ObjectFile *o = dyn_cast<ObjectFile>(child.get()))
+    if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
       DumpObject(o);
     else
       errs() << ToolName << ": '" << a->getFileName() << "': "
@@ -881,7 +881,7 @@ static void DumpInput(StringRef file) {
 
   // Attempt to open the binary.
   ErrorOr<Binary *> BinaryOrErr = createBinary(file);
-  if (error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError()) {
     errs() << ToolName << ": '" << file << "': " << EC.message() << ".\n";
     return;
   }
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index b716a264c85d..80f8f581a880 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -16,19 +16,17 @@
 #include "llvm/Support/StringRefMemoryObject.h"
 
 namespace llvm {
-
 namespace object {
   class COFFObjectFile;
   class ObjectFile;
   class RelocationRef;
 }
-class error_code;
 
 extern cl::opt<std::string> TripleName;
 extern cl::opt<std::string> ArchName;
 
 // Various helper functions.
-bool error(error_code ec);
+bool error(std::error_code ec);
 bool RelocAddressLess(object::RelocationRef a, object::RelocationRef b);
 void DumpBytes(StringRef bytes);
 void DisassembleInputMachO(StringRef Filename);
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index fdde32ac6bb1..ba88aad9ec73 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -56,11 +56,12 @@ int merge_main(int argc, const char *argv[]) {
   InstrProfWriter Writer;
   for (const auto &Filename : Inputs) {
     std::unique_ptr<InstrProfReader> Reader;
-    if (error_code ec = InstrProfReader::create(Filename, Reader))
+    if (std::error_code ec = InstrProfReader::create(Filename, Reader))
       exitWithError(ec.message(), Filename);
 
     for (const auto &I : *Reader)
-      if (error_code EC = Writer.addFunctionCounts(I.Name, I.Hash, I.Counts))
+      if (std::error_code EC =
+              Writer.addFunctionCounts(I.Name, I.Hash, I.Counts))
         errs() << Filename << ": " << I.Name << ": " << EC.message() << "\n";
     if (Reader->hasError())
       exitWithError(Reader->getError().message(), Filename);
@@ -90,7 +91,7 @@ int show_main(int argc, const char *argv[]) {
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data summary\n");
 
   std::unique_ptr<InstrProfReader> Reader;
-  if (error_code EC = InstrProfReader::create(Filename, Reader))
+  if (std::error_code EC = InstrProfReader::create(Filename, Reader))
     exitWithError(EC.message(), Filename);
 
   if (OutputFilename.empty())
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
new file mode 100644
index 000000000000..f6675bdcfedf
--- /dev/null
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -0,0 +1,746 @@
+//===-- ARMWinEHPrinter.cpp - Windows on ARM EH Data Printer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Windows on ARM uses a series of serialised data structures (RuntimeFunction)
+// to create a table of information for unwinding.  In order to conserve space,
+// there are two different ways that this data is represented.
+//
+// For functions with canonical forms for the prologue and epilogue, the data
+// can be stored in a "packed" form.  In this case, the data is packed into the
+// RuntimeFunction's remaining 30-bits and can fully describe the entire frame.
+//
+//        +---------------------------------------+
+//        |         Function Entry Address        |
+//        +---------------------------------------+
+//        |           Packed Form Data            |
+//        +---------------------------------------+
+//
+// This layout is parsed by Decoder::dumpPackedEntry.  No unwind bytecode is
+// associated with such a frame as they can be derived from the provided data.
+// The decoder does not synthesize this data as it is unnecessary for the
+// purposes of validation, with the synthesis being required only by a proper
+// unwinder.
+//
+// For functions that are large or do not match canonical forms, the data is
+// split up into two portions, with the actual data residing in the "exception
+// data" table (.xdata) with a reference to the entry from the "procedure data"
+// (.pdata) entry.
+//
+// The exception data contains information about the frame setup, all of the
+// epilouge scopes (for functions for which there are multiple exit points) and
+// the associated exception handler.  Additionally, the entry contains byte-code
+// describing how to unwind the function (c.f. Decoder::decodeOpcodes).
+//
+//        +---------------------------------------+
+//        |         Function Entry Address        |
+//        +---------------------------------------+
+//        |      Exception Data Entry Address     |
+//        +---------------------------------------+
+//
+// This layout is parsed by Decoder::dumpUnpackedEntry.  Such an entry must
+// first resolve the exception data entry address.  This structure
+// (ExceptionDataRecord) has a variable sized header
+// (c.f. ARM::WinEH::HeaderWords) and encodes most of the same information as
+// the packed form.  However, because this information is insufficient to
+// synthesize the unwinding, there are associated unwinding bytecode which make
+// up the bulk of the Decoder.
+//
+// The decoder itself is table-driven, using the first byte to determine the
+// opcode and dispatching to the associated printing routine.  The bytecode
+// itself is a variable length instruction encoding that can fully describe the
+// state of the stack and the necessary operations for unwinding to the
+// beginning of the frame.
+//
+// The byte-code maintains a 1-1 instruction mapping, indicating both the width
+// of the instruction (Thumb2 instructions are variable length, 16 or 32 bits
+// wide) allowing the program to unwind from any point in the prologue, body, or
+// epilogue of the function.
+
+#include "ARMWinEHPrinter.h"
+#include "Error.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ARMWinEH.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::support;
+
+namespace llvm {
+raw_ostream &operator<<(raw_ostream &OS, const ARM::WinEH::ReturnType &RT) {
+  switch (RT) {
+  case ARM::WinEH::ReturnType::RT_POP:
+    OS << "pop {pc}";
+    break;
+  case ARM::WinEH::ReturnType::RT_B:
+    OS << "b target";
+    break;
+  case ARM::WinEH::ReturnType::RT_BW:
+    OS << "b.w target";
+    break;
+  case ARM::WinEH::ReturnType::RT_NoEpilogue:
+    OS << "(no epilogue)";
+    break;
+  }
+  return OS;
+}
+}
+
+static std::string formatSymbol(StringRef Name, uint64_t Address,
+                                uint64_t Offset = 0) {
+  std::string Buffer;
+  raw_string_ostream OS(Buffer);
+
+  if (!Name.empty())
+    OS << Name << " ";
+
+  if (Offset)
+    OS << format("+0x%X (0x%" PRIX64 ")", Offset, Address);
+  else if (!Name.empty())
+    OS << format("(0x%" PRIX64 ")", Address);
+  else
+    OS << format("0x%" PRIX64, Address);
+
+  return OS.str();
+}
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+const size_t Decoder::PDataEntrySize = sizeof(RuntimeFunction);
+
+// TODO name the uops more appropriately
+const Decoder::RingEntry Decoder::Ring[] = {
+  { 0x80, 0x00, &Decoder::opcode_0xxxxxxx },  // UOP_STACK_FREE (16-bit)
+  { 0xc0, 0x80, &Decoder::opcode_10Lxxxxx },  // UOP_POP (32-bit)
+  { 0xf0, 0xc0, &Decoder::opcode_1100xxxx },  // UOP_STACK_SAVE (16-bit)
+  { 0xf8, 0xd0, &Decoder::opcode_11010Lxx },  // UOP_POP (16-bit)
+  { 0xf8, 0xd8, &Decoder::opcode_11011Lxx },  // UOP_POP (32-bit)
+  { 0xf8, 0xe0, &Decoder::opcode_11100xxx },  // UOP_VPOP (32-bit)
+  { 0xfc, 0xe8, &Decoder::opcode_111010xx },  // UOP_STACK_FREE (32-bit)
+  { 0xfe, 0xec, &Decoder::opcode_1110110L },  // UOP_POP (16-bit)
+  { 0xff, 0xee, &Decoder::opcode_11101110 },  // UOP_MICROSOFT_SPECIFIC (16-bit)
+                                              // UOP_PUSH_MACHINE_FRAME
+                                              // UOP_PUSH_CONTEXT
+                                              // UOP_PUSH_TRAP_FRAME
+                                              // UOP_REDZONE_RESTORE_LR
+  { 0xff, 0xef, &Decoder::opcode_11101111 },  // UOP_LDRPC_POSTINC (32-bit)
+  { 0xff, 0xf5, &Decoder::opcode_11110101 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf6, &Decoder::opcode_11110110 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf7, &Decoder::opcode_11110111 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf8, &Decoder::opcode_11111000 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf9, &Decoder::opcode_11111001 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfa, &Decoder::opcode_11111010 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfb, &Decoder::opcode_11111011 },  // UOP_NOP (16-bit)
+  { 0xff, 0xfc, &Decoder::opcode_11111100 },  // UOP_NOP (32-bit)
+  { 0xff, 0xfd, &Decoder::opcode_11111101 },  // UOP_NOP (16-bit) / END
+  { 0xff, 0xfe, &Decoder::opcode_11111110 },  // UOP_NOP (32-bit) / END
+  { 0xff, 0xff, &Decoder::opcode_11111111 },  // UOP_END
+};
+
+void Decoder::printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask) {
+  static const char * const GPRRegisterNames[16] = {
+    "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+    "r11", "ip", "sp", "lr", "pc",
+  };
+
+  const uint16_t GPRMask = std::get<0>(RegisterMask);
+  const uint16_t VFPMask = std::get<1>(RegisterMask);
+
+  OS << '{';
+  bool Comma = false;
+  for (unsigned RI = 0, RE = 11; RI < RE; ++RI) {
+    if (GPRMask & (1 << RI)) {
+      if (Comma)
+        OS << ", ";
+      OS << GPRRegisterNames[RI];
+      Comma = true;
+    }
+  }
+  for (unsigned RI = 0, RE = 32; RI < RE; ++RI) {
+    if (VFPMask & (1 << RI)) {
+      if (Comma)
+        OS << ", ";
+      OS << "d" << unsigned(RI);
+      Comma = true;
+    }
+  }
+  for (unsigned RI = 11, RE = 16; RI < RE; ++RI) {
+    if (GPRMask & (1 << RI)) {
+      if (Comma)
+        OS << ", ";
+      OS << GPRRegisterNames[RI];
+      Comma = true;
+    }
+  }
+  OS << '}';
+}
+
+ErrorOr<object::SectionRef>
+Decoder::getSectionContaining(const COFFObjectFile &COFF, uint64_t VA) {
+  for (const auto &Section : COFF.sections()) {
+    uint64_t Address;
+    uint64_t Size;
+
+    if (std::error_code EC = Section.getAddress(Address))
+      return EC;
+    if (std::error_code EC = Section.getSize(Size))
+      return EC;
+
+    if (VA >= Address && (VA - Address) <= Size)
+      return Section;
+  }
+  return readobj_error::unknown_symbol;
+}
+
+ErrorOr<object::SymbolRef> Decoder::getSymbol(const COFFObjectFile &COFF,
+                                              uint64_t VA, bool FunctionOnly) {
+  for (const auto &Symbol : COFF.symbols()) {
+    if (FunctionOnly) {
+      SymbolRef::Type Type;
+      if (std::error_code EC = Symbol.getType(Type))
+        return EC;
+      if (Type != SymbolRef::ST_Function)
+        continue;
+    }
+
+    uint64_t Address;
+    if (std::error_code EC = Symbol.getAddress(Address))
+      return EC;
+    if (Address == VA)
+      return Symbol;
+  }
+  return readobj_error::unknown_symbol;
+}
+
+ErrorOr<SymbolRef> Decoder::getRelocatedSymbol(const COFFObjectFile &,
+                                               const SectionRef &Section,
+                                               uint64_t Offset) {
+  for (const auto &Relocation : Section.relocations()) {
+    uint64_t RelocationOffset;
+    if (auto Error = Relocation.getOffset(RelocationOffset))
+      return Error;
+    if (RelocationOffset == Offset)
+      return *Relocation.getSymbol();
+  }
+  return readobj_error::unknown_symbol;
+}
+
+bool Decoder::opcode_0xxxxxxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint8_t Imm = OC[Offset] & 0x7f;
+  SW.startLine() << format("0x%02x                ; %s sp, #(%u * 4)\n",
+                           OC[Offset],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           Imm);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_10Lxxxxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Link = (OC[Offset] & 0x20) >> 5;
+  uint16_t RegisterMask = (Link << (Prologue ? 14 : 15))
+                        | ((OC[Offset + 0] & 0x1f) << 8)
+                        | ((OC[Offset + 1] & 0xff) << 0);
+  assert((~RegisterMask & (1 << 13)) && "sp must not be set");
+  assert((~RegisterMask & (1 << (Prologue ? 15 : 14))) && "pc must not be set");
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s.w ",
+                           OC[Offset + 0], OC[Offset + 1],
+                           Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(RegisterMask, 0));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_1100xxxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  if (Prologue)
+    SW.startLine() << format("0x%02x                ; mov r%u, sp\n",
+                             OC[Offset], OC[Offset] & 0xf);
+  else
+    SW.startLine() << format("0x%02x                ; mov sp, r%u\n",
+                             OC[Offset], OC[Offset] & 0xf);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11010Lxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Link = (OC[Offset] & 0x4) >> 3;
+  unsigned Count = (OC[Offset] & 0x3);
+
+  uint16_t GPRMask = (Link << (Prologue ? 14 : 15))
+                   | (((1 << (Count + 1)) - 1) << 4);
+
+  SW.startLine() << format("0x%02x                ; %s ", OC[Offset],
+                           Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(GPRMask, 0));
+  OS << '\n';
+
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11011Lxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Link = (OC[Offset] & 0x4) >> 2;
+  unsigned Count = (OC[Offset] & 0x3) + 4;
+
+  uint16_t GPRMask = (Link << (Prologue ? 14 : 15))
+                   | (((1 << (Count + 1)) - 1) << 4);
+
+  SW.startLine() << format("0x%02x                ; %s.w ", OC[Offset],
+                           Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(GPRMask, 0));
+  OS << '\n';
+
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11100xxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned High = (OC[Offset] & 0x7);
+  uint32_t VFPMask = (((1 << (High + 1)) - 1) << 8);
+
+  SW.startLine() << format("0x%02x                ; %s ", OC[Offset],
+                           Prologue ? "vpush" : "vpop");
+  printRegisters(std::make_pair(0, VFPMask));
+  OS << '\n';
+
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_111010xx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint16_t Imm = ((OC[Offset + 0] & 0x03) << 8) | ((OC[Offset + 1] & 0xff) << 0);
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s.w sp, #(%u * 4)\n",
+                           OC[Offset + 0], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           Imm);
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_1110110L(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint8_t GPRMask = ((OC[Offset + 0] & 0x01) << (Prologue ? 14 : 15))
+                  | ((OC[Offset + 1] & 0xff) << 0);
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
+                           OC[Offset + 1], Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(GPRMask, 0));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11101110(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  assert(!Prologue && "may not be used in prologue");
+
+  if (OC[Offset + 1] & 0xf0)
+    SW.startLine() << format("0x%02x 0x%02x           ; reserved\n",
+                             OC[Offset + 0], OC[Offset +  1]);
+  else
+    SW.startLine()
+      << format("0x%02x 0x%02x           ; microsoft-specific (type: %u)\n",
+                OC[Offset + 0], OC[Offset + 1], OC[Offset + 1] & 0x0f);
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11101111(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  assert(!Prologue && "may not be used in prologue");
+
+  if (OC[Offset + 1] & 0xf0)
+    SW.startLine() << format("0x%02x 0x%02x           ; reserved\n",
+                             OC[Offset + 0], OC[Offset +  1]);
+  else
+    SW.startLine()
+      << format("0x%02x 0x%02x           ; ldr.w lr, [sp], #%u\n",
+                OC[Offset + 0], OC[Offset + 1], OC[Offset + 1] << 2);
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11110101(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
+  unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
+  uint32_t VFPMask = ((1 << (End - Start)) - 1) << Start;
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
+                           OC[Offset + 1], Prologue ? "vpush" : "vpop");
+  printRegisters(std::make_pair(0, VFPMask));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11110110(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
+  unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
+  uint32_t VFPMask = ((1 << (End - Start)) - 1) << 16;
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
+                           OC[Offset + 1], Prologue ? "vpush" : "vpop");
+  printRegisters(std::make_pair(0, VFPMask));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11110111(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 8) | (OC[Offset + 2] << 0);
+
+  SW.startLine() << format("0x%02x 0x%02x 0x%02x      ; %s sp, sp, #(%u * 4)\n",
+                           OC[Offset + 0], OC[Offset + 1], OC[Offset + 2],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           Imm);
+
+  ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111000(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 16)
+               | (OC[Offset + 2] << 8)
+               | (OC[Offset + 3] << 0);
+
+  SW.startLine()
+    << format("0x%02x 0x%02x 0x%02x 0x%02x ; %s sp, sp, #(%u * 4)\n",
+              OC[Offset + 0], OC[Offset + 1], OC[Offset + 2], OC[Offset + 3],
+              static_cast<const char *>(Prologue ? "sub" : "add"), Imm);
+
+  ++Offset, ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111001(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 8) | (OC[Offset + 2] << 0);
+
+  SW.startLine()
+    << format("0x%02x 0x%02x 0x%02x      ; %s.w sp, sp, #(%u * 4)\n",
+              OC[Offset + 0], OC[Offset + 1], OC[Offset + 2],
+              static_cast<const char *>(Prologue ? "sub" : "add"), Imm);
+
+  ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111010(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 16)
+               | (OC[Offset + 2] << 8)
+               | (OC[Offset + 3] << 0);
+
+  SW.startLine()
+    << format("0x%02x 0x%02x 0x%02x 0x%02x ; %s.w sp, sp, #(%u * 4)\n",
+              OC[Offset + 0], OC[Offset + 1], OC[Offset + 2], OC[Offset + 3],
+              static_cast<const char *>(Prologue ? "sub" : "add"), Imm);
+
+  ++Offset, ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111011(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; nop\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111100(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; nop.w\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111101(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; b\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
+bool Decoder::opcode_11111110(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; b.w\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
+bool Decoder::opcode_11111111(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  ++Offset;
+  return true;
+}
+
+void Decoder::decodeOpcodes(ArrayRef<ulittle8_t> Opcodes, unsigned Offset,
+                            bool Prologue) {
+  assert((!Prologue || Offset == 0) && "prologue should always use offset 0");
+
+  bool Terminated = false;
+  for (unsigned OI = Offset, OE = Opcodes.size(); !Terminated && OI < OE; ) {
+    bool Decoded = false;
+    for (unsigned DI = 0, DE = array_lengthof(Ring); DI < DE; ++DI) {
+      if ((Opcodes[OI] & Ring[DI].Mask) == Ring[DI].Value) {
+        Terminated = (this->*Ring[DI].Routine)(Opcodes.data(), OI, 0, Prologue);
+        Decoded = true;
+        break;
+      }
+    }
+    assert(Decoded && "unhandled opcode");
+  }
+}
+
+bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
+                              const SectionRef &Section,
+                              uint64_t FunctionAddress, uint64_t VA) {
+  ArrayRef<uint8_t> Contents;
+  if (COFF.getSectionContents(COFF.getCOFFSection(Section), Contents))
+    return false;
+
+  uint64_t SectionVA;
+  if (Section.getAddress(SectionVA))
+    return false;
+
+  uint64_t Offset = VA - SectionVA;
+  const ulittle32_t *Data =
+    reinterpret_cast<const ulittle32_t *>(Contents.data() + Offset);
+  const ExceptionDataRecord XData(Data);
+
+  DictScope XRS(SW, "ExceptionData");
+  SW.printNumber("FunctionLength", XData.FunctionLength() << 1);
+  SW.printNumber("Version", XData.Vers());
+  SW.printBoolean("ExceptionData", XData.X());
+  SW.printBoolean("EpiloguePacked", XData.E());
+  SW.printBoolean("Fragment", XData.F());
+  SW.printNumber(XData.E() ? "EpilogueOffset" : "EpilogueScopes",
+                 XData.EpilogueCount());
+  SW.printNumber("ByteCodeLength",
+                 static_cast<uint64_t>(XData.CodeWords() * sizeof(uint32_t)));
+
+  if (XData.E()) {
+    ArrayRef<ulittle8_t> UC = XData.UnwindByteCode();
+    if (!XData.F()) {
+      ListScope PS(SW, "Prologue");
+      decodeOpcodes(UC, 0, /*Prologue=*/true);
+    }
+    if (XData.EpilogueCount()) {
+      ListScope ES(SW, "Epilogue");
+      decodeOpcodes(UC, XData.EpilogueCount(), /*Prologue=*/false);
+    }
+  } else {
+    ArrayRef<ulittle32_t> EpilogueScopes = XData.EpilogueScopes();
+    ListScope ESS(SW, "EpilogueScopes");
+    for (const EpilogueScope ES : EpilogueScopes) {
+      DictScope ESES(SW, "EpilogueScope");
+      SW.printNumber("StartOffset", ES.EpilogueStartOffset());
+      SW.printNumber("Condition", ES.Condition());
+      SW.printNumber("EpilogueStartIndex", ES.EpilogueStartIndex());
+
+      ListScope Opcodes(SW, "Opcodes");
+      decodeOpcodes(XData.UnwindByteCode(), ES.EpilogueStartIndex(),
+                    /*Prologue=*/false);
+    }
+  }
+
+  if (XData.X()) {
+    const uint32_t Address = XData.ExceptionHandlerRVA();
+    const uint32_t Parameter = XData.ExceptionHandlerParameter();
+    const size_t HandlerOffset = HeaderWords(XData)
+                               + (XData.E() ? 0 : XData.EpilogueCount())
+                               + XData.CodeWords();
+
+    ErrorOr<SymbolRef> Symbol =
+      getRelocatedSymbol(COFF, Section, HandlerOffset * sizeof(uint32_t));
+    if (!Symbol)
+      Symbol = getSymbol(COFF, Address, /*FunctionOnly=*/true);
+
+    StringRef Name;
+    if (Symbol)
+      Symbol->getName(Name);
+
+    ListScope EHS(SW, "ExceptionHandler");
+    SW.printString("Routine", formatSymbol(Name, Address));
+    SW.printHex("Parameter", Parameter);
+  }
+
+  return true;
+}
+
+bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
+                                const SectionRef Section, uint64_t Offset,
+                                unsigned Index, const RuntimeFunction &RF) {
+  assert(RF.Flag() == RuntimeFunctionFlag::RFF_Unpacked &&
+         "packed entry cannot be treated as an unpacked entry");
+
+  ErrorOr<SymbolRef> Function = getRelocatedSymbol(COFF, Section, Offset);
+  if (!Function)
+    Function = getSymbol(COFF, RF.BeginAddress, /*FunctionOnly=*/true);
+
+  ErrorOr<SymbolRef> XDataRecord = getRelocatedSymbol(COFF, Section, Offset + 4);
+  if (!XDataRecord)
+    XDataRecord = getSymbol(COFF, RF.ExceptionInformationRVA());
+
+  if (!RF.BeginAddress && !Function)
+    return false;
+  if (!RF.UnwindData && !XDataRecord)
+    return false;
+
+  StringRef FunctionName;
+  uint64_t FunctionAddress;
+  if (Function) {
+    Function->getName(FunctionName);
+    Function->getAddress(FunctionAddress);
+  } else {
+    const pe32_header *PEHeader;
+    if (COFF.getPE32Header(PEHeader))
+      return false;
+    FunctionAddress = PEHeader->ImageBase + RF.BeginAddress;
+  }
+
+  SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
+
+  if (XDataRecord) {
+    StringRef Name;
+    uint64_t Address;
+
+    XDataRecord->getName(Name);
+    XDataRecord->getAddress(Address);
+
+    SW.printString("ExceptionRecord", formatSymbol(Name, Address));
+
+    section_iterator SI = COFF.section_end();
+    if (XDataRecord->getSection(SI))
+      return false;
+
+    return dumpXDataRecord(COFF, *SI, FunctionAddress, Address);
+  } else {
+    const pe32_header *PEHeader;
+    if (COFF.getPE32Header(PEHeader))
+      return false;
+
+    uint64_t Address = PEHeader->ImageBase + RF.ExceptionInformationRVA();
+    SW.printString("ExceptionRecord", formatSymbol("", Address));
+
+    ErrorOr<SectionRef> Section =
+      getSectionContaining(COFF, RF.ExceptionInformationRVA());
+    if (!Section)
+      return false;
+
+    return dumpXDataRecord(COFF, *Section, FunctionAddress,
+                           RF.ExceptionInformationRVA());
+  }
+}
+
+bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
+                              const SectionRef Section, uint64_t Offset,
+                              unsigned Index, const RuntimeFunction &RF) {
+  assert((RF.Flag() == RuntimeFunctionFlag::RFF_Packed ||
+          RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+         "unpacked entry cannot be treated as a packed entry");
+
+  ErrorOr<SymbolRef> Function = getRelocatedSymbol(COFF, Section, Offset);
+  if (!Function)
+    Function = getSymbol(COFF, RF.BeginAddress, /*FunctionOnly=*/true);
+
+  StringRef FunctionName;
+  uint64_t FunctionAddress;
+  if (Function) {
+    Function->getName(FunctionName);
+    Function->getAddress(FunctionAddress);
+  } else {
+    const pe32_header *PEHeader;
+    if (COFF.getPE32Header(PEHeader))
+      return false;
+    FunctionAddress = PEHeader->ImageBase + RF.BeginAddress;
+  }
+
+  SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
+  SW.printBoolean("Fragment",
+                  RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
+  SW.printNumber("FunctionLength", RF.FunctionLength());
+  SW.startLine() << "ReturnType: " << RF.Ret() << '\n';
+  SW.printBoolean("HomedParameters", RF.H());
+  SW.startLine() << "SavedRegisters: ";
+                 printRegisters(SavedRegisterMask(RF));
+  OS << '\n';
+  SW.printNumber("StackAdjustment", StackAdjustment(RF) << 2);
+
+  return true;
+}
+
+bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF,
+                                     const SectionRef Section, unsigned Index,
+                                     ArrayRef<uint8_t> Contents) {
+  uint64_t Offset = PDataEntrySize * Index;
+  const ulittle32_t *Data =
+    reinterpret_cast<const ulittle32_t *>(Contents.data() + Offset);
+
+  const RuntimeFunction Entry(Data);
+  DictScope RFS(SW, "RuntimeFunction");
+  if (Entry.Flag() == RuntimeFunctionFlag::RFF_Unpacked)
+    return dumpUnpackedEntry(COFF, Section, Offset, Index, Entry);
+  return dumpPackedEntry(COFF, Section, Offset, Index, Entry);
+}
+
+void Decoder::dumpProcedureData(const COFFObjectFile &COFF,
+                                const SectionRef Section) {
+  ArrayRef<uint8_t> Contents;
+  if (COFF.getSectionContents(COFF.getCOFFSection(Section), Contents))
+    return;
+
+  if (Contents.size() % PDataEntrySize) {
+    errs() << ".pdata content is not " << PDataEntrySize << "-byte aligned\n";
+    return;
+  }
+
+  for (unsigned EI = 0, EE = Contents.size() / PDataEntrySize; EI < EE; ++EI)
+    if (!dumpProcedureDataEntry(COFF, Section, EI, Contents))
+      break;
+}
+
+std::error_code Decoder::dumpProcedureData(const COFFObjectFile &COFF) {
+  for (const auto &Section : COFF.sections()) {
+    StringRef SectionName;
+    if (std::error_code EC =
+            COFF.getSectionName(COFF.getCOFFSection(Section), SectionName))
+      return EC;
+
+    if (SectionName.startswith(".pdata"))
+      dumpProcedureData(COFF, Section);
+  }
+  return std::error_code();
+}
+}
+}
+}
+
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.h b/tools/llvm-readobj/ARMWinEHPrinter.h
new file mode 100644
index 000000000000..740c8b5841b5
--- /dev/null
+++ b/tools/llvm-readobj/ARMWinEHPrinter.h
@@ -0,0 +1,119 @@
+//===--- ARMWinEHPrinter.h - Windows on ARM Unwind Information Printer ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License.  See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_READOBJ_ARMWINEHPRINTER_H
+#define LLVM_READOBJ_ARMWINEHPRINTER_H
+
+#include "StreamWriter.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/ErrorOr.h"
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+class RuntimeFunction;
+
+class Decoder {
+  static const size_t PDataEntrySize;
+
+  StreamWriter &SW;
+  raw_ostream &OS;
+
+  struct RingEntry {
+    uint8_t Mask;
+    uint8_t Value;
+    bool (Decoder::*Routine)(const support::ulittle8_t *, unsigned &, unsigned,
+                             bool);
+  };
+  static const RingEntry Ring[];
+
+  bool opcode_0xxxxxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_10Lxxxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_1100xxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11010Lxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11011Lxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11100xxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_111010xx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_1110110L(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11101110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11101111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11110101(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11110110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11110111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111000(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111001(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111010(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111011(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111100(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111101(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+
+  void decodeOpcodes(ArrayRef<support::ulittle8_t> Opcodes, unsigned Offset,
+                     bool Prologue);
+
+  void printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask);
+
+  ErrorOr<object::SectionRef>
+  getSectionContaining(const object::COFFObjectFile &COFF, uint64_t Address);
+
+  ErrorOr<object::SymbolRef>
+  getSymbol(const object::COFFObjectFile &COFF, uint64_t Address,
+            bool FunctionOnly = false);
+
+  ErrorOr<object::SymbolRef>
+  getRelocatedSymbol(const object::COFFObjectFile &COFF,
+                     const object::SectionRef &Section, uint64_t Offset);
+
+  bool dumpXDataRecord(const object::COFFObjectFile &COFF,
+                       const object::SectionRef &Section,
+                       uint64_t FunctionAddress, uint64_t VA);
+  bool dumpUnpackedEntry(const object::COFFObjectFile &COFF,
+                         const object::SectionRef Section, uint64_t Offset,
+                         unsigned Index, const RuntimeFunction &Entry);
+  bool dumpPackedEntry(const object::COFFObjectFile &COFF,
+                       const object::SectionRef Section, uint64_t Offset,
+                       unsigned Index, const RuntimeFunction &Entry);
+  bool dumpProcedureDataEntry(const object::COFFObjectFile &COFF,
+                              const object::SectionRef Section, unsigned Entry,
+                              ArrayRef<uint8_t> Contents);
+  void dumpProcedureData(const object::COFFObjectFile &COFF,
+                         const object::SectionRef Section);
+
+public:
+  Decoder(StreamWriter &SW) : SW(SW), OS(SW.getOStream()) {}
+  std::error_code dumpProcedureData(const object::COFFObjectFile &COFF);
+};
+}
+}
+}
+
+#endif
+
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index 036185d48171..30f336f76ca8 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -5,12 +5,14 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_tool(llvm-readobj
-  llvm-readobj.cpp
-  ObjDumper.cpp
+  ARMAttributeParser.cpp
+  ARMWinEHPrinter.cpp
   COFFDumper.cpp
   ELFDumper.cpp
-  MachODumper.cpp
   Error.cpp
+  llvm-readobj.cpp
+  MachODumper.cpp
+  ObjDumper.cpp
   StreamWriter.cpp
-  ARMAttributeParser.cpp
+  Win64EHDumper.cpp
   )
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 26ac181a0be1..8b0dcb3e2c48 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -13,9 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-readobj.h"
+#include "ARMWinEHPrinter.h"
 #include "Error.h"
 #include "ObjDumper.h"
 #include "StreamWriter.h"
+#include "Win64EHDumper.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Object/COFF.h"
@@ -28,9 +30,9 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Win64EH.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <system_error>
 #include <time.h>
 
 using namespace llvm;
@@ -58,45 +60,24 @@ class COFFDumper : public ObjDumper {
   void printSymbol(const SymbolRef &Sym);
   void printRelocation(const SectionRef &Section, const RelocationRef &Reloc);
   void printDataDirectory(uint32_t Index, const std::string &FieldName);
-  void printX64UnwindInfo();
 
   template <class PEHeader> void printPEHeader(const PEHeader *Hdr);
   void printBaseOfDataField(const pe32_header *Hdr);
   void printBaseOfDataField(const pe32plus_header *Hdr);
 
-  void printRuntimeFunction(
-    const RuntimeFunction& RTF,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels);
-
-  void printUnwindInfo(
-    const Win64EH::UnwindInfo& UI,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels);
-
-  void printUnwindCode(const Win64EH::UnwindInfo &UI, ArrayRef<UnwindCode> UCs);
-
   void printCodeViewLineTables(const SectionRef &Section);
 
   void cacheRelocations();
 
-  error_code getSectionContents(
-    const std::vector<RelocationRef> &Rels,
-    uint64_t Offset,
-    ArrayRef<uint8_t> &Contents,
-    uint64_t &Addr);
-
-  error_code getSection(
-    const std::vector<RelocationRef> &Rels,
-    uint64_t Offset,
-    const coff_section **Section,
-    uint64_t *AddrPtr);
+  std::error_code resolveSymbol(const coff_section *Section, uint64_t Offset,
+                                SymbolRef &Sym);
+  std::error_code resolveSymbolName(const coff_section *Section,
+                                    uint64_t Offset, StringRef &Name);
 
   typedef DenseMap<const coff_section*, std::vector<RelocationRef> > RelocMapTy;
 
   const llvm::object::COFFObjectFile *Obj;
   RelocMapTy RelocMap;
-  std::vector<RelocationRef> EmptyRelocs;
 };
 
 } // namespace
@@ -104,8 +85,9 @@ class COFFDumper : public ObjDumper {
 
 namespace llvm {
 
-error_code createCOFFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                            std::unique_ptr<ObjDumper> &Result) {
+std::error_code createCOFFDumper(const object::ObjectFile *Obj,
+                                 StreamWriter &Writer,
+                                 std::unique_ptr<ObjDumper> &Result) {
   const COFFObjectFile *COFFObj = dyn_cast<COFFObjectFile>(Obj);
   if (!COFFObj)
     return readobj_error::unsupported_obj_file_format;
@@ -116,108 +98,34 @@ error_code createCOFFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
 
 } // namespace llvm
 
-
-// Returns the name of the unwind code.
-static StringRef getUnwindCodeTypeName(uint8_t Code) {
-  switch(Code) {
-  default: llvm_unreachable("Invalid unwind code");
-  case UOP_PushNonVol: return "PUSH_NONVOL";
-  case UOP_AllocLarge: return "ALLOC_LARGE";
-  case UOP_AllocSmall: return "ALLOC_SMALL";
-  case UOP_SetFPReg: return "SET_FPREG";
-  case UOP_SaveNonVol: return "SAVE_NONVOL";
-  case UOP_SaveNonVolBig: return "SAVE_NONVOL_FAR";
-  case UOP_SaveXMM128: return "SAVE_XMM128";
-  case UOP_SaveXMM128Big: return "SAVE_XMM128_FAR";
-  case UOP_PushMachFrame: return "PUSH_MACHFRAME";
-  }
-}
-
-// Returns the name of a referenced register.
-static StringRef getUnwindRegisterName(uint8_t Reg) {
-  switch(Reg) {
-  default: llvm_unreachable("Invalid register");
-  case 0: return "RAX";
-  case 1: return "RCX";
-  case 2: return "RDX";
-  case 3: return "RBX";
-  case 4: return "RSP";
-  case 5: return "RBP";
-  case 6: return "RSI";
-  case 7: return "RDI";
-  case 8: return "R8";
-  case 9: return "R9";
-  case 10: return "R10";
-  case 11: return "R11";
-  case 12: return "R12";
-  case 13: return "R13";
-  case 14: return "R14";
-  case 15: return "R15";
-  }
-}
-
-// Calculates the number of array slots required for the unwind code.
-static unsigned getNumUsedSlots(const UnwindCode &UnwindCode) {
-  switch (UnwindCode.getUnwindOp()) {
-  default: llvm_unreachable("Invalid unwind code");
-  case UOP_PushNonVol:
-  case UOP_AllocSmall:
-  case UOP_SetFPReg:
-  case UOP_PushMachFrame:
-    return 1;
-  case UOP_SaveNonVol:
-  case UOP_SaveXMM128:
-    return 2;
-  case UOP_SaveNonVolBig:
-  case UOP_SaveXMM128Big:
-    return 3;
-  case UOP_AllocLarge:
-    return (UnwindCode.getOpInfo() == 0) ? 2 : 3;
-  }
-}
-
-// Given a symbol sym this functions returns the address and section of it.
-static error_code resolveSectionAndAddress(const COFFObjectFile *Obj,
-                                           const SymbolRef &Sym,
-                                           const coff_section *&ResolvedSection,
-                                           uint64_t &ResolvedAddr) {
-  if (error_code EC = Sym.getAddress(ResolvedAddr))
-    return EC;
-
-  section_iterator iter(Obj->section_begin());
-  if (error_code EC = Sym.getSection(iter))
-    return EC;
-
-  ResolvedSection = Obj->getCOFFSection(*iter);
-  return object_error::success;
-}
-
-// Given a vector of relocations for a section and an offset into this section
-// the function returns the symbol used for the relocation at the offset.
-static error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
-                                uint64_t Offset, SymbolRef &Sym) {
-  for (const auto &Relocation : Rels) {
-    uint64_t Ofs;
-    if (error_code EC = Relocation.getOffset(Ofs))
+// Given a a section and an offset into this section the function returns the
+// symbol used for the relocation at the offset.
+std::error_code COFFDumper::resolveSymbol(const coff_section *Section,
+                                          uint64_t Offset, SymbolRef &Sym) {
+  const auto &Relocations = RelocMap[Section];
+  for (const auto &Relocation : Relocations) {
+    uint64_t RelocationOffset;
+    if (std::error_code EC = Relocation.getOffset(RelocationOffset))
       return EC;
 
-    if (Ofs == Offset) {
+    if (RelocationOffset == Offset) {
       Sym = *Relocation.getSymbol();
       return readobj_error::success;
     }
   }
-
   return readobj_error::unknown_symbol;
 }
 
-// Given a vector of relocations for a section and an offset into this section
-// the function returns the name of the symbol used for the relocation at the
-// offset.
-static error_code resolveSymbolName(const std::vector<RelocationRef> &Rels,
-                                    uint64_t Offset, StringRef &Name) {
-  SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym)) return EC;
-  if (error_code EC = Sym.getName(Name)) return EC;
+// Given a section and an offset into this section the function returns the name
+// of the symbol used for the relocation at the offset.
+std::error_code COFFDumper::resolveSymbolName(const coff_section *Section,
+                                              uint64_t Offset,
+                                              StringRef &Name) {
+  SymbolRef Symbol;
+  if (std::error_code EC = resolveSymbol(Section, Offset, Symbol))
+    return EC;
+  if (std::error_code EC = Symbol.getName(Name))
+    return EC;
   return object_error::success;
 }
 
@@ -401,114 +309,15 @@ WeakExternalCharacteristics[] = {
   { "Alias"    , COFF::IMAGE_WEAK_EXTERN_SEARCH_ALIAS     }
 };
 
-static const EnumEntry<unsigned> UnwindFlags[] = {
-  { "ExceptionHandler", Win64EH::UNW_ExceptionHandler },
-  { "TerminateHandler", Win64EH::UNW_TerminateHandler },
-  { "ChainInfo"       , Win64EH::UNW_ChainInfo        }
-};
-
-static const EnumEntry<unsigned> UnwindOpInfo[] = {
-  { "RAX",  0 },
-  { "RCX",  1 },
-  { "RDX",  2 },
-  { "RBX",  3 },
-  { "RSP",  4 },
-  { "RBP",  5 },
-  { "RSI",  6 },
-  { "RDI",  7 },
-  { "R8",   8 },
-  { "R9",   9 },
-  { "R10", 10 },
-  { "R11", 11 },
-  { "R12", 12 },
-  { "R13", 13 },
-  { "R14", 14 },
-  { "R15", 15 }
-};
-
-static uint64_t getOffsetOfLSDA(const Win64EH::UnwindInfo& UI) {
-  return static_cast<const char*>(UI.getLanguageSpecificData())
-         - reinterpret_cast<const char*>(&UI);
-}
-
-static uint32_t getLargeSlotValue(ArrayRef<UnwindCode> UCs) {
-  if (UCs.size() < 3)
-    return 0;
-
-  return UCs[1].FrameOffset + (static_cast<uint32_t>(UCs[2].FrameOffset) << 16);
-}
-
-template<typename T>
-static error_code getSymbolAuxData(const COFFObjectFile *Obj,
-                                   const coff_symbol *Symbol, const T* &Aux) {
+template <typename T>
+static std::error_code getSymbolAuxData(const COFFObjectFile *Obj,
+                                        const coff_symbol *Symbol,
+                                        const T *&Aux) {
   ArrayRef<uint8_t> AuxData = Obj->getSymbolAuxData(Symbol);
   Aux = reinterpret_cast<const T*>(AuxData.data());
   return readobj_error::success;
 }
 
-static std::string formatSymbol(const std::vector<RelocationRef> &Rels,
-                                uint64_t Offset, uint32_t Disp) {
-  std::string Buffer;
-  raw_string_ostream Str(Buffer);
-
-  StringRef Sym;
-  if (resolveSymbolName(Rels, Offset, Sym)) {
-    Str << format(" (0x%" PRIX64 ")", Offset);
-    return Str.str();
-  }
-
-  Str << Sym;
-  if (Disp > 0) {
-    Str << format(" +0x%X (0x%" PRIX64 ")", Disp, Offset);
-  } else {
-    Str << format(" (0x%" PRIX64 ")", Offset);
-  }
-
-  return Str.str();
-}
-
-// Given a vector of relocations for a section and an offset into this section
-// the function resolves the symbol used for the relocation at the offset and
-// returns the section content and the address inside the content pointed to
-// by the symbol.
-error_code COFFDumper::getSectionContents(
-    const std::vector<RelocationRef> &Rels, uint64_t Offset,
-    ArrayRef<uint8_t> &Contents, uint64_t &Addr) {
-
-  SymbolRef Sym;
-  const coff_section *Section;
-
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
-    return EC;
-  if (error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
-    return EC;
-  if (error_code EC = Obj->getSectionContents(Section, Contents))
-    return EC;
-
-  return object_error::success;
-}
-
-error_code COFFDumper::getSection(
-    const std::vector<RelocationRef> &Rels, uint64_t Offset,
-    const coff_section **SectionPtr, uint64_t *AddrPtr) {
-
-  SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
-    return EC;
-
-  const coff_section *Section;
-  uint64_t Addr;
-  if (error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
-    return EC;
-
-  if (SectionPtr)
-    *SectionPtr = Section;
-  if (AddrPtr)
-    *AddrPtr = Addr;
-
-  return object_error::success;
-}
-
 void COFFDumper::cacheRelocations() {
   for (const SectionRef &S : Obj->sections()) {
     const coff_section *Section = Obj->getCOFFSection(S);
@@ -676,8 +485,8 @@ void COFFDumper::printCodeViewLineTables(const SectionRef &Section) {
         }
 
         StringRef FunctionName;
-        if (error(resolveSymbolName(RelocMap[Obj->getCOFFSection(Section)],
-                                    Offset, FunctionName)))
+        if (error(resolveSymbolName(Obj->getCOFFSection(Section), Offset,
+                                    FunctionName)))
           return;
         W.printString("FunctionName", FunctionName);
         if (FunctionLineTables.count(FunctionName) != 0) {
@@ -913,7 +722,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
   const coff_symbol *Symbol = Obj->getCOFFSymbol(Sym);
   const coff_section *Section;
-  if (error_code EC = Obj->getSection(Symbol->SectionNumber, Section)) {
+  if (std::error_code EC = Obj->getSection(Symbol->SectionNumber, Section)) {
     W.startLine() << "Invalid section number: " << EC.message() << "\n";
     W.flush();
     return;
@@ -957,7 +766,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
       const coff_symbol *Linked;
       StringRef LinkedName;
-      error_code EC;
+      std::error_code EC;
       if ((EC = Obj->getSymbol(Aux->TagIndex, Linked)) ||
           (EC = Obj->getSymbolName(Linked, LinkedName))) {
         LinkedName = "";
@@ -999,7 +808,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
           && Aux->Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
         const coff_section *Assoc;
         StringRef AssocName;
-        error_code EC;
+        std::error_code EC;
         if ((EC = Obj->getSection(Aux->Number, Assoc)) ||
             (EC = Obj->getSectionName(Assoc, AssocName))) {
           AssocName = "";
@@ -1015,7 +824,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
       const coff_symbol *ReferredSym;
       StringRef ReferredName;
-      error_code EC;
+      std::error_code EC;
       if ((EC = Obj->getSymbol(Aux->SymbolTableIndex, ReferredSym)) ||
           (EC = Obj->getSymbolName(ReferredSym, ReferredName))) {
         ReferredName = "";
@@ -1040,181 +849,28 @@ void COFFDumper::printUnwindInfo() {
     return;
 
   ListScope D(W, "UnwindInformation");
-  if (Header->Machine != COFF::IMAGE_FILE_MACHINE_AMD64) {
-    W.startLine() << "Unsupported image machine type "
-              "(currently only AMD64 is supported).\n";
-    return;
-  }
-
-  printX64UnwindInfo();
-}
-
-void COFFDumper::printX64UnwindInfo() {
-  for (const SectionRef &Section : Obj->sections()) {
-    StringRef Name;
-    if (error(Section.getName(Name)))
-      continue;
-    if (Name != ".pdata" && !Name.startswith(".pdata$"))
-      continue;
-
-    const coff_section *PData = Obj->getCOFFSection(Section);
-
-    ArrayRef<uint8_t> Contents;
-    if (error(Obj->getSectionContents(PData, Contents)) || Contents.empty())
-      continue;
-
-    ArrayRef<RuntimeFunction> RFs(
-      reinterpret_cast<const RuntimeFunction *>(Contents.data()),
-      Contents.size() / sizeof(RuntimeFunction));
-
-    for (const RuntimeFunction *I = RFs.begin(), *E = RFs.end(); I < E; ++I) {
-      const uint64_t OffsetInSection = std::distance(RFs.begin(), I)
-                                     * sizeof(RuntimeFunction);
-
-      printRuntimeFunction(*I, OffsetInSection, RelocMap[PData]);
-    }
-  }
-}
-
-void COFFDumper::printRuntimeFunction(
-    const RuntimeFunction& RTF,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels) {
-
-  DictScope D(W, "RuntimeFunction");
-  W.printString("StartAddress",
-                formatSymbol(Rels, OffsetInSection + 0, RTF.StartAddress));
-  W.printString("EndAddress",
-                formatSymbol(Rels, OffsetInSection + 4, RTF.EndAddress));
-  W.printString("UnwindInfoAddress",
-                formatSymbol(Rels, OffsetInSection + 8, RTF.UnwindInfoOffset));
-
-  const coff_section* XData = nullptr;
-  uint64_t UnwindInfoOffset = 0;
-  if (error(getSection(Rels, OffsetInSection + 8, &XData, &UnwindInfoOffset)))
-    return;
-
-  ArrayRef<uint8_t> XContents;
-  if (error(Obj->getSectionContents(XData, XContents)) || XContents.empty())
-    return;
-
-  UnwindInfoOffset += RTF.UnwindInfoOffset;
-  if (UnwindInfoOffset > XContents.size())
-    return;
-
-  const Win64EH::UnwindInfo *UI =
-    reinterpret_cast<const Win64EH::UnwindInfo *>(
-      XContents.data() + UnwindInfoOffset);
-
-  printUnwindInfo(*UI, UnwindInfoOffset, RelocMap[XData]);
-}
-
-void COFFDumper::printUnwindInfo(
-    const Win64EH::UnwindInfo& UI,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels) {
-  DictScope D(W, "UnwindInfo");
-  W.printNumber("Version", UI.getVersion());
-  W.printFlags("Flags", UI.getFlags(), makeArrayRef(UnwindFlags));
-  W.printNumber("PrologSize", UI.PrologSize);
-  if (UI.getFrameRegister() != 0) {
-    W.printEnum("FrameRegister", UI.getFrameRegister(),
-                makeArrayRef(UnwindOpInfo));
-    W.printHex("FrameOffset", UI.getFrameOffset());
-  } else {
-    W.printString("FrameRegister", StringRef("-"));
-    W.printString("FrameOffset", StringRef("-"));
-  }
-
-  W.printNumber("UnwindCodeCount", UI.NumCodes);
-  {
-    ListScope CodesD(W, "UnwindCodes");
-    ArrayRef<UnwindCode> UCs(&UI.UnwindCodes[0], UI.NumCodes);
-    for (const UnwindCode *I = UCs.begin(), *E = UCs.end(); I < E; ++I) {
-      unsigned UsedSlots = getNumUsedSlots(*I);
-      if (UsedSlots > UCs.size()) {
-        errs() << "Corrupt unwind data";
-        return;
-      }
-      printUnwindCode(UI, ArrayRef<UnwindCode>(I, E));
-      I += UsedSlots - 1;
-    }
-  }
-
-  uint64_t LSDAOffset = OffsetInSection + getOffsetOfLSDA(UI);
-  if (UI.getFlags() & (UNW_ExceptionHandler | UNW_TerminateHandler)) {
-    W.printString("Handler", formatSymbol(Rels, LSDAOffset,
-                                        UI.getLanguageSpecificHandlerOffset()));
-  } else if (UI.getFlags() & UNW_ChainInfo) {
-    const RuntimeFunction *Chained = UI.getChainedFunctionEntry();
-    if (Chained) {
-      DictScope D(W, "Chained");
-      W.printString("StartAddress", formatSymbol(Rels, LSDAOffset + 0,
-                                                        Chained->StartAddress));
-      W.printString("EndAddress", formatSymbol(Rels, LSDAOffset + 4,
-                                                          Chained->EndAddress));
-      W.printString("UnwindInfoAddress", formatSymbol(Rels, LSDAOffset + 8,
-                                                    Chained->UnwindInfoOffset));
-    }
-  }
-}
-
-// Prints one unwind code. Because an unwind code can occupy up to 3 slots in
-// the unwind codes array, this function requires that the correct number of
-// slots is provided.
-void COFFDumper::printUnwindCode(const Win64EH::UnwindInfo& UI,
-                                 ArrayRef<UnwindCode> UCs) {
-  assert(UCs.size() >= getNumUsedSlots(UCs[0]));
-
-  W.startLine() << format("0x%02X: ", unsigned(UCs[0].u.CodeOffset))
-                << getUnwindCodeTypeName(UCs[0].getUnwindOp());
-
-  uint32_t AllocSize = 0;
-
-  switch (UCs[0].getUnwindOp()) {
-  case UOP_PushNonVol:
-    outs() << " reg=" << getUnwindRegisterName(UCs[0].getOpInfo());
-    break;
-
-  case UOP_AllocLarge:
-    if (UCs[0].getOpInfo() == 0) {
-      AllocSize = UCs[1].FrameOffset * 8;
-    } else {
-      AllocSize = getLargeSlotValue(UCs);
-    }
-    outs() << " size=" << AllocSize;
-    break;
-  case UOP_AllocSmall:
-    outs() << " size=" << ((UCs[0].getOpInfo() + 1) * 8);
-    break;
-  case UOP_SetFPReg:
-    if (UI.getFrameRegister() == 0) {
-      outs() << " reg=<invalid>";
-    } else {
-      outs() << " reg=" << getUnwindRegisterName(UI.getFrameRegister())
-             << format(", offset=0x%X", UI.getFrameOffset() * 16);
-    }
-    break;
-  case UOP_SaveNonVol:
-    outs() << " reg=" << getUnwindRegisterName(UCs[0].getOpInfo())
-           << format(", offset=0x%X", UCs[1].FrameOffset * 8);
-    break;
-  case UOP_SaveNonVolBig:
-    outs() << " reg=" << getUnwindRegisterName(UCs[0].getOpInfo())
-           << format(", offset=0x%X", getLargeSlotValue(UCs));
-    break;
-  case UOP_SaveXMM128:
-    outs() << " reg=XMM" << static_cast<uint32_t>(UCs[0].getOpInfo())
-           << format(", offset=0x%X", UCs[1].FrameOffset * 16);
+  switch (Header->Machine) {
+  case COFF::IMAGE_FILE_MACHINE_AMD64: {
+    Win64EH::Dumper Dumper(W);
+    Win64EH::Dumper::SymbolResolver
+    Resolver = [](const object::coff_section *Section, uint64_t Offset,
+                  SymbolRef &Symbol, void *user_data) -> std::error_code {
+      COFFDumper *Dumper = reinterpret_cast<COFFDumper *>(user_data);
+      return Dumper->resolveSymbol(Section, Offset, Symbol);
+    };
+    Win64EH::Dumper::Context Ctx(*Obj, Resolver, this);
+    Dumper.printData(Ctx);
     break;
-  case UOP_SaveXMM128Big:
-    outs() << " reg=XMM" << static_cast<uint32_t>(UCs[0].getOpInfo())
-           << format(", offset=0x%X", getLargeSlotValue(UCs));
+  }
+  case COFF::IMAGE_FILE_MACHINE_ARMNT: {
+    ARM::WinEH::Decoder Decoder(W);
+    Decoder.dumpProcedureData(*Obj);
     break;
-  case UOP_PushMachFrame:
-    outs() << " errcode=" << (UCs[0].getOpInfo() == 0 ? "no" : "yes");
+  }
+  default:
+    W.printEnum("unsupported Image Machine", Header->Machine,
+                makeArrayRef(ImageFileMachineType));
     break;
   }
-
-  outs() << "\n";
 }
+
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index de4c2079012a..502c7197ddb2 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -18,6 +18,7 @@
 #include "Error.h"
 #include "ObjDumper.h"
 #include "StreamWriter.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Object/ELFObjectFile.h"
@@ -54,6 +55,7 @@ class ELFDumper : public ObjDumper {
   void printProgramHeaders() override;
 
   void printAttributes() override;
+  void printMipsPLTGOT() override;
 
 private:
   typedef ELFFile<ELFT> ELFO;
@@ -81,15 +83,16 @@ template <class T> T errorOrDefault(ErrorOr<T> Val, T Default = T()) {
 namespace llvm {
 
 template <class ELFT>
-static error_code createELFDumper(const ELFFile<ELFT> *Obj,
-                                  StreamWriter &Writer,
-                                  std::unique_ptr<ObjDumper> &Result) {
+static std::error_code createELFDumper(const ELFFile<ELFT> *Obj,
+                                       StreamWriter &Writer,
+                                       std::unique_ptr<ObjDumper> &Result) {
   Result.reset(new ELFDumper<ELFT>(Obj, Writer));
   return readobj_error::success;
 }
 
-error_code createELFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                           std::unique_ptr<ObjDumper> &Result) {
+std::error_code createELFDumper(const object::ObjectFile *Obj,
+                                StreamWriter &Writer,
+                                std::unique_ptr<ObjDumper> &Result) {
   // Little-endian 32-bit
   if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     return createELFDumper(ELFObj->getELFFile(), Writer, Result);
@@ -111,6 +114,62 @@ error_code createELFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
 
 } // namespace llvm
 
+template <typename ELFO>
+static std::string getFullSymbolName(const ELFO &Obj,
+                                     typename ELFO::Elf_Sym_Iter Symbol) {
+  StringRef SymbolName = errorOrDefault(Obj.getSymbolName(Symbol));
+  if (!Symbol.isDynamic())
+    return SymbolName;
+
+  std::string FullSymbolName(SymbolName);
+
+  bool IsDefault;
+  ErrorOr<StringRef> Version =
+      Obj.getSymbolVersion(nullptr, &*Symbol, IsDefault);
+  if (Version) {
+    FullSymbolName += (IsDefault ? "@@" : "@");
+    FullSymbolName += *Version;
+  } else
+    error(Version.getError());
+  return FullSymbolName;
+}
+
+template <typename ELFO>
+static void
+getSectionNameIndex(const ELFO &Obj, typename ELFO::Elf_Sym_Iter Symbol,
+                    StringRef &SectionName, unsigned &SectionIndex) {
+  SectionIndex = Symbol->st_shndx;
+  if (SectionIndex == SHN_UNDEF) {
+    SectionName = "Undefined";
+  } else if (SectionIndex >= SHN_LOPROC && SectionIndex <= SHN_HIPROC) {
+    SectionName = "Processor Specific";
+  } else if (SectionIndex >= SHN_LOOS && SectionIndex <= SHN_HIOS) {
+    SectionName = "Operating System Specific";
+  } else if (SectionIndex > SHN_HIOS && SectionIndex < SHN_ABS) {
+    SectionName = "Reserved";
+  } else if (SectionIndex == SHN_ABS) {
+    SectionName = "Absolute";
+  } else if (SectionIndex == SHN_COMMON) {
+    SectionName = "Common";
+  } else {
+    if (SectionIndex == SHN_XINDEX)
+      SectionIndex = Obj.getSymbolTableIndex(&*Symbol);
+    assert(SectionIndex != SHN_XINDEX &&
+           "getSymbolTableIndex should handle this");
+    const typename ELFO::Elf_Shdr *Sec = Obj.getSection(SectionIndex);
+    SectionName = errorOrDefault(Obj.getSectionName(Sec));
+  }
+}
+
+template <class ELFT>
+static const typename ELFFile<ELFT>::Elf_Shdr *
+findSectionByAddress(const ELFFile<ELFT> *Obj, uint64_t Addr) {
+  for (const auto &Shdr : Obj->sections())
+    if (Shdr.sh_addr == Addr)
+      return &Shdr;
+  return nullptr;
+}
+
 static const EnumEntry<unsigned> ElfClass[] = {
   { "None",   ELF::ELFCLASSNONE },
   { "32-bit", ELF::ELFCLASS32   },
@@ -651,42 +710,10 @@ void ELFDumper<ELFT>::printDynamicSymbols() {
 
 template <class ELFT>
 void ELFDumper<ELFT>::printSymbol(typename ELFO::Elf_Sym_Iter Symbol) {
-  StringRef SymbolName = errorOrDefault(Obj->getSymbolName(Symbol));
-
-  unsigned SectionIndex = Symbol->st_shndx;
+  unsigned SectionIndex = 0;
   StringRef SectionName;
-  if (SectionIndex == SHN_UNDEF) {
-    SectionName = "Undefined";
-  } else if (SectionIndex >= SHN_LOPROC && SectionIndex <= SHN_HIPROC) {
-    SectionName = "Processor Specific";
-  } else if (SectionIndex >= SHN_LOOS && SectionIndex <= SHN_HIOS) {
-    SectionName = "Operating System Specific";
-  } else if (SectionIndex > SHN_HIOS && SectionIndex < SHN_ABS) {
-    SectionName = "Reserved";
-  } else if (SectionIndex == SHN_ABS) {
-    SectionName = "Absolute";
-  } else if (SectionIndex == SHN_COMMON) {
-    SectionName = "Common";
-  } else {
-    if (SectionIndex == SHN_XINDEX)
-      SectionIndex = Obj->getSymbolTableIndex(&*Symbol);
-    assert(SectionIndex != SHN_XINDEX &&
-           "getSymbolTableIndex should handle this");
-    const Elf_Shdr *Sec = Obj->getSection(SectionIndex);
-    SectionName = errorOrDefault(Obj->getSectionName(Sec));
-  }
-
-  std::string FullSymbolName(SymbolName);
-  if (Symbol.isDynamic()) {
-    bool IsDefault;
-    ErrorOr<StringRef> Version = Obj->getSymbolVersion(nullptr, &*Symbol,
-                                                       IsDefault);
-    if (Version) {
-      FullSymbolName += (IsDefault ? "@@" : "@");
-      FullSymbolName += *Version;
-    } else
-      error(Version.getError());
-  }
+  getSectionNameIndex(*Obj, Symbol, SectionName, SectionIndex);
+  std::string FullSymbolName = getFullSymbolName(*Obj, Symbol);
 
   DictScope D(W, "Symbol");
   W.printNumber("Name", FullSymbolName, Symbol->st_name);
@@ -902,13 +929,12 @@ void ELFDumper<ELFType<support::little, 2, false> >::printUnwindInfo() {
 
 template<class ELFT>
 void ELFDumper<ELFT>::printDynamicTable() {
-  typedef typename ELFO::Elf_Dyn_Iter EDI;
-  EDI Start = Obj->begin_dynamic_table(), End = Obj->end_dynamic_table(true);
+  auto DynTable = Obj->dynamic_table(true);
 
-  if (Start == End)
+  ptrdiff_t Total = std::distance(DynTable.begin(), DynTable.end());
+  if (Total == 0)
     return;
 
-  ptrdiff_t Total = std::distance(Start, End);
   raw_ostream &OS = W.getOStream();
   W.startLine() << "DynamicSection [ (" << Total << " entries)\n";
 
@@ -917,12 +943,12 @@ void ELFDumper<ELFT>::printDynamicTable() {
   W.startLine()
      << "  Tag" << (Is64 ? "                " : "        ") << "Type"
      << "                 " << "Name/Value\n";
-  for (; Start != End; ++Start) {
+  for (const auto &Entry : DynTable) {
     W.startLine()
        << "  "
-       << format(Is64 ? "0x%016" PRIX64 : "0x%08" PRIX64, Start->getTag())
-       << " " << format("%-21s", getTypeString(Start->getTag()));
-    printValue(Obj, Start->getTag(), Start->getVal(), Is64, OS);
+       << format(Is64 ? "0x%016" PRIX64 : "0x%08" PRIX64, Entry.getTag())
+       << " " << format("%-21s", getTypeString(Entry.getTag()));
+    printValue(Obj, Entry.getTag(), Entry.getVal(), Is64, OS);
     OS << "\n";
   }
 
@@ -936,11 +962,9 @@ void ELFDumper<ELFT>::printNeededLibraries() {
   typedef std::vector<StringRef> LibsTy;
   LibsTy Libs;
 
-  for (typename ELFO::Elf_Dyn_Iter DynI = Obj->begin_dynamic_table(),
-                                   DynE = Obj->end_dynamic_table();
-       DynI != DynE; ++DynI)
-    if (DynI->d_tag == ELF::DT_NEEDED)
-      Libs.push_back(Obj->getDynamicString(DynI->d_un.d_val));
+  for (const auto &Entry : Obj->dynamic_table())
+    if (Entry.d_tag == ELF::DT_NEEDED)
+      Libs.push_back(Obj->getDynamicString(Entry.d_un.d_val));
 
   std::stable_sort(Libs.begin(), Libs.end());
 
@@ -1008,3 +1032,209 @@ void ELFDumper<ELFType<support::little, 2, false> >::printAttributes() {
 }
 }
 
+namespace {
+template <class ELFT> class MipsGOTParser {
+public:
+  typedef object::ELFFile<ELFT> ObjectFile;
+  typedef typename ObjectFile::Elf_Shdr Elf_Shdr;
+
+  MipsGOTParser(const ObjectFile *Obj, StreamWriter &W) : Obj(Obj), W(W) {}
+
+  void parseGOT(const Elf_Shdr &GOTShdr);
+
+private:
+  typedef typename ObjectFile::Elf_Sym_Iter Elf_Sym_Iter;
+  typedef typename ObjectFile::Elf_Addr GOTEntry;
+  typedef typename ObjectFile::template ELFEntityIterator<const GOTEntry>
+  GOTIter;
+
+  const ObjectFile *Obj;
+  StreamWriter &W;
+
+  std::size_t getGOTTotal(ArrayRef<uint8_t> GOT) const;
+  GOTIter makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum);
+
+  bool getGOTTags(uint64_t &LocalGotNum, uint64_t &GotSym);
+  void printGotEntry(uint64_t GotAddr, GOTIter BeginIt, GOTIter It);
+  void printGlobalGotEntry(uint64_t GotAddr, GOTIter BeginIt, GOTIter It,
+                           Elf_Sym_Iter Sym);
+};
+}
+
+template <class ELFT>
+void MipsGOTParser<ELFT>::parseGOT(const Elf_Shdr &GOTShdr) {
+  // See "Global Offset Table" in Chapter 5 in the following document
+  // for detailed GOT description.
+  // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
+
+  ErrorOr<ArrayRef<uint8_t>> GOT = Obj->getSectionContents(&GOTShdr);
+  if (!GOT) {
+    W.startLine() << "The .got section is empty.\n";
+    return;
+  }
+
+  uint64_t DtLocalGotNum;
+  uint64_t DtGotSym;
+  if (!getGOTTags(DtLocalGotNum, DtGotSym))
+    return;
+
+  if (DtLocalGotNum > getGOTTotal(*GOT)) {
+    W.startLine() << "MIPS_LOCAL_GOTNO exceeds a number of GOT entries.\n";
+    return;
+  }
+
+  Elf_Sym_Iter DynSymBegin = Obj->begin_dynamic_symbols();
+  Elf_Sym_Iter DynSymEnd = Obj->end_dynamic_symbols();
+  std::size_t DynSymTotal = std::size_t(std::distance(DynSymBegin, DynSymEnd));
+
+  if (DtGotSym + 1 > DynSymTotal) {
+    W.startLine() << "MIPS_GOTSYM exceeds a number of dynamic symbols.\n";
+    return;
+  }
+
+  std::size_t GlobalGotNum = DynSymTotal - DtGotSym;
+
+  if (DtLocalGotNum + GlobalGotNum > getGOTTotal(*GOT)) {
+    W.startLine() << "Number of global GOT entries exceeds the size of GOT.\n";
+    return;
+  }
+
+  GOTIter GotBegin = makeGOTIter(*GOT, 0);
+  GOTIter GotLocalEnd = makeGOTIter(*GOT, DtLocalGotNum);
+  GOTIter It = GotBegin;
+
+  DictScope GS(W, "Primary GOT");
+
+  W.printHex("Canonical gp value", GOTShdr.sh_addr + 0x7ff0);
+  {
+    ListScope RS(W, "Reserved entries");
+
+    {
+      DictScope D(W, "Entry");
+      printGotEntry(GOTShdr.sh_addr, GotBegin, It++);
+      W.printString("Purpose", StringRef("Lazy resolver"));
+    }
+
+    if (It != GotLocalEnd && (*It >> (sizeof(GOTEntry) * 8 - 1)) != 0) {
+      DictScope D(W, "Entry");
+      printGotEntry(GOTShdr.sh_addr, GotBegin, It++);
+      W.printString("Purpose", StringRef("Module pointer (GNU extension)"));
+    }
+  }
+  {
+    ListScope LS(W, "Local entries");
+    for (; It != GotLocalEnd; ++It) {
+      DictScope D(W, "Entry");
+      printGotEntry(GOTShdr.sh_addr, GotBegin, It);
+    }
+  }
+  {
+    ListScope GS(W, "Global entries");
+
+    GOTIter GotGlobalEnd = makeGOTIter(*GOT, DtLocalGotNum + GlobalGotNum);
+    Elf_Sym_Iter GotDynSym = DynSymBegin + DtGotSym;
+    for (; It != GotGlobalEnd; ++It) {
+      DictScope D(W, "Entry");
+      printGlobalGotEntry(GOTShdr.sh_addr, GotBegin, It, GotDynSym++);
+    }
+  }
+
+  std::size_t SpecGotNum = getGOTTotal(*GOT) - DtLocalGotNum - GlobalGotNum;
+  W.printNumber("Number of TLS and multi-GOT entries", uint64_t(SpecGotNum));
+}
+
+template <class ELFT>
+std::size_t MipsGOTParser<ELFT>::getGOTTotal(ArrayRef<uint8_t> GOT) const {
+  return GOT.size() / sizeof(GOTEntry);
+}
+
+template <class ELFT>
+typename MipsGOTParser<ELFT>::GOTIter
+MipsGOTParser<ELFT>::makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum) {
+  const char *Data = reinterpret_cast<const char *>(GOT.data());
+  return GOTIter(sizeof(GOTEntry), Data + EntryNum * sizeof(GOTEntry));
+}
+
+template <class ELFT>
+bool MipsGOTParser<ELFT>::getGOTTags(uint64_t &LocalGotNum, uint64_t &GotSym) {
+  bool FoundLocalGotNum = false;
+  bool FoundGotSym = false;
+  for (const auto &Entry : Obj->dynamic_table()) {
+    switch (Entry.getTag()) {
+    case ELF::DT_MIPS_LOCAL_GOTNO:
+      LocalGotNum = Entry.getVal();
+      FoundLocalGotNum = true;
+      break;
+    case ELF::DT_MIPS_GOTSYM:
+      GotSym = Entry.getVal();
+      FoundGotSym = true;
+      break;
+    }
+  }
+
+  if (!FoundLocalGotNum) {
+    W.startLine() << "Cannot find MIPS_LOCAL_GOTNO dynamic table tag.\n";
+    return false;
+  }
+
+  if (!FoundGotSym) {
+    W.startLine() << "Cannot find MIPS_GOTSYM dynamic table tag.\n";
+    return false;
+  }
+
+  return true;
+}
+
+template <class ELFT>
+void MipsGOTParser<ELFT>::printGotEntry(uint64_t GotAddr, GOTIter BeginIt,
+                                        GOTIter It) {
+  int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
+  W.printHex("Address", GotAddr + Offset);
+  W.printNumber("Access", Offset - 0x7ff0);
+  W.printHex("Initial", *It);
+}
+
+template <class ELFT>
+void MipsGOTParser<ELFT>::printGlobalGotEntry(uint64_t GotAddr, GOTIter BeginIt,
+                                              GOTIter It, Elf_Sym_Iter Sym) {
+  printGotEntry(GotAddr, BeginIt, It);
+
+  W.printHex("Value", Sym->st_value);
+  W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
+
+  unsigned SectionIndex = 0;
+  StringRef SectionName;
+  getSectionNameIndex(*Obj, Sym, SectionName, SectionIndex);
+  W.printHex("Section", SectionName, SectionIndex);
+
+  std::string FullSymbolName = getFullSymbolName(*Obj, Sym);
+  W.printNumber("Name", FullSymbolName, Sym->st_name);
+}
+
+template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
+  if (Obj->getHeader()->e_machine != EM_MIPS) {
+    W.startLine() << "MIPS PLT GOT is available for MIPS targets only.\n";
+    return;
+  }
+
+  llvm::Optional<uint64_t> DtPltGot;
+  for (const auto &Entry : Obj->dynamic_table()) {
+    if (Entry.getTag() == ELF::DT_PLTGOT) {
+      DtPltGot = Entry.getVal();
+      break;
+    }
+  }
+
+  if (!DtPltGot) {
+    W.startLine() << "Cannot find PLTGOT dynamic table tag.\n";
+    return;
+  }
+
+  const Elf_Shdr *GotShdr = findSectionByAddress(Obj, *DtPltGot);
+  if (!GotShdr) {
+    W.startLine() << "There is no .got section in the file.\n";
+    return;
+  }
+
+  MipsGOTParser<ELFT>(Obj, W).parseGOT(*GotShdr);
+}
diff --git a/tools/llvm-readobj/Error.cpp b/tools/llvm-readobj/Error.cpp
index 83ed6a7dfc00..a078f5c095c7 100644
--- a/tools/llvm-readobj/Error.cpp
+++ b/tools/llvm-readobj/Error.cpp
@@ -17,11 +17,10 @@
 using namespace llvm;
 
 namespace {
-class _readobj_error_category : public error_category {
+class _readobj_error_category : public std::error_category {
 public:
-  const char* name() const override;
+  const char* name() const LLVM_NOEXCEPT override;
   std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
 };
 } // namespace
 
@@ -29,8 +28,8 @@ const char *_readobj_error_category::name() const {
   return "llvm.readobj";
 }
 
-std::string _readobj_error_category::message(int ev) const {
-  switch (ev) {
+std::string _readobj_error_category::message(int EV) const {
+  switch (static_cast<readobj_error>(EV)) {
   case readobj_error::success: return "Success";
   case readobj_error::file_not_found:
     return "No such file.";
@@ -42,20 +41,13 @@ std::string _readobj_error_category::message(int ev) const {
     return "Unsupported object file format.";
   case readobj_error::unknown_symbol:
     return "Unknown symbol.";
-  default:
-    llvm_unreachable("An enumerator of readobj_error does not have a message "
-                     "defined.");
   }
-}
-
-error_condition _readobj_error_category::default_error_condition(int ev) const {
-  if (ev == readobj_error::success)
-    return errc::success;
-  return errc::invalid_argument;
+  llvm_unreachable("An enumerator of readobj_error does not have a message "
+                   "defined.");
 }
 
 namespace llvm {
-const error_category &readobj_category() {
+const std::error_category &readobj_category() {
   static _readobj_error_category o;
   return o;
 }
diff --git a/tools/llvm-readobj/Error.h b/tools/llvm-readobj/Error.h
index 5129b4e57cd8..81ce4082aab9 100644
--- a/tools/llvm-readobj/Error.h
+++ b/tools/llvm-readobj/Error.h
@@ -14,35 +14,28 @@
 #ifndef LLVM_READOBJ_ERROR_H
 #define LLVM_READOBJ_ERROR_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-
-const error_category &readobj_category();
-
-struct readobj_error {
-  enum _ {
-    success = 0,
-    file_not_found,
-    unsupported_file_format,
-    unrecognized_file_format,
-    unsupported_obj_file_format,
-    unknown_symbol
-  };
-  _ v_;
-
-  readobj_error(_ v) : v_(v) {}
-  explicit readobj_error(int v) : v_(_(v)) {}
-  operator int() const {return v_;}
+const std::error_category &readobj_category();
+
+enum class readobj_error {
+  success = 0,
+  file_not_found,
+  unsupported_file_format,
+  unrecognized_file_format,
+  unsupported_obj_file_format,
+  unknown_symbol
 };
 
-inline error_code make_error_code(readobj_error e) {
-  return error_code(static_cast<int>(e), readobj_category());
+inline std::error_code make_error_code(readobj_error e) {
+  return std::error_code(static_cast<int>(e), readobj_category());
 }
 
-template <> struct is_error_code_enum<readobj_error> : std::true_type { };
-template <> struct is_error_code_enum<readobj_error::_> : std::true_type { };
-
 } // namespace llvm
 
+namespace std {
+template <> struct is_error_code_enum<llvm::readobj_error> : std::true_type {};
+}
+
 #endif
diff --git a/tools/llvm-readobj/MachODumper.cpp b/tools/llvm-readobj/MachODumper.cpp
index 2fd5d4acaa84..d168030a2706 100644
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp
@@ -54,9 +54,9 @@ class MachODumper : public ObjDumper {
 
 namespace llvm {
 
-error_code createMachODumper(const object::ObjectFile *Obj,
-                             StreamWriter &Writer,
-                             std::unique_ptr<ObjDumper> &Result) {
+std::error_code createMachODumper(const object::ObjectFile *Obj,
+                                  StreamWriter &Writer,
+                                  std::unique_ptr<ObjDumper> &Result) {
   const MachOObjectFile *MachOObj = dyn_cast<MachOObjectFile>(Obj);
   if (!MachOObj)
     return readobj_error::unsupported_obj_file_format;
@@ -277,7 +277,7 @@ void MachODumper::printSections(const MachOObjectFile *Obj) {
 void MachODumper::printRelocations() {
   ListScope D(W, "Relocations");
 
-  error_code EC;
+  std::error_code EC;
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
     if (error(Section.getName(Name)))
diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index 9e0fd2f990a4..f80a28b25cfc 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h
@@ -11,15 +11,13 @@
 #define LLVM_READOBJ_OBJDUMPER_H
 
 #include <memory>
+#include <system_error>
 
 namespace llvm {
-
 namespace object {
   class ObjectFile;
 }
 
-class error_code;
-
 class StreamWriter;
 
 class ObjDumper {
@@ -42,19 +40,24 @@ class ObjDumper {
   // Only implemented for ARM ELF at this time.
   virtual void printAttributes() { }
 
+  // Only implemented for MIPS ELF at this time.
+  virtual void printMipsPLTGOT() { }
+
 protected:
   StreamWriter& W;
 };
 
-error_code createCOFFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                            std::unique_ptr<ObjDumper> &Result);
+std::error_code createCOFFDumper(const object::ObjectFile *Obj,
+                                 StreamWriter &Writer,
+                                 std::unique_ptr<ObjDumper> &Result);
 
-error_code createELFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                           std::unique_ptr<ObjDumper> &Result);
+std::error_code createELFDumper(const object::ObjectFile *Obj,
+                                StreamWriter &Writer,
+                                std::unique_ptr<ObjDumper> &Result);
 
-error_code createMachODumper(const object::ObjectFile *Obj,
-                             StreamWriter &Writer,
-                             std::unique_ptr<ObjDumper> &Result);
+std::error_code createMachODumper(const object::ObjectFile *Obj,
+                                  StreamWriter &Writer,
+                                  std::unique_ptr<ObjDumper> &Result);
 
 } // namespace llvm
 
diff --git a/tools/llvm-readobj/StreamWriter.h b/tools/llvm-readobj/StreamWriter.h
index 9282dcc91a8b..04b38fbe25ea 100644
--- a/tools/llvm-readobj/StreamWriter.h
+++ b/tools/llvm-readobj/StreamWriter.h
@@ -169,6 +169,10 @@ class StreamWriter {
     startLine() << Label << ": " << int(Value) << "\n";
   }
 
+  void printBoolean(StringRef Label, bool Value) {
+    startLine() << Label << ": " << (Value ? "Yes" : "No") << '\n';
+  }
+
   template <typename T_>
   void printList(StringRef Label, const SmallVectorImpl<T_> &List) {
     startLine() << Label << ": [";
diff --git a/tools/llvm-readobj/Win64EHDumper.cpp b/tools/llvm-readobj/Win64EHDumper.cpp
new file mode 100644
index 000000000000..f058632a8ce8
--- /dev/null
+++ b/tools/llvm-readobj/Win64EHDumper.cpp
@@ -0,0 +1,329 @@
+//===- Win64EHDumper.cpp - Win64 EH Printer ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win64EHDumper.h"
+#include "llvm-readobj.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::Win64EH;
+
+static const EnumEntry<unsigned> UnwindFlags[] = {
+  { "ExceptionHandler", UNW_ExceptionHandler },
+  { "TerminateHandler", UNW_TerminateHandler },
+  { "ChainInfo"       , UNW_ChainInfo        }
+};
+
+static const EnumEntry<unsigned> UnwindOpInfo[] = {
+  { "RAX",  0 },
+  { "RCX",  1 },
+  { "RDX",  2 },
+  { "RBX",  3 },
+  { "RSP",  4 },
+  { "RBP",  5 },
+  { "RSI",  6 },
+  { "RDI",  7 },
+  { "R8",   8 },
+  { "R9",   9 },
+  { "R10", 10 },
+  { "R11", 11 },
+  { "R12", 12 },
+  { "R13", 13 },
+  { "R14", 14 },
+  { "R15", 15 }
+};
+
+static uint64_t getOffsetOfLSDA(const UnwindInfo& UI) {
+  return static_cast<const char*>(UI.getLanguageSpecificData())
+         - reinterpret_cast<const char*>(&UI);
+}
+
+static uint32_t getLargeSlotValue(ArrayRef<UnwindCode> UC) {
+  if (UC.size() < 3)
+    return 0;
+  return UC[1].FrameOffset + (static_cast<uint32_t>(UC[2].FrameOffset) << 16);
+}
+
+// Returns the name of the unwind code.
+static StringRef getUnwindCodeTypeName(uint8_t Code) {
+  switch (Code) {
+  default: llvm_unreachable("Invalid unwind code");
+  case UOP_PushNonVol: return "PUSH_NONVOL";
+  case UOP_AllocLarge: return "ALLOC_LARGE";
+  case UOP_AllocSmall: return "ALLOC_SMALL";
+  case UOP_SetFPReg: return "SET_FPREG";
+  case UOP_SaveNonVol: return "SAVE_NONVOL";
+  case UOP_SaveNonVolBig: return "SAVE_NONVOL_FAR";
+  case UOP_SaveXMM128: return "SAVE_XMM128";
+  case UOP_SaveXMM128Big: return "SAVE_XMM128_FAR";
+  case UOP_PushMachFrame: return "PUSH_MACHFRAME";
+  }
+}
+
+// Returns the name of a referenced register.
+static StringRef getUnwindRegisterName(uint8_t Reg) {
+  switch (Reg) {
+  default: llvm_unreachable("Invalid register");
+  case 0: return "RAX";
+  case 1: return "RCX";
+  case 2: return "RDX";
+  case 3: return "RBX";
+  case 4: return "RSP";
+  case 5: return "RBP";
+  case 6: return "RSI";
+  case 7: return "RDI";
+  case 8: return "R8";
+  case 9: return "R9";
+  case 10: return "R10";
+  case 11: return "R11";
+  case 12: return "R12";
+  case 13: return "R13";
+  case 14: return "R14";
+  case 15: return "R15";
+  }
+}
+
+// Calculates the number of array slots required for the unwind code.
+static unsigned getNumUsedSlots(const UnwindCode &UnwindCode) {
+  switch (UnwindCode.getUnwindOp()) {
+  default: llvm_unreachable("Invalid unwind code");
+  case UOP_PushNonVol:
+  case UOP_AllocSmall:
+  case UOP_SetFPReg:
+  case UOP_PushMachFrame:
+    return 1;
+  case UOP_SaveNonVol:
+  case UOP_SaveXMM128:
+    return 2;
+  case UOP_SaveNonVolBig:
+  case UOP_SaveXMM128Big:
+    return 3;
+  case UOP_AllocLarge:
+    return (UnwindCode.getOpInfo() == 0) ? 2 : 3;
+  }
+}
+
+static std::string formatSymbol(const Dumper::Context &Ctx,
+                                const coff_section *Section, uint64_t Offset,
+                                uint32_t Displacement) {
+  std::string Buffer;
+  raw_string_ostream OS(Buffer);
+
+  StringRef Name;
+  SymbolRef Symbol;
+  if (Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData) ||
+      Symbol.getName(Name)) {
+    OS << format(" (0x%" PRIX64 ")", Offset);
+    return OS.str();
+  }
+
+  OS << Name;
+  if (Displacement > 0)
+    OS << format(" +0x%X (0x%" PRIX64 ")", Displacement, Offset);
+  else
+    OS << format(" (0x%" PRIX64 ")", Offset);
+  return OS.str();
+}
+
+static std::error_code resolveRelocation(const Dumper::Context &Ctx,
+                                         const coff_section *Section,
+                                         uint64_t Offset,
+                                         const coff_section *&ResolvedSection,
+                                         uint64_t &ResolvedAddress) {
+  SymbolRef Symbol;
+  if (std::error_code EC =
+          Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData))
+    return EC;
+
+  if (std::error_code EC = Symbol.getAddress(ResolvedAddress))
+    return EC;
+
+  section_iterator SI = Ctx.COFF.section_begin();
+  if (std::error_code EC = Symbol.getSection(SI))
+    return EC;
+
+  ResolvedSection = Ctx.COFF.getCOFFSection(*SI);
+  return object_error::success;
+}
+
+namespace llvm {
+namespace Win64EH {
+void Dumper::printRuntimeFunctionEntry(const Context &Ctx,
+                                       const coff_section *Section,
+                                       uint64_t Offset,
+                                       const RuntimeFunction &RF) {
+  SW.printString("StartAddress",
+                 formatSymbol(Ctx, Section, Offset + 0, RF.StartAddress));
+  SW.printString("EndAddress",
+                 formatSymbol(Ctx, Section, Offset + 4, RF.EndAddress));
+  SW.printString("UnwindInfoAddress",
+                 formatSymbol(Ctx, Section, Offset + 8, RF.UnwindInfoOffset));
+}
+
+// Prints one unwind code. Because an unwind code can occupy up to 3 slots in
+// the unwind codes array, this function requires that the correct number of
+// slots is provided.
+void Dumper::printUnwindCode(const UnwindInfo& UI, ArrayRef<UnwindCode> UC) {
+  assert(UC.size() >= getNumUsedSlots(UC[0]));
+
+  SW.startLine() << format("0x%02X: ", unsigned(UC[0].u.CodeOffset))
+                 << getUnwindCodeTypeName(UC[0].getUnwindOp());
+
+  switch (UC[0].getUnwindOp()) {
+  case UOP_PushNonVol:
+    OS << " reg=" << getUnwindRegisterName(UC[0].getOpInfo());
+    break;
+
+  case UOP_AllocLarge:
+    OS << " size="
+       << ((UC[0].getOpInfo() == 0) ? UC[1].FrameOffset * 8
+                                    : getLargeSlotValue(UC));
+    break;
+
+  case UOP_AllocSmall:
+    OS << " size=" << (UC[0].getOpInfo() + 1) * 8;
+    break;
+
+  case UOP_SetFPReg:
+    if (UI.getFrameRegister() == 0)
+      OS << " reg=<invalid>";
+    else
+      OS << " reg=" << getUnwindRegisterName(UI.getFrameRegister())
+         << format(", offset=0x%X", UI.getFrameOffset() * 16);
+    break;
+
+  case UOP_SaveNonVol:
+    OS << " reg=" << getUnwindRegisterName(UC[0].getOpInfo())
+       << format(", offset=0x%X", UC[1].FrameOffset * 8);
+    break;
+
+  case UOP_SaveNonVolBig:
+    OS << " reg=" << getUnwindRegisterName(UC[0].getOpInfo())
+       << format(", offset=0x%X", getLargeSlotValue(UC));
+    break;
+
+  case UOP_SaveXMM128:
+    OS << " reg=XMM" << static_cast<uint32_t>(UC[0].getOpInfo())
+       << format(", offset=0x%X", UC[1].FrameOffset * 16);
+    break;
+
+  case UOP_SaveXMM128Big:
+    OS << " reg=XMM" << static_cast<uint32_t>(UC[0].getOpInfo())
+       << format(", offset=0x%X", getLargeSlotValue(UC));
+    break;
+
+  case UOP_PushMachFrame:
+    OS << " errcode=" << (UC[0].getOpInfo() == 0 ? "no" : "yes");
+    break;
+  }
+
+  OS << "\n";
+}
+
+void Dumper::printUnwindInfo(const Context &Ctx, const coff_section *Section,
+                             off_t Offset, const UnwindInfo &UI) {
+  DictScope UIS(SW, "UnwindInfo");
+  SW.printNumber("Version", UI.getVersion());
+  SW.printFlags("Flags", UI.getFlags(), makeArrayRef(UnwindFlags));
+  SW.printNumber("PrologSize", UI.PrologSize);
+  if (UI.getFrameRegister()) {
+    SW.printEnum("FrameRegister", UI.getFrameRegister(),
+                 makeArrayRef(UnwindOpInfo));
+    SW.printHex("FrameOffset", UI.getFrameOffset());
+  } else {
+    SW.printString("FrameRegister", StringRef("-"));
+    SW.printString("FrameOffset", StringRef("-"));
+  }
+
+  SW.printNumber("UnwindCodeCount", UI.NumCodes);
+  {
+    ListScope UCS(SW, "UnwindCodes");
+    ArrayRef<UnwindCode> UC(&UI.UnwindCodes[0], UI.NumCodes);
+    for (const UnwindCode *UCI = UC.begin(), *UCE = UC.end(); UCI < UCE; ++UCI) {
+      unsigned UsedSlots = getNumUsedSlots(*UCI);
+      if (UsedSlots > UC.size()) {
+        errs() << "corrupt unwind data";
+        return;
+      }
+
+      printUnwindCode(UI, ArrayRef<UnwindCode>(UCI, UCE));
+      UCI = UCI + UsedSlots - 1;
+    }
+  }
+
+  uint64_t LSDAOffset = Offset + getOffsetOfLSDA(UI);
+  if (UI.getFlags() & (UNW_ExceptionHandler | UNW_TerminateHandler)) {
+    SW.printString("Handler",
+                   formatSymbol(Ctx, Section, LSDAOffset,
+                                UI.getLanguageSpecificHandlerOffset()));
+  } else if (UI.getFlags() & UNW_ChainInfo) {
+    if (const RuntimeFunction *Chained = UI.getChainedFunctionEntry()) {
+      DictScope CS(SW, "Chained");
+      printRuntimeFunctionEntry(Ctx, Section, LSDAOffset, *Chained);
+    }
+  }
+}
+
+void Dumper::printRuntimeFunction(const Context &Ctx,
+                                  const coff_section *Section,
+                                  uint64_t SectionOffset,
+                                  const RuntimeFunction &RF) {
+  DictScope RFS(SW, "RuntimeFunction");
+  printRuntimeFunctionEntry(Ctx, Section, SectionOffset, RF);
+
+  const coff_section *XData;
+  uint64_t Offset;
+  if (error(resolveRelocation(Ctx, Section, SectionOffset + 8, XData, Offset)))
+    return;
+
+  ArrayRef<uint8_t> Contents;
+  if (error(Ctx.COFF.getSectionContents(XData, Contents)) || Contents.empty())
+    return;
+
+  Offset = Offset + RF.UnwindInfoOffset;
+  if (Offset > Contents.size())
+    return;
+
+  const auto UI = reinterpret_cast<const UnwindInfo*>(Contents.data() + Offset);
+  printUnwindInfo(Ctx, XData, Offset, *UI);
+}
+
+void Dumper::printData(const Context &Ctx) {
+  for (const auto &Section : Ctx.COFF.sections()) {
+    StringRef Name;
+    if (error(Section.getName(Name)))
+      continue;
+
+    if (Name != ".pdata" && !Name.startswith(".pdata$"))
+      continue;
+
+    const coff_section *PData = Ctx.COFF.getCOFFSection(Section);
+    ArrayRef<uint8_t> Contents;
+    if (error(Ctx.COFF.getSectionContents(PData, Contents)) || Contents.empty())
+      continue;
+
+    const RuntimeFunction *Entries =
+      reinterpret_cast<const RuntimeFunction *>(Contents.data());
+    const size_t Count = Contents.size() / sizeof(RuntimeFunction);
+    ArrayRef<RuntimeFunction> RuntimeFunctions(Entries, Count);
+
+    size_t Index = 0;
+    for (const auto &RF : RuntimeFunctions) {
+      printRuntimeFunction(Ctx, Ctx.COFF.getCOFFSection(Section),
+                           Index * sizeof(RuntimeFunction), RF);
+      ++Index;
+    }
+  }
+}
+}
+}
+
diff --git a/tools/llvm-readobj/Win64EHDumper.h b/tools/llvm-readobj/Win64EHDumper.h
new file mode 100644
index 000000000000..9ce4d39a6912
--- /dev/null
+++ b/tools/llvm-readobj/Win64EHDumper.h
@@ -0,0 +1,63 @@
+//===- Win64EHDumper.h - Win64 EH Printing ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_READOBJ_WIN64EHPRINTER_H
+#define LLVM_TOOLS_READOBJ_WIN64EHPRINTER_H
+
+#include "StreamWriter.h"
+#include "llvm/Support/Win64EH.h"
+
+namespace llvm {
+namespace object {
+class COFFObjectFile;
+class SymbolRef;
+struct coff_section;
+}
+
+namespace Win64EH {
+class Dumper {
+  StreamWriter &SW;
+  raw_ostream &OS;
+
+public:
+  typedef std::error_code (*SymbolResolver)(const object::coff_section *,
+                                            uint64_t, object::SymbolRef &,
+                                            void *);
+
+  struct Context {
+    const object::COFFObjectFile &COFF;
+    SymbolResolver ResolveSymbol;
+    void *UserData;
+
+    Context(const object::COFFObjectFile &COFF, SymbolResolver Resolver,
+            void *UserData)
+      : COFF(COFF), ResolveSymbol(Resolver), UserData(UserData) {}
+  };
+
+private:
+  void printRuntimeFunctionEntry(const Context &Ctx,
+                                 const object::coff_section *Section,
+                                 uint64_t SectionOffset,
+                                 const RuntimeFunction &RF);
+  void printUnwindCode(const UnwindInfo& UI, ArrayRef<UnwindCode> UC);
+  void printUnwindInfo(const Context &Ctx, const object::coff_section *Section,
+                       off_t Offset, const UnwindInfo &UI);
+  void printRuntimeFunction(const Context &Ctx,
+                            const object::coff_section *Section,
+                            uint64_t SectionOffset, const RuntimeFunction &RF);
+
+public:
+  Dumper(StreamWriter &SW) : SW(SW), OS(SW.getOStream()) {}
+
+  void printData(const Context &Ctx);
+};
+}
+}
+
+#endif
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 5be959f85c11..8d2a997a2312 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -35,8 +35,8 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/system_error.h"
 #include <string>
+#include <system_error>
 
 
 using namespace llvm;
@@ -135,13 +135,18 @@ namespace opts {
                               cl::desc("Display the ARM attributes section"));
   cl::alias ARMAttributesShort("-a", cl::desc("Alias for --arm-attributes"),
                                cl::aliasopt(ARMAttributes));
+
+  // -mips-plt-got
+  cl::opt<bool>
+  MipsPLTGOT("mips-plt-got",
+             cl::desc("Display the MIPS GOT and PLT GOT sections"));
 } // namespace opts
 
 static int ReturnValue = EXIT_SUCCESS;
 
 namespace llvm {
 
-bool error(error_code EC) {
+bool error(std::error_code EC) {
   if (!EC)
     return false;
 
@@ -160,8 +165,7 @@ bool relocAddressLess(RelocationRef a, RelocationRef b) {
 
 } // namespace llvm
 
-
-static void reportError(StringRef Input, error_code EC) {
+static void reportError(StringRef Input, std::error_code EC) {
   if (Input == "-")
     Input = "<stdin>";
 
@@ -178,9 +182,21 @@ static void reportError(StringRef Input, StringRef Message) {
   ReturnValue = EXIT_FAILURE;
 }
 
+static bool isMipsArch(unsigned Arch) {
+  switch (Arch) {
+  case llvm::Triple::mips:
+  case llvm::Triple::mipsel:
+  case llvm::Triple::mips64:
+  case llvm::Triple::mips64el:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// @brief Creates an format-specific object file dumper.
-static error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
-                               std::unique_ptr<ObjDumper> &Result) {
+static std::error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
+                                    std::unique_ptr<ObjDumper> &Result) {
   if (!Obj)
     return readobj_error::unsupported_file_format;
 
@@ -199,7 +215,7 @@ static error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
 static void dumpObject(const ObjectFile *Obj) {
   StreamWriter Writer(outs());
   std::unique_ptr<ObjDumper> Dumper;
-  if (error_code EC = createDumper(Obj, Writer, Dumper)) {
+  if (std::error_code EC = createDumper(Obj, Writer, Dumper)) {
     reportError(Obj->getFileName(), EC);
     return;
   }
@@ -235,6 +251,9 @@ static void dumpObject(const ObjectFile *Obj) {
   if (Obj->getArch() == llvm::Triple::arm && Obj->isELF())
     if (opts::ARMAttributes)
       Dumper->printAttributes();
+  if (isMipsArch(Obj->getArch()) && Obj->isELF())
+    if (opts::MipsPLTGOT)
+      Dumper->printMipsPLTGOT();
 }
 
 
@@ -243,15 +262,15 @@ static void dumpArchive(const Archive *Arc) {
   for (Archive::child_iterator ArcI = Arc->child_begin(),
                                ArcE = Arc->child_end();
                                ArcI != ArcE; ++ArcI) {
-    std::unique_ptr<Binary> child;
-    if (error_code EC = ArcI->getAsBinary(child)) {
+    ErrorOr<std::unique_ptr<Binary>> ChildOrErr = ArcI->getAsBinary();
+    if (std::error_code EC = ChildOrErr.getError()) {
       // Ignore non-object files.
       if (EC != object_error::invalid_file_type)
         reportError(Arc->getFileName(), EC.message());
       continue;
     }
 
-    if (ObjectFile *Obj = dyn_cast<ObjectFile>(child.get()))
+    if (ObjectFile *Obj = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
       dumpObject(Obj);
     else
       reportError(Arc->getFileName(), readobj_error::unrecognized_file_format);
@@ -269,7 +288,7 @@ static void dumpInput(StringRef File) {
 
   // Attempt to open the binary.
   ErrorOr<Binary *> BinaryOrErr = createBinary(File);
-  if (error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError()) {
     reportError(File, EC);
     return;
   }
diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index cc5c85d8dac7..04139480a6d8 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -18,10 +18,8 @@ namespace llvm {
     class RelocationRef;
   }
 
-  class error_code;
-
   // Various helper functions.
-  bool error(error_code ec);
+  bool error(std::error_code ec);
   bool relocAddressLess(object::RelocationRef A,
                         object::RelocationRef B);
 } // namespace llvm
@@ -40,6 +38,7 @@ namespace opts {
   extern llvm::cl::opt<bool> ExpandRelocs;
   extern llvm::cl::opt<bool> CodeViewLineTables;
   extern llvm::cl::opt<bool> ARMAttributes;
+  extern llvm::cl::opt<bool> MipsPLTGOT;
 } // namespace opts
 
 #define LLVM_READOBJ_ENUM_ENT(ns, enum) \
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index be5c345c6175..f1364d53503f 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -25,7 +25,7 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 using namespace llvm::object;
 
@@ -157,8 +157,8 @@ static int printLineInfoForInput() {
     // Load the input memory buffer.
     std::unique_ptr<MemoryBuffer> InputBuffer;
     std::unique_ptr<ObjectImage> LoadedObject;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFileList[i],
-                                                     InputBuffer))
+    if (std::error_code ec =
+            MemoryBuffer::getFileOrSTDIN(InputFileList[i], InputBuffer))
       return Error("unable to read input: '" + ec.message() + "'");
 
     // Load the object file
@@ -218,8 +218,8 @@ static int executeInput() {
     // Load the input memory buffer.
     std::unique_ptr<MemoryBuffer> InputBuffer;
     std::unique_ptr<ObjectImage> LoadedObject;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFileList[i],
-                                                     InputBuffer))
+    if (std::error_code ec =
+            MemoryBuffer::getFileOrSTDIN(InputFileList[i], InputBuffer))
       return Error("unable to read input: '" + ec.message() + "'");
 
     // Load the object file
diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index 58eafd4cba3d..5e4e4aa932bb 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -25,19 +26,19 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <string>
+#include <system_error>
 using namespace llvm;
 using namespace object;
 
-enum OutputFormatTy {berkeley, sysv};
+enum OutputFormatTy {berkeley, sysv, darwin};
 static cl::opt<OutputFormatTy>
        OutputFormat("format",
          cl::desc("Specify output format"),
          cl::values(clEnumVal(sysv, "System V format"),
                     clEnumVal(berkeley, "Berkeley format"),
-                    clEnumValEnd),
+                    clEnumVal(darwin, "Darwin -m format"), clEnumValEnd),
          cl::init(berkeley));
 
 static cl::opt<OutputFormatTy>
@@ -47,6 +48,13 @@ static cl::opt<OutputFormatTy>
                     clEnumValEnd),
          cl::init(berkeley));
 
+static bool berkeleyHeaderPrinted = false;
+static bool moreThanOneFile = false;
+
+cl::opt<bool> DarwinLongFormat("l",
+         cl::desc("When format is darwin, use long format "
+                  "to include addresses and offsets."));
+
 enum RadixTy {octal = 8, decimal = 10, hexadecimal = 16};
 static cl::opt<unsigned int>
        Radix("-radix",
@@ -68,7 +76,7 @@ static cl::list<std::string>
 static std::string ToolName;
 
 ///  @brief If ec is not success, print the error and return true.
-static bool error(error_code ec) {
+static bool error(std::error_code ec) {
   if (!ec) return false;
 
   outs() << ToolName << ": error reading file: " << ec.message() << ".\n";
@@ -85,6 +93,182 @@ static size_t getNumLengthAsString(uint64_t num) {
   return result.size();
 }
 
+/// @brief Return the the printing format for the Radix.
+static const char * getRadixFmt(void) {
+  switch (Radix) {
+  case octal:
+    return PRIo64;
+  case decimal:
+    return PRIu64;
+  case hexadecimal:
+    return PRIx64;
+  }
+  return nullptr;
+}
+
+/// @brief Print the size of each Mach-O segment and section in @p MachO.
+///
+/// This is when used when @c OutputFormat is darwin and produces the same
+/// output as darwin's size(1) -m output.
+static void PrintDarwinSectionSizes(MachOObjectFile *MachO) {
+  std::string fmtbuf;
+  raw_string_ostream fmt(fmtbuf);
+  const char *radix_fmt = getRadixFmt();
+  if (Radix == hexadecimal)
+    fmt << "0x";
+  fmt << "%" << radix_fmt;
+
+  uint32_t LoadCommandCount = MachO->getHeader().ncmds;
+  uint32_t Filetype = MachO->getHeader().filetype;
+  MachOObjectFile::LoadCommandInfo Load = MachO->getFirstLoadCommandInfo();
+
+  uint64_t total = 0;
+  for (unsigned I = 0; ; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = MachO->getSegment64LoadCommand(Load);
+      outs() << "Segment " << Seg.segname << ": "
+             << format(fmt.str().c_str(), Seg.vmsize);
+      if (DarwinLongFormat)
+        outs() << " (vmaddr 0x" << format("%" PRIx64, Seg.vmaddr)
+               << " fileoff " << Seg.fileoff << ")";
+      outs() << "\n";
+      total += Seg.vmsize;
+      uint64_t sec_total = 0;
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section_64 Sec = MachO->getSection64(Load, J);
+        if (Filetype == MachO::MH_OBJECT)
+          outs() << "\tSection (" << format("%.16s", &Sec.segname) << ", "
+                 << format("%.16s", &Sec.sectname) << "): ";
+        else
+          outs() << "\tSection " << format("%.16s", &Sec.sectname) << ": ";
+        outs() << format(fmt.str().c_str(), Sec.size);
+        if (DarwinLongFormat)
+          outs() << " (addr 0x" << format("%" PRIx64, Sec.addr)
+                 << " offset " << Sec.offset << ")";
+        outs() << "\n";
+        sec_total += Sec.size;
+      }
+      if (Seg.nsects != 0)
+        outs() << "\ttotal " << format(fmt.str().c_str(), sec_total) << "\n";
+    }
+    else if (Load.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = MachO->getSegmentLoadCommand(Load);
+      outs() << "Segment " << Seg.segname << ": "
+             << format(fmt.str().c_str(), Seg.vmsize);
+      if (DarwinLongFormat)
+        outs() << " (vmaddr 0x" << format("%" PRIx64, Seg.vmaddr)
+               << " fileoff " << Seg.fileoff << ")";
+      outs() << "\n";
+      total += Seg.vmsize;
+      uint64_t sec_total = 0;
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section Sec = MachO->getSection(Load, J);
+        if (Filetype == MachO::MH_OBJECT)
+          outs() << "\tSection (" << format("%.16s", &Sec.segname) << ", "
+                 << format("%.16s", &Sec.sectname) << "): ";
+        else
+          outs() << "\tSection " << format("%.16s", &Sec.sectname) << ": ";
+        outs() << format(fmt.str().c_str(), Sec.size);
+        if (DarwinLongFormat)
+          outs() << " (addr 0x" << format("%" PRIx64, Sec.addr)
+                 << " offset " << Sec.offset << ")";
+        outs() << "\n";
+        sec_total += Sec.size;
+      }
+      if (Seg.nsects != 0)
+        outs() << "\ttotal " << format(fmt.str().c_str(), sec_total) << "\n";
+    }
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = MachO->getNextLoadCommandInfo(Load);
+  }
+  outs() << "total " << format(fmt.str().c_str(), total) << "\n";
+}
+
+/// @brief Print the summary sizes of the standard Mach-O segments in @p MachO.
+///
+/// This is when used when @c OutputFormat is berkeley with a Mach-O file and
+/// produces the same output as darwin's size(1) default output.
+static void PrintDarwinSegmentSizes(MachOObjectFile *MachO) {
+  uint32_t LoadCommandCount = MachO->getHeader().ncmds;
+  MachOObjectFile::LoadCommandInfo Load = MachO->getFirstLoadCommandInfo();
+
+  uint64_t total_text = 0;
+  uint64_t total_data = 0;
+  uint64_t total_objc = 0;
+  uint64_t total_others = 0;
+  for (unsigned I = 0; ; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = MachO->getSegment64LoadCommand(Load);
+      if (MachO->getHeader().filetype == MachO::MH_OBJECT) {
+        for (unsigned J = 0; J < Seg.nsects; ++J) {
+          MachO::section_64 Sec = MachO->getSection64(Load, J);
+          StringRef SegmentName = StringRef(Sec.segname);
+          if (SegmentName == "__TEXT")
+            total_text += Sec.size;
+          else if (SegmentName == "__DATA")
+            total_data += Sec.size;
+          else if (SegmentName == "__OBJC")
+            total_objc += Sec.size;
+          else
+            total_others += Sec.size;
+	}
+      } else {
+        StringRef SegmentName = StringRef(Seg.segname);
+        if (SegmentName == "__TEXT")
+          total_text += Seg.vmsize;
+        else if (SegmentName == "__DATA")
+          total_data += Seg.vmsize;
+        else if (SegmentName == "__OBJC")
+          total_objc += Seg.vmsize;
+        else
+          total_others += Seg.vmsize;
+      }
+    }
+    else if (Load.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = MachO->getSegmentLoadCommand(Load);
+      if (MachO->getHeader().filetype == MachO::MH_OBJECT) {
+        for (unsigned J = 0; J < Seg.nsects; ++J) {
+          MachO::section Sec = MachO->getSection(Load, J);
+          StringRef SegmentName = StringRef(Sec.segname);
+          if (SegmentName == "__TEXT")
+            total_text += Sec.size;
+          else if (SegmentName == "__DATA")
+            total_data += Sec.size;
+          else if (SegmentName == "__OBJC")
+            total_objc += Sec.size;
+          else
+            total_others += Sec.size;
+	}
+      } else {
+        StringRef SegmentName = StringRef(Seg.segname);
+        if (SegmentName == "__TEXT")
+          total_text += Seg.vmsize;
+        else if (SegmentName == "__DATA")
+          total_data += Seg.vmsize;
+        else if (SegmentName == "__OBJC")
+          total_objc += Seg.vmsize;
+        else
+          total_others += Seg.vmsize;
+      }
+    }
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = MachO->getNextLoadCommandInfo(Load);
+  }
+  uint64_t total = total_text + total_data + total_objc + total_others;
+
+  if (!berkeleyHeaderPrinted) {
+    outs() << "__TEXT\t__DATA\t__OBJC\tothers\tdec\thex\n";
+    berkeleyHeaderPrinted = true;
+  }
+  outs() << total_text << "\t" << total_data << "\t" << total_objc << "\t"
+         << total_others << "\t" << total << "\t" << format("%" PRIx64, total)
+         << "\t";
+}
+
 /// @brief Print the size of each section in @p Obj.
 ///
 /// The format used is determined by @c OutputFormat and @c Radix.
@@ -92,20 +276,19 @@ static void PrintObjectSectionSizes(ObjectFile *Obj) {
   uint64_t total = 0;
   std::string fmtbuf;
   raw_string_ostream fmt(fmtbuf);
-
-  const char *radix_fmt = nullptr;
-  switch (Radix) {
-  case octal:
-    radix_fmt = PRIo64;
-    break;
-  case decimal:
-    radix_fmt = PRIu64;
-    break;
-  case hexadecimal:
-    radix_fmt = PRIx64;
-    break;
-  }
-  if (OutputFormat == sysv) {
+  const char *radix_fmt = getRadixFmt();
+
+  // If OutputFormat is darwin and we have a MachOObjectFile print as darwin's
+  // size(1) -m output, else if OutputFormat is darwin and not a Mach-O object
+  // let it fall through to OutputFormat berkeley.
+  MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj);
+  if (OutputFormat == darwin && MachO)
+    PrintDarwinSectionSizes(MachO);
+  // If we have a MachOObjectFile and the OutputFormat is berkeley print as
+  // darwin's default berkeley format for Mach-O files.
+  else if (MachO && OutputFormat == berkeley)
+    PrintDarwinSegmentSizes(MachO);
+  else if (OutputFormat == sysv) {
     // Run two passes over all sections. The first gets the lengths needed for
     // formatting the output. The second actually does the output.
     std::size_t max_name_len = strlen("section");
@@ -204,6 +387,13 @@ static void PrintObjectSectionSizes(ObjectFile *Obj) {
 
     total = total_text + total_data + total_bss;
 
+    if (!berkeleyHeaderPrinted) {
+      outs() << "   text    data     bss     "
+             << (Radix == octal ? "oct" : "dec")
+             << "     hex filename\n";
+      berkeleyHeaderPrinted = true;
+    }
+
     // Print result.
     fmt << "%#7" << radix_fmt << " "
         << "%#7" << radix_fmt << " "
@@ -235,7 +425,7 @@ static void PrintFileSectionSizes(StringRef file) {
 
   // Attempt to open the binary.
   ErrorOr<Binary *> BinaryOrErr = createBinary(file);
-  if (error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError()) {
     errs() << ToolName << ": " << file << ": " << EC.message() << ".\n";
     return;
   }
@@ -245,26 +435,37 @@ static void PrintFileSectionSizes(StringRef file) {
     // This is an archive. Iterate over each member and display its sizes.
     for (object::Archive::child_iterator i = a->child_begin(),
                                          e = a->child_end(); i != e; ++i) {
-      std::unique_ptr<Binary> child;
-      if (error_code ec = i->getAsBinary(child)) {
-        errs() << ToolName << ": " << file << ": " << ec.message() << ".\n";
+      ErrorOr<std::unique_ptr<Binary>> ChildOrErr = i->getAsBinary();
+      if (std::error_code EC = ChildOrErr.getError()) {
+        errs() << ToolName << ": " << file << ": " << EC.message() << ".\n";
         continue;
       }
-      if (ObjectFile *o = dyn_cast<ObjectFile>(child.get())) {
+      if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get())) {
+        MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
         if (OutputFormat == sysv)
           outs() << o->getFileName() << "   (ex " << a->getFileName()
                   << "):\n";
+        else if(MachO && OutputFormat == darwin)
+            outs() << a->getFileName() << "(" << o->getFileName() << "):\n";
         PrintObjectSectionSizes(o);
-        if (OutputFormat == berkeley)
-          outs() << o->getFileName() << " (ex " << a->getFileName() << ")\n";
+        if (OutputFormat == berkeley) {
+          if (MachO)
+            outs() << a->getFileName() << "(" << o->getFileName() << ")\n";
+          else
+            outs() << o->getFileName() << " (ex " << a->getFileName() << ")\n";
+        }
       }
     }
   } else if (ObjectFile *o = dyn_cast<ObjectFile>(binary.get())) {
     if (OutputFormat == sysv)
       outs() << o->getFileName() << "  :\n";
     PrintObjectSectionSizes(o);
-    if (OutputFormat == berkeley)
-      outs() << o->getFileName() << "\n";
+    if (OutputFormat == berkeley) {
+      MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+      if (!MachO || moreThanOneFile)
+        outs() << o->getFileName();
+      outs() << "\n";
+    }
   } else {
     errs() << ToolName << ": " << file << ": " << "Unrecognized file type.\n";
   }
@@ -290,11 +491,7 @@ int main(int argc, char **argv) {
   if (InputFilenames.size() == 0)
     InputFilenames.push_back("a.out");
 
-  if (OutputFormat == berkeley)
-    outs() << "   text    data     bss     "
-           << (Radix == octal ? "oct" : "dec")
-           << "     hex filename\n";
-
+  moreThanOneFile = InputFilenames.size() > 1;
   std::for_each(InputFilenames.begin(), InputFilenames.end(),
                 PrintFileSectionSizes);
 
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.cpp b/tools/llvm-symbolizer/LLVMSymbolize.cpp
index 3e71111b0094..dd9ffc1fb8ed 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.cpp
+++ b/tools/llvm-symbolizer/LLVMSymbolize.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -28,7 +29,7 @@
 namespace llvm {
 namespace symbolize {
 
-static bool error(error_code ec) {
+static bool error(std::error_code ec) {
   if (!ec)
     return false;
   errs() << "LLVMSymbolizer: error reading file: " << ec.message() << ".\n";
@@ -310,7 +311,7 @@ LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
       const std::string &ResourcePath =
           getDarwinDWARFResourceForPath(Path);
       BinaryOrErr = createBinary(ResourcePath);
-      error_code EC = BinaryOrErr.getError();
+      std::error_code EC = BinaryOrErr.getError();
       if (EC != errc::no_such_file_or_directory && !error(EC)) {
         DbgBin = BinaryOrErr.get();
         ParsedBinariesAndObjects.push_back(std::unique_ptr<Binary>(DbgBin));
diff --git a/tools/macho-dump/macho-dump.cpp b/tools/macho-dump/macho-dump.cpp
index 886487bfb43e..bfe1a6eaef49 100644
--- a/tools/macho-dump/macho-dump.cpp
+++ b/tools/macho-dump/macho-dump.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 using namespace llvm::object;
 
@@ -391,7 +391,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm Mach-O dumping tool\n");
 
   ErrorOr<Binary *> BinaryOrErr = createBinary(InputFile);
-  if (error_code EC = BinaryOrErr.getError())
+  if (std::error_code EC = BinaryOrErr.getError())
     return Error("unable to read input: '" + EC.message() + "'");
   std::unique_ptr<Binary> Binary(BinaryOrErr.get());
 
diff --git a/tools/obj2yaml/Error.cpp b/tools/obj2yaml/Error.cpp
index 7be468dd563f..00741287a649 100644
--- a/tools/obj2yaml/Error.cpp
+++ b/tools/obj2yaml/Error.cpp
@@ -13,18 +13,17 @@
 using namespace llvm;
 
 namespace {
-class _obj2yaml_error_category : public error_category {
+class _obj2yaml_error_category : public std::error_category {
 public:
-  const char *name() const override;
+  const char *name() const LLVM_NOEXCEPT override;
   std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
 };
 } // namespace
 
 const char *_obj2yaml_error_category::name() const { return "obj2yaml"; }
 
 std::string _obj2yaml_error_category::message(int ev) const {
-  switch (ev) {
+  switch (static_cast<obj2yaml_error>(ev)) {
   case obj2yaml_error::success:
     return "Success";
   case obj2yaml_error::file_not_found:
@@ -33,21 +32,13 @@ std::string _obj2yaml_error_category::message(int ev) const {
     return "Unrecognized file type.";
   case obj2yaml_error::unsupported_obj_file_format:
     return "Unsupported object file format.";
-  default:
-    llvm_unreachable("An enumerator of obj2yaml_error does not have a message "
-                     "defined.");
   }
-}
-
-error_condition
-_obj2yaml_error_category::default_error_condition(int ev) const {
-  if (ev == obj2yaml_error::success)
-    return errc::success;
-  return errc::invalid_argument;
+  llvm_unreachable("An enumerator of obj2yaml_error does not have a message "
+                   "defined.");
 }
 
 namespace llvm {
-const error_category &obj2yaml_category() {
+  const std::error_category &obj2yaml_category() {
   static _obj2yaml_error_category o;
   return o;
 }
diff --git a/tools/obj2yaml/Error.h b/tools/obj2yaml/Error.h
index a32666455514..4657f0db5282 100644
--- a/tools/obj2yaml/Error.h
+++ b/tools/obj2yaml/Error.h
@@ -10,33 +10,26 @@
 #ifndef LLVM_TOOLS_ERROR_H
 #define LLVM_TOOLS_ERROR_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
+const std::error_category &obj2yaml_category();
 
-const error_category &obj2yaml_category();
-
-struct obj2yaml_error {
-  enum _ {
-    success = 0,
-    file_not_found,
-    unrecognized_file_format,
-    unsupported_obj_file_format
-  };
-  _ v_;
-
-  obj2yaml_error(_ v) : v_(v) {}
-  explicit obj2yaml_error(int v) : v_(_(v)) {}
-  operator int() const {return v_;}
+enum class obj2yaml_error {
+  success = 0,
+  file_not_found,
+  unrecognized_file_format,
+  unsupported_obj_file_format
 };
 
-inline error_code make_error_code(obj2yaml_error e) {
-  return error_code(static_cast<int>(e), obj2yaml_category());
+inline std::error_code make_error_code(obj2yaml_error e) {
+  return std::error_code(static_cast<int>(e), obj2yaml_category());
 }
 
-template <> struct is_error_code_enum<obj2yaml_error> : std::true_type { };
-template <> struct is_error_code_enum<obj2yaml_error::_> : std::true_type { };
-
 } // namespace llvm
 
+namespace std {
+template <> struct is_error_code_enum<llvm::obj2yaml_error> : std::true_type {};
+}
+
 #endif
diff --git a/tools/obj2yaml/coff2yaml.cpp b/tools/obj2yaml/coff2yaml.cpp
index 42b09d312c25..48462f69fc5d 100644
--- a/tools/obj2yaml/coff2yaml.cpp
+++ b/tools/obj2yaml/coff2yaml.cpp
@@ -31,7 +31,7 @@ class COFFDumper {
 
 }
 
-static void check(error_code ec) {
+static void check(std::error_code ec) {
   if (ec)
     report_fatal_error(ec.message());
 }
@@ -210,7 +210,7 @@ COFFYAML::Object &COFFDumper::getYAMLObj() {
   return YAMLObj;
 }
 
-error_code coff2yaml(raw_ostream &Out, const object::COFFObjectFile &Obj) {
+std::error_code coff2yaml(raw_ostream &Out, const object::COFFObjectFile &Obj) {
   COFFDumper Dumper(Obj);
 
   yaml::Output Yout(Out);
diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp
index 7642921b4855..c817e1526dae 100644
--- a/tools/obj2yaml/elf2yaml.cpp
+++ b/tools/obj2yaml/elf2yaml.cpp
@@ -26,11 +26,13 @@ class ELFDumper {
 
   const object::ELFFile<ELFT> &Obj;
 
-  error_code dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S);
-  error_code dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S);
+  std::error_code dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S);
+  std::error_code dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S);
+  std::error_code dumpCommonRelocationSection(const Elf_Shdr *Shdr,
+                                              ELFYAML::RelocationSection &S);
   template <class RelT>
-  error_code dumpRelocation(const Elf_Shdr *Shdr, const RelT *Rel,
-                            ELFYAML::Relocation &R);
+  std::error_code dumpRelocation(const Elf_Shdr *Shdr, const RelT *Rel,
+                                 ELFYAML::Relocation &R);
 
   ErrorOr<ELFYAML::RelocationSection *> dumpRelSection(const Elf_Shdr *Shdr);
   ErrorOr<ELFYAML::RelocationSection *> dumpRelaSection(const Elf_Shdr *Shdr);
@@ -72,21 +74,22 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
       break;
     case ELF::SHT_RELA: {
       ErrorOr<ELFYAML::RelocationSection *> S = dumpRelaSection(&Sec);
-      if (error_code EC = S.getError())
+      if (std::error_code EC = S.getError())
         return EC;
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
       break;
     }
     case ELF::SHT_REL: {
       ErrorOr<ELFYAML::RelocationSection *> S = dumpRelSection(&Sec);
-      if (error_code EC = S.getError())
+      if (std::error_code EC = S.getError())
         return EC;
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
       break;
     }
+    // FIXME: Support SHT_GROUP section format.
     default: {
       ErrorOr<ELFYAML::RawContentSection *> S = dumpContentSection(&Sec);
-      if (error_code EC = S.getError())
+      if (std::error_code EC = S.getError())
         return EC;
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
     }
@@ -102,7 +105,7 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
     }
 
     ELFYAML::Symbol S;
-    if (error_code EC = ELFDumper<ELFT>::dumpSymbol(SI, S))
+    if (std::error_code EC = ELFDumper<ELFT>::dumpSymbol(SI, S))
       return EC;
 
     switch (SI->getBinding())
@@ -125,13 +128,15 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
 }
 
 template <class ELFT>
-error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S) {
+std::error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym,
+                                            ELFYAML::Symbol &S) {
   S.Type = Sym->getType();
   S.Value = Sym->st_value;
   S.Size = Sym->st_size;
+  S.Visibility = Sym->st_other & 0x3;
 
   ErrorOr<StringRef> NameOrErr = Obj.getSymbolName(Sym);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   S.Name = NameOrErr.get();
 
@@ -140,7 +145,7 @@ error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S) {
     return obj2yaml_error::success;
 
   NameOrErr = Obj.getSectionName(Shdr);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   S.Section = NameOrErr.get();
 
@@ -149,9 +154,9 @@ error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S) {
 
 template <class ELFT>
 template <class RelT>
-error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
-                                           const RelT *Rel,
-                                           ELFYAML::Relocation &R) {
+std::error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
+                                                const RelT *Rel,
+                                                ELFYAML::Relocation &R) {
   R.Type = Rel->getType(Obj.isMips64EL());
   R.Offset = Rel->r_offset;
   R.Addend = 0;
@@ -162,7 +167,7 @@ error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
 
   ErrorOr<StringRef> NameOrErr =
       Obj.getSymbolName(NamePair.first, NamePair.second);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   R.Symbol = NameOrErr.get();
 
@@ -170,34 +175,44 @@ error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
 }
 
 template <class ELFT>
-error_code ELFDumper<ELFT>::dumpCommonSection(const Elf_Shdr *Shdr,
-                                              ELFYAML::Section &S) {
+std::error_code ELFDumper<ELFT>::dumpCommonSection(const Elf_Shdr *Shdr,
+                                                   ELFYAML::Section &S) {
   S.Type = Shdr->sh_type;
   S.Flags = Shdr->sh_flags;
   S.Address = Shdr->sh_addr;
   S.AddressAlign = Shdr->sh_addralign;
 
   ErrorOr<StringRef> NameOrErr = Obj.getSectionName(Shdr);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   S.Name = NameOrErr.get();
 
   if (Shdr->sh_link != ELF::SHN_UNDEF) {
     if (const Elf_Shdr *LinkSection = Obj.getSection(Shdr->sh_link)) {
       NameOrErr = Obj.getSectionName(LinkSection);
-      if (error_code EC = NameOrErr.getError())
+      if (std::error_code EC = NameOrErr.getError())
         return EC;
       S.Link = NameOrErr.get();
     }
   }
-  if (Shdr->sh_info != ELF::SHN_UNDEF) {
-    if (const Elf_Shdr *InfoSection = Obj.getSection(Shdr->sh_info)) {
-      NameOrErr = Obj.getSectionName(InfoSection);
-      if (error_code EC = NameOrErr.getError())
-        return EC;
-      S.Info = NameOrErr.get();
-    }
+
+  return obj2yaml_error::success;
+}
+
+template <class ELFT>
+std::error_code
+ELFDumper<ELFT>::dumpCommonRelocationSection(const Elf_Shdr *Shdr,
+                                             ELFYAML::RelocationSection &S) {
+  if (std::error_code EC = dumpCommonSection(Shdr, S))
+    return EC;
+
+  if (const Elf_Shdr *InfoSection = Obj.getSection(Shdr->sh_info)) {
+    ErrorOr<StringRef> NameOrErr = Obj.getSectionName(InfoSection);
+    if (std::error_code EC = NameOrErr.getError())
+      return EC;
+    S.Info = NameOrErr.get();
   }
+
   return obj2yaml_error::success;
 }
 
@@ -207,13 +222,13 @@ ELFDumper<ELFT>::dumpRelSection(const Elf_Shdr *Shdr) {
   assert(Shdr->sh_type == ELF::SHT_REL && "Section type is not SHT_REL");
   auto S = make_unique<ELFYAML::RelocationSection>();
 
-  if (error_code EC = dumpCommonSection(Shdr, *S))
+  if (std::error_code EC = dumpCommonRelocationSection(Shdr, *S))
     return EC;
 
   for (auto RI = Obj.begin_rel(Shdr), RE = Obj.end_rel(Shdr); RI != RE;
        ++RI) {
     ELFYAML::Relocation R;
-    if (error_code EC = dumpRelocation(Shdr, &*RI, R))
+    if (std::error_code EC = dumpRelocation(Shdr, &*RI, R))
       return EC;
     S->Relocations.push_back(R);
   }
@@ -227,13 +242,13 @@ ELFDumper<ELFT>::dumpRelaSection(const Elf_Shdr *Shdr) {
   assert(Shdr->sh_type == ELF::SHT_RELA && "Section type is not SHT_RELA");
   auto S = make_unique<ELFYAML::RelocationSection>();
 
-  if (error_code EC = dumpCommonSection(Shdr, *S))
+  if (std::error_code EC = dumpCommonRelocationSection(Shdr, *S))
     return EC;
 
   for (auto RI = Obj.begin_rela(Shdr), RE = Obj.end_rela(Shdr); RI != RE;
        ++RI) {
     ELFYAML::Relocation R;
-    if (error_code EC = dumpRelocation(Shdr, &*RI, R))
+    if (std::error_code EC = dumpRelocation(Shdr, &*RI, R))
       return EC;
     R.Addend = RI->r_addend;
     S->Relocations.push_back(R);
@@ -247,11 +262,11 @@ ErrorOr<ELFYAML::RawContentSection *>
 ELFDumper<ELFT>::dumpContentSection(const Elf_Shdr *Shdr) {
   auto S = make_unique<ELFYAML::RawContentSection>();
 
-  if (error_code EC = dumpCommonSection(Shdr, *S))
+  if (std::error_code EC = dumpCommonSection(Shdr, *S))
     return EC;
 
   ErrorOr<ArrayRef<uint8_t>> ContentOrErr = Obj.getSectionContents(Shdr);
-  if (error_code EC = ContentOrErr.getError())
+  if (std::error_code EC = ContentOrErr.getError())
     return EC;
   S->Content = object::yaml::BinaryRef(ContentOrErr.get());
   S->Size = S->Content.binary_size();
@@ -260,10 +275,11 @@ ELFDumper<ELFT>::dumpContentSection(const Elf_Shdr *Shdr) {
 }
 
 template <class ELFT>
-static error_code elf2yaml(raw_ostream &Out, const object::ELFFile<ELFT> &Obj) {
+static std::error_code elf2yaml(raw_ostream &Out,
+                                const object::ELFFile<ELFT> &Obj) {
   ELFDumper<ELFT> Dumper(Obj);
   ErrorOr<ELFYAML::Object *> YAMLOrErr = Dumper.dump();
-  if (error_code EC = YAMLOrErr.getError())
+  if (std::error_code EC = YAMLOrErr.getError())
     return EC;
 
   std::unique_ptr<ELFYAML::Object> YAML(YAMLOrErr.get());
@@ -273,7 +289,7 @@ static error_code elf2yaml(raw_ostream &Out, const object::ELFFile<ELFT> &Obj) {
   return object::object_error::success;
 }
 
-error_code elf2yaml(raw_ostream &Out, const object::ObjectFile &Obj) {
+std::error_code elf2yaml(raw_ostream &Out, const object::ObjectFile &Obj) {
   if (const auto *ELFObj = dyn_cast<object::ELF32LEObjectFile>(&Obj))
     return elf2yaml(Out, *ELFObj->getELFFile());
 
diff --git a/tools/obj2yaml/obj2yaml.cpp b/tools/obj2yaml/obj2yaml.cpp
index 7fe034d5b491..944314a923f7 100644
--- a/tools/obj2yaml/obj2yaml.cpp
+++ b/tools/obj2yaml/obj2yaml.cpp
@@ -19,7 +19,7 @@
 using namespace llvm;
 using namespace llvm::object;
 
-static error_code dumpObject(const ObjectFile &Obj) {
+static std::error_code dumpObject(const ObjectFile &Obj) {
   if (Obj.isCOFF())
     return coff2yaml(outs(), cast<COFFObjectFile>(Obj));
   if (Obj.isELF())
@@ -28,12 +28,12 @@ static error_code dumpObject(const ObjectFile &Obj) {
   return obj2yaml_error::unsupported_obj_file_format;
 }
 
-static error_code dumpInput(StringRef File) {
+static std::error_code dumpInput(StringRef File) {
   if (File != "-" && !sys::fs::exists(File))
     return obj2yaml_error::file_not_found;
 
   ErrorOr<Binary *> BinaryOrErr = createBinary(File);
-  if (error_code EC = BinaryOrErr.getError())
+  if (std::error_code EC = BinaryOrErr.getError())
     return EC;
 
   std::unique_ptr<Binary> Binary(BinaryOrErr.get());
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
-  if (error_code EC = dumpInput(InputFilename)) {
+  if (std::error_code EC = dumpInput(InputFilename)) {
     errs() << "Error: '" << EC.message() << "'\n";
     return 1;
   }
diff --git a/tools/obj2yaml/obj2yaml.h b/tools/obj2yaml/obj2yaml.h
index 73c58fa958e6..6d81110f7a38 100644
--- a/tools/obj2yaml/obj2yaml.h
+++ b/tools/obj2yaml/obj2yaml.h
@@ -15,11 +15,11 @@
 
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
-llvm::error_code coff2yaml(llvm::raw_ostream &Out,
-                           const llvm::object::COFFObjectFile &Obj);
-llvm::error_code elf2yaml(llvm::raw_ostream &Out,
-                          const llvm::object::ObjectFile &Obj);
+std::error_code coff2yaml(llvm::raw_ostream &Out,
+                          const llvm::object::COFFObjectFile &Obj);
+std::error_code elf2yaml(llvm::raw_ostream &Out,
+                         const llvm::object::ObjectFile &Obj);
 
 #endif
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 6f0fbf66b73a..6ba6340040b3 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -336,6 +336,7 @@ int main(int argc, char **argv) {
 
   InitializeAllTargets();
   InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
 
   // Initialize passes
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
diff --git a/tools/yaml2obj/yaml2coff.cpp b/tools/yaml2obj/yaml2coff.cpp
index a0ede246bd68..c772db9a8e80 100644
--- a/tools/yaml2obj/yaml2coff.cpp
+++ b/tools/yaml2obj/yaml2coff.cpp
@@ -327,8 +327,7 @@ bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
   return true;
 }
 
-int yaml2coff(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf) {
-  yaml::Input YIn(Buf->getBuffer());
+int yaml2coff(yaml::Input &YIn, raw_ostream &Out) {
   COFFYAML::Object Doc;
   YIn >> Doc;
   if (YIn.error()) {
diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index bb52cda7c1e9..467969d21d7f 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp
@@ -304,6 +304,7 @@ void ELFState<ELFT>::addSymbols(const std::vector<ELFYAML::Symbol> &Symbols,
       Symbol.st_shndx = Index;
     } // else Symbol.st_shndex == SHN_UNDEF (== 0), since it was zero'd earlier.
     Symbol.st_value = Sym.Value;
+    Symbol.st_other = Sym.Visibility;
     Symbol.st_size = Sym.Size;
     Syms.push_back(Symbol);
   }
@@ -467,8 +468,7 @@ static bool isLittleEndian(const ELFYAML::Object &Doc) {
   return Doc.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
 }
 
-int yaml2elf(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf) {
-  yaml::Input YIn(Buf->getBuffer());
+int yaml2elf(yaml::Input &YIn, raw_ostream &Out) {
   ELFYAML::Object Doc;
   YIn >> Doc;
   if (YIn.error()) {
diff --git a/tools/yaml2obj/yaml2obj.cpp b/tools/yaml2obj/yaml2obj.cpp
index 2493b4869938..dd41951e7db8 100644
--- a/tools/yaml2obj/yaml2obj.cpp
+++ b/tools/yaml2obj/yaml2obj.cpp
@@ -15,15 +15,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "yaml2obj.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
 
 using namespace llvm;
 
@@ -51,9 +53,27 @@ cl::opt<YAMLObjectFormat> Format(
     clEnumValN(YOF_ELF, "elf", "ELF object file format"),
   clEnumValEnd));
 
+cl::opt<unsigned>
+DocNum("docnum", cl::init(1),
+       cl::desc("Read specified document from input (default = 1)"));
+
 static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
                                            cl::value_desc("filename"));
 
+typedef int (*ConvertFuncPtr)(yaml::Input & YIn, raw_ostream &Out);
+
+int convertYAML(yaml::Input & YIn, raw_ostream &Out, ConvertFuncPtr Convert) {
+  unsigned CurDocNum = 0;
+  do {
+    if (++CurDocNum == DocNum)
+      return Convert(YIn, Out);
+  } while (YIn.nextDocument());
+
+  errs() << "yaml2obj: Cannot find the " << DocNum
+         << llvm::getOrdinalSuffix(DocNum) << " document\n";
+  return 1;
+}
+
 int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv);
   sys::PrintStackTraceOnErrorSignal();
@@ -75,14 +95,19 @@ int main(int argc, char **argv) {
   if (MemoryBuffer::getFileOrSTDIN(Input, Buf))
     return 1;
 
-  int Res = 1;
+  ConvertFuncPtr Convert = nullptr;
   if (Format == YOF_COFF)
-    Res = yaml2coff(Out->os(), Buf.get());
+    Convert = yaml2coff;
   else if (Format == YOF_ELF)
-    Res = yaml2elf(Out->os(), Buf.get());
-  else
+    Convert = yaml2elf;
+  else {
     errs() << "Not yet implemented\n";
+    return 1;
+  }
+
+  yaml::Input YIn(Buf->getBuffer());
 
+  int Res = convertYAML(YIn, Out->os(), Convert);
   if (Res == 0)
     Out->keep();
 
diff --git a/tools/yaml2obj/yaml2obj.h b/tools/yaml2obj/yaml2obj.h
index 095435c54982..086f6413aa78 100644
--- a/tools/yaml2obj/yaml2obj.h
+++ b/tools/yaml2obj/yaml2obj.h
@@ -13,10 +13,12 @@
 #define LLVM_TOOLS_YAML2OBJ_H
 
 namespace llvm {
-  class raw_ostream;
-  class MemoryBuffer;
+class raw_ostream;
+namespace yaml {
+class Input;
 }
-int yaml2coff(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf);
-int yaml2elf(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf);
+}
+int yaml2coff(llvm::yaml::Input &YIn, llvm::raw_ostream &Out);
+int yaml2elf(llvm::yaml::Input &YIn, llvm::raw_ostream &Out);
 
 #endif
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index e57c8d4b931f..8f298cdfedb3 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -1173,11 +1173,11 @@ TEST(APFloatTest, exactInverse) {
   EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(8.5070592e+37f)));
 
   // Large float, inverse is a denormal.
-  EXPECT_FALSE(APFloat(1.7014118e38f).getExactInverse(0));
+  EXPECT_FALSE(APFloat(1.7014118e38f).getExactInverse(nullptr));
   // Zero
-  EXPECT_FALSE(APFloat(0.0).getExactInverse(0));
+  EXPECT_FALSE(APFloat(0.0).getExactInverse(nullptr));
   // Denormalized float
-  EXPECT_FALSE(APFloat(1.40129846e-45f).getExactInverse(0));
+  EXPECT_FALSE(APFloat(1.40129846e-45f).getExactInverse(nullptr));
 }
 
 TEST(APFloatTest, roundToIntegral) {
@@ -1844,10 +1844,10 @@ TEST(APFloatTest, subtract) {
     { PInf, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PInf, PZero, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PInf, MZero, "inf", APFloat::opOK, APFloat::fcInfinity },
-    { PInf, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PInf, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PInf, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PInf, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PInf, PNormalValue, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PInf, MNormalValue, "inf", APFloat::opOK, APFloat::fcInfinity },
@@ -1861,10 +1861,10 @@ TEST(APFloatTest, subtract) {
     { MInf, MInf, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
     { MInf, PZero, "-inf", APFloat::opOK, APFloat::fcInfinity },
     { MInf, MZero, "-inf", APFloat::opOK, APFloat::fcInfinity },
-    { MInf, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MInf, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MInf, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MInf, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MInf, PNormalValue, "-inf", APFloat::opOK, APFloat::fcInfinity },
     { MInf, MNormalValue, "-inf", APFloat::opOK, APFloat::fcInfinity },
@@ -1878,10 +1878,10 @@ TEST(APFloatTest, subtract) {
     { PZero, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PZero, PZero, "0x0p+0", APFloat::opOK, APFloat::fcZero },
     { PZero, MZero, "0x0p+0", APFloat::opOK, APFloat::fcZero },
-    { PZero, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PZero, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PZero, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PZero, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PZero, PNormalValue, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { PZero, MNormalValue, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
@@ -1895,10 +1895,10 @@ TEST(APFloatTest, subtract) {
     { MZero, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MZero, PZero, "-0x0p+0", APFloat::opOK, APFloat::fcZero },
     { MZero, MZero, "0x0p+0", APFloat::opOK, APFloat::fcZero },
-    { MZero, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MZero, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MZero, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MZero, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MZero, PNormalValue, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { MZero, MNormalValue, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
@@ -1946,10 +1946,10 @@ TEST(APFloatTest, subtract) {
     { PNormalValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PNormalValue, PZero, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { PNormalValue, MZero, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
-    { PNormalValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PNormalValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PNormalValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PNormalValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PNormalValue, PNormalValue, "0x0p+0", APFloat::opOK, APFloat::fcZero },
     { PNormalValue, MNormalValue, "0x1p+1", APFloat::opOK, APFloat::fcNormal },
@@ -1963,10 +1963,10 @@ TEST(APFloatTest, subtract) {
     { MNormalValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MNormalValue, PZero, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { MNormalValue, MZero, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
-    { MNormalValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MNormalValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MNormalValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MNormalValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MNormalValue, PNormalValue, "-0x1p+1", APFloat::opOK, APFloat::fcNormal },
     { MNormalValue, MNormalValue, "0x0p+0", APFloat::opOK, APFloat::fcZero },
@@ -1980,10 +1980,10 @@ TEST(APFloatTest, subtract) {
     { PLargestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PLargestValue, PZero, "0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
     { PLargestValue, MZero, "0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
-    { PLargestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PLargestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PLargestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PLargestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PLargestValue, PNormalValue, "0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
     { PLargestValue, MNormalValue, "0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
@@ -1997,10 +1997,10 @@ TEST(APFloatTest, subtract) {
     { MLargestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MLargestValue, PZero, "-0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
     { MLargestValue, MZero, "-0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
-    { MLargestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MLargestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MLargestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MLargestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MLargestValue, PNormalValue, "-0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
     { MLargestValue, MNormalValue, "-0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
@@ -2014,10 +2014,10 @@ TEST(APFloatTest, subtract) {
     { PSmallestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PSmallestValue, PZero, "0x1p-149", APFloat::opOK, APFloat::fcNormal },
     { PSmallestValue, MZero, "0x1p-149", APFloat::opOK, APFloat::fcNormal },
-    { PSmallestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PSmallestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PSmallestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PSmallestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PSmallestValue, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { PSmallestValue, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
@@ -2031,10 +2031,10 @@ TEST(APFloatTest, subtract) {
     { MSmallestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MSmallestValue, PZero, "-0x1p-149", APFloat::opOK, APFloat::fcNormal },
     { MSmallestValue, MZero, "-0x1p-149", APFloat::opOK, APFloat::fcNormal },
-    { MSmallestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MSmallestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MSmallestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MSmallestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MSmallestValue, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { MSmallestValue, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
@@ -2048,10 +2048,10 @@ TEST(APFloatTest, subtract) {
     { PSmallestNormalized, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PSmallestNormalized, PZero, "0x1p-126", APFloat::opOK, APFloat::fcNormal },
     { PSmallestNormalized, MZero, "0x1p-126", APFloat::opOK, APFloat::fcNormal },
-    { PSmallestNormalized, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PSmallestNormalized, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PSmallestNormalized, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PSmallestNormalized, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PSmallestNormalized, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { PSmallestNormalized, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
@@ -2065,10 +2065,10 @@ TEST(APFloatTest, subtract) {
     { MSmallestNormalized, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MSmallestNormalized, PZero, "-0x1p-126", APFloat::opOK, APFloat::fcNormal },
     { MSmallestNormalized, MZero, "-0x1p-126", APFloat::opOK, APFloat::fcNormal },
-    { MSmallestNormalized, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MSmallestNormalized, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MSmallestNormalized, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MSmallestNormalized, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MSmallestNormalized, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { MSmallestNormalized, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index dd4907104dc6..bb7797cf149d 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -92,9 +92,9 @@ class DenseMapTest : public ::testing::Test {
 };
 
 template <typename T>
-typename T::key_type *const DenseMapTest<T>::dummy_key_ptr = 0;
+typename T::key_type *const DenseMapTest<T>::dummy_key_ptr = nullptr;
 template <typename T>
-typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = 0;
+typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = nullptr;
 
 // Register these types for testing.
 typedef ::testing::Types<DenseMap<uint32_t, uint32_t>,
diff --git a/unittests/ADT/HashingTest.cpp b/unittests/ADT/HashingTest.cpp
index 60917ae4249f..acaa83cddddd 100644
--- a/unittests/ADT/HashingTest.cpp
+++ b/unittests/ADT/HashingTest.cpp
@@ -58,7 +58,7 @@ enum TestEnumeration {
 
 TEST(HashingTest, HashValueBasicTest) {
   int x = 42, y = 43, c = 'x';
-  void *p = 0;
+  void *p = nullptr;
   uint64_t i = 71;
   const unsigned ci = 71;
   volatile int vi = 71;
diff --git a/unittests/ADT/ImmutableMapTest.cpp b/unittests/ADT/ImmutableMapTest.cpp
index 774581ca4eeb..6a99884bfbb3 100644
--- a/unittests/ADT/ImmutableMapTest.cpp
+++ b/unittests/ADT/ImmutableMapTest.cpp
@@ -36,8 +36,8 @@ TEST(ImmutableMapTest, MultiElemIntMapTest) {
   EXPECT_TRUE(S.isEmpty());
   EXPECT_FALSE(S2.isEmpty());
 
-  EXPECT_EQ(0, S.lookup(3));
-  EXPECT_EQ(0, S.lookup(9));
+  EXPECT_EQ(nullptr, S.lookup(3));
+  EXPECT_EQ(nullptr, S.lookup(9));
 
   EXPECT_EQ(10, *S2.lookup(3));
   EXPECT_EQ(11, *S2.lookup(4));
diff --git a/unittests/ADT/OwningPtrTest.cpp b/unittests/ADT/OwningPtrTest.cpp
index aee955bf4029..d83a9471cc56 100644
--- a/unittests/ADT/OwningPtrTest.cpp
+++ b/unittests/ADT/OwningPtrTest.cpp
@@ -74,7 +74,7 @@ TEST_F(OwningPtrTest, Reset) {
 
 TEST_F(OwningPtrTest, Take) {
   TrackDestructor::ResetCounts();
-  TrackDestructor *T = 0;
+  TrackDestructor *T = nullptr;
   {
     OwningPtr<TrackDestructor> O(new TrackDestructor(3));
     T = O.take();
@@ -92,7 +92,7 @@ TEST_F(OwningPtrTest, Take) {
 
 TEST_F(OwningPtrTest, Release) {
   TrackDestructor::ResetCounts();
-  TrackDestructor *T = 0;
+  TrackDestructor *T = nullptr;
   {
     OwningPtr<TrackDestructor> O(new TrackDestructor(3));
     T = O.release();
diff --git a/unittests/ADT/PointerUnionTest.cpp b/unittests/ADT/PointerUnionTest.cpp
index 3bfb79cee491..a592784ae095 100644
--- a/unittests/ADT/PointerUnionTest.cpp
+++ b/unittests/ADT/PointerUnionTest.cpp
@@ -66,7 +66,7 @@ TEST_F(PointerUnionTest, Is) {
 TEST_F(PointerUnionTest, Get) {
   EXPECT_EQ(a.get<float *>(), &f);
   EXPECT_EQ(b.get<int *>(), &i);
-  EXPECT_EQ(n.get<int *>(), (int *)0);
+  EXPECT_EQ(n.get<int *>(), (int *)nullptr);
 }
 
 } // end anonymous namespace
diff --git a/unittests/ADT/SCCIteratorTest.cpp b/unittests/ADT/SCCIteratorTest.cpp
index 8609732fda19..3f1ba1c5843f 100644
--- a/unittests/ADT/SCCIteratorTest.cpp
+++ b/unittests/ADT/SCCIteratorTest.cpp
@@ -213,7 +213,7 @@ class Graph {
           // Return a pointer to it.
           return FirstNode + i;
       assert(false && "Dereferencing end iterator!");
-      return 0; // Avoid compiler warning.
+      return nullptr; // Avoid compiler warning.
     }
   };
 
diff --git a/unittests/ADT/SmallVectorTest.cpp b/unittests/ADT/SmallVectorTest.cpp
index 58f55914ba59..95bf33e5bfba 100644
--- a/unittests/ADT/SmallVectorTest.cpp
+++ b/unittests/ADT/SmallVectorTest.cpp
@@ -26,8 +26,12 @@ namespace {
 class Constructable {
 private:
   static int numConstructorCalls;
+  static int numMoveConstructorCalls;
+  static int numCopyConstructorCalls;
   static int numDestructorCalls;
   static int numAssignmentCalls;
+  static int numMoveAssignmentCalls;
+  static int numCopyAssignmentCalls;
 
   bool constructed;
   int value;
@@ -44,11 +48,13 @@ class Constructable {
   Constructable(const Constructable & src) : constructed(true) {
     value = src.value;
     ++numConstructorCalls;
+    ++numCopyConstructorCalls;
   }
 
   Constructable(Constructable && src) : constructed(true) {
     value = src.value;
     ++numConstructorCalls;
+    ++numMoveConstructorCalls;
   }
 
   ~Constructable() {
@@ -61,6 +67,7 @@ class Constructable {
     EXPECT_TRUE(constructed);
     value = src.value;
     ++numAssignmentCalls;
+    ++numCopyAssignmentCalls;
     return *this;
   }
 
@@ -68,6 +75,7 @@ class Constructable {
     EXPECT_TRUE(constructed);
     value = src.value;
     ++numAssignmentCalls;
+    ++numMoveAssignmentCalls;
     return *this;
   }
 
@@ -77,18 +85,42 @@ class Constructable {
 
   static void reset() {
     numConstructorCalls = 0;
+    numMoveConstructorCalls = 0;
+    numCopyConstructorCalls = 0;
     numDestructorCalls = 0;
     numAssignmentCalls = 0;
+    numMoveAssignmentCalls = 0;
+    numCopyAssignmentCalls = 0;
   }
 
   static int getNumConstructorCalls() {
     return numConstructorCalls;
   }
 
+  static int getNumMoveConstructorCalls() {
+    return numMoveConstructorCalls;
+  }
+
+  static int getNumCopyConstructorCalls() {
+    return numCopyConstructorCalls;
+  }
+
   static int getNumDestructorCalls() {
     return numDestructorCalls;
   }
 
+  static int getNumAssignmentCalls() {
+    return numAssignmentCalls;
+  }
+
+  static int getNumMoveAssignmentCalls() {
+    return numMoveAssignmentCalls;
+  }
+
+  static int getNumCopyAssignmentCalls() {
+    return numCopyAssignmentCalls;
+  }
+
   friend bool operator==(const Constructable & c0, const Constructable & c1) {
     return c0.getValue() == c1.getValue();
   }
@@ -100,8 +132,26 @@ class Constructable {
 };
 
 int Constructable::numConstructorCalls;
+int Constructable::numCopyConstructorCalls;
+int Constructable::numMoveConstructorCalls;
 int Constructable::numDestructorCalls;
 int Constructable::numAssignmentCalls;
+int Constructable::numCopyAssignmentCalls;
+int Constructable::numMoveAssignmentCalls;
+
+struct NonCopyable {
+  NonCopyable() {}
+  NonCopyable(NonCopyable &&) {}
+  NonCopyable &operator=(NonCopyable &&) { return *this; }
+private:
+  NonCopyable(const NonCopyable &) LLVM_DELETED_FUNCTION;
+  NonCopyable &operator=(const NonCopyable &) LLVM_DELETED_FUNCTION;
+};
+
+LLVM_ATTRIBUTE_USED void CompileTest() {
+  SmallVector<NonCopyable, 0> V;
+  V.resize(42);
+}
 
 // Test fixture class
 template <typename VectorT>
@@ -148,7 +198,8 @@ class SmallVectorTest : public testing::Test {
 typedef ::testing::Types<SmallVector<Constructable, 0>,
                          SmallVector<Constructable, 1>,
                          SmallVector<Constructable, 2>,
-                         SmallVector<Constructable, 4>
+                         SmallVector<Constructable, 4>,
+                         SmallVector<Constructable, 5>
                          > SmallVectorTestTypes;
 TYPED_TEST_CASE(SmallVectorTest, SmallVectorTestTypes);
 
@@ -240,13 +291,26 @@ TYPED_TEST(SmallVectorTest, ResizeGrowTest) {
 
   this->theVector.resize(2);
 
-  // The extra constructor/destructor calls come from the temporary object used
-  // to initialize the contents of the resized array (via copy construction).
-  EXPECT_EQ(3, Constructable::getNumConstructorCalls());
-  EXPECT_EQ(1, Constructable::getNumDestructorCalls());
+  EXPECT_EQ(2, Constructable::getNumConstructorCalls());
+  EXPECT_EQ(0, Constructable::getNumDestructorCalls());
   EXPECT_EQ(2u, this->theVector.size());
 }
 
+TYPED_TEST(SmallVectorTest, ResizeWithElementsTest) {
+  this->theVector.resize(2);
+
+  Constructable::reset();
+
+  this->theVector.resize(4);
+
+  size_t Ctors = Constructable::getNumConstructorCalls();
+  EXPECT_TRUE(Ctors == 2 || Ctors == 4);
+  size_t MoveCtors = Constructable::getNumMoveConstructorCalls();
+  EXPECT_TRUE(MoveCtors == 0 || MoveCtors == 2);
+  size_t Dtors = Constructable::getNumDestructorCalls();
+  EXPECT_TRUE(Dtors == 0 || Dtors == 2);
+}
+
 // Resize with fill value.
 TYPED_TEST(SmallVectorTest, ResizeFillTest) {
   SCOPED_TRACE("ResizeFillTest");
@@ -413,22 +477,67 @@ TYPED_TEST(SmallVectorTest, InsertTest) {
   this->assertValuesInOrder(this->theVector, 4u, 1, 77, 2, 3);
 }
 
+// Insert a copy of a single element.
+TYPED_TEST(SmallVectorTest, InsertCopy) {
+  SCOPED_TRACE("InsertTest");
+
+  this->makeSequence(this->theVector, 1, 3);
+  Constructable C(77);
+  typename TypeParam::iterator I =
+      this->theVector.insert(this->theVector.begin() + 1, C);
+  EXPECT_EQ(this->theVector.begin() + 1, I);
+  this->assertValuesInOrder(this->theVector, 4u, 1, 77, 2, 3);
+}
+
 // Insert repeated elements.
 TYPED_TEST(SmallVectorTest, InsertRepeatedTest) {
   SCOPED_TRACE("InsertRepeatedTest");
 
-  this->makeSequence(this->theVector, 10, 15);
-  typename TypeParam::iterator I =
-    this->theVector.insert(this->theVector.begin() + 1, 2, Constructable(16));
+  this->makeSequence(this->theVector, 1, 4);
+  Constructable::reset();
+  auto I =
+      this->theVector.insert(this->theVector.begin() + 1, 2, Constructable(16));
+  // Move construct the top element into newly allocated space, and optionally
+  // reallocate the whole buffer, move constructing into it.
+  // FIXME: This is inefficient, we shouldn't move things into newly allocated
+  // space, then move them up/around, there should only be 2 or 4 move
+  // constructions here.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 2 ||
+              Constructable::getNumMoveConstructorCalls() == 6);
+  // Move assign the next two to shift them up and make a gap.
+  EXPECT_EQ(1, Constructable::getNumMoveAssignmentCalls());
+  // Copy construct the two new elements from the parameter.
+  EXPECT_EQ(2, Constructable::getNumCopyAssignmentCalls());
+  // All without any copy construction.
+  EXPECT_EQ(0, Constructable::getNumCopyConstructorCalls());
   EXPECT_EQ(this->theVector.begin() + 1, I);
-  this->assertValuesInOrder(this->theVector, 8u,
-                      10, 16, 16, 11, 12, 13, 14, 15);
+  this->assertValuesInOrder(this->theVector, 6u, 1, 16, 16, 2, 3, 4);
+}
 
-  // Insert at end.
-  I = this->theVector.insert(this->theVector.end(), 2, Constructable(16));
-  EXPECT_EQ(this->theVector.begin() + 8, I);
-  this->assertValuesInOrder(this->theVector, 10u,
-                      10, 16, 16, 11, 12, 13, 14, 15, 16, 16);
+
+TYPED_TEST(SmallVectorTest, InsertRepeatedAtEndTest) {
+  SCOPED_TRACE("InsertRepeatedTest");
+
+  this->makeSequence(this->theVector, 1, 4);
+  Constructable::reset();
+  auto I = this->theVector.insert(this->theVector.end(), 2, Constructable(16));
+  // Just copy construct them into newly allocated space
+  EXPECT_EQ(2, Constructable::getNumCopyConstructorCalls());
+  // Move everything across if reallocation is needed.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 0 ||
+              Constructable::getNumMoveConstructorCalls() == 4);
+  // Without ever moving or copying anything else.
+  EXPECT_EQ(0, Constructable::getNumCopyAssignmentCalls());
+  EXPECT_EQ(0, Constructable::getNumMoveAssignmentCalls());
+
+  EXPECT_EQ(this->theVector.begin() + 4, I);
+  this->assertValuesInOrder(this->theVector, 6u, 1, 2, 3, 4, 16, 16);
+}
+
+TYPED_TEST(SmallVectorTest, InsertRepeatedEmptyTest) {
+  SCOPED_TRACE("InsertRepeatedTest");
+
+  this->makeSequence(this->theVector, 10, 15);
 
   // Empty insert.
   EXPECT_EQ(this->theVector.end(),
@@ -447,16 +556,53 @@ TYPED_TEST(SmallVectorTest, InsertRangeTest) {
     { Constructable(77), Constructable(77), Constructable(77) };
 
   this->makeSequence(this->theVector, 1, 3);
-  typename TypeParam::iterator I =
-    this->theVector.insert(this->theVector.begin() + 1, Arr, Arr+3);
+  Constructable::reset();
+  auto I = this->theVector.insert(this->theVector.begin() + 1, Arr, Arr + 3);
+  // Move construct the top 3 elements into newly allocated space.
+  // Possibly move the whole sequence into new space first.
+  // FIXME: This is inefficient, we shouldn't move things into newly allocated
+  // space, then move them up/around, there should only be 2 or 3 move
+  // constructions here.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 2 ||
+              Constructable::getNumMoveConstructorCalls() == 5);
+  // Copy assign the lower 2 new elements into existing space.
+  EXPECT_EQ(2, Constructable::getNumCopyAssignmentCalls());
+  // Copy construct the third element into newly allocated space.
+  EXPECT_EQ(1, Constructable::getNumCopyConstructorCalls());
   EXPECT_EQ(this->theVector.begin() + 1, I);
   this->assertValuesInOrder(this->theVector, 6u, 1, 77, 77, 77, 2, 3);
+}
+
+
+TYPED_TEST(SmallVectorTest, InsertRangeAtEndTest) {
+  SCOPED_TRACE("InsertRangeTest");
+
+  Constructable Arr[3] =
+    { Constructable(77), Constructable(77), Constructable(77) };
+
+  this->makeSequence(this->theVector, 1, 3);
 
   // Insert at end.
-  I = this->theVector.insert(this->theVector.end(), Arr, Arr+3);
-  EXPECT_EQ(this->theVector.begin() + 6, I);
-  this->assertValuesInOrder(this->theVector, 9u,
-                            1, 77, 77, 77, 2, 3, 77, 77, 77);
+  Constructable::reset();
+  auto I = this->theVector.insert(this->theVector.end(), Arr, Arr+3);
+  // Copy construct the 3 elements into new space at the top.
+  EXPECT_EQ(3, Constructable::getNumCopyConstructorCalls());
+  // Don't copy/move anything else.
+  EXPECT_EQ(0, Constructable::getNumCopyAssignmentCalls());
+  // Reallocation might occur, causing all elements to be moved into the new
+  // buffer.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 0 ||
+              Constructable::getNumMoveConstructorCalls() == 3);
+  EXPECT_EQ(0, Constructable::getNumMoveAssignmentCalls());
+  EXPECT_EQ(this->theVector.begin() + 3, I);
+  this->assertValuesInOrder(this->theVector, 6u,
+                            1, 2, 3, 77, 77, 77);
+}
+
+TYPED_TEST(SmallVectorTest, InsertEmptyRangeTest) {
+  SCOPED_TRACE("InsertRangeTest");
+
+  this->makeSequence(this->theVector, 1, 3);
 
   // Empty insert.
   EXPECT_EQ(this->theVector.end(),
@@ -531,4 +677,26 @@ TEST(SmallVectorCustomTest, NoAssignTest) {
   EXPECT_EQ(42, vec.pop_back_val().x);
 }
 
+struct MovedFrom {
+  bool hasValue;
+  MovedFrom() : hasValue(true) {
+  }
+  MovedFrom(MovedFrom&& m) : hasValue(m.hasValue) {
+    m.hasValue = false;
+  }
+  MovedFrom &operator=(MovedFrom&& m) {
+    hasValue = m.hasValue;
+    m.hasValue = false;
+    return *this;
+  }
+};
+
+TEST(SmallVectorTest, MidInsert) {
+  SmallVector<MovedFrom, 3> v;
+  v.push_back(MovedFrom());
+  v.insert(v.begin(), MovedFrom());
+  for (MovedFrom &m : v)
+    EXPECT_TRUE(m.hasValue);
+}
+
 }
diff --git a/unittests/ADT/StringMapTest.cpp b/unittests/ADT/StringMapTest.cpp
index de18e07f3861..70eec873ed23 100644
--- a/unittests/ADT/StringMapTest.cpp
+++ b/unittests/ADT/StringMapTest.cpp
@@ -187,7 +187,7 @@ TEST_F(StringMapTest, IterationTest) {
 TEST_F(StringMapTest, StringMapEntryTest) {
   StringMap<uint32_t>::value_type* entry =
       StringMap<uint32_t>::value_type::Create(
-          testKeyFirst, testKeyFirst + testKeyLength, 1u);
+          StringRef(testKeyFirst, testKeyLength), 1u);
   EXPECT_STREQ(testKey, entry->first().data());
   EXPECT_EQ(1u, entry->second);
   free(entry);
@@ -198,7 +198,7 @@ TEST_F(StringMapTest, InsertTest) {
   SCOPED_TRACE("InsertTest");
   testMap.insert(
       StringMap<uint32_t>::value_type::Create(
-          testKeyFirst, testKeyFirst + testKeyLength, 
+          StringRef(testKeyFirst, testKeyLength),
           testMap.getAllocator(), 1u));
   assertSingleItemMap();
 }
@@ -236,7 +236,7 @@ TEST_F(StringMapTest, MoveOnlyKey) {
   StringMap<MoveOnly> t;
   t.GetOrCreateValue("Test", MoveOnly(42));
   StringRef Key = "Test";
-  StringMapEntry<MoveOnly>::Create(Key.begin(), Key.end(), MoveOnly(42))
+  StringMapEntry<MoveOnly>::Create(Key, MoveOnly(42))
       ->Destroy();
 }
 
diff --git a/unittests/ADT/ilistTest.cpp b/unittests/ADT/ilistTest.cpp
index 134607c2fd43..44442ebf75ad 100644
--- a/unittests/ADT/ilistTest.cpp
+++ b/unittests/ADT/ilistTest.cpp
@@ -29,8 +29,8 @@ TEST(ilistTest, Basic) {
   ilist<Node> List;
   List.push_back(Node(1));
   EXPECT_EQ(1, List.back().Value);
-  EXPECT_EQ(0, List.back().getPrevNode());
-  EXPECT_EQ(0, List.back().getNextNode());
+  EXPECT_EQ(nullptr, List.back().getPrevNode());
+  EXPECT_EQ(nullptr, List.back().getNextNode());
 
   List.push_back(Node(2));
   EXPECT_EQ(2, List.back().Value);
diff --git a/unittests/Analysis/CFGTest.cpp b/unittests/Analysis/CFGTest.cpp
index 8d8c560d9e30..ac5e71061d8a 100644
--- a/unittests/Analysis/CFGTest.cpp
+++ b/unittests/Analysis/CFGTest.cpp
@@ -46,10 +46,10 @@ class IsPotentiallyReachableTest : public testing::Test {
     }
 
     Function *F = M->getFunction("test");
-    if (F == NULL)
+    if (F == nullptr)
       report_fatal_error("Test must have a function named @test");
 
-    A = B = NULL;
+    A = B = nullptr;
     for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
       if (I->hasName()) {
         if (I->getName() == "A")
@@ -58,9 +58,9 @@ class IsPotentiallyReachableTest : public testing::Test {
           B = &*I;
       }
     }
-    if (A == NULL)
+    if (A == nullptr)
       report_fatal_error("@test must have an instruction %A");
-    if (B == NULL)
+    if (B == nullptr)
       report_fatal_error("@test must have an instruction %B");
   }
 
@@ -74,7 +74,7 @@ class IsPotentiallyReachableTest : public testing::Test {
 
       static int initialize() {
         PassInfo *PI = new PassInfo("isPotentiallyReachable testing pass",
-                                    "", &ID, 0, true, true);
+                                    "", &ID, nullptr, true, true);
         PassRegistry::getPassRegistry()->registerPass(*PI, false);
         initializeLoopInfoPass(*PassRegistry::getPassRegistry());
         initializeDominatorTreeWrapperPassPass(
@@ -95,9 +95,10 @@ class IsPotentiallyReachableTest : public testing::Test {
         LoopInfo *LI = &getAnalysis<LoopInfo>();
         DominatorTree *DT =
             &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-        EXPECT_EQ(isPotentiallyReachable(A, B, 0, 0), ExpectedResult);
-        EXPECT_EQ(isPotentiallyReachable(A, B, DT, 0), ExpectedResult);
-        EXPECT_EQ(isPotentiallyReachable(A, B, 0, LI), ExpectedResult);
+        EXPECT_EQ(isPotentiallyReachable(A, B, nullptr, nullptr),
+                  ExpectedResult);
+        EXPECT_EQ(isPotentiallyReachable(A, B, DT, nullptr), ExpectedResult);
+        EXPECT_EQ(isPotentiallyReachable(A, B, nullptr, LI), ExpectedResult);
         EXPECT_EQ(isPotentiallyReachable(A, B, DT, LI), ExpectedResult);
         return false;
       }
diff --git a/unittests/Analysis/MixedTBAATest.cpp b/unittests/Analysis/MixedTBAATest.cpp
index 2cf7c734dc65..142e04789d43 100644
--- a/unittests/Analysis/MixedTBAATest.cpp
+++ b/unittests/Analysis/MixedTBAATest.cpp
@@ -43,7 +43,7 @@ TEST_F(MixedTBAATest, MixedTBAA) {
 
   auto *Store1 = new StoreInst(Value, Addr, BB);
   auto *Store2 = new StoreInst(Value, Addr, BB);
-  ReturnInst::Create(C, 0, BB);
+  ReturnInst::Create(C, nullptr, BB);
 
   // New TBAA metadata
   {
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index 398d09e5a873..90f6997e98cd 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -41,7 +41,7 @@ TEST_F(ScalarEvolutionsTest, SCEVUnknownRAUW) {
                                               std::vector<Type *>(), false);
   Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
   BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
-  ReturnInst::Create(Context, 0, BB);
+  ReturnInst::Create(Context, nullptr, BB);
 
   Type *Ty = Type::getInt1Ty(Context);
   Constant *Init = Constant::getNullValue(Ty);
@@ -94,7 +94,7 @@ TEST_F(ScalarEvolutionsTest, SCEVMultiplyAddRecs) {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(Context), Types, false);
   Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
   BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
-  ReturnInst::Create(Context, 0, BB);
+  ReturnInst::Create(Context, nullptr, BB);
 
   // Create a ScalarEvolution and "run" it so that it gets initialized.
   PM.add(&SE);
diff --git a/unittests/ExecutionEngine/CMakeLists.txt b/unittests/ExecutionEngine/CMakeLists.txt
index 7ef509b3243d..489eaaff6ad1 100644
--- a/unittests/ExecutionEngine/CMakeLists.txt
+++ b/unittests/ExecutionEngine/CMakeLists.txt
@@ -9,9 +9,10 @@ add_llvm_unittest(ExecutionEngineTests
   ExecutionEngineTest.cpp
   )
 
-# Include JIT/MCJIT tests only if native arch is a JIT target.
-list(FIND LLVM_TARGETS_WITH_JIT "${LLVM_NATIVE_ARCH}" have_jit)
-if (NOT have_jit EQUAL -1 )
+# Include JIT/MCJIT tests only if native arch is a built JIT target.
+list(FIND LLVM_TARGETS_TO_BUILD "${LLVM_NATIVE_ARCH}" build_idx)
+list(FIND LLVM_TARGETS_WITH_JIT "${LLVM_NATIVE_ARCH}" jit_idx)
+if (NOT build_idx LESS 0 AND NOT jit_idx LESS 0)
   add_subdirectory(JIT)
   add_subdirectory(MCJIT)
 endif()
diff --git a/unittests/ExecutionEngine/ExecutionEngineTest.cpp b/unittests/ExecutionEngine/ExecutionEngineTest.cpp
index e6f07dce1e58..f23745c19dbb 100644
--- a/unittests/ExecutionEngine/ExecutionEngineTest.cpp
+++ b/unittests/ExecutionEngine/ExecutionEngineTest.cpp
@@ -26,13 +26,13 @@ class ExecutionEngineTest : public testing::Test {
   }
 
   virtual void SetUp() {
-    ASSERT_TRUE(Engine.get() != NULL) << "EngineBuilder returned error: '"
+    ASSERT_TRUE(Engine.get() != nullptr) << "EngineBuilder returned error: '"
       << Error << "'";
   }
 
   GlobalVariable *NewExtGlobal(Type *T, const Twine &Name) {
     return new GlobalVariable(*M, T, false,  // Not constant.
-                              GlobalValue::ExternalLinkage, NULL, Name);
+                              GlobalValue::ExternalLinkage, nullptr, Name);
   }
 
   Module *const M;
@@ -49,14 +49,14 @@ TEST_F(ExecutionEngineTest, ForwardGlobalMapping) {
   int32_t Mem2 = 4;
   Engine->updateGlobalMapping(G1, &Mem2);
   EXPECT_EQ(&Mem2, Engine->getPointerToGlobalIfAvailable(G1));
-  Engine->updateGlobalMapping(G1, NULL);
-  EXPECT_EQ(NULL, Engine->getPointerToGlobalIfAvailable(G1));
+  Engine->updateGlobalMapping(G1, nullptr);
+  EXPECT_EQ(nullptr, Engine->getPointerToGlobalIfAvailable(G1));
   Engine->updateGlobalMapping(G1, &Mem2);
   EXPECT_EQ(&Mem2, Engine->getPointerToGlobalIfAvailable(G1));
 
   GlobalVariable *G2 =
       NewExtGlobal(Type::getInt32Ty(getGlobalContext()), "Global1");
-  EXPECT_EQ(NULL, Engine->getPointerToGlobalIfAvailable(G2))
+  EXPECT_EQ(nullptr, Engine->getPointerToGlobalIfAvailable(G2))
     << "The NULL return shouldn't depend on having called"
     << " updateGlobalMapping(..., NULL)";
   // Check that update...() can be called before add...().
@@ -75,7 +75,7 @@ TEST_F(ExecutionEngineTest, ReverseGlobalMapping) {
   EXPECT_EQ(G1, Engine->getGlobalValueAtAddress(&Mem1));
   int32_t Mem2 = 4;
   Engine->updateGlobalMapping(G1, &Mem2);
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
   EXPECT_EQ(G1, Engine->getGlobalValueAtAddress(&Mem2));
 
   GlobalVariable *G2 =
@@ -83,12 +83,12 @@ TEST_F(ExecutionEngineTest, ReverseGlobalMapping) {
   Engine->updateGlobalMapping(G2, &Mem1);
   EXPECT_EQ(G2, Engine->getGlobalValueAtAddress(&Mem1));
   EXPECT_EQ(G1, Engine->getGlobalValueAtAddress(&Mem2));
-  Engine->updateGlobalMapping(G1, NULL);
+  Engine->updateGlobalMapping(G1, nullptr);
   EXPECT_EQ(G2, Engine->getGlobalValueAtAddress(&Mem1))
     << "Removing one mapping doesn't affect a different one.";
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem2));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem2));
   Engine->updateGlobalMapping(G2, &Mem2);
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
   EXPECT_EQ(G2, Engine->getGlobalValueAtAddress(&Mem2))
     << "Once a mapping is removed, we can point another GV at the"
     << " now-free address.";
@@ -104,7 +104,7 @@ TEST_F(ExecutionEngineTest, ClearModuleMappings) {
 
   Engine->clearGlobalMappingsFromModule(M);
 
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
 
   GlobalVariable *G2 =
       NewExtGlobal(Type::getInt32Ty(getGlobalContext()), "Global2");
@@ -124,7 +124,7 @@ TEST_F(ExecutionEngineTest, DestructionRemovesGlobalMapping) {
   // When the GV goes away, the ExecutionEngine should remove any
   // mappings that refer to it.
   G1->eraseFromParent();
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
 }
 
 }
diff --git a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
index ab308844f271..296838de61b3 100644
--- a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
@@ -267,12 +267,12 @@ TEST(JITMemoryManagerTest, TestManyStubs) {
 
   // After allocating a bunch of stubs, we should have two.
   for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateStub(NULL, Size, 8);
+    MemMgr->allocateStub(nullptr, Size, 8);
   EXPECT_EQ(2U, MemMgr->GetNumStubSlabs());
 
   // And after much more, we should have three.
   for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateStub(NULL, Size, 8);
+    MemMgr->allocateStub(nullptr, Size, 8);
   EXPECT_EQ(3U, MemMgr->GetNumStubSlabs());
 }
 
@@ -286,10 +286,10 @@ TEST(JITMemoryManagerTest, AllocateSection) {
   uint8_t *data2 = MemMgr->allocateDataSection(256, 64, 4, StringRef(), false);
   uint8_t *code3 = MemMgr->allocateCodeSection(258, 64, 5, StringRef());
 
-  EXPECT_NE((uint8_t*)0, code1);
-  EXPECT_NE((uint8_t*)0, code2);
-  EXPECT_NE((uint8_t*)0, data1);
-  EXPECT_NE((uint8_t*)0, data2);
+  EXPECT_NE((uint8_t*)nullptr, code1);
+  EXPECT_NE((uint8_t*)nullptr, code2);
+  EXPECT_NE((uint8_t*)nullptr, data1);
+  EXPECT_NE((uint8_t*)nullptr, data2);
 
   // Check alignment
   EXPECT_EQ((uint64_t)code1 & 0xf, 0u);
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index f438286f3175..817d207c2dca 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -169,7 +169,7 @@ class RecordingJITMemoryManager : public JITMemoryManager {
 bool LoadAssemblyInto(Module *M, const char *assembly) {
   SMDiagnostic Error;
   bool success =
-    NULL != ParseAssemblyString(assembly, M, Error, M->getContext());
+    nullptr != ParseAssemblyString(assembly, M, Error, M->getContext());
   std::string errMsg;
   raw_string_ostream os(errMsg);
   Error.print("", os);
@@ -193,7 +193,7 @@ class JITTest : public testing::Test {
                  .setJITMemoryManager(RJMM)
                  .setErrorStr(&Error)
                  .setTargetOptions(Options).create());
-    ASSERT_TRUE(TheJIT.get() != NULL) << Error;
+    ASSERT_TRUE(TheJIT.get() != nullptr) << Error;
   }
 
   void LoadAssembly(const char *assembly) {
@@ -249,7 +249,7 @@ TEST(JIT, GlobalInFunction) {
 
   // Since F1 was codegen'd, a pointer to G should be available.
   int32_t *GPtr = (int32_t*)JIT->getPointerToGlobalIfAvailable(G);
-  ASSERT_NE((int32_t*)NULL, GPtr);
+  ASSERT_NE((int32_t*)nullptr, GPtr);
   EXPECT_EQ(0, *GPtr);
 
   // F1() should increment G.
@@ -633,10 +633,10 @@ ExecutionEngine *getJITFromBitcode(
   MemoryBuffer *BitcodeBuffer =
     MemoryBuffer::getMemBuffer(Bitcode, "Bitcode for test");
   ErrorOr<Module*> ModuleOrErr = getLazyBitcodeModule(BitcodeBuffer, Context);
-  if (error_code EC = ModuleOrErr.getError()) {
+  if (std::error_code EC = ModuleOrErr.getError()) {
     ADD_FAILURE() << EC.message();
     delete BitcodeBuffer;
-    return NULL;
+    return nullptr;
   }
   M = ModuleOrErr.get();
   std::string errMsg;
@@ -644,11 +644,11 @@ ExecutionEngine *getJITFromBitcode(
     .setEngineKind(EngineKind::JIT)
     .setErrorStr(&errMsg)
     .create();
-  if (TheJIT == NULL) {
+  if (TheJIT == nullptr) {
     ADD_FAILURE() << errMsg;
     delete M;
-    M = NULL;
-    return NULL;
+    M = nullptr;
+    return nullptr;
   }
   return TheJIT;
 }
diff --git a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp b/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
index 5016532eef87..f530e0de5d51 100644
--- a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
@@ -27,7 +27,7 @@ namespace {
 bool LoadAssemblyInto(Module *M, const char *assembly) {
   SMDiagnostic Error;
   bool success =
-    NULL != ParseAssemblyString(assembly, M, Error, M->getContext());
+    nullptr != ParseAssemblyString(assembly, M, Error, M->getContext());
   std::string errMsg;
   raw_string_ostream os(errMsg);
   Error.print("", os);
@@ -71,13 +71,13 @@ void createModule2(LLVMContext &Context2, Module *&M2, Function *&FooF2) {
 
 TEST(MultiJitTest, EagerMode) {
   LLVMContext Context1;
-  Module *M1 = 0;
-  Function *FooF1 = 0;
+  Module *M1 = nullptr;
+  Function *FooF1 = nullptr;
   createModule1(Context1, M1, FooF1);
 
   LLVMContext Context2;
-  Module *M2 = 0;
-  Function *FooF2 = 0;
+  Module *M2 = nullptr;
+  Function *FooF2 = nullptr;
   createModule2(Context2, M2, FooF2);
 
   // Now we create the JIT in eager mode
@@ -101,13 +101,13 @@ TEST(MultiJitTest, EagerMode) {
 
 TEST(MultiJitTest, LazyMode) {
   LLVMContext Context1;
-  Module *M1 = 0;
-  Function *FooF1 = 0;
+  Module *M1 = nullptr;
+  Function *FooF1 = nullptr;
   createModule1(Context1, M1, FooF1);
 
   LLVMContext Context2;
-  Module *M2 = 0;
-  Function *FooF2 = 0;
+  Module *M2 = nullptr;
+  Function *FooF2 = nullptr;
   createModule2(Context2, M2, FooF2);
 
   // Now we create the JIT in lazy mode
@@ -135,13 +135,13 @@ extern "C" {
 
 TEST(MultiJitTest, JitPool) {
   LLVMContext Context1;
-  Module *M1 = 0;
-  Function *FooF1 = 0;
+  Module *M1 = nullptr;
+  Function *FooF1 = nullptr;
   createModule1(Context1, M1, FooF1);
 
   LLVMContext Context2;
-  Module *M2 = 0;
-  Function *FooF2 = 0;
+  Module *M2 = nullptr;
+  Function *FooF2 = nullptr;
   createModule2(Context2, M2, FooF2);
 
   // Now we create two JITs
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index 20d3f139b4c5..d03de898b4e6 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
@@ -148,10 +148,10 @@ class MCJITCAPITest : public testing::Test, public MCJITTestAPICommon {
     didCallAllocateCodeSection = false;
     didAllocateCompactUnwindSection = false;
     didCallYield = false;
-    Module = 0;
-    Function = 0;
-    Engine = 0;
-    Error = 0;
+    Module = nullptr;
+    Function = nullptr;
+    Engine = nullptr;
+    Error = nullptr;
   }
   
   virtual void TearDown() {
@@ -166,8 +166,8 @@ class MCJITCAPITest : public testing::Test, public MCJITTestAPICommon {
     
     LLVMSetTarget(Module, HostTriple.c_str());
     
-    Function = LLVMAddFunction(
-      Module, "simple_function", LLVMFunctionType(LLVMInt32Type(), 0, 0, 0));
+    Function = LLVMAddFunction(Module, "simple_function",
+                               LLVMFunctionType(LLVMInt32Type(), nullptr,0, 0));
     LLVMSetFunctionCallConv(Function, LLVMCCallConv);
     
     LLVMBasicBlockRef entry = LLVMAppendBasicBlock(Function, "entry");
@@ -192,8 +192,8 @@ class MCJITCAPITest : public testing::Test, public MCJITTestAPICommon {
       LLVMFunctionType(LLVMVoidType(), stackmapParamTypes, 2, 1));
     LLVMSetLinkage(stackmap, LLVMExternalLinkage);
     
-    Function = LLVMAddFunction(
-      Module, "simple_function", LLVMFunctionType(LLVMInt32Type(), 0, 0, 0));
+    Function = LLVMAddFunction(Module, "simple_function",
+                              LLVMFunctionType(LLVMInt32Type(), nullptr, 0, 0));
     
     LLVMBasicBlockRef entry = LLVMAppendBasicBlock(Function, "entry");
     LLVMBuilderRef builder = LLVMCreateBuilder();
@@ -221,8 +221,8 @@ class MCJITCAPITest : public testing::Test, public MCJITTestAPICommon {
     LLVMSetInitializer(GlobalVar, LLVMConstInt(LLVMInt32Type(), 42, 0));
     
     {
-        Function = LLVMAddFunction(
-          Module, "getGlobal", LLVMFunctionType(LLVMInt32Type(), 0, 0, 0));
+        Function = LLVMAddFunction(Module, "getGlobal",
+                              LLVMFunctionType(LLVMInt32Type(), nullptr, 0, 0));
         LLVMSetFunctionCallConv(Function, LLVMCCallConv);
         
         LLVMBasicBlockRef Entry = LLVMAppendBasicBlock(Function, "entry");
@@ -443,7 +443,7 @@ TEST_F(MCJITCAPITest, yield) {
   buildMCJITOptions();
   buildMCJITEngine();
   LLVMContextRef C = LLVMGetGlobalContext();
-  LLVMContextSetYieldCallback(C, yield, NULL);
+  LLVMContextSetYieldCallback(C, yield, nullptr);
   buildAndRunPasses();
 
   union {
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
index f862999da866..98587f7e85cf 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
@@ -23,10 +23,10 @@ TEST(MCJITMemoryManagerTest, BasicAllocations) {
   uint8_t *code2 = MemMgr->allocateCodeSection(256, 0, 3, "");
   uint8_t *data2 = MemMgr->allocateDataSection(256, 0, 4, "", false);
 
-  EXPECT_NE((uint8_t*)0, code1);
-  EXPECT_NE((uint8_t*)0, code2);
-  EXPECT_NE((uint8_t*)0, data1);
-  EXPECT_NE((uint8_t*)0, data2);
+  EXPECT_NE((uint8_t*)nullptr, code1);
+  EXPECT_NE((uint8_t*)nullptr, code2);
+  EXPECT_NE((uint8_t*)nullptr, data1);
+  EXPECT_NE((uint8_t*)nullptr, data2);
 
   // Initialize the data
   for (unsigned i = 0; i < 256; ++i) {
@@ -56,10 +56,10 @@ TEST(MCJITMemoryManagerTest, LargeAllocations) {
   uint8_t *code2 = MemMgr->allocateCodeSection(0x100000, 0, 3, "");
   uint8_t *data2 = MemMgr->allocateDataSection(0x100000, 0, 4, "", false);
 
-  EXPECT_NE((uint8_t*)0, code1);
-  EXPECT_NE((uint8_t*)0, code2);
-  EXPECT_NE((uint8_t*)0, data1);
-  EXPECT_NE((uint8_t*)0, data2);
+  EXPECT_NE((uint8_t*)nullptr, code1);
+  EXPECT_NE((uint8_t*)nullptr, code2);
+  EXPECT_NE((uint8_t*)nullptr, data1);
+  EXPECT_NE((uint8_t*)nullptr, data2);
 
   // Initialize the data
   for (unsigned i = 0; i < 0x100000; ++i) {
@@ -98,8 +98,8 @@ TEST(MCJITMemoryManagerTest, ManyAllocations) {
       data[i][j] = 2 + (i % 254);
     }
 
-    EXPECT_NE((uint8_t *)0, code[i]);
-    EXPECT_NE((uint8_t *)0, data[i]);
+    EXPECT_NE((uint8_t *)nullptr, code[i]);
+    EXPECT_NE((uint8_t *)nullptr, data[i]);
   }
 
   // Verify the data (this is checking for overlaps in the addresses)
@@ -141,8 +141,8 @@ TEST(MCJITMemoryManagerTest, ManyVariedAllocations) {
       data[i][j] = 2 + (i % 254);
     }
 
-    EXPECT_NE((uint8_t *)0, code[i]);
-    EXPECT_NE((uint8_t *)0, data[i]);
+    EXPECT_NE((uint8_t *)nullptr, code[i]);
+    EXPECT_NE((uint8_t *)nullptr, data[i]);
 
     uintptr_t CodeAlign = Align ? (uintptr_t)code[i] % Align : 0;
     uintptr_t DataAlign = Align ? (uintptr_t)data[i] % Align : 0;
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
index 46847d3e512c..fbbab42dbb02 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
@@ -48,7 +48,7 @@ class TestObjectCache : public ObjectCache {
     const MemoryBuffer* BufferFound = getObjectInternal(M);
     ModulesLookedUp.insert(M->getModuleIdentifier());
     if (!BufferFound)
-      return NULL;
+      return nullptr;
     // Our test cache wants to maintain ownership of its object buffers
     // so we make a copy here for the execution engine.
     return MemoryBuffer::getMemBufferCopy(BufferFound->getBuffer());
@@ -67,7 +67,7 @@ class TestObjectCache : public ObjectCache {
     const std::string ModuleID = M->getModuleIdentifier();
     StringMap<const MemoryBuffer *>::iterator it = ObjMap.find(ModuleID);
     if (it == ObjMap.end())
-      return 0;
+      return nullptr;
     return it->second;
   }
 
@@ -101,13 +101,13 @@ class MCJITObjectCacheTest : public testing::Test, public MCJITTestBase {
   void compileAndRun(int ExpectedRC = OriginalRC) {
     // This function shouldn't be called until after SetUp.
     ASSERT_TRUE(bool(TheJIT));
-    ASSERT_TRUE(0 != Main);
+    ASSERT_TRUE(nullptr != Main);
 
     // We may be using a null cache, so ensure compilation is valid.
     TheJIT->finalizeObject();
     void *vPtr = TheJIT->getPointerToFunction(Main);
 
-    EXPECT_TRUE(0 != vPtr)
+    EXPECT_TRUE(nullptr != vPtr)
       << "Unable to get pointer to main() from JIT";
 
     int (*FuncPtr)(void) = (int(*)(void))(intptr_t)vPtr;
@@ -123,7 +123,7 @@ TEST_F(MCJITObjectCacheTest, SetNullObjectCache) {
 
   createJIT(M.release());
 
-  TheJIT->setObjectCache(NULL);
+  TheJIT->setObjectCache(nullptr);
 
   compileAndRun();
 }
@@ -143,7 +143,7 @@ TEST_F(MCJITObjectCacheTest, VerifyBasicObjectCaching) {
 
   // Verify that our object cache does not contain the module yet.
   const MemoryBuffer *ObjBuffer = Cache->getObjectInternal(SavedModulePointer);
-  EXPECT_EQ(0, ObjBuffer);
+  EXPECT_EQ(nullptr, ObjBuffer);
 
   compileAndRun();
 
@@ -152,7 +152,7 @@ TEST_F(MCJITObjectCacheTest, VerifyBasicObjectCaching) {
 
   // Verify that our object cache now contains the module.
   ObjBuffer = Cache->getObjectInternal(SavedModulePointer);
-  EXPECT_TRUE(0 != ObjBuffer);
+  EXPECT_TRUE(nullptr != ObjBuffer);
 
   // Verify that the cache was only notified once.
   EXPECT_FALSE(Cache->wereDuplicatesInserted());
@@ -221,7 +221,7 @@ TEST_F(MCJITObjectCacheTest, VerifyNonLoadFromCache) {
 
   // Verify that our object cache does not contain the module yet.
   const MemoryBuffer *ObjBuffer = Cache->getObjectInternal(SecondModulePointer);
-  EXPECT_EQ(0, ObjBuffer);
+  EXPECT_EQ(nullptr, ObjBuffer);
 
   // Run the function and look for the replacement return code.
   compileAndRun(ReplacementRC);
@@ -231,7 +231,7 @@ TEST_F(MCJITObjectCacheTest, VerifyNonLoadFromCache) {
 
   // Verify that our object cache now contains the module.
   ObjBuffer = Cache->getObjectInternal(SecondModulePointer);
-  EXPECT_TRUE(0 != ObjBuffer);
+  EXPECT_TRUE(nullptr != ObjBuffer);
 
   // Verify that MCJIT didn't try to cache this again.
   EXPECT_FALSE(Cache->wereDuplicatesInserted());
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
index a439508af0c6..c37c1d1d599d 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
@@ -51,7 +51,7 @@ TEST_F(MCJITTest, global_variable) {
   GlobalValue *Global = insertGlobalInt32(M.get(), "test_global", initialValue);
   createJIT(M.release());
   void *globalPtr =  TheJIT->getPointerToGlobal(Global);
-  EXPECT_TRUE(0 != globalPtr)
+  EXPECT_TRUE(nullptr != globalPtr)
     << "Unable to get pointer to global value from JIT";
 
   EXPECT_EQ(initialValue, *(int32_t*)globalPtr)
diff --git a/unittests/IR/ConstantRangeTest.cpp b/unittests/IR/ConstantRangeTest.cpp
index cdf7378c17c7..fa03302ad2bb 100644
--- a/unittests/IR/ConstantRangeTest.cpp
+++ b/unittests/IR/ConstantRangeTest.cpp
@@ -99,11 +99,11 @@ TEST_F(ConstantRangeTest, Equality) {
 }
 
 TEST_F(ConstantRangeTest, SingleElement) {
-  EXPECT_EQ(Full.getSingleElement(), static_cast<APInt *>(NULL));
-  EXPECT_EQ(Empty.getSingleElement(), static_cast<APInt *>(NULL));
+  EXPECT_EQ(Full.getSingleElement(), static_cast<APInt *>(nullptr));
+  EXPECT_EQ(Empty.getSingleElement(), static_cast<APInt *>(nullptr));
   EXPECT_EQ(*One.getSingleElement(), APInt(16, 0xa));
-  EXPECT_EQ(Some.getSingleElement(), static_cast<APInt *>(NULL));
-  EXPECT_EQ(Wrap.getSingleElement(), static_cast<APInt *>(NULL));
+  EXPECT_EQ(Some.getSingleElement(), static_cast<APInt *>(nullptr));
+  EXPECT_EQ(Wrap.getSingleElement(), static_cast<APInt *>(nullptr));
 
   EXPECT_FALSE(Full.isSingleElement());
   EXPECT_FALSE(Empty.isSingleElement());
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index c11729c08f07..0cd854998286 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -269,16 +269,6 @@ TEST(ConstantsTest, ReplaceWithConstantTest) {
                "this->replaceAllUsesWith\\(expr\\(this\\)\\) is NOT valid!");
 }
 
-TEST(ConstantsTest, ReplaceInAliasTest) {
-  std::unique_ptr<Module> M(new Module("MyModule", getGlobalContext()));
-
-  Type *Int32Ty = Type::getInt32Ty(getGlobalContext());
-  auto *Global = cast<GlobalObject>(M->getOrInsertGlobal("dummy", Int32Ty));
-  auto *GA = GlobalAlias::create(GlobalValue::ExternalLinkage, "alias", Global);
-  EXPECT_DEATH(Global->replaceAllUsesWith(GA),
-               "replaceAliasUseWith cannot form an alias cycle");
-}
-
 #endif
 #endif
 
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index 98c2317f2ca1..ab43d1c91fcd 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -213,7 +213,7 @@ namespace llvm {
         "}\n";
       LLVMContext &C = getGlobalContext();
       SMDiagnostic Err;
-      return ParseAssemblyString(ModuleStrig, NULL, Err, C);
+      return ParseAssemblyString(ModuleStrig, nullptr, Err, C);
     }
 
     TEST(DominatorTree, Unreachable) {
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 9796e445990f..21085755fbad 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -31,11 +31,11 @@ class IRBuilderTest : public testing::Test {
     F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
     BB = BasicBlock::Create(Ctx, "", F);
     GV = new GlobalVariable(*M, Type::getFloatTy(Ctx), true,
-                            GlobalValue::ExternalLinkage, 0);
+                            GlobalValue::ExternalLinkage, nullptr);
   }
 
   virtual void TearDown() {
-    BB = 0;
+    BB = nullptr;
     M.reset();
   }
 
@@ -71,9 +71,9 @@ TEST_F(IRBuilderTest, Lifetime) {
 
   IntrinsicInst *II_Start1 = dyn_cast<IntrinsicInst>(Start1);
   IntrinsicInst *II_End1 = dyn_cast<IntrinsicInst>(End1);
-  ASSERT_TRUE(II_Start1 != NULL);
+  ASSERT_TRUE(II_Start1 != nullptr);
   EXPECT_EQ(II_Start1->getIntrinsicID(), Intrinsic::lifetime_start);
-  ASSERT_TRUE(II_End1 != NULL);
+  ASSERT_TRUE(II_End1 != nullptr);
   EXPECT_EQ(II_End1->getIntrinsicID(), Intrinsic::lifetime_end);
 }
 
@@ -203,7 +203,7 @@ TEST_F(IRBuilderTest, WrapFlags) {
 
   // Test instructions.
   GlobalVariable *G = new GlobalVariable(*M, Builder.getInt32Ty(), true,
-                                         GlobalValue::ExternalLinkage, 0);
+                                         GlobalValue::ExternalLinkage, nullptr);
   Value *V = Builder.CreateLoad(G);
   EXPECT_TRUE(
       cast<BinaryOperator>(Builder.CreateNSWAdd(V, V))->hasNoSignedWrap());
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index 336f5a2dcdc0..7ec9b62207fb 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -415,7 +415,7 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
                                            CastInst::IntToPtr,
                                            Int64PtrTy, Int64Ty, Int64PtrTy,
-                                           Int32Ty, 0, Int32Ty),
+                                           Int32Ty, nullptr, Int32Ty),
             CastInst::BitCast);
 
   // Source and destination have unknown sizes, but the same address space and
@@ -423,7 +423,7 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
                                            CastInst::IntToPtr,
                                            Int64PtrTy, Int64Ty, Int64PtrTy,
-                                           0, 0, 0),
+                                           nullptr, nullptr, nullptr),
             CastInst::BitCast);
 
   // Source and destination have unknown sizes, but the same address space and
@@ -431,21 +431,21 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
                                            CastInst::IntToPtr,
                                            Int64PtrTy, Int32Ty, Int64PtrTy,
-                                           0, 0, 0),
+                                           nullptr, nullptr, nullptr),
             0U);
 
   // Middle pointer big enough -> bitcast.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::PtrToInt,
                                            Int64Ty, Int64PtrTy, Int64Ty,
-                                           0, Int64Ty, 0),
+                                           nullptr, Int64Ty, nullptr),
             CastInst::BitCast);
 
   // Middle pointer too small -> fail.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::PtrToInt,
                                            Int64Ty, Int64PtrTy, Int64Ty,
-                                           0, Int32Ty, 0),
+                                           nullptr, Int32Ty, nullptr),
             0U);
 
   // Test that we don't eliminate bitcasts between different address spaces,
@@ -464,21 +464,21 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::AddrSpaceCast,
                                            Int16Ty, Int64PtrTyAS1, Int64PtrTyAS2,
-                                           0, Int16SizePtr, Int64SizePtr),
+                                           nullptr, Int16SizePtr, Int64SizePtr),
             0U);
 
   // Cannot simplify addrspacecast, ptrtoint
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::AddrSpaceCast,
                                            CastInst::PtrToInt,
                                            Int64PtrTyAS1, Int64PtrTyAS2, Int16Ty,
-                                           Int64SizePtr, Int16SizePtr, 0),
+                                           Int64SizePtr, Int16SizePtr, nullptr),
             0U);
 
   // Pass since the bitcast address spaces are the same
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::BitCast,
                                            Int16Ty, Int64PtrTyAS1, Int64PtrTyAS1,
-                                           0, 0, 0),
+                                           nullptr, nullptr, nullptr),
             CastInst::IntToPtr);
 
 }
diff --git a/unittests/IR/LegacyPassManagerTest.cpp b/unittests/IR/LegacyPassManagerTest.cpp
index df6f460df955..9c2a835ca099 100644
--- a/unittests/IR/LegacyPassManagerTest.cpp
+++ b/unittests/IR/LegacyPassManagerTest.cpp
@@ -476,7 +476,7 @@ namespace llvm {
       // Function: test1 (func_test1)
       {
 
-        BasicBlock* label_entry = BasicBlock::Create(getGlobalContext(), "entry",func_test1,0);
+        BasicBlock* label_entry = BasicBlock::Create(getGlobalContext(), "entry",func_test1,nullptr);
 
         // Block entry (label_entry)
         CallInst* int32_3 = CallInst::Create(func_test2, "", label_entry);
@@ -491,7 +491,7 @@ namespace llvm {
       // Function: test2 (func_test2)
       {
 
-        BasicBlock* label_entry_5 = BasicBlock::Create(getGlobalContext(), "entry",func_test2,0);
+        BasicBlock* label_entry_5 = BasicBlock::Create(getGlobalContext(), "entry",func_test2,nullptr);
 
         // Block entry (label_entry_5)
         CallInst* int32_6 = CallInst::Create(func_test3, "", label_entry_5);
@@ -506,7 +506,7 @@ namespace llvm {
       // Function: test3 (func_test3)
       {
 
-        BasicBlock* label_entry_8 = BasicBlock::Create(getGlobalContext(), "entry",func_test3,0);
+        BasicBlock* label_entry_8 = BasicBlock::Create(getGlobalContext(), "entry",func_test3,nullptr);
 
         // Block entry (label_entry_8)
         CallInst* int32_9 = CallInst::Create(func_test1, "", label_entry_8);
@@ -524,10 +524,10 @@ namespace llvm {
         Value* int1_f = args++;
         int1_f->setName("f");
 
-        BasicBlock* label_entry_11 = BasicBlock::Create(getGlobalContext(), "entry",func_test4,0);
-        BasicBlock* label_bb = BasicBlock::Create(getGlobalContext(), "bb",func_test4,0);
-        BasicBlock* label_bb1 = BasicBlock::Create(getGlobalContext(), "bb1",func_test4,0);
-        BasicBlock* label_return = BasicBlock::Create(getGlobalContext(), "return",func_test4,0);
+        BasicBlock* label_entry_11 = BasicBlock::Create(getGlobalContext(), "entry",func_test4,nullptr);
+        BasicBlock* label_bb = BasicBlock::Create(getGlobalContext(), "bb",func_test4,nullptr);
+        BasicBlock* label_bb1 = BasicBlock::Create(getGlobalContext(), "bb1",func_test4,nullptr);
+        BasicBlock* label_return = BasicBlock::Create(getGlobalContext(), "return",func_test4,nullptr);
 
         // Block entry (label_entry_11)
         BranchInst::Create(label_bb, label_entry_11);
diff --git a/unittests/IR/MDBuilderTest.cpp b/unittests/IR/MDBuilderTest.cpp
index c8b5a09e4914..fc4674ea1357 100644
--- a/unittests/IR/MDBuilderTest.cpp
+++ b/unittests/IR/MDBuilderTest.cpp
@@ -33,8 +33,8 @@ TEST_F(MDBuilderTest, createFPMath) {
   MDBuilder MDHelper(Context);
   MDNode *MD0 = MDHelper.createFPMath(0.0);
   MDNode *MD1 = MDHelper.createFPMath(1.0);
-  EXPECT_EQ(MD0, (MDNode *)0);
-  EXPECT_NE(MD1, (MDNode *)0);
+  EXPECT_EQ(MD0, (MDNode *)nullptr);
+  EXPECT_NE(MD1, (MDNode *)nullptr);
   EXPECT_EQ(MD1->getNumOperands(), 1U);
   Value *Op = MD1->getOperand(0);
   EXPECT_TRUE(isa<ConstantFP>(Op));
@@ -47,8 +47,8 @@ TEST_F(MDBuilderTest, createRangeMetadata) {
   APInt A(8, 1), B(8, 2);
   MDNode *R0 = MDHelper.createRange(A, A);
   MDNode *R1 = MDHelper.createRange(A, B);
-  EXPECT_EQ(R0, (MDNode *)0);
-  EXPECT_NE(R1, (MDNode *)0);
+  EXPECT_EQ(R0, (MDNode *)nullptr);
+  EXPECT_NE(R1, (MDNode *)nullptr);
   EXPECT_EQ(R1->getNumOperands(), 2U);
   EXPECT_TRUE(isa<ConstantInt>(R1->getOperand(0)));
   EXPECT_TRUE(isa<ConstantInt>(R1->getOperand(1)));
@@ -66,8 +66,8 @@ TEST_F(MDBuilderTest, createAnonymousTBAARoot) {
   EXPECT_GE(R1->getNumOperands(), 1U);
   EXPECT_EQ(R0->getOperand(0), R0);
   EXPECT_EQ(R1->getOperand(0), R1);
-  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == 0);
-  EXPECT_TRUE(R1->getNumOperands() == 1 || R1->getOperand(1) == 0);
+  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == nullptr);
+  EXPECT_TRUE(R1->getNumOperands() == 1 || R1->getOperand(1) == nullptr);
 }
 TEST_F(MDBuilderTest, createTBAARoot) {
   MDBuilder MDHelper(Context);
@@ -77,7 +77,7 @@ TEST_F(MDBuilderTest, createTBAARoot) {
   EXPECT_GE(R0->getNumOperands(), 1U);
   EXPECT_TRUE(isa<MDString>(R0->getOperand(0)));
   EXPECT_EQ(cast<MDString>(R0->getOperand(0))->getString(), "Root");
-  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == 0);
+  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == nullptr);
 }
 TEST_F(MDBuilderTest, createTBAANode) {
   MDBuilder MDHelper(Context);
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 00a2783fbf67..4f7bd720d1c3 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -103,7 +103,7 @@ TEST_F(MDNodeTest, Simple) {
 #endif
   EXPECT_EQ(n4, n1);
   EXPECT_EQ(n5, n2);
-  EXPECT_EQ(n6, (Value*)0);
+  EXPECT_EQ(n6, (Value*)nullptr);
 
   EXPECT_EQ(3u, n1->getNumOperands());
   EXPECT_EQ(s1, n1->getOperand(0));
diff --git a/unittests/IR/PassManagerTest.cpp b/unittests/IR/PassManagerTest.cpp
index 310e48fc3b8e..25037a773cfc 100644
--- a/unittests/IR/PassManagerTest.cpp
+++ b/unittests/IR/PassManagerTest.cpp
@@ -168,7 +168,7 @@ struct TestInvalidationFunctionPass {
 Module *parseIR(const char *IR) {
   LLVMContext &C = getGlobalContext();
   SMDiagnostic Err;
-  return ParseAssemblyString(IR, 0, Err, C);
+  return ParseAssemblyString(IR, nullptr, Err, C);
 }
 
 class PassManagerTest : public ::testing::Test {
diff --git a/unittests/IR/PatternMatch.cpp b/unittests/IR/PatternMatch.cpp
index bebee1536695..f3a27b8d2509 100644
--- a/unittests/IR/PatternMatch.cpp
+++ b/unittests/IR/PatternMatch.cpp
@@ -230,17 +230,17 @@ TEST_F(PatternMatchTest, OverflowingBinOps) {
       m_NSWAdd(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNSWAdd(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NSWSub(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNSWSub(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NSWMul(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNSWMul(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(m_NSWShl(m_Value(MatchL), m_Value(MatchR)).match(
       IRB.CreateShl(L, R, "", /* NUW */ false, /* NSW */ true)));
   EXPECT_EQ(L, MatchL);
@@ -250,17 +250,17 @@ TEST_F(PatternMatchTest, OverflowingBinOps) {
       m_NUWAdd(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNUWAdd(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NUWSub(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNUWSub(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NUWMul(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNUWMul(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(m_NUWShl(m_Value(MatchL), m_Value(MatchR)).match(
       IRB.CreateShl(L, R, "", /* NUW */ true, /* NSW */ false)));
   EXPECT_EQ(L, MatchL);
diff --git a/unittests/IR/TypeBuilderTest.cpp b/unittests/IR/TypeBuilderTest.cpp
index be493cdc6396..b7b3e45e35ed 100644
--- a/unittests/IR/TypeBuilderTest.cpp
+++ b/unittests/IR/TypeBuilderTest.cpp
@@ -234,19 +234,19 @@ TEST(TypeBuilderTest, Extensions) {
                                      TypeBuilder<int, false>::get(getGlobalContext()),
                                      TypeBuilder<int*, false>::get(getGlobalContext()),
                                      TypeBuilder<void*[], false>::get(getGlobalContext()),
-                                     (void*)0)),
+                                     (void*)nullptr)),
             (TypeBuilder<MyType*, false>::get(getGlobalContext())));
   EXPECT_EQ(PointerType::getUnqual(StructType::get(
                                      TypeBuilder<types::i<32>, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<32>*, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<8>*[], false>::get(getGlobalContext()),
-                                     (void*)0)),
+                                     (void*)nullptr)),
             (TypeBuilder<MyPortableType*, false>::get(getGlobalContext())));
   EXPECT_EQ(PointerType::getUnqual(StructType::get(
                                      TypeBuilder<types::i<32>, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<32>*, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<8>*[], false>::get(getGlobalContext()),
-                                     (void*)0)),
+                                     (void*)nullptr)),
             (TypeBuilder<MyPortableType*, true>::get(getGlobalContext())));
 }
 
diff --git a/unittests/IR/UserTest.cpp b/unittests/IR/UserTest.cpp
index 9c0e7b2750ef..eb07e824d8b8 100644
--- a/unittests/IR/UserTest.cpp
+++ b/unittests/IR/UserTest.cpp
@@ -65,7 +65,7 @@ TEST(UserTest, ValueOpIteration) {
                              "  ret void\n"
                              "}\n";
   SMDiagnostic Err;
-  Module *M = ParseAssemblyString(ModuleString, NULL, Err, C);
+  Module *M = ParseAssemblyString(ModuleString, nullptr, Err, C);
 
   Function *F = M->getFunction("f");
   BasicBlock &ExitBB = F->back();
diff --git a/unittests/IR/ValueHandleTest.cpp b/unittests/IR/ValueHandleTest.cpp
index 15a0c226f75c..403d2bcbda16 100644
--- a/unittests/IR/ValueHandleTest.cpp
+++ b/unittests/IR/ValueHandleTest.cpp
@@ -94,7 +94,7 @@ TEST_F(ValueHandle, WeakVH_NullOnDeletion) {
   WeakVH WVH_Copy(WVH);
   WeakVH WVH_Recreated(BitcastV.get());
   BitcastV.reset();
-  Value *null_value = NULL;
+  Value *null_value = nullptr;
   EXPECT_EQ(null_value, WVH);
   EXPECT_EQ(null_value, WVH_Copy);
   EXPECT_EQ(null_value, WVH_Recreated);
@@ -178,10 +178,10 @@ TEST_F(ValueHandle, AssertingVH_Asserts) {
   EXPECT_DEATH({BitcastV.reset();},
                "An asserting value handle still pointed to this value!");
   AssertingVH<Value> Copy(AVH);
-  AVH = NULL;
+  AVH = nullptr;
   EXPECT_DEATH({BitcastV.reset();},
                "An asserting value handle still pointed to this value!");
-  Copy = NULL;
+  Copy = nullptr;
   BitcastV.reset();
 }
 
@@ -263,14 +263,14 @@ TEST_F(ValueHandle, CallbackVH_CallbackOnRAUW) {
     int DeletedCalls;
     Value *AURWArgument;
 
-    RecordingVH() : DeletedCalls(0), AURWArgument(NULL) {}
+    RecordingVH() : DeletedCalls(0), AURWArgument(nullptr) {}
     RecordingVH(Value *V)
-      : CallbackVH(V), DeletedCalls(0), AURWArgument(NULL) {}
+      : CallbackVH(V), DeletedCalls(0), AURWArgument(nullptr) {}
 
   private:
     virtual void deleted() { DeletedCalls++; CallbackVH::deleted(); }
     virtual void allUsesReplacedWith(Value *new_value) {
-      EXPECT_EQ(NULL, AURWArgument);
+      EXPECT_EQ(nullptr, AURWArgument);
       AURWArgument = new_value;
     }
   };
@@ -278,7 +278,7 @@ TEST_F(ValueHandle, CallbackVH_CallbackOnRAUW) {
   RecordingVH RVH;
   RVH = BitcastV.get();
   EXPECT_EQ(0, RVH.DeletedCalls);
-  EXPECT_EQ(NULL, RVH.AURWArgument);
+  EXPECT_EQ(nullptr, RVH.AURWArgument);
   BitcastV->replaceAllUsesWith(ConstantV);
   EXPECT_EQ(0, RVH.DeletedCalls);
   EXPECT_EQ(ConstantV, RVH.AURWArgument);
@@ -291,21 +291,21 @@ TEST_F(ValueHandle, CallbackVH_DeletionCanRAUW) {
     Value *AURWArgument;
     LLVMContext *Context;
 
-    RecoveringVH() : DeletedCalls(0), AURWArgument(NULL), 
+    RecoveringVH() : DeletedCalls(0), AURWArgument(nullptr), 
                      Context(&getGlobalContext()) {}
     RecoveringVH(Value *V)
-      : CallbackVH(V), DeletedCalls(0), AURWArgument(NULL), 
+      : CallbackVH(V), DeletedCalls(0), AURWArgument(nullptr), 
         Context(&getGlobalContext()) {}
 
   private:
     virtual void deleted() {
       getValPtr()->replaceAllUsesWith(Constant::getNullValue(Type::getInt32Ty(getGlobalContext())));
-      setValPtr(NULL);
+      setValPtr(nullptr);
     }
     virtual void allUsesReplacedWith(Value *new_value) {
-      ASSERT_TRUE(NULL != getValPtr());
+      ASSERT_TRUE(nullptr != getValPtr());
       EXPECT_EQ(1U, getValPtr()->getNumUses());
-      EXPECT_EQ(NULL, AURWArgument);
+      EXPECT_EQ(nullptr, AURWArgument);
       AURWArgument = new_value;
     }
   };
@@ -368,8 +368,8 @@ TEST_F(ValueHandle, DestroyingOtherVHOnSameValueDoesntBreakIteration) {
     WeakVH ShouldBeVisited2(BitcastV.get());
 
     BitcastV.reset();
-    EXPECT_EQ(NULL, static_cast<Value*>(ShouldBeVisited1));
-    EXPECT_EQ(NULL, static_cast<Value*>(ShouldBeVisited2));
+    EXPECT_EQ(nullptr, static_cast<Value*>(ShouldBeVisited1));
+    EXPECT_EQ(nullptr, static_cast<Value*>(ShouldBeVisited2));
   }
 }
 
@@ -389,8 +389,8 @@ TEST_F(ValueHandle, AssertingVHCheckedLast) {
     }
 
     virtual void deleted() {
-      *ToClear[0] = 0;
-      *ToClear[1] = 0;
+      *ToClear[0] = nullptr;
+      *ToClear[1] = nullptr;
       CallbackVH::deleted();
     }
   };
diff --git a/unittests/IR/ValueMapTest.cpp b/unittests/IR/ValueMapTest.cpp
index 6fd87b1e0cd3..248740aeb726 100644
--- a/unittests/IR/ValueMapTest.cpp
+++ b/unittests/IR/ValueMapTest.cpp
@@ -40,8 +40,8 @@ TYPED_TEST_CASE(ValueMapTest, KeyTypes);
 
 TYPED_TEST(ValueMapTest, Null) {
   ValueMap<TypeParam*, int> VM1;
-  VM1[NULL] = 7;
-  EXPECT_EQ(7, VM1.lookup(NULL));
+  VM1[nullptr] = 7;
+  EXPECT_EQ(7, VM1.lookup(nullptr));
 }
 
 TYPED_TEST(ValueMapTest, FollowsValue) {
@@ -177,10 +177,10 @@ TYPED_TEST(ValueMapTest, ConfiguredCollisionBehavior) {
   // TODO: Implement this when someone needs it.
 }
 
-template<typename KeyT>
-struct LockMutex : ValueMapConfig<KeyT> {
+template<typename KeyT, typename MutexT>
+struct LockMutex : ValueMapConfig<KeyT, MutexT> {
   struct ExtraData {
-    sys::Mutex *M;
+    MutexT *M;
     bool *CalledRAUW;
     bool *CalledDeleted;
   };
@@ -192,15 +192,15 @@ struct LockMutex : ValueMapConfig<KeyT> {
     *Data.CalledDeleted = true;
     EXPECT_FALSE(Data.M->tryacquire()) << "Mutex should already be locked.";
   }
-  static sys::Mutex *getMutex(const ExtraData &Data) { return Data.M; }
+  static MutexT *getMutex(const ExtraData &Data) { return Data.M; }
 };
 #if LLVM_ENABLE_THREADS
 TYPED_TEST(ValueMapTest, LocksMutex) {
   sys::Mutex M(false);  // Not recursive.
   bool CalledRAUW = false, CalledDeleted = false;
-  typename LockMutex<TypeParam*>::ExtraData Data =
-    {&M, &CalledRAUW, &CalledDeleted};
-  ValueMap<TypeParam*, int, LockMutex<TypeParam*> > VM(Data);
+  typedef LockMutex<TypeParam*, sys::Mutex> ConfigType;
+  typename ConfigType::ExtraData Data = {&M, &CalledRAUW, &CalledDeleted};
+  ValueMap<TypeParam*, int, ConfigType> VM(Data);
   VM[this->BitcastV.get()] = 7;
   this->BitcastV->replaceAllUsesWith(this->AddV.get());
   this->AddV.reset();
diff --git a/unittests/IR/ValueTest.cpp b/unittests/IR/ValueTest.cpp
index d92bc82768a0..61e44a908d4e 100644
--- a/unittests/IR/ValueTest.cpp
+++ b/unittests/IR/ValueTest.cpp
@@ -34,7 +34,7 @@ TEST(ValueTest, UsedInBasicBlock) {
                              "  ret void\n"
                              "}\n";
   SMDiagnostic Err;
-  Module *M = ParseAssemblyString(ModuleString, NULL, Err, C);
+  Module *M = ParseAssemblyString(ModuleString, nullptr, Err, C);
 
   Function *F = M->getFunction("f");
 
@@ -56,7 +56,7 @@ TEST(GlobalTest, CreateAddressSpace) {
                          GlobalValue::ExternalLinkage,
                          Constant::getAllOnesValue(Int32Ty),
                          "dummy",
-                         0,
+                         nullptr,
                          GlobalVariable::NotThreadLocal,
                          1);
 
@@ -74,7 +74,7 @@ TEST(GlobalTest, CreateAddressSpace) {
                          GlobalValue::ExternalLinkage,
                          Constant::getAllOnesValue(Int32Ty),
                          "dummy_cast",
-                         0,
+                         nullptr,
                          GlobalVariable::NotThreadLocal,
                          1);
 
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index 252bdd333c18..71e3168b686b 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -44,23 +44,6 @@ TEST(VerifierTest, Branch_i1) {
   EXPECT_TRUE(verifyFunction(*F));
 }
 
-TEST(VerifierTest, AliasUnnamedAddr) {
-  LLVMContext &C = getGlobalContext();
-  Module M("M", C);
-  Type *Ty = Type::getInt8Ty(C);
-  Constant *Init = Constant::getNullValue(Ty);
-  GlobalVariable *Aliasee = new GlobalVariable(M, Ty, true,
-                                               GlobalValue::ExternalLinkage,
-                                               Init, "foo");
-  auto *GA = GlobalAlias::create(GlobalValue::ExternalLinkage, "bar", Aliasee);
-  GA->setUnnamedAddr(true);
-  std::string Error;
-  raw_string_ostream ErrorOS(Error);
-  EXPECT_TRUE(verifyModule(M, &ErrorOS));
-  EXPECT_TRUE(
-      StringRef(ErrorOS.str()).startswith("Alias cannot have unnamed_addr"));
-}
-
 TEST(VerifierTest, InvalidRetAttribute) {
   LLVMContext &C = getGlobalContext();
   Module M("M", C);
diff --git a/unittests/IR/WaymarkTest.cpp b/unittests/IR/WaymarkTest.cpp
index 9a9b4a2ad4e7..8e3cd45808f8 100644
--- a/unittests/IR/WaymarkTest.cpp
+++ b/unittests/IR/WaymarkTest.cpp
@@ -31,7 +31,7 @@ TEST(WaymarkTest, NativeArray) {
   FunctionType *FT = FunctionType::get(Type::getVoidTy(getGlobalContext()), true);
   Function *F = Function::Create(FT, GlobalValue::ExternalLinkage);
   const CallInst *A = CallInst::Create(F, makeArrayRef(values));
-  ASSERT_NE(A, (const CallInst*)NULL);
+  ASSERT_NE(A, (const CallInst*)nullptr);
   ASSERT_EQ(1U + 22, A->getNumOperands());
   const Use *U = &A->getOperandUse(0);
   const Use *Ue = &A->getOperandUse(22);
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index 1d5db3641d7d..4ccced195d24 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -36,7 +36,7 @@ class LinkModuleTest : public testing::Test {
     ArrayType *AT = ArrayType::get(Type::getInt8PtrTy(Ctx), 3);
 
     GV = new GlobalVariable(*M.get(), AT, false /*=isConstant*/,
-                            GlobalValue::InternalLinkage, 0, "switch.bas");
+                            GlobalValue::InternalLinkage, nullptr,"switch.bas");
 
     // Global Initializer
     std::vector<Constant *> Init;
@@ -88,7 +88,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
   Module *LinkedModule = new Module("MyModuleLinked", Ctx);
-  Linker::LinkModules(LinkedModule, M.get(), Linker::PreserveSource, 0);
+  Linker::LinkModules(LinkedModule, M.get(), Linker::PreserveSource, nullptr);
 
   // Delete the original module.
   M.reset();
@@ -138,16 +138,16 @@ TEST_F(LinkModuleTest, EmptyModule) {
 
   GlobalVariable *GV =
       new GlobalVariable(*InternalM, STy, false /*=isConstant*/,
-                         GlobalValue::InternalLinkage, 0, "g");
+                         GlobalValue::InternalLinkage, nullptr, "g");
 
   GV->setInitializer(ConstantStruct::get(STy, F));
 
   Module *EmptyM = new Module("EmptyModule1", Ctx);
-  Linker::LinkModules(EmptyM, InternalM, Linker::PreserveSource, 0);
+  Linker::LinkModules(EmptyM, InternalM, Linker::PreserveSource, nullptr);
 
   delete EmptyM;
   EmptyM = new Module("EmptyModule2", Ctx);
-  Linker::LinkModules(InternalM, EmptyM, Linker::PreserveSource, 0);
+  Linker::LinkModules(InternalM, EmptyM, Linker::PreserveSource, nullptr);
 
   delete EmptyM;
   delete InternalM;
diff --git a/unittests/Support/Casting.cpp b/unittests/Support/Casting.cpp
index 228c90bb8b73..88c7d19aa464 100644
--- a/unittests/Support/Casting.cpp
+++ b/unittests/Support/Casting.cpp
@@ -18,7 +18,7 @@ namespace llvm {
 // Used to test illegal cast. If a cast doesn't match any of the "real" ones,
 // it will match this one.
 struct IllegalCast;
-template <typename T> IllegalCast *cast(...) { return 0; }
+template <typename T> IllegalCast *cast(...) { return nullptr; }
 
 // set up two example classes
 // with conversion facility
@@ -90,7 +90,7 @@ static_assert(std::is_same<simplify_type<foo *>::SimpleType, foo *>::value,
 
 namespace {
 
-const foo *null_foo = NULL;
+const foo *null_foo = nullptr;
 
 bar B;
 extern bar &B1;
@@ -175,7 +175,7 @@ TEST(CastingTest, dyn_cast_or_null) {
 const bar *B2 = &B;
 }  // anonymous namespace
 
-bar *llvm::fub() { return 0; }
+bar *llvm::fub() { return nullptr; }
 
 namespace {
 namespace inferred_upcasting {
@@ -203,7 +203,7 @@ TEST(CastingTest, UpcastIsInferred) {
   Derived D;
   EXPECT_TRUE(isa<Base>(D));
   Base *BP = dyn_cast<Base>(&D);
-  EXPECT_TRUE(BP != NULL);
+  EXPECT_TRUE(BP != nullptr);
 }
 
 
diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp
index b0f1eb1ad6a5..b2d71ab795cc 100644
--- a/unittests/Support/CommandLineTest.cpp
+++ b/unittests/Support/CommandLineTest.cpp
@@ -23,7 +23,7 @@ class TempEnvVar {
   TempEnvVar(const char *name, const char *value)
       : name(name) {
     const char *old_value = getenv(name);
-    EXPECT_EQ(NULL, old_value) << old_value;
+    EXPECT_EQ(nullptr, old_value) << old_value;
 #if HAVE_SETENV
     setenv(name, value, true);
 #else
diff --git a/unittests/Support/ConvertUTFTest.cpp b/unittests/Support/ConvertUTFTest.cpp
index 13ea75b1573b..16c9bebfde16 100644
--- a/unittests/Support/ConvertUTFTest.cpp
+++ b/unittests/Support/ConvertUTFTest.cpp
@@ -10,6 +10,8 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "gtest/gtest.h"
 #include <string>
+#include <vector>
+#include <utility>
 
 using namespace llvm;
 
@@ -63,3 +65,1600 @@ TEST(ConvertUTFTest, HasUTF16BOM) {
   HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1));
   EXPECT_FALSE(HasBOM);
 }
+
+struct ConvertUTFResultContainer {
+  ConversionResult ErrorCode;
+  std::vector<unsigned> UnicodeScalars;
+
+  ConvertUTFResultContainer(ConversionResult ErrorCode)
+      : ErrorCode(ErrorCode) {}
+
+  ConvertUTFResultContainer
+  withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
+              unsigned US2 = 0x110000, unsigned US3 = 0x110000,
+              unsigned US4 = 0x110000, unsigned US5 = 0x110000,
+              unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
+    ConvertUTFResultContainer Result(*this);
+    if (US0 != 0x110000)
+      Result.UnicodeScalars.push_back(US0);
+    if (US1 != 0x110000)
+      Result.UnicodeScalars.push_back(US1);
+    if (US2 != 0x110000)
+      Result.UnicodeScalars.push_back(US2);
+    if (US3 != 0x110000)
+      Result.UnicodeScalars.push_back(US3);
+    if (US4 != 0x110000)
+      Result.UnicodeScalars.push_back(US4);
+    if (US5 != 0x110000)
+      Result.UnicodeScalars.push_back(US5);
+    if (US6 != 0x110000)
+      Result.UnicodeScalars.push_back(US6);
+    if (US7 != 0x110000)
+      Result.UnicodeScalars.push_back(US7);
+    return Result;
+  }
+};
+
+std::pair<ConversionResult, std::vector<unsigned>>
+ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
+  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
+
+  const UTF8 *SourceNext = SourceStart;
+  std::vector<UTF32> Decoded(S.size(), 0);
+  UTF32 *TargetStart = Decoded.data();
+
+  auto ErrorCode =
+      ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
+                         Decoded.data() + Decoded.size(), lenientConversion);
+
+  Decoded.resize(TargetStart - Decoded.data());
+
+  return std::make_pair(ErrorCode, Decoded);
+}
+
+std::pair<ConversionResult, std::vector<unsigned>>
+ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
+  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
+
+  const UTF8 *SourceNext = SourceStart;
+  std::vector<UTF32> Decoded(S.size(), 0);
+  UTF32 *TargetStart = Decoded.data();
+
+  auto ErrorCode = ConvertUTF8toUTF32Partial(
+      &SourceNext, SourceStart + S.size(), &TargetStart,
+      Decoded.data() + Decoded.size(), lenientConversion);
+
+  Decoded.resize(TargetStart - Decoded.data());
+
+  return std::make_pair(ErrorCode, Decoded);
+}
+
+::testing::AssertionResult
+CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
+                                 StringRef S, bool Partial = false) {
+  ConversionResult ErrorCode;
+  std::vector<unsigned> Decoded;
+  if (!Partial)
+    std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
+  else
+
+    std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
+  if (Expected.ErrorCode != ErrorCode)
+    return ::testing::AssertionFailure() << "Expected error code "
+                                         << Expected.ErrorCode << ", actual "
+                                         << ErrorCode;
+
+  if (Expected.UnicodeScalars != Decoded)
+    return ::testing::AssertionFailure()
+           << "Expected lenient decoded result:\n"
+           << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
+           << "Actual result:\n" << ::testing::PrintToString(Decoded);
+
+  return ::testing::AssertionSuccess();
+}
+
+TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
+
+  //
+  // 1-byte sequences
+  //
+
+  // U+0041 LATIN CAPITAL LETTER A
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
+
+  //
+  // 2-byte sequences
+  //
+
+  // U+0283 LATIN SMALL LETTER ESH
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
+      "\xca\x83"));
+
+  // U+03BA GREEK SMALL LETTER KAPPA
+  // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
+  // U+03C3 GREEK SMALL LETTER SIGMA
+  // U+03BC GREEK SMALL LETTER MU
+  // U+03B5 GREEK SMALL LETTER EPSILON
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK)
+          .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
+      "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
+
+  //
+  // 3-byte sequences
+  //
+
+  // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
+  // U+6587 CJK UNIFIED IDEOGRAPH-6587
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
+      "\xe4\xbe\x8b\xe6\x96\x87"));
+
+  // U+D55C HANGUL SYLLABLE HAN
+  // U+AE00 HANGUL SYLLABLE GEUL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
+      "\xed\x95\x9c\xea\xb8\x80"));
+
+  // U+1112 HANGUL CHOSEONG HIEUH
+  // U+1161 HANGUL JUNGSEONG A
+  // U+11AB HANGUL JONGSEONG NIEUN
+  // U+1100 HANGUL CHOSEONG KIYEOK
+  // U+1173 HANGUL JUNGSEONG EU
+  // U+11AF HANGUL JONGSEONG RIEUL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK)
+          .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
+      "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
+      "\xe1\x86\xaf"));
+
+  //
+  // 4-byte sequences
+  //
+
+  // U+E0100 VARIATION SELECTOR-17
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
+      "\xf3\xa0\x84\x80"));
+
+  //
+  // First possible sequence of a certain length
+  //
+
+  // U+0000 NULL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
+      StringRef("\x00", 1)));
+
+  // U+0080 PADDING CHARACTER
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
+      "\xc2\x80"));
+
+  // U+0800 SAMARITAN LETTER ALAF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
+      "\xe0\xa0\x80"));
+
+  // U+10000 LINEAR B SYLLABLE B008 A
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
+      "\xf0\x90\x80\x80"));
+
+  // U+200000 (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x88\x80\x80\x80"));
+
+  // U+4000000 (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80\x80\x80\x80"));
+
+  //
+  // Last possible sequence of a certain length
+  //
+
+  // U+007F DELETE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
+
+  // U+07FF (unassigned)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
+      "\xdf\xbf"));
+
+  // U+FFFF (noncharacter)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
+      "\xef\xbf\xbf"));
+
+  // U+1FFFFF (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf7\xbf\xbf\xbf"));
+
+  // U+3FFFFFF (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfb\xbf\xbf\xbf\xbf"));
+
+  // U+7FFFFFFF (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf\xbf\xbf\xbf"));
+
+  //
+  // Other boundary conditions
+  //
+
+  // U+D7FF (unassigned)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
+      "\xed\x9f\xbf"));
+
+  // U+E000 (private use)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
+      "\xee\x80\x80"));
+
+  // U+FFFD REPLACEMENT CHARACTER
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
+      "\xef\xbf\xbd"));
+
+  // U+10FFFF (noncharacter)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
+      "\xf4\x8f\xbf\xbf"));
+
+  // U+110000 (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf4\x90\x80\x80"));
+
+  //
+  // Unexpected continuation bytes
+  //
+
+  // A sequence of unexpected continuation bytes that don't follow a first
+  // byte, every byte is a maximal subpart.
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\x80\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xbf\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\x80\xbf\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\x80\xbf\x80\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\x80\xbf\x82\xbf\xaa"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xaa\xb0\xbb\xbf\xaa\xa0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
+
+  // All continuation bytes (0x80--0xbf).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+      "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+      "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+      "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
+
+  //
+  // Lonely start bytes
+  //
+
+  // Start bytes of 2-byte sequences (0xc0--0xdf).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+      "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
+      "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
+      "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
+      "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
+
+  // Start bytes of 3-byte sequences (0xe0--0xef).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
+      "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
+
+  // Start bytes of 4-byte sequences (0xf0--0xf7).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
+
+  // Start bytes of 5-byte sequences (0xf8--0xfb).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\xf9\xfa\xfb"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
+
+  // Start bytes of 6-byte sequences (0xfc--0xfd).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\xfd"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xfc\x20\xfd\x20"));
+
+  //
+  // Other bytes (0xc0--0xc1, 0xfe--0xff).
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xc0\xc1\xfe\xff"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfe\xfe\xff\xff"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfe\x80\x80\x80\x80\x80"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xff\x80\x80\x80\x80\x80"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
+
+  //
+  // Sequences with one continuation byte missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xe0\xa0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xe0\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xe1\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xec\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xed\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xed\x9f"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xee\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xef\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\x90\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf1\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf3\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x8f\xbf"));
+
+  // Overlong sequences with one trailing byte missing.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xc0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xc1"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xe0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xe0\x9f"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x8f\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80\x80"));
+
+  // Sequences that represent surrogates with one trailing byte missing.
+  // High surrogates
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xa0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xac"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xaf"));
+  // Low surrogates
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xb0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xb4"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xbf"));
+
+  // Ill-formed 4-byte sequences.
+  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+1100xx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf4\x90\x80"));
+  // U+13FBxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf4\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf5\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf6\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf7\x80\x80"));
+  // U+1FFBxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf7\xbf\xbf"));
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+2000xx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x88\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf9\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfa\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfb\x80\x80\x80"));
+  // U+3FFFFxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfb\xbf\xbf\xbf"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
+  // U+40000xx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\xbf\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\x80\x80\x80\x80"));
+  // U+7FFFFFxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf\xbf\xbf"));
+
+  //
+  // Sequences with two continuation bytes missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\x90"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf1\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf3\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x8f"));
+
+  // Overlong sequences with two trailing byte missing.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf0\x8f"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80"));
+
+  // Sequences that represent surrogates with two trailing bytes missing.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
+
+  // Ill-formed 4-byte sequences.
+  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+110yxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf4\x90"));
+  // U+13Fyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf4\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf5\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf6\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf7\x80"));
+  // U+1FFyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf7\xbf"));
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+200yxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x88\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf8\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf9\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfa\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfb\x80\x80"));
+  // U+3FFFyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfb\xbf\xbf"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+4000yxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\x80\x80\x80"));
+  // U+7FFFFyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf\xbf"));
+
+  //
+  // Sequences with three continuation bytes missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
+
+  // Broken overlong sequences.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf8\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80"));
+
+  // Ill-formed 4-byte sequences.
+  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+14yyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
+  // U+1Cyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+20yyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf8\x88"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf8\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf9\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfa\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfb\x80"));
+  // U+3FCyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfb\xbf"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+400yyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfc\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfd\x80\x80"));
+  // U+7FFCyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf"));
+
+  //
+  // Sequences with four continuation bytes missing
+  //
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+uzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
+  // U+3zyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
+
+  // Broken overlong sequences.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\x80"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+uzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\x84"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfd\x80"));
+  // U+7Fzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfd\xbf"));
+
+  //
+  // Sequences with five continuation bytes missing
+  //
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+uzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
+  // U+uuzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
+
+  //
+  // Consecutive sequences with trailing bytes missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xc0" "\xe0\x80" "\xf0\x80\x80"
+      "\xf8\x80\x80\x80"
+      "\xfc\x80\x80\x80\x80"
+      "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
+      "\xfb\xbf\xbf\xbf"
+      "\xfd\xbf\xbf\xbf\xbf"));
+
+  //
+  // Overlong UTF-8 sequences
+  //
+
+  // U+002F SOLIDUS
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
+
+  // Overlong sequences of the above.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc0\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xe0\x80\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x80\x80\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80\x80\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80\x80\xaf"));
+
+  // U+0000 NULL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
+      StringRef("\x00", 1)));
+
+  // Overlong sequences of the above.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xe0\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80\x80\x80"));
+
+  // Other overlong sequences.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc0\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc1\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc1\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xe0\x9f\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x8f\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x8f\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x87\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x83\xbf\xbf\xbf\xbf"));
+
+  //
+  // Isolated surrogates
+  //
+
+  // Unicode 6.3.0:
+  //
+  //    D71.  High-surrogate code point: A Unicode code point in the range
+  //    U+D800 to U+DBFF.
+  //
+  //    D73.  Low-surrogate code point: A Unicode code point in the range
+  //    U+DC00 to U+DFFF.
+
+  // Note: U+E0100 is <DB40 DD00> in UTF16.
+
+  // High surrogates
+
+  // U+D800
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80"));
+
+  // U+DB40
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0"));
+
+  // U+DBFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf"));
+
+  // Low surrogates
+
+  // U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xb0\x80"));
+
+  // U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xb4\x80"));
+
+  // U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xbf\xbf"));
+
+  // Surrogate pairs
+
+  // U+D800 U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80\xed\xb0\x80"));
+
+  // U+D800 U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80\xed\xb4\x80"));
+
+  // U+D800 U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80\xed\xbf\xbf"));
+
+  // U+DB40 U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0\xed\xb0\x80"));
+
+  // U+DB40 U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0\xed\xb4\x80"));
+
+  // U+DB40 U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0\xed\xbf\xbf"));
+
+  // U+DBFF U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf\xed\xb0\x80"));
+
+  // U+DBFF U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf\xed\xb4\x80"));
+
+  // U+DBFF U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf\xed\xbf\xbf"));
+
+  //
+  // Noncharacters
+  //
+
+  // Unicode 6.3.0:
+  //
+  //    D14.  Noncharacter: A code point that is permanently reserved for
+  //    internal use and that should never be interchanged. Noncharacters
+  //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
+  //    and the values U+FDD0..U+FDEF.
+
+  // U+FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
+      "\xef\xbf\xbe"));
+
+  // U+FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
+      "\xef\xbf\xbf"));
+
+  // U+1FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
+      "\xf0\x9f\xbf\xbe"));
+
+  // U+1FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
+      "\xf0\x9f\xbf\xbf"));
+
+  // U+2FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
+      "\xf0\xaf\xbf\xbe"));
+
+  // U+2FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
+      "\xf0\xaf\xbf\xbf"));
+
+  // U+3FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
+      "\xf0\xbf\xbf\xbe"));
+
+  // U+3FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
+      "\xf0\xbf\xbf\xbf"));
+
+  // U+4FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
+      "\xf1\x8f\xbf\xbe"));
+
+  // U+4FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
+      "\xf1\x8f\xbf\xbf"));
+
+  // U+5FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
+      "\xf1\x9f\xbf\xbe"));
+
+  // U+5FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
+      "\xf1\x9f\xbf\xbf"));
+
+  // U+6FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
+      "\xf1\xaf\xbf\xbe"));
+
+  // U+6FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
+      "\xf1\xaf\xbf\xbf"));
+
+  // U+7FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
+      "\xf1\xbf\xbf\xbe"));
+
+  // U+7FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
+      "\xf1\xbf\xbf\xbf"));
+
+  // U+8FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
+      "\xf2\x8f\xbf\xbe"));
+
+  // U+8FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
+      "\xf2\x8f\xbf\xbf"));
+
+  // U+9FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
+      "\xf2\x9f\xbf\xbe"));
+
+  // U+9FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
+      "\xf2\x9f\xbf\xbf"));
+
+  // U+AFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
+      "\xf2\xaf\xbf\xbe"));
+
+  // U+AFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
+      "\xf2\xaf\xbf\xbf"));
+
+  // U+BFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
+      "\xf2\xbf\xbf\xbe"));
+
+  // U+BFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
+      "\xf2\xbf\xbf\xbf"));
+
+  // U+CFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
+      "\xf3\x8f\xbf\xbe"));
+
+  // U+CFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
+      "\xf3\x8f\xbf\xbf"));
+
+  // U+DFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
+      "\xf3\x9f\xbf\xbe"));
+
+  // U+DFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
+      "\xf3\x9f\xbf\xbf"));
+
+  // U+EFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
+      "\xf3\xaf\xbf\xbe"));
+
+  // U+EFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
+      "\xf3\xaf\xbf\xbf"));
+
+  // U+FFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
+      "\xf3\xbf\xbf\xbe"));
+
+  // U+FFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
+      "\xf3\xbf\xbf\xbf"));
+
+  // U+10FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
+      "\xf4\x8f\xbf\xbe"));
+
+  // U+10FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
+      "\xf4\x8f\xbf\xbf"));
+
+  // U+FDD0
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
+      "\xef\xb7\x90"));
+
+  // U+FDD1
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
+      "\xef\xb7\x91"));
+
+  // U+FDD2
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
+      "\xef\xb7\x92"));
+
+  // U+FDD3
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
+      "\xef\xb7\x93"));
+
+  // U+FDD4
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
+      "\xef\xb7\x94"));
+
+  // U+FDD5
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
+      "\xef\xb7\x95"));
+
+  // U+FDD6
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
+      "\xef\xb7\x96"));
+
+  // U+FDD7
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
+      "\xef\xb7\x97"));
+
+  // U+FDD8
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
+      "\xef\xb7\x98"));
+
+  // U+FDD9
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
+      "\xef\xb7\x99"));
+
+  // U+FDDA
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
+      "\xef\xb7\x9a"));
+
+  // U+FDDB
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
+      "\xef\xb7\x9b"));
+
+  // U+FDDC
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
+      "\xef\xb7\x9c"));
+
+  // U+FDDD
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
+      "\xef\xb7\x9d"));
+
+  // U+FDDE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
+      "\xef\xb7\x9e"));
+
+  // U+FDDF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
+      "\xef\xb7\x9f"));
+
+  // U+FDE0
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
+      "\xef\xb7\xa0"));
+
+  // U+FDE1
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
+      "\xef\xb7\xa1"));
+
+  // U+FDE2
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
+      "\xef\xb7\xa2"));
+
+  // U+FDE3
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
+      "\xef\xb7\xa3"));
+
+  // U+FDE4
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
+      "\xef\xb7\xa4"));
+
+  // U+FDE5
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
+      "\xef\xb7\xa5"));
+
+  // U+FDE6
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
+      "\xef\xb7\xa6"));
+
+  // U+FDE7
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
+      "\xef\xb7\xa7"));
+
+  // U+FDE8
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
+      "\xef\xb7\xa8"));
+
+  // U+FDE9
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
+      "\xef\xb7\xa9"));
+
+  // U+FDEA
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
+      "\xef\xb7\xaa"));
+
+  // U+FDEB
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
+      "\xef\xb7\xab"));
+
+  // U+FDEC
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
+      "\xef\xb7\xac"));
+
+  // U+FDED
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
+      "\xef\xb7\xad"));
+
+  // U+FDEE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
+      "\xef\xb7\xae"));
+
+  // U+FDEF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
+      "\xef\xb7\xaf"));
+
+  // U+FDF0
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
+      "\xef\xb7\xb0"));
+
+  // U+FDF1
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
+      "\xef\xb7\xb1"));
+
+  // U+FDF2
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
+      "\xef\xb7\xb2"));
+
+  // U+FDF3
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
+      "\xef\xb7\xb3"));
+
+  // U+FDF4
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
+      "\xef\xb7\xb4"));
+
+  // U+FDF5
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
+      "\xef\xb7\xb5"));
+
+  // U+FDF6
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
+      "\xef\xb7\xb6"));
+
+  // U+FDF7
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
+      "\xef\xb7\xb7"));
+
+  // U+FDF8
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
+      "\xef\xb7\xb8"));
+
+  // U+FDF9
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
+      "\xef\xb7\xb9"));
+
+  // U+FDFA
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
+      "\xef\xb7\xba"));
+
+  // U+FDFB
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
+      "\xef\xb7\xbb"));
+
+  // U+FDFC
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
+      "\xef\xb7\xbc"));
+
+  // U+FDFD
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
+      "\xef\xb7\xbd"));
+
+  // U+FDFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
+      "\xef\xb7\xbe"));
+
+  // U+FDFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
+      "\xef\xb7\xbf"));
+}
+
+TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
+  // U+0041 LATIN CAPITAL LETTER A
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
+      "\x41", true));
+
+  //
+  // Sequences with one continuation byte missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xc2", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xdf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xe0\xa0", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xe0\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xe1\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xec\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xed\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xed\x9f", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xee\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xef\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf0\x90\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf0\xbf\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf1\x80\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf3\xbf\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf4\x80\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf4\x8f\xbf", true));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
+      "\x41\xc2", true));
+}
+
diff --git a/unittests/Support/DataExtractorTest.cpp b/unittests/Support/DataExtractorTest.cpp
index ec8bd3d18c8a..81de983d2265 100644
--- a/unittests/Support/DataExtractorTest.cpp
+++ b/unittests/Support/DataExtractorTest.cpp
@@ -94,7 +94,7 @@ TEST(DataExtractorTest, Strings) {
 
   EXPECT_EQ(stringData, DE.getCStr(&offset));
   EXPECT_EQ(11U, offset);
-  EXPECT_EQ(NULL, DE.getCStr(&offset));
+  EXPECT_EQ(nullptr, DE.getCStr(&offset));
   EXPECT_EQ(11U, offset);
 }
 
diff --git a/unittests/Support/ErrorOrTest.cpp b/unittests/Support/ErrorOrTest.cpp
index 18ce507b8895..d76e7d62421f 100644
--- a/unittests/Support/ErrorOrTest.cpp
+++ b/unittests/Support/ErrorOrTest.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Errc.h"
 #include "gtest/gtest.h"
 #include <memory>
 
@@ -30,7 +31,7 @@ TEST(ErrorOr, SimpleValue) {
 
   a = t2();
   EXPECT_FALSE(a);
-  EXPECT_EQ(errc::invalid_argument, a.getError());
+  EXPECT_EQ(a.getError(), errc::invalid_argument);
 #ifdef EXPECT_DEBUG_DEATH
   EXPECT_DEBUG_DEATH(*a, "Cannot get value when an error exists");
 #endif
@@ -54,10 +55,10 @@ struct B {};
 struct D : B {};
 
 TEST(ErrorOr, Covariant) {
-  ErrorOr<B*> b(ErrorOr<D*>(0));
-  b = ErrorOr<D*>(0);
+  ErrorOr<B*> b(ErrorOr<D*>(nullptr));
+  b = ErrorOr<D*>(nullptr);
 
-  ErrorOr<std::unique_ptr<B> > b1(ErrorOr<std::unique_ptr<D> >(0));
-  b1 = ErrorOr<std::unique_ptr<D> >(0);
+  ErrorOr<std::unique_ptr<B> > b1(ErrorOr<std::unique_ptr<D> >(nullptr));
+  b1 = ErrorOr<std::unique_ptr<D> >(nullptr);
 }
 } // end anon namespace
diff --git a/unittests/Support/FileOutputBufferTest.cpp b/unittests/Support/FileOutputBufferTest.cpp
index 0801f8539909..b086f1e118d2 100644
--- a/unittests/Support/FileOutputBufferTest.cpp
+++ b/unittests/Support/FileOutputBufferTest.cpp
@@ -17,12 +17,13 @@
 using namespace llvm;
 using namespace llvm::sys;
 
-#define ASSERT_NO_ERROR(x) \
-  if (error_code ASSERT_NO_ERROR_ec = x) { \
-    errs() << #x ": did not return errc::success.\n" \
-            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \
-            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \
-  } else {}
+#define ASSERT_NO_ERROR(x)                                                     \
+  if (std::error_code ASSERT_NO_ERROR_ec = x) {                                \
+    errs() << #x ": did not return errc::success.\n"                           \
+           << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n"           \
+           << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n";       \
+  } else {                                                                     \
+  }
 
 namespace {
 TEST(FileOutputBuffer, Test) {
@@ -46,11 +47,7 @@ TEST(FileOutputBuffer, Test) {
     // Commit buffer.
     ASSERT_NO_ERROR(Buffer->commit());
   }
-  // Verify file exists and starts with special header.
-  bool MagicMatches = false;
-  ASSERT_NO_ERROR(fs::has_magic(Twine(File1), Twine("AABBCCDDEEFFGGHHIIJJ"),
-                                                                MagicMatches));
-  EXPECT_TRUE(MagicMatches);
+
   // Verify file is correct size.
   uint64_t File1Size;
   ASSERT_NO_ERROR(fs::file_size(Twine(File1), File1Size));
@@ -86,11 +83,7 @@ TEST(FileOutputBuffer, Test) {
     // Commit buffer, but size down to smaller size
     ASSERT_NO_ERROR(Buffer->commit(5000));
   }
-  // Verify file exists and starts with special header.
-  bool MagicMatches3 = false;
-  ASSERT_NO_ERROR(fs::has_magic(Twine(File3), Twine("AABBCCDDEEFFGGHHIIJJ"),
-                                                              MagicMatches3));
-  EXPECT_TRUE(MagicMatches3);
+
   // Verify file is correct size.
   uint64_t File3Size;
   ASSERT_NO_ERROR(fs::file_size(Twine(File3), File3Size));
diff --git a/unittests/Support/LockFileManagerTest.cpp b/unittests/Support/LockFileManagerTest.cpp
index 93fa10bee07d..885b7d605367 100644
--- a/unittests/Support/LockFileManagerTest.cpp
+++ b/unittests/Support/LockFileManagerTest.cpp
@@ -19,7 +19,7 @@ namespace {
 
 TEST(LockFileManagerTest, Basic) {
   SmallString<64> TmpDir;
-  error_code EC;
+  std::error_code EC;
   EC = sys::fs::createUniqueDirectory("LockFileManagerTestDir", TmpDir);
   ASSERT_FALSE(EC);
 
@@ -46,7 +46,7 @@ TEST(LockFileManagerTest, Basic) {
 
 TEST(LockFileManagerTest, LinkLockExists) {
   SmallString<64> TmpDir;
-  error_code EC;
+  std::error_code EC;
   EC = sys::fs::createUniqueDirectory("LockFileManagerTestDir", TmpDir);
   ASSERT_FALSE(EC);
 
@@ -89,7 +89,7 @@ TEST(LockFileManagerTest, LinkLockExists) {
 
 TEST(LockFileManagerTest, RelativePath) {
   SmallString<64> TmpDir;
-  error_code EC;
+  std::error_code EC;
   EC = sys::fs::createUniqueDirectory("LockFileManagerTestDir", TmpDir);
   ASSERT_FALSE(EC);
 
diff --git a/unittests/Support/ManagedStatic.cpp b/unittests/Support/ManagedStatic.cpp
index 1497f4e34082..ad2fc977e6d7 100644
--- a/unittests/Support/ManagedStatic.cpp
+++ b/unittests/Support/ManagedStatic.cpp
@@ -25,7 +25,7 @@ namespace test1 {
   llvm::ManagedStatic<int> ms;
   void *helper(void*) {
     *ms;
-    return NULL;
+    return nullptr;
   }
 
   // Valgrind's leak checker complains glibc's stack allocation.
@@ -49,10 +49,10 @@ TEST(Initialize, MultipleThreads) {
 
   llvm_start_multithreaded();
   pthread_t t1, t2;
-  pthread_create(&t1, &a1, test1::helper, NULL);
-  pthread_create(&t2, &a2, test1::helper, NULL);
-  pthread_join(t1, NULL);
-  pthread_join(t2, NULL);
+  pthread_create(&t1, &a1, test1::helper, nullptr);
+  pthread_create(&t2, &a2, test1::helper, nullptr);
+  pthread_join(t1, nullptr);
+  pthread_join(t2, nullptr);
   free(p1);
   free(p2);
   llvm_stop_multithreaded();
diff --git a/unittests/Support/MemoryBufferTest.cpp b/unittests/Support/MemoryBufferTest.cpp
index 6790d0c53b4b..98af5c4f3163 100644
--- a/unittests/Support/MemoryBufferTest.cpp
+++ b/unittests/Support/MemoryBufferTest.cpp
@@ -43,15 +43,15 @@ class MemoryBufferTest : public testing::Test {
 TEST_F(MemoryBufferTest, get) {
   // Default name and null-terminator flag
   OwningBuffer MB1(MemoryBuffer::getMemBuffer(data));
-  EXPECT_TRUE(0 != MB1.get());
+  EXPECT_TRUE(nullptr != MB1.get());
 
   // RequiresNullTerminator = false
   OwningBuffer MB2(MemoryBuffer::getMemBuffer(data, "one", false));
-  EXPECT_TRUE(0 != MB2.get());
+  EXPECT_TRUE(nullptr != MB2.get());
 
   // RequiresNullTerminator = true
   OwningBuffer MB3(MemoryBuffer::getMemBuffer(data, "two", true));
-  EXPECT_TRUE(0 != MB3.get());
+  EXPECT_TRUE(nullptr != MB3.get());
 
   // verify all 3 buffers point to the same address
   EXPECT_EQ(MB1->getBufferStart(), MB2->getBufferStart());
@@ -78,7 +78,7 @@ TEST_F(MemoryBufferTest, NullTerminator4K) {
   OF.close();
 
   OwningBuffer MB;
-  error_code EC = MemoryBuffer::getFile(TestPath.c_str(), MB);
+  std::error_code EC = MemoryBuffer::getFile(TestPath.c_str(), MB);
   ASSERT_FALSE(EC);
 
   const char *BufData = MB->getBufferStart();
@@ -89,11 +89,11 @@ TEST_F(MemoryBufferTest, NullTerminator4K) {
 TEST_F(MemoryBufferTest, copy) {
   // copy with no name
   OwningBuffer MBC1(MemoryBuffer::getMemBufferCopy(data));
-  EXPECT_TRUE(0 != MBC1.get());
+  EXPECT_TRUE(nullptr != MBC1.get());
 
   // copy with a name
   OwningBuffer MBC2(MemoryBuffer::getMemBufferCopy(data, "copy"));
-  EXPECT_TRUE(0 != MBC2.get());
+  EXPECT_TRUE(nullptr != MBC2.get());
 
   // verify the two copies do not point to the same place
   EXPECT_NE(MBC1->getBufferStart(), MBC2->getBufferStart());
@@ -102,25 +102,25 @@ TEST_F(MemoryBufferTest, copy) {
 TEST_F(MemoryBufferTest, make_new) {
   // 0-sized buffer
   OwningBuffer Zero(MemoryBuffer::getNewUninitMemBuffer(0));
-  EXPECT_TRUE(0 != Zero.get());
+  EXPECT_TRUE(nullptr != Zero.get());
 
   // uninitialized buffer with no name
   OwningBuffer One(MemoryBuffer::getNewUninitMemBuffer(321));
-  EXPECT_TRUE(0 != One.get());
+  EXPECT_TRUE(nullptr != One.get());
 
   // uninitialized buffer with name
   OwningBuffer Two(MemoryBuffer::getNewUninitMemBuffer(123, "bla"));
-  EXPECT_TRUE(0 != Two.get());
+  EXPECT_TRUE(nullptr != Two.get());
 
   // 0-initialized buffer with no name
   OwningBuffer Three(MemoryBuffer::getNewMemBuffer(321, data));
-  EXPECT_TRUE(0 != Three.get());
+  EXPECT_TRUE(nullptr != Three.get());
   for (size_t i = 0; i < 321; ++i)
     EXPECT_EQ(0, Three->getBufferStart()[0]);
 
   // 0-initialized buffer with name
   OwningBuffer Four(MemoryBuffer::getNewMemBuffer(123, "zeros"));
-  EXPECT_TRUE(0 != Four.get());
+  EXPECT_TRUE(nullptr != Four.get());
   for (size_t i = 0; i < 123; ++i)
     EXPECT_EQ(0, Four->getBufferStart()[0]);
 }
@@ -147,10 +147,11 @@ void MemoryBufferTest::testGetOpenFileSlice(bool Reopen) {
   }
 
   OwningBuffer Buf;
-  error_code EC = MemoryBuffer::getOpenFileSlice(TestFD, TestPath.c_str(), Buf,
-                                                 40000, // Size
-                                                 80000  // Offset
-                                                 );
+  std::error_code EC =
+      MemoryBuffer::getOpenFileSlice(TestFD, TestPath.c_str(), Buf,
+                                     40000, // Size
+                                     80000  // Offset
+                                     );
   EXPECT_FALSE(EC);
 
   StringRef BufData = Buf->getBuffer();
diff --git a/unittests/Support/MemoryTest.cpp b/unittests/Support/MemoryTest.cpp
index fae67a8dd256..8ad90e045de9 100644
--- a/unittests/Support/MemoryTest.cpp
+++ b/unittests/Support/MemoryTest.cpp
@@ -57,30 +57,30 @@ class MappedMemoryTest : public ::testing::TestWithParam<unsigned> {
 };
 
 TEST_P(MappedMemoryTest, AllocAndRelease) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(sizeof(int), M1.size());
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
 }
 
 TEST_P(MappedMemoryTest, MultipleAllocAndRelease) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M2 = Memory::allocateMappedMemory(64, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M3 = Memory::allocateMappedMemory(32, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-
-  EXPECT_NE((void*)0, M1.base());
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -89,9 +89,9 @@ TEST_P(MappedMemoryTest, MultipleAllocAndRelease) {
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
   EXPECT_FALSE(Memory::releaseMappedMemory(M3));
-  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  EXPECT_NE((void*)0, M4.base());
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  EXPECT_NE((void*)nullptr, M4.base());
   EXPECT_LE(16U, M4.size());
   EXPECT_FALSE(Memory::releaseMappedMemory(M4));
   EXPECT_FALSE(Memory::releaseMappedMemory(M2));
@@ -103,11 +103,11 @@ TEST_P(MappedMemoryTest, BasicWrite) {
       !((Flags & Memory::MF_READ) && (Flags & Memory::MF_WRITE)))
     return;
 
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(sizeof(int), M1.size());
 
   int *a = (int*)M1.base();
@@ -122,23 +122,26 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
   if (Flags &&
       !((Flags & Memory::MF_READ) && (Flags & Memory::MF_WRITE)))
     return;
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
 
   EXPECT_FALSE(doesOverlap(M1, M2));
   EXPECT_FALSE(doesOverlap(M2, M3));
   EXPECT_FALSE(doesOverlap(M1, M3));
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(1U * sizeof(int), M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(8U * sizeof(int), M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(4U * sizeof(int), M3.size());
 
   int *x = (int*)M1.base();
@@ -159,9 +162,10 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
   EXPECT_FALSE(Memory::releaseMappedMemory(M3));
 
-  MemoryBlock M4 = Memory::allocateMappedMemory(64 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  EXPECT_NE((void*)0, M4.base());
+  MemoryBlock M4 = Memory::allocateMappedMemory(64 * sizeof(int), nullptr,
+                                                Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  EXPECT_NE((void*)nullptr, M4.base());
   EXPECT_LE(64U * sizeof(int), M4.size());
   x = (int*)M4.base();
   *x = 4;
@@ -176,19 +180,22 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
 }
 
 TEST_P(MappedMemoryTest, EnabledWrite) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(2 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-
-  EXPECT_NE((void*)0, M1.base());
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(2 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(2U * sizeof(int), M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(8U * sizeof(int), M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(4U * sizeof(int), M3.size());
 
   EXPECT_FALSE(Memory::protectMappedMemory(M1, getTestableEquivalent(Flags)));
@@ -216,11 +223,12 @@ TEST_P(MappedMemoryTest, EnabledWrite) {
   EXPECT_FALSE(Memory::releaseMappedMemory(M3));
   EXPECT_EQ(6, y[6]);
 
-  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  EXPECT_NE((void*)0, M4.base());
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  EXPECT_NE((void*)nullptr, M4.base());
   EXPECT_LE(16U, M4.size());
-  EXPECT_EQ(error_code::success(), Memory::protectMappedMemory(M4, getTestableEquivalent(Flags)));
+  EXPECT_EQ(std::error_code(),
+            Memory::protectMappedMemory(M4, getTestableEquivalent(Flags)));
   x = (int*)M4.base();
   *x = 4;
   EXPECT_EQ(4, *x);
@@ -229,19 +237,19 @@ TEST_P(MappedMemoryTest, EnabledWrite) {
 }
 
 TEST_P(MappedMemoryTest, SuccessiveNear) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &M1, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &M2, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -254,20 +262,20 @@ TEST_P(MappedMemoryTest, SuccessiveNear) {
 }
 
 TEST_P(MappedMemoryTest, DuplicateNear) {
-  error_code EC;
+  std::error_code EC;
   MemoryBlock Near((void*)(3*PageSize), 16);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
@@ -276,20 +284,20 @@ TEST_P(MappedMemoryTest, DuplicateNear) {
 }
 
 TEST_P(MappedMemoryTest, ZeroNear) {
-  error_code EC;
-  MemoryBlock Near(0, 0);
+  std::error_code EC;
+  MemoryBlock Near(nullptr, 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -302,20 +310,20 @@ TEST_P(MappedMemoryTest, ZeroNear) {
 }
 
 TEST_P(MappedMemoryTest, ZeroSizeNear) {
-  error_code EC;
+  std::error_code EC;
   MemoryBlock Near((void*)(4*PageSize), 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -328,12 +336,12 @@ TEST_P(MappedMemoryTest, ZeroSizeNear) {
 }
 
 TEST_P(MappedMemoryTest, UnalignedNear) {
-  error_code EC;
+  std::error_code EC;
   MemoryBlock Near((void*)(2*PageSize+5), 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(15, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(sizeof(int), M1.size());
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index b79d05505c4e..da1bd2239577 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -8,24 +8,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
+#ifdef LLVM_ON_WIN32
+#include <winerror.h>
+#endif
+
 using namespace llvm;
 using namespace llvm::sys;
 
-#define ASSERT_NO_ERROR(x) \
-  if (error_code ASSERT_NO_ERROR_ec = x) { \
-    SmallString<128> MessageStorage; \
-    raw_svector_ostream Message(MessageStorage); \
-    Message << #x ": did not return errc::success.\n" \
-            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \
-            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \
-    GTEST_FATAL_FAILURE_(MessageStorage.c_str()); \
-  } else {}
+#define ASSERT_NO_ERROR(x)                                                     \
+  if (std::error_code ASSERT_NO_ERROR_ec = x) {                                \
+    SmallString<128> MessageStorage;                                           \
+    raw_svector_ostream Message(MessageStorage);                               \
+    Message << #x ": did not return errc::success.\n"                          \
+            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n"          \
+            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n";      \
+    GTEST_FATAL_FAILURE_(MessageStorage.c_str());                              \
+  } else {                                                                     \
+  }
 
 namespace {
 
@@ -352,7 +358,7 @@ TEST_F(FileSystemTest, TempFiles) {
   ASSERT_EQ(fs::remove(Twine(TempPath2), false),
             errc::no_such_file_or_directory);
 
-  error_code EC = fs::status(TempPath2.c_str(), B);
+  std::error_code EC = fs::status(TempPath2.c_str(), B);
   EXPECT_EQ(EC, errc::no_such_file_or_directory);
   EXPECT_EQ(B.type(), fs::file_type::file_not_found);
 
@@ -393,7 +399,7 @@ TEST_F(FileSystemTest, TempFiles) {
     "abcdefghijklmnopqrstuvwxyz3abcdefghijklmnopqrstuvwxyz2"
     "abcdefghijklmnopqrstuvwxyz1abcdefghijklmnopqrstuvwxyz0";
   EXPECT_EQ(fs::createUniqueFile(Twine(Path270), FileDescriptor, TempPath),
-            windows_error::path_not_found);
+            errc::no_such_file_or_directory);
 #endif
 }
 
@@ -406,7 +412,7 @@ TEST_F(FileSystemTest, CreateDir) {
 }
 
 TEST_F(FileSystemTest, DirectoryIteration) {
-  error_code ec;
+  std::error_code ec;
   for (fs::directory_iterator i(".", ec), e; i != e; i.increment(ec))
     ASSERT_NO_ERROR(ec);
 
@@ -535,9 +541,6 @@ TEST_F(FileSystemTest, Magic) {
     StringRef magic(i->magic_str, i->magic_str_len);
     file << magic;
     file.close();
-    bool res = false;
-    ASSERT_NO_ERROR(fs::has_magic(file_pathname.c_str(), magic, res));
-    EXPECT_TRUE(res);
     EXPECT_EQ(i->magic, fs::identify_magic(magic));
     ASSERT_NO_ERROR(fs::remove(Twine(file_pathname)));
   }
@@ -581,7 +584,7 @@ TEST_F(FileSystemTest, FileMapping) {
   ASSERT_NO_ERROR(
       fs::createTemporaryFile("prefix", "temp", FileDescriptor, TempPath));
   // Map in temp file and add some content
-  error_code EC;
+  std::error_code EC;
   StringRef Val("hello there");
   {
     fs::mapped_file_region mfr(FileDescriptor,
diff --git a/unittests/Support/ProgramTest.cpp b/unittests/Support/ProgramTest.cpp
index 800df14234d3..4e7316fb3ace 100644
--- a/unittests/Support/ProgramTest.cpp
+++ b/unittests/Support/ProgramTest.cpp
@@ -54,7 +54,7 @@ static void CopyEnvironment(std::vector<const char *> &out) {
   // environ seems to work for Windows and most other Unices.
   char **envp = environ;
 #endif
-  while (*envp != 0) {
+  while (*envp != nullptr) {
     out.push_back(*envp);
     ++envp;
   }
@@ -76,14 +76,14 @@ TEST(ProgramTest, CreateProcessTrailingSlash) {
     "--gtest_filter=ProgramTest.CreateProcessTrailingSlash",
     "-program-test-string-arg1", "has\\\\ trailing\\",
     "-program-test-string-arg2", "has\\\\ trailing\\",
-    0
+    nullptr
   };
 
   // Add LLVM_PROGRAM_TEST_CHILD to the environment of the child.
   std::vector<const char *> envp;
   CopyEnvironment(envp);
   envp.push_back("LLVM_PROGRAM_TEST_CHILD=1");
-  envp.push_back(0);
+  envp.push_back(nullptr);
 
   std::string error;
   bool ExecutionFailed;
@@ -93,7 +93,7 @@ TEST(ProgramTest, CreateProcessTrailingSlash) {
 #else
   StringRef nul("/dev/null");
 #endif
-  const StringRef *redirects[] = { &nul, &nul, 0 };
+  const StringRef *redirects[] = { &nul, &nul, nullptr };
   int rc = ExecuteAndWait(my_exe, argv, &envp[0], redirects,
                           /*secondsToWait=*/ 10, /*memoryLimit=*/ 0, &error,
                           &ExecutionFailed);
@@ -114,19 +114,19 @@ TEST(ProgramTest, TestExecuteNoWait) {
   const char *argv[] = {
     Executable.c_str(),
     "--gtest_filter=ProgramTest.TestExecuteNoWait",
-    0
+    nullptr
   };
 
   // Add LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT to the environment of the child.
   std::vector<const char *> envp;
   CopyEnvironment(envp);
   envp.push_back("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT=1");
-  envp.push_back(0);
+  envp.push_back(nullptr);
 
   std::string Error;
   bool ExecutionFailed;
-  ProcessInfo PI1 =
-      ExecuteNoWait(Executable, argv, &envp[0], 0, 0, &Error, &ExecutionFailed);
+  ProcessInfo PI1 = ExecuteNoWait(Executable, argv, &envp[0], nullptr, 0,
+                                  &Error, &ExecutionFailed);
   ASSERT_FALSE(ExecutionFailed) << Error;
   ASSERT_NE(PI1.Pid, 0) << "Invalid process id";
 
@@ -144,8 +144,8 @@ TEST(ProgramTest, TestExecuteNoWait) {
 
   EXPECT_EQ(LoopCount, 1u) << "LoopCount should be 1";
 
-  ProcessInfo PI2 =
-      ExecuteNoWait(Executable, argv, &envp[0], 0, 0, &Error, &ExecutionFailed);
+  ProcessInfo PI2 = ExecuteNoWait(Executable, argv, &envp[0], nullptr, 0,
+                                  &Error, &ExecutionFailed);
   ASSERT_FALSE(ExecutionFailed) << Error;
   ASSERT_NE(PI2.Pid, 0) << "Invalid process id";
 
@@ -162,15 +162,45 @@ TEST(ProgramTest, TestExecuteNoWait) {
   ASSERT_GT(LoopCount, 1u) << "LoopCount should be >1";
 }
 
+TEST(ProgramTest, TestExecuteAndWaitTimeout) {
+  using namespace llvm::sys;
+
+  if (getenv("LLVM_PROGRAM_TEST_TIMEOUT")) {
+    sleep_for(/*seconds*/ 10);
+    exit(0);
+  }
+
+  std::string Executable =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
+  const char *argv[] = {
+    Executable.c_str(),
+    "--gtest_filter=ProgramTest.TestExecuteAndWaitTimeout",
+    nullptr
+  };
+
+  // Add LLVM_PROGRAM_TEST_TIMEOUT to the environment of the child.
+  std::vector<const char *> envp;
+  CopyEnvironment(envp);
+  envp.push_back("LLVM_PROGRAM_TEST_TIMEOUT=1");
+  envp.push_back(nullptr);
+
+  std::string Error;
+  bool ExecutionFailed;
+  int RetCode =
+      ExecuteAndWait(Executable, argv, &envp[0], nullptr, /*secondsToWait=*/1, 0,
+                     &Error, &ExecutionFailed);
+  ASSERT_EQ(-2, RetCode);
+}
+
 TEST(ProgramTest, TestExecuteNegative) {
   std::string Executable = "i_dont_exist";
-  const char *argv[] = { Executable.c_str(), 0 };
+  const char *argv[] = { Executable.c_str(), nullptr };
 
   {
     std::string Error;
     bool ExecutionFailed;
-    int RetCode =
-        ExecuteAndWait(Executable, argv, 0, 0, 0, 0, &Error, &ExecutionFailed);
+    int RetCode = ExecuteAndWait(Executable, argv, nullptr, nullptr, 0, 0,
+                                 &Error, &ExecutionFailed);
     ASSERT_TRUE(RetCode < 0) << "On error ExecuteAndWait should return 0 or "
                                 "positive value indicating the result code";
     ASSERT_TRUE(ExecutionFailed);
@@ -180,8 +210,8 @@ TEST(ProgramTest, TestExecuteNegative) {
   {
     std::string Error;
     bool ExecutionFailed;
-    ProcessInfo PI =
-        ExecuteNoWait(Executable, argv, 0, 0, 0, &Error, &ExecutionFailed);
+    ProcessInfo PI = ExecuteNoWait(Executable, argv, nullptr, nullptr, 0,
+                                   &Error, &ExecutionFailed);
     ASSERT_EQ(PI.Pid, 0)
         << "On error ExecuteNoWait should return an invalid ProcessInfo";
     ASSERT_TRUE(ExecutionFailed);
diff --git a/unittests/Support/SwapByteOrderTest.cpp b/unittests/Support/SwapByteOrderTest.cpp
index 85ac6f3e8ddb..525cfc1aea82 100644
--- a/unittests/Support/SwapByteOrderTest.cpp
+++ b/unittests/Support/SwapByteOrderTest.cpp
@@ -20,70 +20,70 @@ namespace {
 // In these first two tests all of the original_uintx values are truncated
 // except for 64. We could avoid this, but there's really no point.
 
-TEST(SwapByteOrder, UnsignedRoundTrip) {
+TEST(getSwappedBytes, UnsignedRoundTrip) {
   // The point of the bit twiddling of magic is to test with and without bits
   // in every byte.
   uint64_t value = 1;
   for (std::size_t i = 0; i <= sizeof(value); ++i) {
     uint8_t original_uint8 = static_cast<uint8_t>(value);
     EXPECT_EQ(original_uint8,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint8)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint8)));
 
     uint16_t original_uint16 = static_cast<uint16_t>(value);
     EXPECT_EQ(original_uint16,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint16)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint16)));
 
     uint32_t original_uint32 = static_cast<uint32_t>(value);
     EXPECT_EQ(original_uint32,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint32)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint32)));
 
     uint64_t original_uint64 = static_cast<uint64_t>(value);
     EXPECT_EQ(original_uint64,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint64)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint64)));
 
     value = (value << 8) | 0x55; // binary 0101 0101.
   }
 }
 
-TEST(SwapByteOrder, SignedRoundTrip) {
+TEST(getSwappedBytes, SignedRoundTrip) {
   // The point of the bit twiddling of magic is to test with and without bits
   // in every byte.
   uint64_t value = 1;
   for (std::size_t i = 0; i <= sizeof(value); ++i) {
     int8_t original_int8 = static_cast<int8_t>(value);
     EXPECT_EQ(original_int8,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int8)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int8)));
 
     int16_t original_int16 = static_cast<int16_t>(value);
     EXPECT_EQ(original_int16,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int16)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int16)));
 
     int32_t original_int32 = static_cast<int32_t>(value);
     EXPECT_EQ(original_int32,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int32)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int32)));
 
     int64_t original_int64 = static_cast<int64_t>(value);
     EXPECT_EQ(original_int64,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int64)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int64)));
 
     // Test other sign.
     value *= -1;
 
     original_int8 = static_cast<int8_t>(value);
     EXPECT_EQ(original_int8,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int8)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int8)));
 
     original_int16 = static_cast<int16_t>(value);
     EXPECT_EQ(original_int16,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int16)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int16)));
 
     original_int32 = static_cast<int32_t>(value);
     EXPECT_EQ(original_int32,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int32)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int32)));
 
     original_int64 = static_cast<int64_t>(value);
     EXPECT_EQ(original_int64,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int64)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int64)));
 
     // Return to normal sign and twiddle.
     value *= -1;
@@ -91,38 +91,86 @@ TEST(SwapByteOrder, SignedRoundTrip) {
   }
 }
 
-TEST(SwapByteOrder, uint8_t) {
-  EXPECT_EQ(uint8_t(0x11), sys::SwapByteOrder(uint8_t(0x11)));
+TEST(getSwappedBytes, uint8_t) {
+  EXPECT_EQ(uint8_t(0x11), sys::getSwappedBytes(uint8_t(0x11)));
 }
 
-TEST(SwapByteOrder, uint16_t) {
-  EXPECT_EQ(uint16_t(0x1122), sys::SwapByteOrder(uint16_t(0x2211)));
+TEST(getSwappedBytes, uint16_t) {
+  EXPECT_EQ(uint16_t(0x1122), sys::getSwappedBytes(uint16_t(0x2211)));
 }
 
-TEST(SwapByteOrder, uint32_t) {
-  EXPECT_EQ(uint32_t(0x11223344), sys::SwapByteOrder(uint32_t(0x44332211)));
+TEST(getSwappedBytes, uint32_t) {
+  EXPECT_EQ(uint32_t(0x11223344), sys::getSwappedBytes(uint32_t(0x44332211)));
 }
 
-TEST(SwapByteOrder, uint64_t) {
+TEST(getSwappedBytes, uint64_t) {
   EXPECT_EQ(uint64_t(0x1122334455667788ULL),
-    sys::SwapByteOrder(uint64_t(0x8877665544332211ULL)));
+    sys::getSwappedBytes(uint64_t(0x8877665544332211ULL)));
 }
 
-TEST(SwapByteOrder, int8_t) {
-  EXPECT_EQ(int8_t(0x11), sys::SwapByteOrder(int8_t(0x11)));
+TEST(getSwappedBytes, int8_t) {
+  EXPECT_EQ(int8_t(0x11), sys::getSwappedBytes(int8_t(0x11)));
 }
 
-TEST(SwapByteOrder, int16_t) {
-  EXPECT_EQ(int16_t(0x1122), sys::SwapByteOrder(int16_t(0x2211)));
+TEST(getSwappedBytes, int16_t) {
+  EXPECT_EQ(int16_t(0x1122), sys::getSwappedBytes(int16_t(0x2211)));
 }
 
-TEST(SwapByteOrder, int32_t) {
-  EXPECT_EQ(int32_t(0x11223344), sys::SwapByteOrder(int32_t(0x44332211)));
+TEST(getSwappedBytes, int32_t) {
+  EXPECT_EQ(int32_t(0x11223344), sys::getSwappedBytes(int32_t(0x44332211)));
 }
 
-TEST(SwapByteOrder, int64_t) {
+TEST(getSwappedBytes, int64_t) {
   EXPECT_EQ(int64_t(0x1122334455667788LL),
-    sys::SwapByteOrder(int64_t(0x8877665544332211LL)));
+    sys::getSwappedBytes(int64_t(0x8877665544332211LL)));
+}
+
+TEST(swapByteOrder, uint8_t) {
+  uint8_t value = 0x11;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint8_t(0x11), value);
+}
+
+TEST(swapByteOrder, uint16_t) {
+  uint16_t value = 0x2211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint16_t(0x1122), value);
+}
+
+TEST(swapByteOrder, uint32_t) {
+  uint32_t value = 0x44332211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint32_t(0x11223344), value);
+}
+
+TEST(swapByteOrder, uint64_t) {
+  uint64_t value = 0x8877665544332211ULL;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint64_t(0x1122334455667788ULL), value);
+}
+
+TEST(swapByteOrder, int8_t) {
+  int8_t value = 0x11;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int8_t(0x11), value);
+}
+
+TEST(swapByteOrder, int16_t) {
+  int16_t value = 0x2211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int16_t(0x1122), value);
+}
+
+TEST(swapByteOrder, int32_t) {
+  int32_t value = 0x44332211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int32_t(0x11223344), value);
+}
+
+TEST(swapByteOrder, int64_t) {
+  int64_t value = 0x8877665544332211LL;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int64_t(0x1122334455667788LL), value);
 }
 
 }
diff --git a/unittests/Support/ThreadLocalTest.cpp b/unittests/Support/ThreadLocalTest.cpp
index dd4d706f5cf5..ea751be230cc 100644
--- a/unittests/Support/ThreadLocalTest.cpp
+++ b/unittests/Support/ThreadLocalTest.cpp
@@ -25,14 +25,14 @@ struct S {
 TEST_F(ThreadLocalTest, Basics) {
   ThreadLocal<const S> x;
 
-  EXPECT_EQ(0, x.get());
+  EXPECT_EQ(nullptr, x.get());
 
   S s;
   x.set(&s);
   EXPECT_EQ(&s, x.get());
 
   x.erase();
-  EXPECT_EQ(0, x.get());
+  EXPECT_EQ(nullptr, x.get());
 }
 
 }
diff --git a/unittests/Support/TimeValueTest.cpp b/unittests/Support/TimeValueTest.cpp
index 8058812ff4ed..3d2b9780c069 100644
--- a/unittests/Support/TimeValueTest.cpp
+++ b/unittests/Support/TimeValueTest.cpp
@@ -16,7 +16,7 @@ namespace {
 
 TEST(TimeValue, time_t) {
   sys::TimeValue now = sys::TimeValue::now();
-  time_t now_t = time(NULL);
+  time_t now_t = time(nullptr);
   EXPECT_TRUE(std::abs(static_cast<long>(now_t - now.toEpochTime())) < 2);
 }
 
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index cf95532141e2..8aed98012f10 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -1204,9 +1204,9 @@ TEST(YAMLIO, TestValidatingInput) {
   std::vector<MyValidation> docList;
   Input yin("--- \nvalue:  3.0\n"
             "--- \nvalue:  -1.0\n...\n",
-            NULL, suppressErrorMessages);
+            nullptr, suppressErrorMessages);
   yin >> docList;
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1224,10 +1224,10 @@ TEST(YAMLIO, TestColorsReadError) {
             "c2:  purple\n"
             "c3:  green\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> map;
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1241,11 +1241,11 @@ TEST(YAMLIO, TestFlagsReadError) {
             "f2:  [ round, hollow ]\n"
             "f3:  []\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> map;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1260,11 +1260,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint8Error) {
             "- 0\n"
             "- 257\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1279,11 +1279,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint16Error) {
             "- 0\n"
             "- 66000\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1298,11 +1298,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint32Error) {
             "- 0\n"
             "- 5000000000\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1317,11 +1317,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint64Error) {
             "- 0\n"
             "- 19446744073709551615\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1337,11 +1337,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint8OverError) {
             "- 127\n"
             "- 128\n"
            "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1355,11 +1355,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint8UnderError) {
             "- 127\n"
             "- -129\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1375,11 +1375,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint16UnderError) {
             "- -32768\n"
             "- -32769\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1394,11 +1394,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint16OverError) {
             "- -32768\n"
             "- 32768\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1414,11 +1414,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint32UnderError) {
             "- -2147483648\n"
             "- -2147483649\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1432,11 +1432,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint32OverError) {
             "- -2147483648\n"
             "- 2147483649\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1452,11 +1452,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint64UnderError) {
             "- 9223372036854775807\n"
             "- -9223372036854775809\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1470,11 +1470,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint64OverError) {
             "- 9223372036854775807\n"
             "- 9223372036854775809\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1489,11 +1489,11 @@ TEST(YAMLIO, TestReadBuiltInTypesFloatError) {
             "- -123.456\n"
             "- 1.2.3\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1508,11 +1508,11 @@ TEST(YAMLIO, TestReadBuiltInTypesDoubleError) {
             "- -123.456\n"
             "- 1.2.3\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1526,11 +1526,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex8Error) {
             "- 0xFE\n"
             "- 0x123\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1545,11 +1545,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex16Error) {
             "- 0xFEFF\n"
             "- 0x12345\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1563,11 +1563,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex32Error) {
             "- 0xFEFF0000\n"
             "- 0x1234556789\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1581,11 +1581,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex64Error) {
             "- 0xFFEEDDCCBBAA9988\n"
             "- 0x12345567890ABCDEF0\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 TEST(YAMLIO, TestMalformedMapFailsGracefully) {
@@ -1593,15 +1593,15 @@ TEST(YAMLIO, TestMalformedMapFailsGracefully) {
   {
     // We pass the suppressErrorMessages handler to handle the error
     // message generated in the constructor of Input.
-    Input yin("{foo:3, bar: 5}", /*Ctxt=*/NULL, suppressErrorMessages);
+    Input yin("{foo:3, bar: 5}", /*Ctxt=*/nullptr, suppressErrorMessages);
     yin >> doc;
-    EXPECT_TRUE(yin.error());
+    EXPECT_TRUE(!!yin.error());
   }
 
   {
-    Input yin("---\nfoo:3\nbar: 5\n...\n", /*Ctxt=*/NULL, suppressErrorMessages);
+    Input yin("---\nfoo:3\nbar: 5\n...\n", /*Ctxt=*/nullptr, suppressErrorMessages);
     yin >> doc;
-    EXPECT_TRUE(yin.error());
+    EXPECT_TRUE(!!yin.error());
   }
 }
 
@@ -1673,7 +1673,7 @@ TEST(YAMLIO, TestEmptyStringFailsForMapWithRequiredFields) {
   FooBar doc;
   Input yin("");
   yin >> doc;
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 TEST(YAMLIO, TestEmptyStringSucceedsForMapWithOptionalFields) {
@@ -1685,7 +1685,7 @@ TEST(YAMLIO, TestEmptyStringSucceedsForMapWithOptionalFields) {
 
 TEST(YAMLIO, TestEmptyStringSucceedsForSequence) {
   std::vector<uint8_t> seq;
-  Input yin("", /*Ctxt=*/NULL, suppressErrorMessages);
+  Input yin("", /*Ctxt=*/nullptr, suppressErrorMessages);
   yin >> seq;
 
   EXPECT_FALSE(yin.error());
diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp
index 2b797b43666b..44d27d041021 100644
--- a/unittests/Support/raw_ostream_test.cpp
+++ b/unittests/Support/raw_ostream_test.cpp
@@ -69,7 +69,7 @@ TEST(raw_ostreamTest, Types_Buffered) {
   EXPECT_EQ("1.100000e+00", printToString(1.1));
 
   // void*
-  EXPECT_EQ("0x0", printToString((void*) 0));
+  EXPECT_EQ("0x0", printToString((void*) nullptr));
   EXPECT_EQ("0xbeef", printToString((void*) 0xbeef));
   EXPECT_EQ("0xdeadbeef", printToString((void*) 0xdeadbeef));
 
@@ -100,7 +100,7 @@ TEST(raw_ostreamTest, Types_Unbuffered) {
   EXPECT_EQ("1.100000e+00", printToStringUnbuffered(1.1));
 
   // void*
-  EXPECT_EQ("0x0", printToStringUnbuffered((void*) 0));
+  EXPECT_EQ("0x0", printToStringUnbuffered((void*) nullptr));
   EXPECT_EQ("0xbeef", printToStringUnbuffered((void*) 0xbeef));
   EXPECT_EQ("0xdeadbeef", printToStringUnbuffered((void*) 0xdeadbeef));
 
diff --git a/unittests/Transforms/DebugIR/DebugIR.cpp b/unittests/Transforms/DebugIR/DebugIR.cpp
index 9b89c1593c71..41df14721684 100644
--- a/unittests/Transforms/DebugIR/DebugIR.cpp
+++ b/unittests/Transforms/DebugIR/DebugIR.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/Path.h"
@@ -57,7 +58,7 @@ void insertCUDescriptor(Module *M, StringRef File, StringRef Dir,
 bool removeIfExists(StringRef Path) {
   // This is an approximation, on error we don't know in general if the file
   // existed or not.
-  llvm::error_code EC = sys::fs::remove(Path, false);
+  std::error_code EC = sys::fs::remove(Path, false);
   return EC != llvm::errc::no_such_file_or_directory;
 }
 
@@ -65,7 +66,7 @@ char * current_dir() {
 #if defined(LLVM_ON_WIN32) || defined(HAVE_GETCWD)
   // calling getcwd (or _getcwd() on windows) with a null buffer makes it
   // allocate a sufficiently sized buffer to store the current working dir.
-  return getcwd_impl(0, 0);
+  return getcwd_impl(nullptr, 0);
 #else
   return 0;
 #endif
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index fb27dc173532..b3a1f5b10521 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -32,7 +32,7 @@ namespace {
 class CloneInstruction : public ::testing::Test {
 protected:
   virtual void SetUp() {
-    V = NULL;
+    V = nullptr;
   }
 
   template <typename T>
@@ -272,7 +272,7 @@ class CloneFunc : public ::testing::Test {
 
   void CreateNewFunc() {
     ValueToValueMapTy VMap;
-    NewFunc = CloneFunction(OldFunc, VMap, true, NULL);
+    NewFunc = CloneFunction(OldFunc, VMap, true, nullptr);
     M->getFunctionList().push_back(NewFunc);
   }
 
diff --git a/unittests/Transforms/Utils/SpecialCaseList.cpp b/unittests/Transforms/Utils/SpecialCaseList.cpp
index fd00687f8985..bcbca493af23 100644
--- a/unittests/Transforms/Utils/SpecialCaseList.cpp
+++ b/unittests/Transforms/Utils/SpecialCaseList.cpp
@@ -29,9 +29,9 @@ class SpecialCaseListTest : public ::testing::Test {
 
   GlobalVariable *makeGlobal(StringRef Name, StringRef StructName, Module &M) {
     StructType *ST =
-        StructType::create(StructName, Type::getInt32Ty(Ctx), (Type*)0);
+        StructType::create(StructName, Type::getInt32Ty(Ctx), (Type*)nullptr);
     return new GlobalVariable(
-        M, ST, false, GlobalValue::ExternalLinkage, 0, Name);
+        M, ST, false, GlobalValue::ExternalLinkage, nullptr, Name);
   }
 
   GlobalAlias *makeAlias(StringRef Name, GlobalObject *Aliasee) {
@@ -209,17 +209,17 @@ TEST_F(SpecialCaseListTest, Substring) {
 
 TEST_F(SpecialCaseListTest, InvalidSpecialCaseList) {
   std::string Error;
-  EXPECT_EQ(0, makeSpecialCaseList("badline", Error));
+  EXPECT_EQ(nullptr, makeSpecialCaseList("badline", Error));
   EXPECT_EQ("Malformed line 1: 'badline'", Error);
-  EXPECT_EQ(0, makeSpecialCaseList("src:bad[a-", Error));
+  EXPECT_EQ(nullptr, makeSpecialCaseList("src:bad[a-", Error));
   EXPECT_EQ("Malformed regex in line 1: 'bad[a-': invalid character range",
             Error);
-  EXPECT_EQ(0, makeSpecialCaseList("src:a.c\n"
+  EXPECT_EQ(nullptr, makeSpecialCaseList("src:a.c\n"
                                    "fun:fun(a\n",
                                    Error));
   EXPECT_EQ("Malformed regex in line 2: 'fun(a': parentheses not balanced",
             Error);
-  EXPECT_EQ(0, SpecialCaseList::create("unexisting", Error));
+  EXPECT_EQ(nullptr, SpecialCaseList::create("unexisting", Error));
   EXPECT_EQ(0U, Error.find("Can't open file 'unexisting':"));
 }
 
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index a1243774fee3..978e05f4f2ca 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -27,11 +27,11 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cctype>
 #include <map>
 #include <string>
+#include <system_error>
 #include <vector>
 using namespace llvm;
 
@@ -821,8 +821,7 @@ static StringRef FindFirstMatchingPrefix(StringRef &Buffer,
 static bool ReadCheckFile(SourceMgr &SM,
                           std::vector<CheckString> &CheckStrings) {
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(CheckFilename, File)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(CheckFilename, File)) {
     errs() << "Could not open check file '" << CheckFilename << "': "
            << ec.message() << '\n';
     return true;
@@ -1043,7 +1042,7 @@ bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
              SMLoc::getFromPointer(Buffer.data())))->getBufferStart() &&
          "CHECK-NEXT can't be the first check in a file");
 
-  const char *FirstNewLine = 0;
+  const char *FirstNewLine = nullptr;
   unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
 
   if (NumNewLines == 0) {
@@ -1225,8 +1224,7 @@ int main(int argc, char **argv) {
 
   // Open the file to check and add it to SourceMgr.
   std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(InputFilename, File)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, File)) {
     errs() << "Could not open input file '" << InputFilename << "': "
            << ec.message() << '\n';
     return 2;
diff --git a/utils/FileUpdate/FileUpdate.cpp b/utils/FileUpdate/FileUpdate.cpp
index 1bf1248f65e0..c185dd05e0e8 100644
--- a/utils/FileUpdate/FileUpdate.cpp
+++ b/utils/FileUpdate/FileUpdate.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static cl::opt<bool>
@@ -45,7 +45,7 @@ int main(int argc, char **argv) {
 
   // Get the input data.
   std::unique_ptr<MemoryBuffer> In;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, In)) {
+  if (std::error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, In)) {
     errs() << argv[0] << ": error: Unable to get input '"
            << InputFilename << "': " << ec.message() << '\n';
     return 1;
diff --git a/utils/KillTheDoctor/KillTheDoctor.cpp b/utils/KillTheDoctor/KillTheDoctor.cpp
index feba2e54f6a5..111bad209584 100644
--- a/utils/KillTheDoctor/KillTheDoctor.cpp
+++ b/utils/KillTheDoctor/KillTheDoctor.cpp
@@ -40,14 +40,15 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
 #include <cerrno>
 #include <cstdlib>
 #include <map>
 #include <string>
+#include <system_error>
 
 // These includes must be last.
 #include <Windows.h>
@@ -169,8 +170,10 @@ namespace {
   typedef ScopedHandle<FileHandle>              FileScopedHandle;
 }
 
-static error_code GetFileNameFromHandle(HANDLE FileHandle,
-                                        std::string& Name) {
+static std::error_code windows_error(DWORD E) { return mapWindowsError(E); }
+
+static std::error_code GetFileNameFromHandle(HANDLE FileHandle,
+                                             std::string &Name) {
   char Filename[MAX_PATH+1];
   bool Success = false;
   Name.clear();
@@ -210,7 +213,7 @@ static error_code GetFileNameFromHandle(HANDLE FileHandle,
     return windows_error(::GetLastError());
   else {
     Name = Filename;
-    return windows_error::success;
+    return std::error_code();
   }
 }
 
@@ -220,7 +223,8 @@ static error_code GetFileNameFromHandle(HANDLE FileHandle,
 ///        extension is present, try all extensions in PATHEXT.
 /// @return If ec == errc::success, The absolute path to the program. Otherwise
 ///         the return value is undefined.
-static std::string FindProgram(const std::string &Program, error_code &ec) {
+static std::string FindProgram(const std::string &Program,
+                               std::error_code &ec) {
   char PathName[MAX_PATH + 1];
   typedef SmallVector<StringRef, 12> pathext_t;
   pathext_t pathext;
@@ -245,11 +249,11 @@ static std::string FindProgram(const std::string &Program, error_code &ec) {
       ec = windows_error(::GetLastError());
     else if (length > array_lengthof(PathName)) {
       // This may have been the file, return with error.
-      ec = windows_error::buffer_overflow;
+      ec = windows_error(ERROR_BUFFER_OVERFLOW);
       break;
     } else {
       // We found the path! Return it.
-      ec = windows_error::success;
+      ec = std::error_code();
       break;
     }
   }
@@ -312,7 +316,7 @@ int main(int argc, char **argv) {
 
   std::string CommandLine(ProgramToRun);
 
-  error_code ec;
+  std::error_code ec;
   ProgramToRun = FindProgram(ProgramToRun, ec);
   if (ec) {
     errs() << ToolName << ": Failed to find program: '" << CommandLine
@@ -356,8 +360,8 @@ int main(int argc, char **argv) {
                                   &StartupInfo,
                                   &ProcessInfo);
   if (!success) {
-    errs() << ToolName << ": Failed to run program: '" << ProgramToRun
-           << "': " << error_code(windows_error(::GetLastError())).message()
+    errs() << ToolName << ": Failed to run program: '" << ProgramToRun << "': "
+           << std::error_code(windows_error(::GetLastError())).message()
            << '\n';
     return -1;
   }
@@ -420,9 +424,10 @@ int main(int argc, char **argv) {
     success = WaitForDebugEvent(&DebugEvent, TimeLeft);
 
     if (!success) {
-      ec = windows_error(::GetLastError());
+      DWORD LastError = ::GetLastError();
+      ec = windows_error(LastError);
 
-      if (ec == errc::timed_out) {
+      if (LastError == ERROR_SEM_TIMEOUT || LastError == WSAETIMEDOUT) {
         errs() << ToolName << ": Process timed out.\n";
         ::TerminateProcess(ProcessInfo.hProcess, -1);
         // Otherwise other stuff starts failing...
diff --git a/utils/PerfectShuffle/PerfectShuffle.cpp b/utils/PerfectShuffle/PerfectShuffle.cpp
index d39414eede94..f80d88563168 100644
--- a/utils/PerfectShuffle/PerfectShuffle.cpp
+++ b/utils/PerfectShuffle/PerfectShuffle.cpp
@@ -219,10 +219,10 @@ static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
 int main() {
   // Seed the table with accesses to the LHS and RHS.
   ShufTab[0x0123].Cost = 0;
-  ShufTab[0x0123].Op = 0;
+  ShufTab[0x0123].Op = nullptr;
   ShufTab[0x0123].Arg0 = 0x0123;
   ShufTab[0x4567].Cost = 0;
-  ShufTab[0x4567].Op = 0;
+  ShufTab[0x4567].Op = nullptr;
   ShufTab[0x4567].Arg0 = 0x4567;
 
   // Seed the first-level of shuffles, shuffles whose inputs are the input to
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index bb32cf427ee8..1277086ba823 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -573,6 +573,11 @@ struct SubtargetFeatureInfo {
   std::string getEnumName() const {
     return "Feature_" + TheDef->getName();
   }
+
+  void dump() {
+    errs() << getEnumName() << " " << Index << "\n";
+    TheDef->dump();
+  }
 };
 
 struct OperandMatchEntry {
@@ -1324,6 +1329,7 @@ void AsmMatcherInfo::buildInfo() {
 
     unsigned FeatureNo = SubtargetFeatures.size();
     SubtargetFeatures[Pred] = new SubtargetFeatureInfo(Pred, FeatureNo);
+    DEBUG(SubtargetFeatures[Pred]->dump());
     assert(FeatureNo < 32 && "Too many subtarget features!");
   }
 
@@ -1716,8 +1722,8 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   CvtOS << "void " << Target.getName() << ClassName << "::\n"
         << "convertToMCInst(unsigned Kind, MCInst &Inst, "
         << "unsigned Opcode,\n"
-        << "                const SmallVectorImpl<MCParsedAsmOperand*"
-        << "> &Operands) {\n"
+        << "                const OperandVector"
+        << " &Operands) {\n"
         << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
         << "  const uint8_t *Converter = ConversionTable[Kind];\n"
         << "  Inst.setOpcode(Opcode);\n"
@@ -1726,7 +1732,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
         << "    case CVT_Reg:\n"
         << "      static_cast<" << TargetOperandClass
-        << "*>(Operands[*(p + 1)])->addRegOperands(Inst, 1);\n"
+        << "&>(*Operands[*(p + 1)]).addRegOperands(Inst, 1);\n"
         << "      break;\n"
         << "    case CVT_Tied:\n"
         << "      Inst.addOperand(Inst.getOperand(*(p + 1)));\n"
@@ -1738,7 +1744,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   OpOS << "void " << Target.getName() << ClassName << "::\n"
        << "convertToMapAndConstraints(unsigned Kind,\n";
   OpOS.indent(27);
-  OpOS << "const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {\n"
+  OpOS << "const OperandVector &Operands) {\n"
        << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
        << "  unsigned NumMCOperands = 0;\n"
        << "  const uint8_t *Converter = ConversionTable[Kind];\n"
@@ -1843,9 +1849,8 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         // converter driver.
         CvtOS << "    case " << Name << ":\n"
               << "      static_cast<" << TargetOperandClass
-              << "*>(Operands[*(p + 1)])->"
-              << Op.Class->RenderMethod << "(Inst, " << OpInfo.MINumOperands
-              << ");\n"
+              << "&>(*Operands[*(p + 1)])." << Op.Class->RenderMethod
+              << "(Inst, " << OpInfo.MINumOperands << ");\n"
               << "      break;\n";
 
         // Add a handler for the operand number lookup.
@@ -2030,10 +2035,10 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target,
 /// emitValidateOperandClass - Emit the function to validate an operand class.
 static void emitValidateOperandClass(AsmMatcherInfo &Info,
                                      raw_ostream &OS) {
-  OS << "static unsigned validateOperandClass(MCParsedAsmOperand *GOp, "
+  OS << "static unsigned validateOperandClass(MCParsedAsmOperand &GOp, "
      << "MatchClassKind Kind) {\n";
-  OS << "  " << Info.Target.getName() << "Operand &Operand = *("
-     << Info.Target.getName() << "Operand*)GOp;\n";
+  OS << "  " << Info.Target.getName() << "Operand &Operand = ("
+     << Info.Target.getName() << "Operand&)GOp;\n";
 
   // The InvalidMatchClass is not to match any operand.
   OS << "  if (Kind == InvalidMatchClass)\n";
@@ -2199,18 +2204,35 @@ static void emitMatchRegisterName(CodeGenTarget &Target, Record *AsmParser,
   OS << "}\n\n";
 }
 
+static const char *getMinimalTypeForRange(uint64_t Range) {
+  assert(Range <= 0xFFFFFFFFULL && "Enum too large");
+  if (Range > 0xFFFF)
+    return "uint32_t";
+  if (Range > 0xFF)
+    return "uint16_t";
+  return "uint8_t";
+}
+
+static const char *getMinimalRequiredFeaturesType(const AsmMatcherInfo &Info) {
+  uint64_t MaxIndex = Info.SubtargetFeatures.size();
+  if (MaxIndex > 0)
+    MaxIndex--;
+  return getMinimalTypeForRange(1ULL << MaxIndex);
+}
+
 /// emitSubtargetFeatureFlagEnumeration - Emit the subtarget feature flag
 /// definitions.
 static void emitSubtargetFeatureFlagEnumeration(AsmMatcherInfo &Info,
                                                 raw_ostream &OS) {
   OS << "// Flags for subtarget features that participate in "
      << "instruction matching.\n";
-  OS << "enum SubtargetFeatureFlag {\n";
+  OS << "enum SubtargetFeatureFlag : " << getMinimalRequiredFeaturesType(Info)
+     << " {\n";
   for (std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator
          it = Info.SubtargetFeatures.begin(),
          ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
     SubtargetFeatureInfo &SFI = *it->second;
-    OS << "  " << SFI.getEnumName() << " = (1 << " << SFI.Index << "),\n";
+    OS << "  " << SFI.getEnumName() << " = (1U << " << SFI.Index << "),\n";
   }
   OS << "  Feature_None = 0\n";
   OS << "};\n\n";
@@ -2446,15 +2468,6 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info,
   return true;
 }
 
-static const char *getMinimalTypeForRange(uint64_t Range) {
-  assert(Range < 0xFFFFFFFFULL && "Enum too large");
-  if (Range > 0xFFFF)
-    return "uint32_t";
-  if (Range > 0xFF)
-    return "uint16_t";
-  return "uint8_t";
-}
-
 static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
                               const AsmMatcherInfo &Info, StringRef ClassName,
                               StringToOffsetTable &StringTable,
@@ -2469,7 +2482,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // Emit the static custom operand parsing table;
   OS << "namespace {\n";
   OS << "  struct OperandMatchEntry {\n";
-  OS << "    " << getMinimalTypeForRange(1ULL << Info.SubtargetFeatures.size())
+  OS << "    " << getMinimalRequiredFeaturesType(Info)
                << " RequiredFeatures;\n";
   OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
                << " Mnemonic;\n";
@@ -2547,7 +2560,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // the found operand class.
   OS << Target.getName() << ClassName << "::OperandMatchResultTy "
      << Target.getName() << ClassName << "::\n"
-     << "tryCustomParseOperand(SmallVectorImpl<MCParsedAsmOperand*>"
+     << "tryCustomParseOperand(OperandVector"
      << " &Operands,\n                      unsigned MCK) {\n\n"
      << "  switch(MCK) {\n";
 
@@ -2571,7 +2584,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // a better error handling.
   OS << Target.getName() << ClassName << "::OperandMatchResultTy "
      << Target.getName() << ClassName << "::\n"
-     << "MatchOperandParserImpl(SmallVectorImpl<MCParsedAsmOperand*>"
+     << "MatchOperandParserImpl(OperandVector"
      << " &Operands,\n                       StringRef Mnemonic) {\n";
 
   // Emit code to get the available features.
@@ -2681,14 +2694,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  unsigned ComputeAvailableFeatures(uint64_t FeatureBits) const;\n";
   OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
      << "unsigned Opcode,\n"
-     << "                       const SmallVectorImpl<MCParsedAsmOperand*> "
+     << "                       const OperandVector "
      << "&Operands);\n";
   OS << "  void convertToMapAndConstraints(unsigned Kind,\n                ";
-  OS << "           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;\n";
+  OS << "           const OperandVector &Operands) override;\n";
   OS << "  bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) override;\n";
   OS << "  unsigned MatchInstructionImpl(\n";
   OS.indent(27);
-  OS << "const SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n"
+  OS << "const OperandVector &Operands,\n"
      << "                                MCInst &Inst,\n"
      << "                                unsigned &ErrorInfo,"
      << " bool matchingInlineAsm,\n"
@@ -2701,11 +2714,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "    MatchOperand_ParseFail   // operand matched but had errors\n";
     OS << "  };\n";
     OS << "  OperandMatchResultTy MatchOperandParserImpl(\n";
-    OS << "    SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n";
+    OS << "    OperandVector &Operands,\n";
     OS << "    StringRef Mnemonic);\n";
 
     OS << "  OperandMatchResultTy tryCustomParseOperand(\n";
-    OS << "    SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n";
+    OS << "    OperandVector &Operands,\n";
     OS << "    unsigned MCK);\n\n";
   }
 
@@ -2805,7 +2818,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    uint16_t Opcode;\n";
   OS << "    " << getMinimalTypeForRange(Info.Matchables.size())
                << " ConvertFn;\n";
-  OS << "    " << getMinimalTypeForRange(1ULL << Info.SubtargetFeatures.size())
+  OS << "    " << getMinimalRequiredFeaturesType(Info)
                << " RequiredFeatures;\n";
   OS << "    " << getMinimalTypeForRange(Info.Classes.size())
                << " Classes[" << MaxNumOperands << "];\n";
@@ -2895,9 +2908,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "}\n\n";
 
   // Finally, build the match function.
-  OS << "unsigned "
-     << Target.getName() << ClassName << "::\n"
-     << "MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*>"
+  OS << "unsigned " << Target.getName() << ClassName << "::\n"
+     << "MatchInstructionImpl(const OperandVector"
      << " &Operands,\n";
   OS << "                     MCInst &Inst,\n"
      << "unsigned &ErrorInfo, bool matchingInlineAsm, unsigned VariantID) {\n";
@@ -2914,7 +2926,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "  // Get the instruction mnemonic, which is the first token.\n";
   OS << "  StringRef Mnemonic = ((" << Target.getName()
-     << "Operand*)Operands[0])->getToken();\n\n";
+     << "Operand&)*Operands[0]).getToken();\n\n";
 
   if (HasMnemonicAliases) {
     OS << "  // Process all MnemonicAliases to remap the mnemonic.\n";
@@ -2966,7 +2978,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "        if (!OperandsValid) ErrorInfo = i + 1;\n";
   OS << "        break;\n";
   OS << "      }\n";
-  OS << "      unsigned Diag = validateOperandClass(Operands[i+1],\n";
+  OS << "      unsigned Diag = validateOperandClass(*Operands[i+1],\n";
   OS.indent(43);
   OS << "(MatchClassKind)it->Classes[i]);\n";
   OS << "      if (Diag == Match_Success)\n";
@@ -2974,7 +2986,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      // If the generic handler indicates an invalid operand\n";
   OS << "      // failure, check for a special case.\n";
   OS << "      if (Diag == Match_InvalidOperand) {\n";
-  OS << "        Diag = validateTargetOperandClass(Operands[i+1],\n";
+  OS << "        Diag = validateTargetOperandClass(*Operands[i+1],\n";
   OS.indent(43);
   OS << "(MatchClassKind)it->Classes[i]);\n";
   OS << "        if (Diag == Match_Success)\n";
@@ -3041,7 +3053,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   if (HasDeprecation) {
     OS << "    std::string Info;\n";
     OS << "    if (MII.get(Inst.getOpcode()).getDeprecatedInfo(Inst, STI, Info)) {\n";
-    OS << "      SMLoc Loc = ((" << Target.getName() << "Operand*)Operands[0])->getStartLoc();\n";
+    OS << "      SMLoc Loc = ((" << Target.getName()
+       << "Operand&)*Operands[0]).getStartLoc();\n";
     OS << "      Parser.Warning(Loc, Info, None);\n";
     OS << "    }\n";
   }
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 2741d8f4adec..c7fe9dfd685f 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -806,6 +806,10 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   // before it can be matched to the mnemonic.
   std::map<std::string, std::vector<IAPrinter*> > IAPrinterMap;
 
+  // A list of MCOperandPredicates for all operands in use, and the reverse map
+  std::vector<const Record*> MCOpPredicates;
+  DenseMap<const Record*, unsigned> MCOpPredicateMap;
+
   for (auto &Aliases : AliasMap) {
     for (auto &Alias : Aliases.second) {
       const CodeGenInstAlias *CGA = Alias.first;
@@ -832,6 +836,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
       unsigned MIOpNum = 0;
       for (unsigned i = 0, e = LastOpNo; i != e; ++i) {
+        std::string Op = "MI->getOperand(" + llvm::utostr(MIOpNum) + ")";
+
         const CodeGenInstAlias::ResultOperand &RO = CGA->ResultOperands[i];
 
         switch (RO.Kind) {
@@ -858,9 +864,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
           if (Rec->isSubClassOf("RegisterOperand"))
             Rec = Rec->getValueAsDef("RegClass");
           if (Rec->isSubClassOf("RegisterClass")) {
-            Cond = std::string("MI->getOperand(") + llvm::utostr(MIOpNum) +
-                   ").isReg()";
-            IAP->addCond(Cond);
+            IAP->addCond(Op + ".isReg()");
 
             if (!IAP->isOpMapped(ROName)) {
               IAP->addOperand(ROName, MIOpNum, PrintMethodIdx);
@@ -869,26 +873,34 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
                 R = R->getValueAsDef("RegClass");
               Cond = std::string("MRI.getRegClass(") + Target.getName() + "::" +
                      R->getName() + "RegClassID)"
-                                    ".contains(MI->getOperand(" +
-                     llvm::utostr(MIOpNum) + ").getReg())";
-              IAP->addCond(Cond);
+                                    ".contains(" + Op + ".getReg())";
             } else {
-              Cond = std::string("MI->getOperand(") +
-                llvm::utostr(MIOpNum) + ").getReg() == MI->getOperand(" +
+              Cond = Op + ".getReg() == MI->getOperand(" +
                 llvm::utostr(IAP->getOpIndex(ROName)) + ").getReg()";
-              IAP->addCond(Cond);
             }
           } else {
             // Assume all printable operands are desired for now. This can be
             // overridden in the InstAlias instantiation if necessary.
             IAP->addOperand(ROName, MIOpNum, PrintMethodIdx);
-          }
 
+            // There might be an additional predicate on the MCOperand
+            unsigned Entry = MCOpPredicateMap[Rec];
+            if (!Entry) {
+              if (!Rec->isValueUnset("MCOperandPredicate")) {
+                MCOpPredicates.push_back(Rec);
+                Entry = MCOpPredicates.size();
+                MCOpPredicateMap[Rec] = Entry;
+              } else
+                break; // No conditions on this operand at all
+            }
+            Cond = Target.getName() + ClassName + "ValidateMCOperand(" +
+                   Op + ", " + llvm::utostr(Entry) + ")";
+          }
+          // for all subcases of ResultOperand::K_Record:
+          IAP->addCond(Cond);
           break;
         }
         case CodeGenInstAlias::ResultOperand::K_Imm: {
-          std::string Op = "MI->getOperand(" + llvm::utostr(MIOpNum) + ")";
-
           // Just because the alias has an immediate result, doesn't mean the
           // MCInst will. An MCExpr could be present, for example.
           IAP->addCond(Op + ".isImm()");
@@ -906,8 +918,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
             break;
           }
 
-          Cond = std::string("MI->getOperand(") +
-            llvm::utostr(MIOpNum) + ").getReg() == " + Target.getName() +
+          Cond = Op + ".getReg() == " + Target.getName() +
             "::" + CGA->ResultOperands[i].getRegister()->getName();
           IAP->addCond(Cond);
           break;
@@ -980,6 +991,11 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     return;
   }
 
+  if (MCOpPredicates.size())
+    O << "static bool " << Target.getName() << ClassName
+      << "ValidateMCOperand(\n"
+      << "       const MCOperand &MCOp, unsigned PredicateIndex);\n";
+
   O << HeaderO.str();
   O.indent(2) << "const char *AsmString;\n";
   O.indent(2) << "switch (MI->getOpcode()) {\n";
@@ -1041,6 +1057,28 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   }    
   O << "}\n\n";
 
+  if (MCOpPredicates.size()) {
+    O << "static bool " << Target.getName() << ClassName
+      << "ValidateMCOperand(\n"
+      << "       const MCOperand &MCOp, unsigned PredicateIndex) {\n"
+      << "  switch (PredicateIndex) {\n"
+      << "  default:\n"
+      << "    llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n"
+      << "    break;\n";
+
+    for (unsigned i = 0; i < MCOpPredicates.size(); ++i) {
+      Init *MCOpPred = MCOpPredicates[i]->getValueInit("MCOperandPredicate");
+      if (StringInit *SI = dyn_cast<StringInit>(MCOpPred)) {
+        O << "  case " << i + 1 << ": {\n"
+          << SI->getValue() << "\n"
+          << "    }\n";
+      } else
+        llvm_unreachable("Unexpected MCOperandPredicate field!");
+    }
+    O << "  }\n"
+      << "}\n\n";
+  }
+
   O << "#endif // PRINT_ALIAS_INSTR\n";
 }
 
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index f277608d2c80..feaa7c757962 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -26,7 +26,6 @@ add_tablegen(llvm-tblgen LLVM
   OptParserEmitter.cpp
   PseudoLoweringEmitter.cpp
   RegisterInfoEmitter.cpp
-  SetTheory.cpp
   SubtargetEmitter.cpp
   TableGen.cpp
   X86DisassemblerTables.cpp
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 1636fb423393..00bc9a5bbb9b 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -1718,9 +1718,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         DagInit *MIOpInfo = OperandNode->getValueAsDag("MIOperandInfo");
         if (unsigned NumArgs = MIOpInfo->getNumArgs()) {
           // But don't do that if the whole operand is being provided by
-          // a single ComplexPattern.
-          const ComplexPattern *AM = Child->getComplexPatternInfo(CDP);
-          if (!AM || AM->getNumOperands() < NumArgs) {
+          // a single ComplexPattern-related Operand.
+
+          if (Child->getNumMIResults(CDP) < NumArgs) {
             // Match first sub-operand against the child we already have.
             Record *SubRec = cast<DefInit>(MIOpInfo->getArg(0))->getDef();
             MadeChange |=
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 30732c8e04d8..278315ba47b3 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -15,7 +15,6 @@
 #ifndef CODEGEN_REGISTERS_H
 #define CODEGEN_REGISTERS_H
 
-#include "SetTheory.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -23,6 +22,7 @@
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 #include <cstdlib>
 #include <map>
 #include <set>
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 65ac60207472..3fef8adf91e5 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -15,11 +15,11 @@
 #ifndef CODEGEN_SCHEDULE_H
 #define CODEGEN_SCHEDULE_H
 
-#include "SetTheory.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 
 namespace llvm {
 
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 1927ad923699..5dba9514283d 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -380,7 +380,7 @@ static void ComputeFixedEncoding(const CodeGenIntrinsic &Int,
       case 3: TypeSig.push_back(IIT_STRUCT3); break;
       case 4: TypeSig.push_back(IIT_STRUCT4); break;
       case 5: TypeSig.push_back(IIT_STRUCT5); break;
-      default: assert(0 && "Unhandled case in struct");
+      default: llvm_unreachable("Unhandled case in struct");
     }
 
     for (unsigned i = 0, e = Int.IS.RetVTs.size(); i != e; ++i)
diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 70b677fd1b9f..3b74ac41e9b3 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -200,70 +200,74 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
 
   o << "bool " << Target.getName() + "AsmPrinter" << "::\n"
     << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n"
-    << "                            const MachineInstr *MI) {\n"
-    << "  switch (MI->getOpcode()) {\n"
-    << "    default: return false;\n";
-  for (unsigned i = 0, e = Expansions.size(); i != e; ++i) {
-    PseudoExpansion &Expansion = Expansions[i];
-    CodeGenInstruction &Source = Expansion.Source;
-    CodeGenInstruction &Dest = Expansion.Dest;
-    o << "    case " << Source.Namespace << "::"
-      << Source.TheDef->getName() << ": {\n"
-      << "      MCInst TmpInst;\n"
-      << "      MCOperand MCOp;\n"
-      << "      TmpInst.setOpcode(" << Dest.Namespace << "::"
-      << Dest.TheDef->getName() << ");\n";
-
-    // Copy the operands from the source instruction.
-    // FIXME: Instruction operands with defaults values (predicates and cc_out
-    //        in ARM, for example shouldn't need explicit values in the
-    //        expansion DAG.
-    unsigned MIOpNo = 0;
-    for (unsigned OpNo = 0, E = Dest.Operands.size(); OpNo != E;
-         ++OpNo) {
-      o << "      // Operand: " << Dest.Operands[OpNo].Name << "\n";
-      for (unsigned i = 0, e = Dest.Operands[OpNo].MINumOperands;
-           i != e; ++i) {
-        switch (Expansion.OperandMap[MIOpNo + i].Kind) {
-        case OpData::Operand:
-          o << "      lowerOperand(MI->getOperand("
-            << Source.Operands[Expansion.OperandMap[MIOpNo].Data
-                .Operand].MIOperandNo + i
-            << "), MCOp);\n"
-            << "      TmpInst.addOperand(MCOp);\n";
-          break;
-        case OpData::Imm:
-          o << "      TmpInst.addOperand(MCOperand::CreateImm("
-            << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
-          break;
-        case OpData::Reg: {
-          Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
-          o << "      TmpInst.addOperand(MCOperand::CreateReg(";
-          // "zero_reg" is special.
-          if (Reg->getName() == "zero_reg")
-            o << "0";
-          else
-            o << Reg->getValueAsString("Namespace") << "::" << Reg->getName();
-          o << "));\n";
-          break;
-        }
+    << "                            const MachineInstr *MI) {\n";
+
+  if (!Expansions.empty()) {
+    o << "  switch (MI->getOpcode()) {\n"
+      << "    default: return false;\n";
+    for (auto &Expansion : Expansions) {
+      CodeGenInstruction &Source = Expansion.Source;
+      CodeGenInstruction &Dest = Expansion.Dest;
+      o << "    case " << Source.Namespace << "::"
+        << Source.TheDef->getName() << ": {\n"
+        << "      MCInst TmpInst;\n"
+        << "      MCOperand MCOp;\n"
+        << "      TmpInst.setOpcode(" << Dest.Namespace << "::"
+        << Dest.TheDef->getName() << ");\n";
+
+      // Copy the operands from the source instruction.
+      // FIXME: Instruction operands with defaults values (predicates and cc_out
+      //        in ARM, for example shouldn't need explicit values in the
+      //        expansion DAG.
+      unsigned MIOpNo = 0;
+      for (const auto &DestOperand : Dest.Operands) {
+        o << "      // Operand: " << DestOperand.Name << "\n";
+        for (unsigned i = 0, e = DestOperand.MINumOperands; i != e; ++i) {
+          switch (Expansion.OperandMap[MIOpNo + i].Kind) {
+            case OpData::Operand:
+            o << "      lowerOperand(MI->getOperand("
+              << Source.Operands[Expansion.OperandMap[MIOpNo].Data
+              .Operand].MIOperandNo + i
+              << "), MCOp);\n"
+              << "      TmpInst.addOperand(MCOp);\n";
+            break;
+            case OpData::Imm:
+            o << "      TmpInst.addOperand(MCOperand::CreateImm("
+              << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
+            break;
+            case OpData::Reg: {
+              Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
+              o << "      TmpInst.addOperand(MCOperand::CreateReg(";
+              // "zero_reg" is special.
+              if (Reg->getName() == "zero_reg")
+                o << "0";
+              else
+                o << Reg->getValueAsString("Namespace") << "::"
+                  << Reg->getName();
+              o << "));\n";
+              break;
+            }
+          }
         }
+        MIOpNo += DestOperand.MINumOperands;
       }
-      MIOpNo += Dest.Operands[OpNo].MINumOperands;
-    }
-    if (Dest.Operands.isVariadic) {
-      MIOpNo = Source.Operands.size() + 1;
-      o << "      // variable_ops\n";
-      o << "      for (unsigned i = " << MIOpNo
-        << ", e = MI->getNumOperands(); i != e; ++i)\n"
-        << "        if (lowerOperand(MI->getOperand(i), MCOp))\n"
-        << "          TmpInst.addOperand(MCOp);\n";
+      if (Dest.Operands.isVariadic) {
+        MIOpNo = Source.Operands.size() + 1;
+        o << "      // variable_ops\n";
+        o << "      for (unsigned i = " << MIOpNo
+          << ", e = MI->getNumOperands(); i != e; ++i)\n"
+          << "        if (lowerOperand(MI->getOperand(i), MCOp))\n"
+          << "          TmpInst.addOperand(MCOp);\n";
+      }
+      o << "      EmitToStreamer(OutStreamer, TmpInst);\n"
+        << "      break;\n"
+        << "    }\n";
     }
-    o << "      EmitToStreamer(OutStreamer, TmpInst);\n"
-      << "      break;\n"
-      << "    }\n";
-  }
-  o << "  }\n  return true;\n}\n\n";
+    o << "  }\n  return true;";
+  } else
+    o << "  return false;";
+
+  o << "\n}\n\n";
 }
 
 void PseudoLoweringEmitter::run(raw_ostream &o) {
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 00c3a6fd91f6..bbd61f5fb112 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "TableGenBackends.h" // Declares all backends.
-#include "SetTheory.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 
 using namespace llvm;
 
diff --git a/utils/TableGen/module.modulemap b/utils/TableGen/module.modulemap
new file mode 100644
index 000000000000..8871bbfd4a2f
--- /dev/null
+++ b/utils/TableGen/module.modulemap
@@ -0,0 +1,4 @@
+module TableGen {
+  umbrella "."
+  module * { export * }
+}
diff --git a/utils/lit/lit/discovery.py b/utils/lit/lit/discovery.py
index c3c0f283b558..876d4f31e950 100644
--- a/utils/lit/lit/discovery.py
+++ b/utils/lit/lit/discovery.py
@@ -200,9 +200,7 @@ def find_tests_for_inputs(lit_config, inputs):
     # Expand '@...' form in inputs.
     actual_inputs = []
     for input in inputs:
-        if os.path.exists(input) or not input.startswith('@'):
-            actual_inputs.append(input)
-        else:
+        if input.startswith('@'):
             f = open(input[1:])
             try:
                 for ln in f:
@@ -211,6 +209,8 @@ def find_tests_for_inputs(lit_config, inputs):
                         actual_inputs.append(ln)
             finally:
                 f.close()
+        else:
+            actual_inputs.append(input)
                     
     # Load the tests from the inputs.
     tests = []
diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
index 2b1010c1870c..72a8b4848e08 100644
--- a/utils/lit/lit/util.py
+++ b/utils/lit/lit/util.py
@@ -167,3 +167,20 @@ def executeCommand(command, cwd=None, env=None):
         err = str(err)
 
     return out, err, exitCode
+
+def usePlatformSdkOnDarwin(config, lit_config):
+    # On Darwin, support relocatable SDKs by providing Clang with a
+    # default system root path.
+    if 'darwin' in config.target_triple:
+        try:
+            cmd = subprocess.Popen(['xcrun', '--show-sdk-path'],
+                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            out, err = cmd.communicate()
+            out = out.strip()
+            res = cmd.wait()
+        except OSError:
+            res = -1
+        if res == 0 and out:
+            sdk_path = out
+            lit_config.note('using SDKROOT: %r' % sdk_path)
+            config.environment['SDKROOT'] = sdk_path
diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index 1baf398aa533..352448d535cd 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py
@@ -1,10 +1,18 @@
 """
-Load into LLDB with:
-script import lldbDataFormatters
-type synthetic add -x "^llvm::SmallVectorImpl<.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
-type synthetic add -x "^llvm::SmallVector<.+,.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
+LLDB Formatters for LLVM data types.
+
+Load into LLDB with 'command script import /path/to/lldbDataFormatters.py'
 """
 
+def __lldb_init_module(debugger, internal_dict):
+    debugger.HandleCommand('type category define -e llvm -l c++')
+    debugger.HandleCommand('type synthetic add -w llvm '
+                           '-l lldbDataFormatters.SmallVectorSynthProvider '
+                           '-x "^llvm::SmallVectorImpl<.+>$"')
+    debugger.HandleCommand('type synthetic add -w llvm '
+                           '-l lldbDataFormatters.SmallVectorSynthProvider '
+                           '-x "^llvm::SmallVector<.+,.+>$"')
+
 # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl
 class SmallVectorSynthProvider:
     def __init__(self, valobj, dict):
diff --git a/utils/not/not.cpp b/utils/not/not.cpp
index ebd16189c9bd..af9425001864 100644
--- a/utils/not/not.cpp
+++ b/utils/not/not.cpp
@@ -30,7 +30,15 @@ int main(int argc, const char **argv) {
   std::string Program = sys::FindProgramByName(argv[0]);
 
   std::string ErrMsg;
-  int Result = sys::ExecuteAndWait(Program, argv, 0, 0, 0, 0, &ErrMsg);
+  int Result = sys::ExecuteAndWait(Program, argv, nullptr, nullptr, 0, 0,
+                                   &ErrMsg);
+#ifdef _WIN32
+  // Handle abort() in msvcrt -- It has exit code as 3.
+  // abort(), aka unreachable, may be handled as crash.
+  // FIXME: Could we move this into Win32/Program.inc?
+  if (Result == 3)
+    Result = -3;
+#endif
   if (Result < 0) {
     errs() << "Error: " << ErrMsg << "\n";
     if (ExpectCrash)
diff --git a/utils/yaml-bench/YAMLBench.cpp b/utils/yaml-bench/YAMLBench.cpp
index 58b735686289..39b8f72ecc50 100644
--- a/utils/yaml-bench/YAMLBench.cpp
+++ b/utils/yaml-bench/YAMLBench.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using namespace llvm;